├── .gitignore
├── LICENSE
├── README.md
├── gurobi.sh
├── source
    ├── c_solver
    │   ├── compile.sh
    │   └── gurobi_c.cpp
    ├── gurobi.env
    ├── planning
    │   └── ilp.py
    ├── rl
    │   ├── ac.py
    │   ├── plan_env.py
    │   └── rl.py
    ├── simulate
    │   ├── flow.py
    │   ├── spof.py
    │   ├── spofs.py
    │   └── traffic_matrix.py
    ├── test.py
    └── topology
    │   ├── ip
    │       ├── link.py
    │       ├── network.py
    │       └── router.py
    │   ├── optical
    │       ├── optic_fiber.py
    │       ├── optic_network.py
    │       ├── optic_node.py
    │       └── optic_path.py
    │   ├── topology.py
    │   └── utils
    │       └── node.py
└── spinningup
    ├── .gitignore
    ├── .travis.yml
    ├── LICENSE
    ├── readme.md
    ├── readthedocs.yml
    ├── setup.py
    ├── spinup
        ├── __init__.py
        ├── algos
        │   ├── __init__.py
        │   ├── pytorch
        │   │   ├── ddpg
        │   │   │   ├── core.py
        │   │   │   └── ddpg.py
        │   │   ├── ppo
        │   │   │   ├── core.py
        │   │   │   └── ppo.py
        │   │   ├── sac
        │   │   │   ├── core.py
        │   │   │   └── sac.py
        │   │   ├── td3
        │   │   │   ├── core.py
        │   │   │   └── td3.py
        │   │   ├── trpo
        │   │   │   └── trpo.py
        │   │   └── vpg
        │   │   │   ├── core.py
        │   │   │   └── vpg.py
        │   └── tf1
        │   │   ├── ddpg
        │   │       ├── __init__.py
        │   │       ├── core.py
        │   │       └── ddpg.py
        │   │   ├── ppo
        │   │       ├── __init__.py
        │   │       ├── core.py
        │   │       └── ppo.py
        │   │   ├── sac
        │   │       ├── __init__.py
        │   │       ├── core.py
        │   │       └── sac.py
        │   │   ├── td3
        │   │       ├── __init__.py
        │   │       ├── core.py
        │   │       └── td3.py
        │   │   ├── trpo
        │   │       ├── __init__.py
        │   │       ├── core.py
        │   │       └── trpo.py
        │   │   └── vpg
        │   │       ├── __init__.py
        │   │       ├── core.py
        │   │       └── vpg.py
        ├── examples
        │   ├── pytorch
        │   │   ├── bench_ppo_cartpole.py
        │   │   └── pg_math
        │   │   │   ├── 1_simple_pg.py
        │   │   │   └── 2_rtg_pg.py
        │   └── tf1
        │   │   ├── bench_ppo_cartpole.py
        │   │   ├── pg_math
        │   │       ├── 1_simple_pg.py
        │   │       └── 2_rtg_pg.py
        │   │   └── train_mnist.py
        ├── exercises
        │   ├── common.py
        │   ├── pytorch
        │   │   ├── problem_set_1
        │   │   │   ├── exercise1_1.py
        │   │   │   ├── exercise1_2.py
        │   │   │   ├── exercise1_2_auxiliary.py
        │   │   │   └── exercise1_3.py
        │   │   ├── problem_set_1_solutions
        │   │   │   ├── exercise1_1_soln.py
        │   │   │   └── exercise1_2_soln.py
        │   │   └── problem_set_2
        │   │   │   └── exercise2_2.py
        │   └── tf1
        │   │   ├── problem_set_1
        │   │       ├── exercise1_1.py
        │   │       ├── exercise1_2.py
        │   │       └── exercise1_3.py
        │   │   ├── problem_set_1_solutions
        │   │       ├── exercise1_1_soln.py
        │   │       └── exercise1_2_soln.py
        │   │   └── problem_set_2
        │   │       └── exercise2_2.py
        ├── run.py
        ├── user_config.py
        ├── utils
        │   ├── __init__.py
        │   ├── logx.py
        │   ├── mpi_pytorch.py
        │   ├── mpi_tf.py
        │   ├── mpi_tools.py
        │   ├── plot.py
        │   ├── run_entrypoint.py
        │   ├── run_utils.py
        │   ├── serialization_utils.py
        │   └── test_policy.py
        └── version.py
    └── travis_setup.sh


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python byte code
 2 | *.pyc
 3 | 
 4 | # Vim
 5 | *.swp
 6 | 
 7 | # Mac
 8 | *.DS_Store
 9 | 
10 | # Configuration files
11 | .env
12 | .vscode
13 | .VSCodeCounter
14 | 
15 | # results
16 | source/results/*
17 | 
18 | # Gurobi solver related
19 | *.log
20 | *.so
21 | *.lp
22 | 
23 | # data and model
24 | fb_data_anon*
25 | results
26 | 
27 | # Others
28 | source/config.py
29 | sync.py
30 | test_bak.py


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 
203 | --------------------------------------------------------------------------------
204 | 
205 | Code in python/ray/rllib/{evolution_strategies, dqn} adapted from
206 | https://github.com/openai (MIT License)
207 | 
208 | Copyright (c) 2016 OpenAI (http://openai.com)
209 | 
210 | Permission is hereby granted, free of charge, to any person obtaining a copy
211 | of this software and associated documentation files (the "Software"), to deal
212 | in the Software without restriction, including without limitation the rights
213 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
214 | copies of the Software, and to permit persons to whom the Software is
215 | furnished to do so, subject to the following conditions:
216 | 
217 | The above copyright notice and this permission notice shall be included in
218 | all copies or substantial portions of the Software.
219 | 
220 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
221 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
222 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
223 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
224 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
225 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
226 | THE SOFTWARE.
227 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## 0. Introduction
 2 | This repository contains the source code for our SIGCOMM'21 paper "Network Planning with Deep Reinforcement Learning".
 3 | ### Notes
 4 | The network topologies and the trained models used in the paper are not open-sourced. One can create synthetic topologies according to the problem formulation in the paper or modify the code for their own use case.
 5 | 
 6 | ## 1. Environment config
 7 | ### AWS instance configurations
 8 | - AMI image: "Deep Learning AMI (Ubuntu 16.04) Version 43.0 - ami-0774e48892bd5f116"
 9 | - for First-stage: g4dn.4xlarge; `Threads 16` in `gurobi.env`
10 | - for others (ILP, ILP-heur, Second-stage): m5zn.12xlarge; `Threads 8` in `gurobi.env`
11 | 
12 | ### Step 0: download the git repo
13 | ### Step 1: install Linux dependencies
14 | ```
15 | sudo apt-get update
16 | sudo apt-get install build-essential libopenmpi-dev libboost-all-dev
17 | ```
18 | ### Step 2: install Gurobi
19 | ```
20 | cd <repo>/
21 | ./gurobi.sh
22 | source ~/.bashrc
23 | ```
24 | - Install the license here: https://www.gurobi.com/downloads/free-academic-license/
25 | - Make sure your Gurobi solver work: `gurobi_cl /opt/gurobi902/linux64/examples/data/coins.lp`
26 | ### Step 3: setup && start conda environment with python3.7.7
27 | If you use the AWS Deep Learning AMI, conda is preinstalled.
28 | ```
29 | conda create --name <env> python=3.7.7
30 | conda activate <env>
31 | ```
32 | ### Step 4: install python dependencies in the conda env
33 | ```
34 | cd <repo>/spinninup
35 | pip install -e .
36 | pip install networkx pulp pybind11 xlrd==1.2.0
37 | ```
38 | ### Step 5: compile C++ program with pybind11
39 | ```
40 | cd <repo>/source/c_solver
41 | ./compile.sh
42 | ```
43 | ## 2. Content
44 | - source
45 |     - c_solver: C++ implementation with Gurobi APIs for ILP solver and network plan evaluator
46 |     - planning: `ILP` and `ILP-heur` implementation
47 |     - results: store the provided trained models and solutions, and the training log
48 |     - rl: the implementations of Critic-Actor, RL environment and RL solver 
49 |     - simulate: python classes of flow, spof, and traffic matrix
50 |     - topology: python classes of network topology (both optical layer and IP layer)
51 |     - `test.py`: the main script used to reproduce results
52 | - spinningup
53 |     - adapted from [OpenAI Spinning Up](https://github.com/openai/spinningup)
54 | - `gurobi.sh`
55 |     - used to install Gurobi solver
56 | ## 3. Reproduce results (for SIGCOMM'21 artifact evaluation)
57 | ### Notes 
58 | - Some data points are time-consuming to get (i.e., First-stage for A-0, A-0.25, A-0.5, A-0.75 in Figure 8 and B, C, D, E in Figure 9). We provide pretrained models in `<repo>/source/results/trained/<topo_name>/`, which will be loaded by default. 
59 | - We recommend distributing different data points and differetnt experiments on multiple AWS instances to run simultaneously.
60 | - The default `epoch_num` for Figure 10, 11 and 12 is set to be 1024, to guarantee the convergence. The training process can be terminated manually if convergence is observed.
61 | ### How to reproduce
62 | - `cd <repo>/source`
63 | - Figure 7: `python test.py fig_7 <epoch_num>`, `epoch_num` can be set smaller than 10 (e.g. 2) to get results faster.
64 | - Figure 8: `python test.py single_dp_fig8 <alg> <adjust_factor>` produces one data point at a time (the default adjust_factor is 1).  
65 |     - For example, `python test.py single_dp_fig8 ILP 0.0` runs ILP algorithm for `A-0`. 
66 |     - Pretrained models will be loaded by default if provided in `source/results/trained/`. To train from scratch which is **NOT RECOMMENDED**, run `python test.py single_dp_fig8 <alg> <adjust_factor> False`
67 | - Figure 9&13: `python test.py single_dp_fig9 <topo_name> <alg>` produces one data point at a time. 
68 |     - For example, `python test.py single_dp_fig9 E NeuroPlan` runs NeuroPlan (First-stage) for topology E with the pretrained model. To train from scratch which is **NOT RECOMMENDED**, run `python test.py single_dp_fig9 E NeuroPlan False`. 
69 |     - `python test.py second_stage <topo_name> <sol_path> <relax_factor>` can load the solution from the first stage in `<sol_path>` and run second-stage with `relax_factor=<relax_factor>` on topo `<topo_name>`. For example, `python test.py second_stage D "results/<log_dir>/opt_topo/***.txt" 1.5` 
70 |     - we also provide our results of First-stage in `results/trained/<topo_name>/<topo_name>.txt`, which can be used to run second-stage directly. For example, `python test.py second_stage C "results/trained/C/C.txt" 1.5`
71 | - Figure 10: `python test.py fig_10 <adjust_factor> <num_gnn_layer>`. 
72 |     - `adjust_factor={0.0, 0.5, 1.0}, num_gnn_layer={0, 2, 4}`
73 |     - For example, `python test.py fig_10 0.5 2` runs NeuroPlan with `2`-layer GNNs for topology `A-0.5`
74 | - Figure 11: `python test.py fig_11 <adjust_factor> <mlp_hidden_size>`. 
75 |     - `adjust_factor={0.0, 0.5, 1.0}, mlp_hidden_size={64, 256, 512}`
76 |     - For example, `python test.py fig_11 0.0 512` runs NeuroPlan with hidden_size=`512` for topology `A-0`
77 | - Figure 12: `python test.py fig_12 <adjust_factor> <max_unit_per_step>`. 
78 |     - `adjust_factor={0.0, 0.5, 1.0}, max_unit_per_step={1, 4, 16}`
79 |     - For example, `python test.py fig_11 1.0 4` runs NeuroPlan with max_unit_per_step=`4` for topology `A-1`
80 | 
81 | ## 4. Contact
82 | For any question, please contact `hzhu at jhu dot edu`.
83 | 


--------------------------------------------------------------------------------
/gurobi.sh:
--------------------------------------------------------------------------------
 1 | wget https://packages.gurobi.com/9.0/gurobi9.0.2_linux64.tar.gz
 2 | sudo mv gurobi9.0.2_linux64.tar.gz /opt
 3 | cd /opt;sudo tar xvfz gurobi9.0.2_linux64.tar.gz
 4 | cd /opt/gurobi902/linux64/src/build/
 5 | sudo make
 6 | sudo cp libgurobi_c++.a ../../lib/
 7 | 
 8 | # set env var
 9 | cat <<EOT >> ~/.bashrc
10 | export GUROBI_HOME="/opt/gurobi902/linux64"
11 | export PATH="\${PATH}:\${GUROBI_HOME}/bin"
12 | export LD_LIBRARY_PATH="\${LD_LIBRARY_PATH}:\${GUROBI_HOME}/lib"
13 | EOT


--------------------------------------------------------------------------------
/source/c_solver/compile.sh:
--------------------------------------------------------------------------------
1 | c++ -O3 -Wall -shared -std=c++11 -fPIC `python3 -m pybind11 --includes` gurobi_c.cpp -o ../gurobi_c`python3-config --extension-suffix` -I/opt/gurobi902/linux64/include -L/opt/gurobi902/linux64/lib -lgurobi_c++ -lgurobi90
2 | 


--------------------------------------------------------------------------------
/source/gurobi.env:
--------------------------------------------------------------------------------
1 | LogToConsole 0
2 | Threads 16
3 | 
4 | 


--------------------------------------------------------------------------------
/source/planning/ilp.py:
--------------------------------------------------------------------------------
 1 | #from planning.algorithm import PlanAlg
 2 | from simulate.traffic_matrix import TrafficMatrix
 3 | from topology.ip.router import Router
 4 | from topology.ip.link import Link
 5 | from topology.optical.optic_node import OpticNode
 6 | from topology.optical.optic_fiber import OpticFiber
 7 | from topology.topology import Topology
 8 | 
 9 | import matplotlib.pyplot as plt
10 | import pdb, time, sys
11 | 
12 | # pybind11, c++ impl
13 | import gurobi_c
14 | 
15 | class ILP(object):
16 |     def __init__(self, topo):
17 |         self.topo = topo
18 |         self.cost_opt = None
19 | 
20 |     def run_ilp(self, subopt_sol=None, delta_bw=100, relax_factor=1, mipgapabs=5e-2):
21 |         ilp_solve_limit = -1
22 | 
23 |         non_direct_graph, init_cost = self.topo.ip.generate_non_direction_graph(1, subopt_sol, relax_factor)
24 |         fiber_info = {}
25 |         for fiber_name, fiber_inst in self.topo.optic.fibers.items():
26 |             if fiber_inst.lease_flag:
27 |                 max_spectrum = 0
28 |                 max_capa = fiber_inst.max_bw
29 |             else:
30 |                 max_spectrum = fiber_inst.max_fp*fiber_inst.spectrum
31 |                 max_capa = 0
32 | 
33 |             fiber_info[fiber_name] = (fiber_inst.lease_flag, max_capa, max_spectrum)
34 | 
35 |         failed_links_for_spof_list = self.topo.failed_links_for_spof_list[:-1]
36 |         print("start ilp_solve_c...", flush=True)
37 |         start_time = time.time()
38 |         (cost_opt, delta_capa_sum, opt_sol) = gurobi_c.ilp_solve_c(non_direct_graph, failed_links_for_spof_list, \
39 |             self.topo.tm.data['all'], self.topo.tm.data['no-bronze'], fiber_info, self.topo.l3node_map_stub, self.topo.load_factor, \
40 |             delta_bw, ilp_solve_limit, mipgapabs)
41 |         print("ilp_solve result, running time: {} \nfinal_cost:{}, init_cost:{}, delta_cost:{}, delta_capa:{}".format(
42 |             int(time.time()-start_time), cost_opt+init_cost, init_cost, cost_opt, delta_capa_sum), flush=True)
43 |         
44 |         print("opt_cost:{}".format(cost_opt), flush=True)
45 |         self.cost_opt = cost_opt
46 |         print(dict(sorted(opt_sol.items(), key=lambda item: item[1], reverse=True)), flush=True)
47 |     
48 |     def run_ilp_heuristic(self, subopt_sol=None, delta_bw=1600, relax_factor=1, spof_group_size=10):
49 |         ilp_solve_limit = -1
50 |         mipgapabs = 0.05
51 |         fiber_info = {}
52 |         for fiber_name, fiber_inst in self.topo.optic.fibers.items():
53 |             if fiber_inst.lease_flag:
54 |                 max_spectrum = 0
55 |                 max_capa = fiber_inst.max_bw
56 |             else:
57 |                 max_spectrum = fiber_inst.max_fp*fiber_inst.spectrum
58 |                 max_capa = 0
59 | 
60 |             fiber_info[fiber_name] = (fiber_inst.lease_flag, max_capa, max_spectrum)
61 |         
62 |         failed_links_for_spof_list = self.topo.failed_links_for_spof_list[:-1]
63 | 
64 |         spof_group = failed_links_for_spof_list[:spof_group_size]
65 |         group_idx = 0
66 |         total_cost = 0
67 |         total_sol = {}
68 |         total_start_time = time.time()
69 |         while len(spof_group) > 0:
70 |             non_direct_graph, init_cost = self.topo.ip.generate_non_direction_graph(1, subopt_sol, relax_factor)
71 |             start_time = time.time()
72 |             (cost_opt, delta_capa_sum, opt_sol) = gurobi_c.ilp_solve_c(non_direct_graph, spof_group, \
73 |                 self.topo.tm.data['all'], self.topo.tm.data['no-bronze'], fiber_info, self.topo.l3node_map_stub, \
74 |                 self.topo.load_factor, delta_bw, ilp_solve_limit, mipgapabs, 0)
75 |             print("spof_group_idx:{}, opt_slo:{}, running time:{}".format(group_idx, opt_sol, time.time()-start_time))
76 | 
77 |             for link_idx, step_size in opt_sol.items():
78 |                 self.topo.ip.links[self.topo.ip.idx_map_link_name[link_idx]].incr_bw(step_size*delta_bw)
79 |                 try:
80 |                     total_sol[link_idx] += step_size
81 |                 except:
82 |                     total_sol[link_idx] = step_size
83 |             
84 |             total_cost += cost_opt
85 |             group_idx += 1
86 |             spof_group = failed_links_for_spof_list[spof_group_size*group_idx:spof_group_size*(group_idx+1)]
87 | 
88 |         print("heuristic total time:{}".format(time.time()-total_start_time))
89 |         print("opt_cost:{}".format(total_cost), flush=True)
90 |         self.cost_opt = total_cost
91 |         print(dict(sorted(total_sol.items(), key=lambda item: item[1], reverse=True)), flush=True)
92 | 
93 | 


--------------------------------------------------------------------------------
/source/rl/ac.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy.signal, math
  3 | from gym.spaces import Box, Discrete
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.functional as F
  8 | from torch.distributions.normal import Normal
  9 | from torch.distributions.categorical import Categorical
 10 | from torch.nn.parameter import Parameter
 11 | 
 12 | import pdb, functools
 13 | 
 14 | 
 15 | def mlp(sizes, activation, dropout_flag=False, dropout=0.5, output_activation=nn.Identity):
 16 |     layers = []
 17 |     for j in range(len(sizes)-1):
 18 |         act = activation if j < len(sizes)-2 else output_activation
 19 |         if dropout_flag:
 20 |             layers += [nn.Linear(sizes[j], sizes[j+1]), act(), nn.Dropout(dropout)]
 21 |         else:
 22 |             layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
 23 |     return nn.Sequential(*layers)
 24 | 
 25 | 
 26 | class SimpleGCN(nn.Module):
 27 |     """
 28 |     Simple GCN layer, similar to https://arxiv.org/abs/1609.02907
 29 |     """
 30 |     def __init__(self, in_features, out_features):
 31 |         super(SimpleGCN, self).__init__()
 32 |         self.weight = Parameter(torch.FloatTensor(in_features, out_features))
 33 |         self.reset_parameters()
 34 | 
 35 |     def reset_parameters(self):
 36 |         stdv = 1. / math.sqrt(self.weight.size(1))
 37 |         self.weight.data.uniform_(-stdv, stdv)
 38 | 
 39 |     # adj_adjust is D^(-0.5)*(adj+I)*D^(0.5)
 40 |     def forward(self, h_0, adj_adjust):
 41 |         support = torch.matmul(h_0, self.weight)
 42 |         output = torch.matmul(adj_adjust, support)
 43 |         return output
 44 | 
 45 | """
 46 | batch: return (batch_size, one-hot vector encoding for the graph)
 47 | one sample: return one-hot vector encoding for the graph
 48 | """
 49 | class GCN(nn.Module):
 50 |     def __init__(self, feature_num, ip_node_num, n_hidden, num_layer):
 51 |         super(GCN, self).__init__()
 52 |         self.ip_node_num = ip_node_num
 53 |         self.feature_num = feature_num
 54 | 
 55 |         self.gcn_list = []
 56 |         for i in range(num_layer):
 57 |             if i == 0:
 58 |                 self.gcn_list.append(SimpleGCN(feature_num, n_hidden))
 59 |             elif i == num_layer-1:
 60 |                 self.gcn_list.append(SimpleGCN(n_hidden, feature_num))
 61 |             else:
 62 |                 self.gcn_list.append(SimpleGCN(n_hidden, n_hidden))
 63 |         print("num of gcn layer:{}".format(len(self.gcn_list)))
 64 |         self.gcn_list = nn.ModuleList(self.gcn_list)
 65 | 
 66 |     # node_num: n
 67 |     # state_node: batch_size*n*feature_num
 68 |     # state_adj: batch_size*n*n
 69 |     # obs: batch_size*n*(feature_num+n)
 70 |     def forward(self, obs):
 71 |         # reconstruct state_node and state_adj from flatten_obs
 72 |         if (len(obs.size())==3):
 73 |             # batch
 74 |             adj_adjust, h_0 = torch.split(obs,[self.ip_node_num, self.feature_num],dim=2)
 75 |         else:
 76 |             adj_adjust, h_0 = torch.split(obs,[self.ip_node_num, self.feature_num],dim=1)
 77 | 
 78 |         for gcn in self.gcn_list:
 79 |             h_0 = F.relu(gcn(h_0, adj_adjust))
 80 | 
 81 |         if (len(h_0.size())==3):
 82 |             # batch
 83 |             bn_emb = torch.flatten(h_0,1)
 84 |         else:
 85 |             bn_emb = torch.flatten(h_0)
 86 |         return bn_emb
 87 | 
 88 | class Actor(nn.Module):
 89 | 
 90 |     def _distribution(self, obs):
 91 |         raise NotImplementedError
 92 | 
 93 |     def _log_prob_from_distribution(self, pi, act):
 94 |         raise NotImplementedError
 95 | 
 96 |     def forward(self, obs, act=None):
 97 |         # Produce action distributions for given observations, and
 98 |         # optionally compute the log likelihood of given actions under
 99 |         # those distributions.
100 |         pi = self._distribution(obs)
101 |         logp_a = None
102 |         if act is not None:
103 |             logp_a = self._log_prob_from_distribution(pi, act)
104 |         return pi, logp_a
105 | 
106 | 
107 | class GCNCategoricalActor(Actor):
108 | 
109 |     def __init__(self, feature_num, ip_node_num, gcn, hidden_sizes, act_num, activation):
110 |         super().__init__()
111 |         self.GCN = gcn
112 |         self.logits_net = mlp([feature_num*ip_node_num] + list(hidden_sizes) + [act_num], activation)
113 | 
114 |     # logits is the log probability, log_p = ln(p)
115 |     def _distribution(self, obs):
116 |         obs_emb = self.GCN(obs)
117 |         logits = self.logits_net(obs_emb)
118 |         return Categorical(logits=logits)
119 | 
120 |     def _get_logits(self, obs):
121 |         obs_emb = self.GCN(obs)
122 |         logits = self.logits_net(obs_emb)
123 |         return logits
124 | 
125 |     def _log_prob_from_distribution(self, pi, act):
126 |         return pi.log_prob(act)
127 |     
128 | class GCNCritic(nn.Module):
129 | 
130 |     def __init__(self, feature_num, ip_node_num, gcn, hidden_sizes, activation):
131 |         super().__init__()
132 |         self.GCN = gcn
133 |         self.v_net = mlp([feature_num*ip_node_num] + list(hidden_sizes) + [1], activation)
134 | 
135 |     def forward(self, obs):
136 |         return torch.squeeze(self.v_net(self.GCN(obs)), -1) # Critical to ensure v has right shape.
137 | 
138 | 
139 | 
140 | class GCNActorCritic(nn.Module):
141 |     def __init__(self, observation_space, action_space, graph_encoder_hidden=256, num_gnn_layer=2, 
142 |                  hidden_sizes=(64,64), activation=nn.ReLU):
143 |         super().__init__()
144 | 
145 |         ip_node_num = observation_space.shape[0]
146 |         feature_num = observation_space.shape[1] - ip_node_num
147 |         
148 |         act_num = action_space.n
149 |         self.GCN = GCN(feature_num, ip_node_num, graph_encoder_hidden, num_gnn_layer)
150 |         self.pi = GCNCategoricalActor(feature_num, ip_node_num, self.GCN, hidden_sizes, act_num, activation)
151 | 
152 |         # build value function
153 |         self.v = GCNCritic(feature_num, ip_node_num, self.GCN, hidden_sizes, activation)
154 |         params_num = sum(functools.reduce( lambda a, b: a*b, x.size()) for x in self.parameters())
155 |         print("# of trainable params:{}".format(params_num))
156 | 
157 |     def step(self, obs, mask):
158 |         with torch.no_grad():
159 |             pi = self.pi._distribution(obs)
160 |             
161 |             pi_logits = self.pi._get_logits(obs)
162 |             pi_logits_delta = torch.zeros(mask.size()).to(mask.device)
163 |             pi_logits_delta[mask == 0] = float("-Inf")
164 |             pi_logits += pi_logits_delta
165 |             pi_mask = Categorical(logits=pi_logits)
166 |             
167 |             a = pi_mask.sample()
168 |             logp_a = self.pi._log_prob_from_distribution(pi, a)
169 | 
170 |             v = self.v(obs)
171 |         return a.cpu().numpy(), v.cpu().numpy(), logp_a.cpu().numpy()


--------------------------------------------------------------------------------
/source/rl/plan_env.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | from topology.topology import Topology
  3 | from copy import deepcopy
  4 | import numpy as np
  5 | import pdb, os, time, json
  6 | import networkx as nx
  7 | import matplotlib.pyplot as plt
  8 | import sys, math
  9 | 
 10 | class PlanEnv(gym.Env):
 11 |     def __init__(self, topo: Topology, log_dir, graph_encoder, max_n_delta_bw, max_action=512,\
 12 |             steps_per_epoch=2048, delta_bw=100, checker_mode="all"):
 13 |         self.max_action = max_action
 14 |         self.steps_per_epoch = steps_per_epoch
 15 |         self.checker_mode = checker_mode
 16 |         self.delta_bw = delta_bw
 17 |         self.max_rwd = None
 18 |         self.opt_target = None
 19 |         self.action_cnt = 0
 20 |         
 21 |         self.w1 = 2
 22 |         self.w2 = 2
 23 |         self.norm_param=1e-7
 24 |         self.graph_encoder = graph_encoder
 25 |         self.max_n_delta_bw = max_n_delta_bw
 26 |         self.max_ip_node = len(topo.ip.routers)
 27 | 
 28 |         # related to utils
 29 |         self.topo = topo
 30 |         self.topo_preprocess()
 31 |         self.original_topo = deepcopy(topo)
 32 | 
 33 |         obs, _ = self.get_observation()
 34 | 
 35 |         self.observation_space = gym.Space(shape=list(obs.shape))
 36 |         print("obv_space size: {}".format(self.observation_space.shape))
 37 | 
 38 |         self.action_space = gym.spaces.Discrete(len(self.topo.ip.links)*max_n_delta_bw)
 39 |         print("act_space size: {}".format(self.action_space.n))
 40 |         
 41 |         self.cum_rwd = 0
 42 |         self.complete_cnt = 0
 43 | 
 44 |         self.max_rwd = None
 45 |         self.opt_target = None
 46 |         self.cost = 0
 47 |         self.optm_topo = self.topo
 48 |         self.optm_ob = None
 49 |         self.opt_action_list = []
 50 |         self.optm_epoch_idx = 0
 51 |         self.action_list = []
 52 |         self.epoch_idx = 0
 53 | 
 54 |         self.start_sec_ts = int(time.time())
 55 | 
 56 |         action_path = "results/{}/actions.txt".format(log_dir)
 57 |         self.topo_path = "results/{}/opt_topo".format(log_dir)
 58 |         if not os.path.exists(self.topo_path):
 59 |             os.makedirs(self.topo_path)
 60 | 
 61 |         self.action_fpr = open(action_path,"w")
 62 | 
 63 |         self.action_cnt_cum = 0 # to record the epoch num
 64 |         self.traj_set = set() # traj set stores the vars (the set of l3 link candidates) that decide the ilp 
 65 |         self.main_epoch_traj_stats_list= []
 66 |         traj_path = "results/{}/traj.txt".format(log_dir)
 67 |         self.traj_fpr = open(traj_path,"w")
 68 |         
 69 |         self.main_epoch_traj_num = 0
 70 |         self.main_epoch_traj_num_visited = 0
 71 |         self.main_epoch_cache_hit_num = 0
 72 | 
 73 |         # cache for accelerating check_sf
 74 |         # each state is represented as a frozenset((l3_link_idx1, delta_bw), (l3_link_idx2, delta_bw),...)
 75 |         self.state_map_fp_cache = {}
 76 |         self.cache_max_entry = 1e6
 77 |         self.cache_path = "results/{}/cache".format(log_dir)
 78 |         if not os.path.exists(self.cache_path):
 79 |             os.makedirs(self.cache_path)
 80 | 
 81 |     def topo_preprocess(self):
 82 |         self.topo.get_edge2node_adj()
 83 | 
 84 |     def step(self, action):
 85 |         obs, reward, done, info = None, None, False, None
 86 | 
 87 |         violate_max_cstr_flag, visited_flag, cache_hit_flag = False, False, False
 88 |         adapt_tm = -1
 89 | 
 90 |         action_int, act_type = int(action), 0
 91 | 
 92 |         l3_link_idx = int(action_int/self.max_n_delta_bw)
 93 |         delta_bw_act = self.delta_bw*(int(action_int%self.max_n_delta_bw)+1)
 94 |         
 95 |         if act_type == 0:
 96 |             # add IP link capacity
 97 |             cost = self.topo.aug_l3_link_by_idx_nocheck_max(l3_link_idx, delta_bw=delta_bw_act)
 98 |             assert(cost >= 0)
 99 |         else:
100 |             # remove IP link capacity
101 |             cost = self.topo.aug_l3_link_by_idx_nocheck_max(l3_link_idx, delta_bw=-delta_bw_act)
102 | 
103 |         self.action_list.append((action_int, self.topo.spof_failed_point))
104 |         self.action_cnt += 1
105 |         self.action_cnt_cum += 1
106 |         obs, mask = self.get_observation()
107 |         
108 |         if cost >= 0:
109 |             # check the spof constraints further
110 |             sat_flag, cache_hit_flag, self.state_map_fp_cache = self.topo.check_spof(l3_link_idx, delta_bw_act, self.state_map_fp_cache, self.cache_max_entry, self.checker_mode)
111 |             reward = -round(cost*self.norm_param, 10)
112 |             self.cost += cost
113 |         else:
114 |             # cost < 0 means violating the max_cstrs
115 |             sat_flag = False
116 |             violate_max_cstr_flag = True
117 |             reward = -400
118 |             
119 | 
120 |         if sum(mask)==0:
121 |             # no feasible action
122 |             violate_max_cstr_flag = True
123 | 
124 |         if cache_hit_flag:
125 |             self.main_epoch_cache_hit_num += 1
126 | 
127 |         if sat_flag or self.action_cnt >= self.max_action or violate_max_cstr_flag:
128 |             done = 1
129 |         else:
130 |             done = 0
131 |         
132 |         if done or (self.action_cnt_cum%self.steps_per_epoch == 0):
133 |             visited_flag = self.is_visited_sol()
134 |         
135 |         if done == 1:
136 |             if sat_flag:
137 |                 reward += 0
138 |             else:
139 |                 reward -= 1
140 | 
141 |         self.cum_rwd += reward
142 | 
143 |         action_idx_map_extra_rwd = None
144 |         self.opt_sol_on_rl = -1
145 |         if sat_flag:
146 |             self.save_if_best()
147 | 
148 |         if done or (self.action_cnt_cum%self.steps_per_epoch == 0):
149 |             # save trajectory and plan results
150 |             self.save_trajectory(visited_flag, adapt_tm, self.cost, self.cum_rwd, violate_max_cstr_flag, sat_flag)
151 |         info = {"log_ptr": self.traj_fpr, "extra_rwd": action_idx_map_extra_rwd}
152 |         return obs, mask, reward, done, info
153 | 
154 |     def reset(self):
155 |         self.action_cnt = 0
156 |         self.cum_rwd = 0
157 |         self.cost = 0
158 |         
159 |         self.topo.reset()
160 |         self.epoch_idx += 1
161 |         self.action_list = []
162 |         sys.stdout.flush()
163 | 
164 |         return self.get_observation()
165 | 
166 |     def get_observation(self):
167 |         """
168 |         ob['ip_adj']:n*n --- E
169 |         ob['ip_node']:n*d_n ---- F (longitute, latitude, in_traffic, out_traffic)
170 |         """
171 |         E_origin = self.topo.edge2node_adj
172 |         E_hat = E_origin + np.eye(E_origin.shape[0])
173 | 
174 |         D = np.diag(np.sum(E_hat, axis=1))
175 | 
176 |         # https://towardsdatascience.com/how-to-do-deep-learning-on-graphs-with-graph-convolutional-networks-62acf5b143d0
177 |         D_spectral = np.sqrt(np.linalg.inv(D))
178 |         E = np.matmul(np.matmul(D_spectral, E_hat),D_spectral)
179 |         
180 |         F = self.topo.get_edge_feature()
181 |         ob = np.concatenate((E,F), axis=1)
182 |         
183 |         mask = np.asarray(self.topo.get_feasible_action(self.max_n_delta_bw,self.delta_bw))
184 |         return ob, mask
185 | 
186 |     def terminate(self):
187 |         self.action_fpr.write("epoch_cnt:{}, ip_node_num:{}\n".format(self.epoch_idx, self.max_ip_node))
188 |         self.action_fpr.write("total_time(sec):{}\n".format(int(time.time())-self.start_sec_ts))
189 |         self.action_fpr.close()
190 | 
191 |     def ilp_opt_on_rl(self, action_list):
192 |         l3_link_idx_map_cnt = {}
193 |         for (l3_link_idx,_) in action_list:
194 |             try:
195 |                 l3_link_idx_map_cnt[l3_link_idx] += 1
196 |             except:
197 |                 l3_link_idx_map_cnt[l3_link_idx] = 1
198 |         
199 |         return self.original_topo.ilp_solve(self.delta_bw, l3_link_idx_map_cnt, relax_factor=1)
200 | 
201 |     def save_if_best(self):
202 |         self.complete_cnt += 1
203 |             
204 |         if self.opt_target == None or self.opt_target >= self.cost:
205 |             self.max_rwd = self.cum_rwd
206 |             self.opt_target = self.cost
207 |             self.optm_topo = self.topo
208 |             self.optm_ob = self.get_observation()
209 |             self.optm_epoch_idx = self.epoch_idx
210 |             self.opt_action_list = self.action_list[:]
211 |             self.ip_idx_map_num_step = {}
212 |             for i in self.opt_action_list:
213 |                 link_idx = int(i[0]/self.max_n_delta_bw)
214 |                 delta_bw_act = int(i[0]%self.max_n_delta_bw)+1
215 |                 try:
216 |                     self.ip_idx_map_num_step[link_idx] += delta_bw_act
217 |                 except:
218 |                     self.ip_idx_map_num_step[link_idx] = delta_bw_act
219 | 
220 |             main_epoch_idx = int((self.action_cnt_cum-1)/self.steps_per_epoch)
221 |             self.action_fpr.write("local_opt:{} {} {} ilp_opt_ob_rl:{} {} {}\nip_idx_list:{}\n".format(self.epoch_idx, self.max_rwd, self.opt_target, self.opt_sol_on_rl, \
222 |                 len(self.opt_action_list), self.opt_action_list, self.ip_idx_map_num_step))
223 |             self.action_fpr.flush()
224 |             
225 |             opt_topo_path = '{}/{}_main_epoch{}_cost{}.txt'.format(self.topo_path, int(time.time())-self.start_sec_ts, main_epoch_idx, self.cost)
226 |             opt_topo_fpr = open(opt_topo_path,"w")
227 |             opt_topo_fpr.write(json.dumps(self.ip_idx_map_num_step))
228 |             opt_topo_fpr.close()
229 |     
230 |     def is_visited_sol(self):
231 |         link_cand_list = [int(action) for (action, cost) in self.action_list]
232 |         link_cand_tuple = tuple(sorted(link_cand_list))
233 |         visited_flag = (tuple(link_cand_tuple) in self.traj_set)
234 |         if visited_flag:
235 |             self.main_epoch_traj_num_visited += 1
236 |         self.main_epoch_traj_num += 1
237 |         self.traj_set.add(link_cand_tuple)
238 | 
239 |         return visited_flag
240 | 
241 |     def save_trajectory(self, visited_flag, adapt_tm, cost, reward, violate_max_cstr_flag, sat_flag):
242 |         main_epoch_idx = int((self.action_cnt_cum-1)/self.steps_per_epoch)
243 | 
244 |         self.traj_fpr.write("main epoch idx:{}, visited_flag:{}, adapt_tm:{}, cost:{}, rwd:{}, violate_max_cstr_flag:{}, sat_flag:{}\n action_list:{} {}\n ".\
245 |             format(main_epoch_idx, visited_flag, adapt_tm, cost, reward, violate_max_cstr_flag, sat_flag, len(self.action_list), self.action_list))
246 |         if self.action_cnt_cum%self.max_action == 0:
247 |             # current epoch terminate
248 |             self.traj_fpr.write("main epoch idx:{}, # of traj:{}, # of visited traj:{}, visited_ratio:{}, cache_hit_num:{}, cache_num:{}\n".
249 |                 format(main_epoch_idx, self.main_epoch_traj_num, self.main_epoch_traj_num_visited, \
250 |                 round(self.main_epoch_traj_num_visited/self.main_epoch_traj_num, 4), self.main_epoch_cache_hit_num, len(self.state_map_fp_cache)))
251 |             self.main_epoch_traj_num_visited = 0
252 |             self.main_epoch_traj_num = 0 
253 |             self.main_epoch_cache_hit_num = 0
254 |             
255 |         self.traj_fpr.flush()
256 | 


--------------------------------------------------------------------------------
/source/rl/rl.py:
--------------------------------------------------------------------------------
 1 | import pdb, time, sys,torch
 2 | 
 3 | from rl.plan_env import PlanEnv
 4 | from rl.ac import GCNActorCritic
 5 | sys.path.insert(0 ,"../spinningup/")
 6 | from spinup import vpg_pytorch
 7 | 
 8 | class RL(object):
 9 |     def __init__(self, topo, graph_encoder="GCN", num_gnn_layer=2, \
10 |             max_n_delta_bw=1, hidden_sizes=(256, 256), \
11 |             epoch_num=1024, max_action=512,steps_per_epoch=1024,\
12 |             delta_bw=100, checker_mode="all", model_path=None):
13 |         
14 |         self.topo = topo
15 |         
16 |         self.graph_encoder = graph_encoder
17 |         self.num_gnn_layer = num_gnn_layer
18 |         self.hidden_sizes = hidden_sizes
19 | 
20 |         self.epoch_num = epoch_num
21 |         self.max_action = max_action
22 |         self.steps_per_epoch = steps_per_epoch
23 |         self.delta_bw = delta_bw
24 |         self.max_n_delta_bw = max_n_delta_bw
25 | 
26 |         self.checker_mode = checker_mode
27 |         self.model_path = model_path
28 |         
29 |         log_dir_name_list = [int(time.time()), len(self.topo.ip.links), self.graph_encoder, \
30 |             self.max_n_delta_bw, self.steps_per_epoch, self.delta_bw]
31 |         self.log_dir = '_'.join([str(i) for i in log_dir_name_list])
32 | 
33 |     def get_env(self):
34 |         self.env = PlanEnv(self.topo, log_dir=self.log_dir, graph_encoder=self.graph_encoder, \
35 |             max_n_delta_bw=self.max_n_delta_bw, max_action=self.max_action, steps_per_epoch=self.steps_per_epoch, delta_bw=self.delta_bw, checker_mode=self.checker_mode)
36 |         return self.env
37 | 
38 |     def run_training(self):
39 |         logger_kwargs = dict(output_dir="results/{}".format(self.log_dir), exp_name="test")
40 |         ac_kwargs = dict(graph_encoder_hidden=256,hidden_sizes=self.hidden_sizes, num_gnn_layer=self.num_gnn_layer)
41 | 
42 |         device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
43 |         ac = GCNActorCritic
44 | 
45 |         vpg_pytorch(self.get_env, enable_mpi=False, non_blocking=False, gamma=1,actor_critic=ac,\
46 |             max_ep_len=self.max_action, seed=8, device=device, \
47 |             model_path=self.model_path, \
48 |             ac_kwargs=ac_kwargs,epochs=self.epoch_num,steps_per_epoch=self.steps_per_epoch,logger_kwargs=logger_kwargs)
49 | 
50 |         self.env.terminate()


--------------------------------------------------------------------------------
/source/simulate/flow.py:
--------------------------------------------------------------------------------
 1 | from topology.ip.router import Router
 2 | 
 3 | class Flow:
 4 |     # cos represents the priority (GOLD-ICP, SILVER, BRONZE)
 5 |     def __init__(self, name, src:Router, dst:Router, capacity, cos):
 6 |         self.name = name
 7 |         self.src = src
 8 |         self.dst = dst
 9 |         self.capacity = capacity
10 |         self.cos = cos
11 | 


--------------------------------------------------------------------------------
/source/simulate/spof.py:
--------------------------------------------------------------------------------
1 | class Spof:
2 |     def __init__(self, name, fiber_name_list, cos_protect_list):
3 |         self.name = name
4 |         self.fiber_name_list = fiber_name_list
5 |         self.cos_protect_list = cos_protect_list
6 | 


--------------------------------------------------------------------------------
/source/simulate/spofs.py:
--------------------------------------------------------------------------------
1 | from simulate.spof import Spof
2 | class Spofs:
3 |     def __init__(self):
4 |         self.spof_list = []
5 | 
6 |     def register_spof(self, name, fiber_name_list, cos_protect_list):
7 |         self.spof_list.append(Spof(name, fiber_name_list, cos_protect_list))
8 | 
9 | 


--------------------------------------------------------------------------------
/source/simulate/traffic_matrix.py:
--------------------------------------------------------------------------------
 1 | from simulate.flow import Flow
 2 | 
 3 | class TrafficMatrix:
 4 |     def __init__(self):
 5 |         self.data = {"all":{},"no-bronze":{}}
 6 |         self.flows = {}
 7 | 
 8 |     def add_data(self, src, dst, traffic, type):
 9 |         if src not in self.data[type]:
10 |             self.data[type][src] = {}
11 | 
12 |         # aggregate traffic with the same OD pair for different priorities(GOLD, SILVER, BRONZE)
13 |         try:
14 |             self.data[type][src][dst] += traffic
15 |         except:
16 |             self.data[type][src][dst] = traffic
17 | 
18 |     def register_flow(self, flow_name, src, dst, traffic, cos):
19 |         assert(flow_name not in self.flows)
20 |         self.flows[flow_name] = Flow(flow_name, src, dst, traffic, cos)
21 |         if cos!="BRONZE":
22 |             self.add_data(src.name, dst.name, traffic, "no-bronze")
23 |         self.add_data(src.name, dst.name, traffic, "all")
24 | 


--------------------------------------------------------------------------------
/source/test.py:
--------------------------------------------------------------------------------
  1 | import argparse, sys, time, collections
  2 | import os, signal, socket, json, pdb
  3 | import torch
  4 | from rl.rl import RL
  5 | from planning.ilp import ILP
  6 | from topology.topology import Topology
  7 | 
  8 | def read_topo(topo_name, adjust_factor_in=None):
  9 |     assert(topo_name in ["A", "B", "C", "D", "E"])
 10 |     topo_name_map_file_path = {}
 11 |     
 12 |     file_path = topo_name_map_file_path[topo_name]
 13 |     
 14 |     topo = Topology(adjust_factor=adjust_factor)
 15 |     topo.import_fiber_from_file(file_path)
 16 |     topo.import_lease_from_file(file_path)
 17 |     topo.import_l3_node_from_file(file_path)
 18 |     topo.import_l3_link_from_file(file_path)
 19 |     topo.import_tm_from_file(file_path)
 20 |     topo.import_spof_from_file(file_path)
 21 |     
 22 |     topo.gen_failed_ip_link_and_spof_map()
 23 |     topo.generate_delta_bw_matrix_from_spof_list()
 24 | 
 25 |     return topo
 26 | 
 27 | # implementation efficiency
 28 | def fig_7(epoch_num=10):
 29 |     checker_mode_list = ["all", "sa", "vanilla"]
 30 |     topo_name_list = ["A", "B", "C", "D", "E"]
 31 | 
 32 |     result_log = collections.defaultdict(dict)
 33 |     for topo_name in topo_name_list:
 34 |         if topo_name == "A":
 35 |             checker_mode_list = ["all", "sa", "vanilla"]
 36 |         else:
 37 |             checker_mode_list = ["all", "sa"]
 38 | 
 39 |         for checker_mode in checker_mode_list:
 40 |             print(f'\n========== checker_mode:{checker_mode} topo_name:{topo_name} ==========\n')
 41 |             rl_solver = RL(topo=read_topo(topo_name), num_gnn_layer=2, max_n_delta_bw=1, checker_mode=checker_mode)
 42 |             rl_solver.run_training()
 43 | 
 44 |             # read the last line of the log file and calculate the avg time per epoch
 45 |             file_path = "results/{}/progress.txt".format(rl_solver.log_dir)
 46 |             with open(file_path, 'r') as f:
 47 |                 last_line = f.readlines()[-1]
 48 |             ele_list = last_line.strip().split('\t')
 49 |             avg_rt = round(float(ele_list[-1])/(int(ele_list[0])+1), 2)
 50 |             print(f'========== average running time: {avg_rt} seconds ========')
 51 |             result_log[topo_name][checker_mode] = avg_rt
 52 |     print(f'==== before normalization:{result_log}')
 53 |     for topo_name, d in result_log.items():
 54 |         norm_val = d["all"]
 55 |         print(f'\n======== final results of topo: {topo_name} ========')
 56 |         for k in d.keys():
 57 |             result_log[topo_name][k] = round(result_log[topo_name][k]/norm_val, 5)
 58 |             print(f'{k}, {result_log[topo_name][k]}')
 59 | 
 60 | # single data point, used for Figure 8
 61 | # support ILP, First-stage and Second-stage
 62 | def single_dp_fig8(alg, adjust_factor_in=1, load_trained=True):
 63 |     print(f'\n========== Fig8 start, A-{adjust_factor_in}, alg:{alg} ==========\n')
 64 |                                 
 65 |     if alg == "ILP":
 66 |         ilp_solver = ILP(topo=read_topo("A", adjust_factor_in=adjust_factor_in))
 67 |         ilp_solver.run_ilp()
 68 |         print(f'========== Topo: A-{adjust_factor_in}, result: {ilp_solver.cost_opt} =========\n')
 69 |     elif alg == "NeuroPlan":
 70 |         if load_trained:
 71 |             if int(adjust_factor_in) == adjust_factor_in:
 72 |                 af_file_name = int(adjust_factor_in)
 73 |             else:
 74 |                 af_file_name = adjust_factor_in
 75 |             model_path = f'results/trained/A-{af_file_name}/'
 76 |             if af_file_name == 1:
 77 |                 model_path = f'results/trained/A/'
 78 |             if os.path.exists(model_path + "pyt_save/model.pt") == False:
 79 |                 model_path = None
 80 |         else:
 81 |             model_path = None
 82 |         print(f'\n========== Fig8, RL: Topo: A-{adjust_factor_in}, load pre-trained model: {model_path} ==========\n')
 83 |         rl_solver = RL(topo=read_topo("A", adjust_factor_in=adjust_factor_in), model_path=model_path, num_gnn_layer=2, max_n_delta_bw=1)
 84 |         rl_solver.run_training()
 85 |         print(f'========== first stage result: {rl_solver.env.opt_target} =========\n')
 86 |         subopt_sol = rl_solver.env.ip_idx_map_num_step
 87 |         print(f'\n========== ILP on second stage: adjust_factor_in:{adjust_factor_in} ==========\n')
 88 |         ilp_solver = ILP(topo=read_topo("A", adjust_factor_in=adjust_factor_in))
 89 |         ilp_solver.run_ilp(subopt_sol=subopt_sol, relax_factor=1.5)
 90 |         print(f'========== second stage, adjust_factor_in: {adjust_factor_in}, result: {ilp_solver.cost_opt} =========\n')
 91 |     else:
 92 |         print("Illegal args")
 93 | 
 94 | # single data point, used for Figure 9
 95 | # support ILP, ILP-huer and First-stage
 96 | def single_dp_fig9(topo_name, alg, adjust_factor_in=1.0, load_trained=True):
 97 |     print(f'\n========== start: topo_name:{topo_name} alg:{alg} adjust_factor_in:{adjust_factor_in}==========\n')
 98 |                                 
 99 |     if alg == "ILP":
100 |         ilp_solver = ILP(topo=read_topo(topo_name, adjust_factor_in=adjust_factor_in))
101 |         ilp_solver.run_ilp()
102 |         print(f'========== result: {ilp_solver.cost_opt} =========\n')
103 |     elif alg == "ILP-heur":
104 |         ilp_solver = ILP(topo=read_topo(topo_name))
105 |         ilp_solver.run_ilp_heuristic()
106 |         print(f'========== result: {ilp_solver.cost_opt} =========\n')
107 |     elif alg == "NeuroPlan":
108 |         if load_trained:
109 |             model_path = f'results/trained/{topo_name}/'
110 |             if os.path.exists(model_path + "pyt_save/model.pt") == False:
111 |                 model_path = None
112 |         else:
113 |             model_path = None
114 |         print(f'\n========== RL: topo_name:{topo_name}, load pre-trained model: {model_path} ==========\n')
115 |         rl_solver = RL(topo=read_topo(topo_name), model_path=model_path, num_gnn_layer=2, max_n_delta_bw=1)
116 |         rl_solver.run_training()
117 |         print(f'========== first stage result: {rl_solver.env.opt_target} =========\n')
118 |     else:
119 |         print("Illegal args")
120 | 
121 | # given the path of the sol form the first stage, run second stage
122 | def second_stage(topo_name, sol_path, rf=1.0):
123 | 
124 |     with open(sol_path) as json_file:
125 |         json_dict = json.load(json_file)
126 |     subopt_sol = {}
127 |     for k, v in json_dict.items():
128 |         subopt_sol[int(k)] = v
129 |     ilp_solver = ILP(topo=read_topo(topo_name))
130 |     ilp_solver.run_ilp(subopt_sol=subopt_sol, relax_factor=rf)
131 |     print(f'========== sol from the first stage: {subopt_sol} ============\n')
132 |     print(f'========== second stage, topo_name: {topo_name}, rf: {rf}, result: {ilp_solver.cost_opt} =========\n')
133 | 
134 | # single data point, used for Figure 10, 11, 12
135 | def params_rl(adjust_factor_in=1.0, num_gnn_layer=2, max_n_delta_bw=1, hidden_sizes=(256, 256)):
136 |     print(f'\n========== start: adjust_factor_in:{adjust_factor_in} num_gnn_layer:{num_gnn_layer}, max_n_delta_bw:{max_n_delta_bw}, hidden_sizes:{hidden_sizes} ==========\n')
137 |     
138 |     rl_solver = RL(topo=read_topo("A", adjust_factor_in=adjust_factor_in), num_gnn_layer=num_gnn_layer, \
139 |         max_n_delta_bw=max_n_delta_bw,hidden_sizes=hidden_sizes)
140 |     rl_solver.run_training()
141 |     print(f'\n========== end: adjust_factor_in:{adjust_factor_in} num_gnn_layer:{num_gnn_layer}, max_n_delta_bw:{max_n_delta_bw}, hidden_sizes:{hidden_sizes} ==========')
142 |     print(f'result: {rl_solver.env.opt_target}')
143 | 
144 | if __name__ == "__main__":
145 |     arg = sys.argv[1]
146 |     if arg == 'fig_7':
147 |         fig_7(int(sys.argv[2]))
148 |     elif arg == 'fig_8':
149 |         fig_8()
150 |     elif arg == 'fig_9_13':
151 |         fig_9_13()
152 |     elif arg == "single_dp_fig8":
153 |         if len(sys.argv)==5 and sys.argv[4]=="False":
154 |             single_dp_fig8(sys.argv[2], float(sys.argv[3]), load_trained=False)
155 |         else:
156 |             single_dp_fig8(sys.argv[2], float(sys.argv[3]), load_trained=True)
157 |     elif arg == "single_dp_fig9":
158 |         if len(sys.argv)==5 and sys.argv[4]=="False":
159 |             single_dp_fig9(sys.argv[2], sys.argv[3], load_trained=False)
160 |         else:
161 |             single_dp_fig9(sys.argv[2], sys.argv[3], load_trained=True)
162 |     elif arg == "second_stage":
163 |         second_stage(sys.argv[2], sys.argv[3], float(sys.argv[4]))
164 |     elif arg == "fig_10":
165 |         params_rl(adjust_factor_in=float(sys.argv[2]), num_gnn_layer=int(sys.argv[3]))
166 |     elif arg == "fig_11":
167 |         params_rl(adjust_factor_in=float(sys.argv[2]), hidden_sizes=(int(sys.argv[3]), int(sys.argv[3])))
168 |     elif arg == "fig_12":
169 |         params_rl(adjust_factor_in=float(sys.argv[2]), max_n_delta_bw=int(sys.argv[3]))
170 |     else:
171 |         print("Illegal args")
172 |     
173 | 


--------------------------------------------------------------------------------
/source/topology/ip/link.py:
--------------------------------------------------------------------------------
 1 | from topology.utils.node import Node
 2 | 
 3 | 
 4 | class Link:
 5 |     
 6 |     def __init__(self, name, optic_set: frozenset, src: Node, dst: Node, idx=-1, initial_bw=0, max_bw=None, igp=0, fiber_map_spectrum=None, cost=None):
 7 |         self.name = name
 8 |         self.optic_set = optic_set
 9 |         self.src = src
10 |         self.dst = dst
11 |         self.idx = idx
12 |         self.initial_bw = initial_bw
13 |         self.bandwidth = initial_bw
14 |         self.max_bw = max_bw
15 |         self.igp = igp
16 |         self.fiber_map_spectrum = fiber_map_spectrum
17 |         self.cost = cost
18 | 
19 | 
20 |     def incr_bw(self, delta_bw):
21 |         self.bandwidth += delta_bw
22 | 
23 |     def reset_bw(self):
24 |         self.bandwidth = self.initial_bw
25 | 


--------------------------------------------------------------------------------
/source/topology/ip/network.py:
--------------------------------------------------------------------------------
  1 | import networkx as nx
  2 | 
  3 | import topology.ip as ip
  4 | from topology.ip.router import Router
  5 | from topology.ip.link import Link
  6 | import collections, pdb, sys
  7 | from pulp import *
  8 | import numpy as np
  9 | 
 10 | 
 11 | class Network:
 12 |     def __init__(self):
 13 |         self.routers = {}       # Set of Router objects
 14 |         self.links = {}         # Set of Link objects
 15 | 
 16 |         self.idx_map_router_name = {}
 17 |         self.router_name_map_idx = {}
 18 |         self.router_idx_cnt = 0
 19 | 
 20 |         self.link_name_map_cost = {}
 21 |         self.idx_map_link_name = {}
 22 |         self.link_name_map_idx = {}
 23 |         self.link_name_map_od_pair = {}
 24 |         self.link_name_map_fiber_and_spectrum = {}
 25 | 
 26 |         # utils for multiple edge between two routers
 27 |         self.od_pair_map_dup_cnt = {}
 28 | 
 29 |     def reset_link_bw(self):
 30 |         for link in self.links.values():
 31 |             link.reset_bw()
 32 | 
 33 |     def clear_links(self):
 34 |         self.links = {}
 35 |         self.idx_map_link_name = {}
 36 |         self.link_name_map_idx = {}
 37 | 
 38 |     def register_router(self, router_name, l1_node, stub):
 39 |         self.routers[router_name] = Router(router_name, l1_node, stub)
 40 |         self.idx_map_router_name[self.router_idx_cnt] = router_name
 41 |         self.router_name_map_idx[router_name] = self.router_idx_cnt
 42 |         self.router_idx_cnt += 1
 43 | 
 44 |     def get_router_by_name(self, router_name):
 45 |         return self.routers[router_name]
 46 | 
 47 |     def register_link(self, link_name, optic_set, src, dst, idx=-1, initial_bw=0, max_bw=0, igp=0, fiber_map_spectrum=None, cost=None):
 48 |         try:
 49 |             assert(link_name not in self.links)
 50 |         except:
 51 |             raise Exception("exist link_name:{}".format(link_name))
 52 | 
 53 |         self.idx_map_link_name[idx] = link_name
 54 |         self.link_name_map_idx[link_name] = idx
 55 | 
 56 |         self.links[link_name] = Link(link_name, optic_set, src, dst, idx=idx, initial_bw=initial_bw,\
 57 |             max_bw=max_bw, igp=igp, fiber_map_spectrum=fiber_map_spectrum,cost=cost)
 58 | 
 59 |     def add_link(self, src_router, dst_router, bandwidth):
 60 |         src, dst = src_router.name, dst_router.name
 61 |         link_name = '%s-%s' % (min(src, dst), max(src, dst))
 62 |         new_link = Link(src_router, dst_router, bandwidth)
 63 |         self.links[link_name] = new_link
 64 | 
 65 |     def generate_graph_from_multi_edge(self, failed_link_name_list=[]):
 66 |         graph = nx.DiGraph()
 67 |         capa_matrix = {}
 68 |         # Add nodes
 69 |         for link_name, link in self.links.items():
 70 |             src_name = link.src.name
 71 |             dst_name = link.dst.name
 72 | 
 73 |             if link_name not in failed_link_name_list:
 74 |                 # parallel edges between two nodes
 75 |                 try:
 76 |                     capa_matrix[(src_name, dst_name)] += link.bandwidth
 77 |                 except:
 78 |                     capa_matrix[(src_name, dst_name)] = link.bandwidth
 79 | 
 80 |         # add nodes and edges
 81 |         for (src_name, dst_name), bw in capa_matrix.items():
 82 |             graph.add_edge(src_name, dst_name, capacity=bw)
 83 |             graph.add_edge(dst_name, src_name, capacity=bw)
 84 |         return graph
 85 | 
 86 |     # generate non direction graph, used to perform a complete ILP approach
 87 |     # max_cstr_sol is used to take the second step for RL approach
 88 |     def generate_non_direction_graph(self, adjust_factor=1.0, max_cstr_sol=None, relax_factor=1):
 89 |         graph = nx.MultiGraph()
 90 |         init_cost = 0
 91 |         for link_name, link in self.links.items():
 92 | 
 93 |             src_name = link.src.name
 94 |             dst_name = link.dst.name
 95 |             if max_cstr_sol == None:
 96 |                 max_delta_step = -1
 97 |             else:
 98 |                 max_delta_step = int(max_cstr_sol.get(link.idx, 0)*relax_factor)
 99 | 
100 |             graph.add_edge(src_name, dst_name, capacity=int(link.bandwidth*adjust_factor), name=link_name, cost=link.cost, \
101 |                 fiber_map_spectrum=link.fiber_map_spectrum, idx=link.idx, max_delta_step=max_delta_step)   
102 |             init_cost += link.bandwidth*link.cost
103 | 
104 |         return graph, init_cost
105 | 
106 |     def generate_graph(self, failed_links=None):
107 |         graph = nx.DiGraph()
108 | 
109 |         # Add nodes
110 |         graph.add_nodes_from(list(self.routers.keys()))
111 | 
112 |         # Add edges
113 |         for link in self.links.values():
114 |             if failed_links is None or link not in failed_links:
115 |                 # add bidirectional edge
116 |                 graph.add_edge(link.src.name, link.dst.name, capacity=link.bandwidth)
117 |                 graph.add_edge(link.dst.name, link.src.name, capacity=link.bandwidth)
118 | 
119 |         return graph
120 | 


--------------------------------------------------------------------------------
/source/topology/ip/router.py:
--------------------------------------------------------------------------------
 1 | from topology.utils.node import Node
 2 | 
 3 | class Router(Node):
 4 |     def __init__(self, name, l1_node:Node, stub:bool):
 5 |         super().__init__(name)
 6 |         self.l1_node = l1_node
 7 |         self.stub = stub
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/source/topology/optical/optic_fiber.py:
--------------------------------------------------------------------------------
 1 | from topology.optical.optic_node import OpticNode
 2 | 
 3 | class OpticFiber:
 4 |     def __init__(self, name, src: OpticNode, dst: OpticNode, length,lease_flag=False,max_fp=None,lighted_fp=None,spectrum=None,\
 5 |         min_bw=None,max_bw=None):
 6 |         self.name = name
 7 |         self.src = src
 8 |         self.dst = dst
 9 |         self.length = length
10 |         self.lease_flag = lease_flag
11 | 
12 |         # attributes for fibers owned: lease_flag=False
13 |         self.max_fp = max_fp
14 |         self.lighted_fp = lighted_fp
15 |         self.spectrum = spectrum
16 | 
17 |         # attributes for fibers leased: lease_flag=True
18 |         self.min_bw = min_bw
19 |         self.max_bw = max_bw
20 | 
21 |         if lease_flag:
22 |             assert(self.min_bw!=None)
23 |             assert(self.max_bw!=None)
24 |         else:
25 |             assert(self.max_fp!=None)
26 |             assert(self.spectrum!=None)
27 | 


--------------------------------------------------------------------------------
/source/topology/optical/optic_network.py:
--------------------------------------------------------------------------------
 1 | from topology.optical.optic_node import OpticNode
 2 | from topology.optical.optic_fiber import OpticFiber
 3 | import networkx as nx
 4 | 
 5 | class OpticNetwork:
 6 |     def __init__(self):
 7 |         self.nodes = {}     # Set of OpticNode objects
 8 |         self.fibers = {}    # Set of OpticFiber objects
 9 | 
10 |     def register_node(self, node_name):
11 |         self.nodes[node_name] = OpticNode(node_name)
12 | 
13 |     def get_node_by_name(self, node_name):
14 |         return self.nodes[node_name]
15 | 
16 |     def register_fiber(self, fiber_name, src: OpticNode, dst: OpticNode,length,lease_flag=False,max_fp=None,lighted_fp=None,spectrum=None,\
17 |         min_bw=None,max_bw=None):
18 |         self.fibers[fiber_name] = OpticFiber(fiber_name, src, dst, length=length, lease_flag=lease_flag, \
19 |             max_fp=max_fp, lighted_fp=lighted_fp, spectrum=spectrum,min_bw=min_bw,max_bw=max_bw)
20 |     
21 |     def generate_non_direction_graph(self, od_pair_map_optic):
22 |         graph = nx.Graph()
23 |         for optic_name in od_pair_map_optic.values():
24 |             optic_inst = self.fibers[optic_name]
25 |             src_name = optic_inst.src.name
26 |             dst_name = optic_inst.dst.name
27 |             graph.add_edge(src_name, dst_name, name=optic_name, length=optic_inst.length)
28 |         
29 |         return graph
30 | 


--------------------------------------------------------------------------------
/source/topology/optical/optic_node.py:
--------------------------------------------------------------------------------
1 | from topology.utils.node import Node
2 | 
3 | class OpticNode(Node):
4 |     def __init__(self, name):
5 |         super().__init__(name)
6 | 


--------------------------------------------------------------------------------
/source/topology/optical/optic_path.py:
--------------------------------------------------------------------------------
1 | from topology.optical.optic_node import OpticNode
2 | from topology.optical.optic_fiber import OpticFiber
3 | 
4 | class OpticPath:
5 |     def __init__(self, node_name_set: set, fiber_name_list: list, length):
6 |         self.node_name_set = node_name_set    
7 |         self.fiber_name_list = fiber_name_list 
8 |         self.length = length


--------------------------------------------------------------------------------
/source/topology/utils/node.py:
--------------------------------------------------------------------------------
1 | class Node:
2 |     def __init__(self, name):
3 |         self.name = name
4 | 
5 | 


--------------------------------------------------------------------------------
/spinningup/.gitignore:
--------------------------------------------------------------------------------
 1 | *.*~
 2 | __pycache__/
 3 | *.pkl
 4 | data/
 5 | **/*.egg-info
 6 | .python-version
 7 | .idea/
 8 | .vscode/
 9 | .DS_Store
10 | _build/
11 | 


--------------------------------------------------------------------------------
/spinningup/.travis.yml:
--------------------------------------------------------------------------------
 1 | env:
 2 |  global:
 3 |  - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/travis/.mujoco/mujoco200/bin
 4 | 
 5 | matrix:
 6 |     include:
 7 |         - os: linux
 8 |           language: python
 9 |           python: "3.6"
10 | 
11 | before_install:
12 |     - ./travis_setup.sh
13 | 
14 | script:
15 |     - pip3 install --upgrade -e .[mujoco]
16 |     - python3 -c "import mujoco_py"
17 |     - python3 -c "import spinup"
18 |     - python3 -m pytest
19 | 


--------------------------------------------------------------------------------
/spinningup/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License
 2 | 
 3 | Copyright (c) 2018 OpenAI (http://openai.com)
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/spinningup/readme.md:
--------------------------------------------------------------------------------
 1 | **Status:** Maintenance (expect bug fixes and minor updates)
 2 | 
 3 | Welcome to Spinning Up in Deep RL! 
 4 | ==================================
 5 | 
 6 | This is an educational resource produced by OpenAI that makes it easier to learn about deep reinforcement learning (deep RL).
 7 | 
 8 | For the unfamiliar: [reinforcement learning](https://en.wikipedia.org/wiki/Reinforcement_learning) (RL) is a machine learning approach for teaching agents how to solve tasks by trial and error. Deep RL refers to the combination of RL with [deep learning](http://ufldl.stanford.edu/tutorial/).
 9 | 
10 | This module contains a variety of helpful resources, including:
11 | 
12 | - a short [introduction](https://spinningup.openai.com/en/latest/spinningup/rl_intro.html) to RL terminology, kinds of algorithms, and basic theory,
13 | - an [essay](https://spinningup.openai.com/en/latest/spinningup/spinningup.html) about how to grow into an RL research role,
14 | - a [curated list](https://spinningup.openai.com/en/latest/spinningup/keypapers.html) of important papers organized by topic,
15 | - a well-documented [code repo](https://github.com/openai/spinningup) of short, standalone implementations of key algorithms,
16 | - and a few [exercises](https://spinningup.openai.com/en/latest/spinningup/exercises.html) to serve as warm-ups.
17 | 
18 | Get started at [spinningup.openai.com](https://spinningup.openai.com)!
19 | 
20 | 
21 | Citing Spinning Up
22 | ------------------
23 | 
24 | If you reference or use Spinning Up in your research, please cite:
25 | 
26 | ```
27 | @article{SpinningUp2018,
28 |     author = {Achiam, Joshua},
29 |     title = {{Spinning Up in Deep Reinforcement Learning}},
30 |     year = {2018}
31 | }
32 | ```


--------------------------------------------------------------------------------
/spinningup/readthedocs.yml:
--------------------------------------------------------------------------------
1 | build:
2 |     image: latest
3 | 
4 | python:
5 |     version: 3.6


--------------------------------------------------------------------------------
/spinningup/setup.py:
--------------------------------------------------------------------------------
 1 | from os.path import join, dirname, realpath
 2 | from setuptools import setup
 3 | import sys
 4 | 
 5 | assert sys.version_info.major == 3 and sys.version_info.minor >= 6, \
 6 |     "The Spinning Up repo is designed to work with Python 3.6 and greater." \
 7 |     + "Please install it before proceeding."
 8 | 
 9 | with open(join("spinup", "version.py")) as version_file:
10 |     exec(version_file.read())
11 | 
12 | setup(
13 |     name='spinup',
14 |     py_modules=['spinup'],
15 |     version=__version__,#'0.1',
16 |     install_requires=[
17 |         'cloudpickle==1.2.1',
18 |         'gym[atari,box2d,classic_control]~=0.15.3',
19 |         'ipython',
20 |         'joblib',
21 |         'matplotlib==3.1.1',
22 |         'mpi4py',
23 |         'numpy',
24 |         'pandas',
25 |         'pytest',
26 |         'psutil',
27 |         'scipy',
28 |         'seaborn==0.8.1',
29 |         'tensorflow>=1.8.0,<2.0',
30 |         'torch==1.3.1',
31 |         'tqdm'
32 |     ],
33 |     description="Teaching tools for introducing people to deep RL.",
34 |     author="Joshua Achiam",
35 | )
36 | 


--------------------------------------------------------------------------------
/spinningup/spinup/__init__.py:
--------------------------------------------------------------------------------
 1 | # Disable TF deprecation warnings.
 2 | # Syntax from tf1 is not expected to be compatible with tf2.
 3 | import tensorflow as tf
 4 | tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
 5 | 
 6 | # Algorithms
 7 | from spinup.algos.tf1.ddpg.ddpg import ddpg as ddpg_tf1
 8 | from spinup.algos.tf1.ppo.ppo import ppo as ppo_tf1
 9 | from spinup.algos.tf1.sac.sac import sac as sac_tf1
10 | from spinup.algos.tf1.td3.td3 import td3 as td3_tf1
11 | from spinup.algos.tf1.trpo.trpo import trpo as trpo_tf1
12 | from spinup.algos.tf1.vpg.vpg import vpg as vpg_tf1
13 | 
14 | from spinup.algos.pytorch.ddpg.ddpg import ddpg as ddpg_pytorch
15 | from spinup.algos.pytorch.ppo.ppo import ppo as ppo_pytorch
16 | from spinup.algos.pytorch.sac.sac import sac as sac_pytorch
17 | from spinup.algos.pytorch.td3.td3 import td3 as td3_pytorch
18 | from spinup.algos.pytorch.trpo.trpo import trpo as trpo_pytorch
19 | from spinup.algos.pytorch.vpg.vpg import vpg as vpg_pytorch
20 | 
21 | # Loggers
22 | from spinup.utils.logx import Logger, EpochLogger
23 | 
24 | # Version
25 | from spinup.version import __version__
26 | 


--------------------------------------------------------------------------------
/spinningup/spinup/algos/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netx-repo/neuroplan/f4d73d71b67261d819494385b1a7e3270b7499e5/spinningup/spinup/algos/__init__.py


--------------------------------------------------------------------------------
/spinningup/spinup/algos/pytorch/ddpg/core.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy.signal
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | 
 7 | 
 8 | def combined_shape(length, shape=None):
 9 |     if shape is None:
10 |         return (length,)
11 |     return (length, shape) if np.isscalar(shape) else (length, *shape)
12 | 
13 | def mlp(sizes, activation, output_activation=nn.Identity):
14 |     layers = []
15 |     for j in range(len(sizes)-1):
16 |         act = activation if j < len(sizes)-2 else output_activation
17 |         layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
18 |     return nn.Sequential(*layers)
19 | 
20 | def count_vars(module):
21 |     return sum([np.prod(p.shape) for p in module.parameters()])
22 | 
23 | class MLPActor(nn.Module):
24 | 
25 |     def __init__(self, obs_dim, act_dim, hidden_sizes, activation, act_limit):
26 |         super().__init__()
27 |         pi_sizes = [obs_dim] + list(hidden_sizes) + [act_dim]
28 |         self.pi = mlp(pi_sizes, activation, nn.Tanh)
29 |         self.act_limit = act_limit
30 | 
31 |     def forward(self, obs):
32 |         # Return output from network scaled to action space limits.
33 |         return self.act_limit * self.pi(obs)
34 | 
35 | class MLPQFunction(nn.Module):
36 | 
37 |     def __init__(self, obs_dim, act_dim, hidden_sizes, activation):
38 |         super().__init__()
39 |         self.q = mlp([obs_dim + act_dim] + list(hidden_sizes) + [1], activation)
40 | 
41 |     def forward(self, obs, act):
42 |         q = self.q(torch.cat([obs, act], dim=-1))
43 |         return torch.squeeze(q, -1) # Critical to ensure q has right shape.
44 | 
45 | class MLPActorCritic(nn.Module):
46 | 
47 |     def __init__(self, observation_space, action_space, hidden_sizes=(256,256),
48 |                  activation=nn.ReLU):
49 |         super().__init__()
50 | 
51 |         obs_dim = observation_space.shape[0]
52 |         act_dim = action_space.shape[0]
53 |         act_limit = action_space.high[0]
54 | 
55 |         # build policy and value functions
56 |         self.pi = MLPActor(obs_dim, act_dim, hidden_sizes, activation, act_limit)
57 |         self.q = MLPQFunction(obs_dim, act_dim, hidden_sizes, activation)
58 | 
59 |     def act(self, obs):
60 |         with torch.no_grad():
61 |             return self.pi(obs).numpy()
62 | 


--------------------------------------------------------------------------------
/spinningup/spinup/algos/pytorch/ppo/core.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy.signal
  3 | from gym.spaces import Box, Discrete
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | from torch.distributions.normal import Normal
  8 | from torch.distributions.categorical import Categorical
  9 | 
 10 | 
 11 | def combined_shape(length, shape=None):
 12 |     if shape is None:
 13 |         return (length,)
 14 |     return (length, shape) if np.isscalar(shape) else (length, *shape)
 15 | 
 16 | 
 17 | def mlp(sizes, activation, output_activation=nn.Identity):
 18 |     layers = []
 19 |     for j in range(len(sizes)-1):
 20 |         act = activation if j < len(sizes)-2 else output_activation
 21 |         layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
 22 |     return nn.Sequential(*layers)
 23 | 
 24 | 
 25 | def count_vars(module):
 26 |     return sum([np.prod(p.shape) for p in module.parameters()])
 27 | 
 28 | 
 29 | def discount_cumsum(x, discount):
 30 |     """
 31 |     magic from rllab for computing discounted cumulative sums of vectors.
 32 | 
 33 |     input: 
 34 |         vector x, 
 35 |         [x0, 
 36 |          x1, 
 37 |          x2]
 38 | 
 39 |     output:
 40 |         [x0 + discount * x1 + discount^2 * x2,  
 41 |          x1 + discount * x2,
 42 |          x2]
 43 |     """
 44 |     return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]
 45 | 
 46 | 
 47 | class Actor(nn.Module):
 48 | 
 49 |     def _distribution(self, obs):
 50 |         raise NotImplementedError
 51 | 
 52 |     def _log_prob_from_distribution(self, pi, act):
 53 |         raise NotImplementedError
 54 | 
 55 |     def forward(self, obs, act=None):
 56 |         # Produce action distributions for given observations, and 
 57 |         # optionally compute the log likelihood of given actions under
 58 |         # those distributions.
 59 |         pi = self._distribution(obs)
 60 |         logp_a = None
 61 |         if act is not None:
 62 |             logp_a = self._log_prob_from_distribution(pi, act)
 63 |         return pi, logp_a
 64 | 
 65 | 
 66 | class MLPCategoricalActor(Actor):
 67 |     
 68 |     def __init__(self, obs_dim, act_dim, hidden_sizes, activation):
 69 |         super().__init__()
 70 |         self.logits_net = mlp([obs_dim] + list(hidden_sizes) + [act_dim], activation)
 71 | 
 72 |     def _distribution(self, obs):
 73 |         logits = self.logits_net(obs)
 74 |         return Categorical(logits=logits)
 75 | 
 76 |     def _log_prob_from_distribution(self, pi, act):
 77 |         return pi.log_prob(act)
 78 | 
 79 | 
 80 | class MLPGaussianActor(Actor):
 81 | 
 82 |     def __init__(self, obs_dim, act_dim, hidden_sizes, activation):
 83 |         super().__init__()
 84 |         log_std = -0.5 * np.ones(act_dim, dtype=np.float32)
 85 |         self.log_std = torch.nn.Parameter(torch.as_tensor(log_std))
 86 |         self.mu_net = mlp([obs_dim] + list(hidden_sizes) + [act_dim], activation)
 87 | 
 88 |     def _distribution(self, obs):
 89 |         mu = self.mu_net(obs)
 90 |         std = torch.exp(self.log_std)
 91 |         return Normal(mu, std)
 92 | 
 93 |     def _log_prob_from_distribution(self, pi, act):
 94 |         return pi.log_prob(act).sum(axis=-1)    # Last axis sum needed for Torch Normal distribution
 95 | 
 96 | 
 97 | class MLPCritic(nn.Module):
 98 | 
 99 |     def __init__(self, obs_dim, hidden_sizes, activation):
100 |         super().__init__()
101 |         self.v_net = mlp([obs_dim] + list(hidden_sizes) + [1], activation)
102 | 
103 |     def forward(self, obs):
104 |         return torch.squeeze(self.v_net(obs), -1) # Critical to ensure v has right shape.
105 | 
106 | 
107 | 
108 | class MLPActorCritic(nn.Module):
109 | 
110 | 
111 |     def __init__(self, observation_space, action_space, 
112 |                  hidden_sizes=(64,64), activation=nn.Tanh):
113 |         super().__init__()
114 | 
115 |         obs_dim = observation_space.shape[0]
116 | 
117 |         # policy builder depends on action space
118 |         if isinstance(action_space, Box):
119 |             self.pi = MLPGaussianActor(obs_dim, action_space.shape[0], hidden_sizes, activation)
120 |         elif isinstance(action_space, Discrete):
121 |             self.pi = MLPCategoricalActor(obs_dim, action_space.n, hidden_sizes, activation)
122 | 
123 |         # build value function
124 |         self.v  = MLPCritic(obs_dim, hidden_sizes, activation)
125 | 
126 |     def step(self, obs):
127 |         with torch.no_grad():
128 |             pi = self.pi._distribution(obs)
129 |             a = pi.sample()
130 |             logp_a = self.pi._log_prob_from_distribution(pi, a)
131 |             v = self.v(obs)
132 |         return a.numpy(), v.numpy(), logp_a.numpy()
133 | 
134 |     def act(self, obs):
135 |         return self.step(obs)[0]


--------------------------------------------------------------------------------
/spinningup/spinup/algos/pytorch/sac/core.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy.signal
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | import torch.nn.functional as F
 7 | from torch.distributions.normal import Normal
 8 | 
 9 | 
10 | def combined_shape(length, shape=None):
11 |     if shape is None:
12 |         return (length,)
13 |     return (length, shape) if np.isscalar(shape) else (length, *shape)
14 | 
15 | def mlp(sizes, activation, output_activation=nn.Identity):
16 |     layers = []
17 |     for j in range(len(sizes)-1):
18 |         act = activation if j < len(sizes)-2 else output_activation
19 |         layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
20 |     return nn.Sequential(*layers)
21 | 
22 | def count_vars(module):
23 |     return sum([np.prod(p.shape) for p in module.parameters()])
24 | 
25 | 
26 | LOG_STD_MAX = 2
27 | LOG_STD_MIN = -20
28 | 
29 | class SquashedGaussianMLPActor(nn.Module):
30 | 
31 |     def __init__(self, obs_dim, act_dim, hidden_sizes, activation, act_limit):
32 |         super().__init__()
33 |         self.net = mlp([obs_dim] + list(hidden_sizes), activation, activation)
34 |         self.mu_layer = nn.Linear(hidden_sizes[-1], act_dim)
35 |         self.log_std_layer = nn.Linear(hidden_sizes[-1], act_dim)
36 |         self.act_limit = act_limit
37 | 
38 |     def forward(self, obs, deterministic=False, with_logprob=True):
39 |         net_out = self.net(obs)
40 |         mu = self.mu_layer(net_out)
41 |         log_std = self.log_std_layer(net_out)
42 |         log_std = torch.clamp(log_std, LOG_STD_MIN, LOG_STD_MAX)
43 |         std = torch.exp(log_std)
44 | 
45 |         # Pre-squash distribution and sample
46 |         pi_distribution = Normal(mu, std)
47 |         if deterministic:
48 |             # Only used for evaluating policy at test time.
49 |             pi_action = mu
50 |         else:
51 |             pi_action = pi_distribution.rsample()
52 | 
53 |         if with_logprob:
54 |             # Compute logprob from Gaussian, and then apply correction for Tanh squashing.
55 |             # NOTE: The correction formula is a little bit magic. To get an understanding 
56 |             # of where it comes from, check out the original SAC paper (arXiv 1801.01290) 
57 |             # and look in appendix C. This is a more numerically-stable equivalent to Eq 21.
58 |             # Try deriving it yourself as a (very difficult) exercise. :)
59 |             logp_pi = pi_distribution.log_prob(pi_action).sum(axis=-1)
60 |             logp_pi -= (2*(np.log(2) - pi_action - F.softplus(-2*pi_action))).sum(axis=1)
61 |         else:
62 |             logp_pi = None
63 | 
64 |         pi_action = torch.tanh(pi_action)
65 |         pi_action = self.act_limit * pi_action
66 | 
67 |         return pi_action, logp_pi
68 | 
69 | 
70 | class MLPQFunction(nn.Module):
71 | 
72 |     def __init__(self, obs_dim, act_dim, hidden_sizes, activation):
73 |         super().__init__()
74 |         self.q = mlp([obs_dim + act_dim] + list(hidden_sizes) + [1], activation)
75 | 
76 |     def forward(self, obs, act):
77 |         q = self.q(torch.cat([obs, act], dim=-1))
78 |         return torch.squeeze(q, -1) # Critical to ensure q has right shape.
79 | 
80 | class MLPActorCritic(nn.Module):
81 | 
82 |     def __init__(self, observation_space, action_space, hidden_sizes=(256,256),
83 |                  activation=nn.ReLU):
84 |         super().__init__()
85 | 
86 |         obs_dim = observation_space.shape[0]
87 |         act_dim = action_space.shape[0]
88 |         act_limit = action_space.high[0]
89 | 
90 |         # build policy and value functions
91 |         self.pi = SquashedGaussianMLPActor(obs_dim, act_dim, hidden_sizes, activation, act_limit)
92 |         self.q1 = MLPQFunction(obs_dim, act_dim, hidden_sizes, activation)
93 |         self.q2 = MLPQFunction(obs_dim, act_dim, hidden_sizes, activation)
94 | 
95 |     def act(self, obs, deterministic=False):
96 |         with torch.no_grad():
97 |             a, _ = self.pi(obs, deterministic, False)
98 |             return a.numpy()
99 | 


--------------------------------------------------------------------------------
/spinningup/spinup/algos/pytorch/td3/core.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy.signal
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | 
 7 | 
 8 | def combined_shape(length, shape=None):
 9 |     if shape is None:
10 |         return (length,)
11 |     return (length, shape) if np.isscalar(shape) else (length, *shape)
12 | 
13 | def mlp(sizes, activation, output_activation=nn.Identity):
14 |     layers = []
15 |     for j in range(len(sizes)-1):
16 |         act = activation if j < len(sizes)-2 else output_activation
17 |         layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
18 |     return nn.Sequential(*layers)
19 | 
20 | def count_vars(module):
21 |     return sum([np.prod(p.shape) for p in module.parameters()])
22 | 
23 | class MLPActor(nn.Module):
24 | 
25 |     def __init__(self, obs_dim, act_dim, hidden_sizes, activation, act_limit):
26 |         super().__init__()
27 |         pi_sizes = [obs_dim] + list(hidden_sizes) + [act_dim]
28 |         self.pi = mlp(pi_sizes, activation, nn.Tanh)
29 |         self.act_limit = act_limit
30 | 
31 |     def forward(self, obs):
32 |         # Return output from network scaled to action space limits.
33 |         return self.act_limit * self.pi(obs)
34 | 
35 | class MLPQFunction(nn.Module):
36 | 
37 |     def __init__(self, obs_dim, act_dim, hidden_sizes, activation):
38 |         super().__init__()
39 |         self.q = mlp([obs_dim + act_dim] + list(hidden_sizes) + [1], activation)
40 | 
41 |     def forward(self, obs, act):
42 |         q = self.q(torch.cat([obs, act], dim=-1))
43 |         return torch.squeeze(q, -1) # Critical to ensure q has right shape.
44 | 
45 | class MLPActorCritic(nn.Module):
46 | 
47 |     def __init__(self, observation_space, action_space, hidden_sizes=(256,256),
48 |                  activation=nn.ReLU):
49 |         super().__init__()
50 | 
51 |         obs_dim = observation_space.shape[0]
52 |         act_dim = action_space.shape[0]
53 |         act_limit = action_space.high[0]
54 | 
55 |         # build policy and value functions
56 |         self.pi = MLPActor(obs_dim, act_dim, hidden_sizes, activation, act_limit)
57 |         self.q1 = MLPQFunction(obs_dim, act_dim, hidden_sizes, activation)
58 |         self.q2 = MLPQFunction(obs_dim, act_dim, hidden_sizes, activation)
59 | 
60 |     def act(self, obs):
61 |         with torch.no_grad():
62 |             return self.pi(obs).numpy()
63 | 


--------------------------------------------------------------------------------
/spinningup/spinup/algos/pytorch/trpo/trpo.py:
--------------------------------------------------------------------------------
1 | def trpo(*args, **kwargs):
2 |     print('\n\nUnfortunately, TRPO has not yet been implemented in PyTorch '\
3 |         + 'for Spinning Up. TRPO will migrate some time in the future.\n\n')
4 |     raise NotImplementedError


--------------------------------------------------------------------------------
/spinningup/spinup/algos/pytorch/vpg/core.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy.signal
  3 | from gym.spaces import Box, Discrete
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | from torch.distributions.normal import Normal
  8 | from torch.distributions.categorical import Categorical
  9 | 
 10 | 
 11 | def combined_shape(length, shape=None):
 12 |     if shape is None:
 13 |         return (length,)
 14 |     return (length, shape) if np.isscalar(shape) else (length, *shape)
 15 | 
 16 | 
 17 | def mlp(sizes, activation, output_activation=nn.Identity):
 18 |     layers = []
 19 |     for j in range(len(sizes)-1):
 20 |         act = activation if j < len(sizes)-2 else output_activation
 21 |         layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
 22 |     return nn.Sequential(*layers)
 23 | 
 24 | 
 25 | def count_vars(module):
 26 |     return sum([np.prod(p.shape) for p in module.parameters()])
 27 | 
 28 | 
 29 | def discount_cumsum(x, discount):
 30 |     """
 31 |     magic from rllab for computing discounted cumulative sums of vectors.
 32 | 
 33 |     input: 
 34 |         vector x, 
 35 |         [x0, 
 36 |          x1, 
 37 |          x2]
 38 | 
 39 |     output:
 40 |         [x0 + discount * x1 + discount^2 * x2,  
 41 |          x1 + discount * x2,
 42 |          x2]
 43 |     """
 44 |     return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]
 45 | 
 46 | 
 47 | class Actor(nn.Module):
 48 | 
 49 |     def _distribution(self, obs):
 50 |         raise NotImplementedError
 51 | 
 52 |     def _log_prob_from_distribution(self, pi, act):
 53 |         raise NotImplementedError
 54 | 
 55 |     def forward(self, obs, act=None):
 56 |         # Produce action distributions for given observations, and 
 57 |         # optionally compute the log likelihood of given actions under
 58 |         # those distributions.
 59 |         pi = self._distribution(obs)
 60 |         logp_a = None
 61 |         if act is not None:
 62 |             logp_a = self._log_prob_from_distribution(pi, act)
 63 |         return pi, logp_a
 64 | 
 65 | 
 66 | class MLPCategoricalActor(Actor):
 67 |     
 68 |     def __init__(self, obs_dim, act_dim, hidden_sizes, activation):
 69 |         super().__init__()
 70 |         self.logits_net = mlp([obs_dim] + list(hidden_sizes) + [act_dim], activation)
 71 | 
 72 |     def _distribution(self, obs):
 73 |         logits = self.logits_net(obs)
 74 |         return Categorical(logits=logits)
 75 | 
 76 |     def _log_prob_from_distribution(self, pi, act):
 77 |         return pi.log_prob(act)
 78 | 
 79 | 
 80 | class MLPGaussianActor(Actor):
 81 | 
 82 |     def __init__(self, obs_dim, act_dim, hidden_sizes, activation):
 83 |         super().__init__()
 84 |         log_std = -0.5 * np.ones(act_dim, dtype=np.float32)
 85 |         self.log_std = torch.nn.Parameter(torch.as_tensor(log_std))
 86 |         self.mu_net = mlp([obs_dim] + list(hidden_sizes) + [act_dim], activation)
 87 | 
 88 |     def _distribution(self, obs):
 89 |         mu = self.mu_net(obs)
 90 |         std = torch.exp(self.log_std)
 91 |         return Normal(mu, std)
 92 | 
 93 |     def _log_prob_from_distribution(self, pi, act):
 94 |         return pi.log_prob(act).sum(axis=-1)    # Last axis sum needed for Torch Normal distribution
 95 | 
 96 | 
 97 | class MLPCritic(nn.Module):
 98 | 
 99 |     def __init__(self, obs_dim, hidden_sizes, activation):
100 |         super().__init__()
101 |         self.v_net = mlp([obs_dim] + list(hidden_sizes) + [1], activation)
102 | 
103 |     def forward(self, obs):
104 |         return torch.squeeze(self.v_net(obs), -1) # Critical to ensure v has right shape.
105 | 
106 | 
107 | 
108 | class MLPActorCritic(nn.Module):
109 | 
110 | 
111 |     def __init__(self, observation_space, action_space, 
112 |                  hidden_sizes=(64,64), activation=nn.Tanh):
113 |         super().__init__()
114 | 
115 |         obs_dim = observation_space.shape[0]
116 | 
117 |         # policy builder depends on action space
118 |         if isinstance(action_space, Box):
119 |             self.pi = MLPGaussianActor(obs_dim, action_space.shape[0], hidden_sizes, activation)
120 |         elif isinstance(action_space, Discrete):
121 |             self.pi = MLPCategoricalActor(obs_dim, action_space.n, hidden_sizes, activation)
122 | 
123 |         # build value function
124 |         self.v  = MLPCritic(obs_dim, hidden_sizes, activation)
125 | 
126 |     def step(self, obs):
127 |         with torch.no_grad():
128 |             pi = self.pi._distribution(obs)
129 |             a = pi.sample()
130 |             logp_a = self.pi._log_prob_from_distribution(pi, a)
131 |             v = self.v(obs)
132 |         return a.numpy(), v.numpy(), logp_a.numpy()
133 | 
134 |     def act(self, obs):
135 |         return self.step(obs)[0]


--------------------------------------------------------------------------------
/spinningup/spinup/algos/tf1/ddpg/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netx-repo/neuroplan/f4d73d71b67261d819494385b1a7e3270b7499e5/spinningup/spinup/algos/tf1/ddpg/__init__.py


--------------------------------------------------------------------------------
/spinningup/spinup/algos/tf1/ddpg/core.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | 
 4 | 
 5 | def placeholder(dim=None):
 6 |     return tf.placeholder(dtype=tf.float32, shape=(None,dim) if dim else (None,))
 7 | 
 8 | def placeholders(*args):
 9 |     return [placeholder(dim) for dim in args]
10 | 
11 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None):
12 |     for h in hidden_sizes[:-1]:
13 |         x = tf.layers.dense(x, units=h, activation=activation)
14 |     return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation)
15 | 
16 | def get_vars(scope):
17 |     return [x for x in tf.global_variables() if scope in x.name]
18 | 
19 | def count_vars(scope):
20 |     v = get_vars(scope)
21 |     return sum([np.prod(var.shape.as_list()) for var in v])
22 | 
23 | """
24 | Actor-Critics
25 | """
26 | def mlp_actor_critic(x, a, hidden_sizes=(256,256), activation=tf.nn.relu, 
27 |                      output_activation=tf.tanh, action_space=None):
28 |     act_dim = a.shape.as_list()[-1]
29 |     act_limit = action_space.high[0]
30 |     with tf.variable_scope('pi'):
31 |         pi = act_limit * mlp(x, list(hidden_sizes)+[act_dim], activation, output_activation)
32 |     with tf.variable_scope('q'):
33 |         q = tf.squeeze(mlp(tf.concat([x,a], axis=-1), list(hidden_sizes)+[1], activation, None), axis=1)
34 |     with tf.variable_scope('q', reuse=True):
35 |         q_pi = tf.squeeze(mlp(tf.concat([x,pi], axis=-1), list(hidden_sizes)+[1], activation, None), axis=1)
36 |     return pi, q, q_pi
37 | 


--------------------------------------------------------------------------------
/spinningup/spinup/algos/tf1/ddpg/ddpg.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import gym
  4 | import time
  5 | from spinup.algos.tf1.ddpg import core
  6 | from spinup.algos.tf1.ddpg.core import get_vars
  7 | from spinup.utils.logx import EpochLogger
  8 | 
  9 | 
 10 | class ReplayBuffer:
 11 |     """
 12 |     A simple FIFO experience replay buffer for DDPG agents.
 13 |     """
 14 | 
 15 |     def __init__(self, obs_dim, act_dim, size):
 16 |         self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32)
 17 |         self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32)
 18 |         self.acts_buf = np.zeros([size, act_dim], dtype=np.float32)
 19 |         self.rews_buf = np.zeros(size, dtype=np.float32)
 20 |         self.done_buf = np.zeros(size, dtype=np.float32)
 21 |         self.ptr, self.size, self.max_size = 0, 0, size
 22 | 
 23 |     def store(self, obs, act, rew, next_obs, done):
 24 |         self.obs1_buf[self.ptr] = obs
 25 |         self.obs2_buf[self.ptr] = next_obs
 26 |         self.acts_buf[self.ptr] = act
 27 |         self.rews_buf[self.ptr] = rew
 28 |         self.done_buf[self.ptr] = done
 29 |         self.ptr = (self.ptr+1) % self.max_size
 30 |         self.size = min(self.size+1, self.max_size)
 31 | 
 32 |     def sample_batch(self, batch_size=32):
 33 |         idxs = np.random.randint(0, self.size, size=batch_size)
 34 |         return dict(obs1=self.obs1_buf[idxs],
 35 |                     obs2=self.obs2_buf[idxs],
 36 |                     acts=self.acts_buf[idxs],
 37 |                     rews=self.rews_buf[idxs],
 38 |                     done=self.done_buf[idxs])
 39 | 
 40 | 
 41 | 
 42 | def ddpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, 
 43 |          steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, 
 44 |          polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, 
 45 |          update_after=1000, update_every=50, act_noise=0.1, num_test_episodes=10, 
 46 |          max_ep_len=1000, logger_kwargs=dict(), save_freq=1):
 47 |     """
 48 |     Deep Deterministic Policy Gradient (DDPG)
 49 | 
 50 | 
 51 |     Args:
 52 |         env_fn : A function which creates a copy of the environment.
 53 |             The environment must satisfy the OpenAI Gym API.
 54 | 
 55 |         actor_critic: A function which takes in placeholder symbols 
 56 |             for state, ``x_ph``, and action, ``a_ph``, and returns the main 
 57 |             outputs from the agent's Tensorflow computation graph:
 58 | 
 59 |             ===========  ================  ======================================
 60 |             Symbol       Shape             Description
 61 |             ===========  ================  ======================================
 62 |             ``pi``       (batch, act_dim)  | Deterministically computes actions
 63 |                                            | from policy given states.
 64 |             ``q``        (batch,)          | Gives the current estimate of Q* for 
 65 |                                            | states in ``x_ph`` and actions in
 66 |                                            | ``a_ph``.
 67 |             ``q_pi``     (batch,)          | Gives the composition of ``q`` and 
 68 |                                            | ``pi`` for states in ``x_ph``: 
 69 |                                            | q(x, pi(x)).
 70 |             ===========  ================  ======================================
 71 | 
 72 |         ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
 73 |             function you provided to DDPG.
 74 | 
 75 |         seed (int): Seed for random number generators.
 76 | 
 77 |         steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
 78 |             for the agent and the environment in each epoch.
 79 | 
 80 |         epochs (int): Number of epochs to run and train agent.
 81 | 
 82 |         replay_size (int): Maximum length of replay buffer.
 83 | 
 84 |         gamma (float): Discount factor. (Always between 0 and 1.)
 85 | 
 86 |         polyak (float): Interpolation factor in polyak averaging for target 
 87 |             networks. Target networks are updated towards main networks 
 88 |             according to:
 89 | 
 90 |             .. math:: \\theta_{\\text{targ}} \\leftarrow 
 91 |                 \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta
 92 | 
 93 |             where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
 94 |             close to 1.)
 95 | 
 96 |         pi_lr (float): Learning rate for policy.
 97 | 
 98 |         q_lr (float): Learning rate for Q-networks.
 99 | 
100 |         batch_size (int): Minibatch size for SGD.
101 | 
102 |         start_steps (int): Number of steps for uniform-random action selection,
103 |             before running real policy. Helps exploration.
104 | 
105 |         update_after (int): Number of env interactions to collect before
106 |             starting to do gradient descent updates. Ensures replay buffer
107 |             is full enough for useful updates.
108 | 
109 |         update_every (int): Number of env interactions that should elapse
110 |             between gradient descent updates. Note: Regardless of how long 
111 |             you wait between updates, the ratio of env steps to gradient steps 
112 |             is locked to 1.
113 | 
114 |         act_noise (float): Stddev for Gaussian exploration noise added to 
115 |             policy at training time. (At test time, no noise is added.)
116 | 
117 |         num_test_episodes (int): Number of episodes to test the deterministic
118 |             policy at the end of each epoch.
119 | 
120 |         max_ep_len (int): Maximum length of trajectory / episode / rollout.
121 | 
122 |         logger_kwargs (dict): Keyword args for EpochLogger.
123 | 
124 |         save_freq (int): How often (in terms of gap between epochs) to save
125 |             the current policy and value function.
126 | 
127 |     """
128 | 
129 |     logger = EpochLogger(**logger_kwargs)
130 |     logger.save_config(locals())
131 | 
132 |     tf.set_random_seed(seed)
133 |     np.random.seed(seed)
134 | 
135 |     env, test_env = env_fn(), env_fn()
136 |     obs_dim = env.observation_space.shape[0]
137 |     act_dim = env.action_space.shape[0]
138 | 
139 |     # Action limit for clamping: critically, assumes all dimensions share the same bound!
140 |     act_limit = env.action_space.high[0]
141 | 
142 |     # Share information about action space with policy architecture
143 |     ac_kwargs['action_space'] = env.action_space
144 | 
145 |     # Inputs to computation graph
146 |     x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None)
147 | 
148 |     # Main outputs from computation graph
149 |     with tf.variable_scope('main'):
150 |         pi, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs)
151 |     
152 |     # Target networks
153 |     with tf.variable_scope('target'):
154 |         # Note that the action placeholder going to actor_critic here is 
155 |         # irrelevant, because we only need q_targ(s, pi_targ(s)).
156 |         pi_targ, _, q_pi_targ  = actor_critic(x2_ph, a_ph, **ac_kwargs)
157 | 
158 |     # Experience buffer
159 |     replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)
160 | 
161 |     # Count variables
162 |     var_counts = tuple(core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main'])
163 |     print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n'%var_counts)
164 | 
165 |     # Bellman backup for Q function
166 |     backup = tf.stop_gradient(r_ph + gamma*(1-d_ph)*q_pi_targ)
167 | 
168 |     # DDPG losses
169 |     pi_loss = -tf.reduce_mean(q_pi)
170 |     q_loss = tf.reduce_mean((q-backup)**2)
171 | 
172 |     # Separate train ops for pi, q
173 |     pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr)
174 |     q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr)
175 |     train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))
176 |     train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q'))
177 | 
178 |     # Polyak averaging for target variables
179 |     target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main)
180 |                               for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])
181 | 
182 |     # Initializing targets to match main variables
183 |     target_init = tf.group([tf.assign(v_targ, v_main)
184 |                               for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])
185 | 
186 |     sess = tf.Session()
187 |     sess.run(tf.global_variables_initializer())
188 |     sess.run(target_init)
189 | 
190 |     # Setup model saving
191 |     logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'pi': pi, 'q': q})
192 | 
193 |     def get_action(o, noise_scale):
194 |         a = sess.run(pi, feed_dict={x_ph: o.reshape(1,-1)})[0]
195 |         a += noise_scale * np.random.randn(act_dim)
196 |         return np.clip(a, -act_limit, act_limit)
197 | 
198 |     def test_agent():
199 |         for j in range(num_test_episodes):
200 |             o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
201 |             while not(d or (ep_len == max_ep_len)):
202 |                 # Take deterministic actions at test time (noise_scale=0)
203 |                 o, r, d, _ = test_env.step(get_action(o, 0))
204 |                 ep_ret += r
205 |                 ep_len += 1
206 |             logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)
207 | 
208 |     # Prepare for interaction with environment
209 |     total_steps = steps_per_epoch * epochs
210 |     start_time = time.time()
211 |     o, ep_ret, ep_len = env.reset(), 0, 0
212 | 
213 |     # Main loop: collect experience in env and update/log each epoch
214 |     for t in range(total_steps):
215 | 
216 |         # Until start_steps have elapsed, randomly sample actions
217 |         # from a uniform distribution for better exploration. Afterwards, 
218 |         # use the learned policy (with some noise, via act_noise). 
219 |         if t > start_steps:
220 |             a = get_action(o, act_noise)
221 |         else:
222 |             a = env.action_space.sample()
223 | 
224 |         # Step the env
225 |         o2, r, d, _ = env.step(a)
226 |         ep_ret += r
227 |         ep_len += 1
228 | 
229 |         # Ignore the "done" signal if it comes from hitting the time
230 |         # horizon (that is, when it's an artificial terminal signal
231 |         # that isn't based on the agent's state)
232 |         d = False if ep_len==max_ep_len else d
233 | 
234 |         # Store experience to replay buffer
235 |         replay_buffer.store(o, a, r, o2, d)
236 | 
237 |         # Super critical, easy to overlook step: make sure to update 
238 |         # most recent observation!
239 |         o = o2
240 | 
241 |         # End of trajectory handling
242 |         if d or (ep_len == max_ep_len):
243 |             logger.store(EpRet=ep_ret, EpLen=ep_len)
244 |             o, ep_ret, ep_len = env.reset(), 0, 0
245 | 
246 |         # Update handling
247 |         if t >= update_after and t % update_every == 0:
248 |             for _ in range(update_every):
249 |                 batch = replay_buffer.sample_batch(batch_size)
250 |                 feed_dict = {x_ph: batch['obs1'],
251 |                              x2_ph: batch['obs2'],
252 |                              a_ph: batch['acts'],
253 |                              r_ph: batch['rews'],
254 |                              d_ph: batch['done']
255 |                             }
256 | 
257 |                 # Q-learning update
258 |                 outs = sess.run([q_loss, q, train_q_op], feed_dict)
259 |                 logger.store(LossQ=outs[0], QVals=outs[1])
260 | 
261 |                 # Policy update
262 |                 outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict)
263 |                 logger.store(LossPi=outs[0])
264 | 
265 |         # End of epoch wrap-up
266 |         if (t+1) % steps_per_epoch == 0:
267 |             epoch = (t+1) // steps_per_epoch
268 | 
269 |             # Save model
270 |             if (epoch % save_freq == 0) or (epoch == epochs):
271 |                 logger.save_state({'env': env}, None)
272 | 
273 |             # Test the performance of the deterministic version of the agent.
274 |             test_agent()
275 | 
276 |             # Log info about epoch
277 |             logger.log_tabular('Epoch', epoch)
278 |             logger.log_tabular('EpRet', with_min_and_max=True)
279 |             logger.log_tabular('TestEpRet', with_min_and_max=True)
280 |             logger.log_tabular('EpLen', average_only=True)
281 |             logger.log_tabular('TestEpLen', average_only=True)
282 |             logger.log_tabular('TotalEnvInteracts', t)
283 |             logger.log_tabular('QVals', with_min_and_max=True)
284 |             logger.log_tabular('LossPi', average_only=True)
285 |             logger.log_tabular('LossQ', average_only=True)
286 |             logger.log_tabular('Time', time.time()-start_time)
287 |             logger.dump_tabular()
288 | 
289 | if __name__ == '__main__':
290 |     import argparse
291 |     parser = argparse.ArgumentParser()
292 |     parser.add_argument('--env', type=str, default='HalfCheetah-v2')
293 |     parser.add_argument('--hid', type=int, default=256)
294 |     parser.add_argument('--l', type=int, default=2)
295 |     parser.add_argument('--gamma', type=float, default=0.99)
296 |     parser.add_argument('--seed', '-s', type=int, default=0)
297 |     parser.add_argument('--epochs', type=int, default=50)
298 |     parser.add_argument('--exp_name', type=str, default='ddpg')
299 |     args = parser.parse_args()
300 | 
301 |     from spinup.utils.run_utils import setup_logger_kwargs
302 |     logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed)
303 | 
304 |     ddpg(lambda : gym.make(args.env), actor_critic=core.mlp_actor_critic,
305 |          ac_kwargs=dict(hidden_sizes=[args.hid]*args.l),
306 |          gamma=args.gamma, seed=args.seed, epochs=args.epochs,
307 |          logger_kwargs=logger_kwargs)
308 | 


--------------------------------------------------------------------------------
/spinningup/spinup/algos/tf1/ppo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netx-repo/neuroplan/f4d73d71b67261d819494385b1a7e3270b7499e5/spinningup/spinup/algos/tf1/ppo/__init__.py


--------------------------------------------------------------------------------
/spinningup/spinup/algos/tf1/ppo/core.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import scipy.signal
  4 | from gym.spaces import Box, Discrete
  5 | 
  6 | EPS = 1e-8
  7 | 
  8 | def combined_shape(length, shape=None):
  9 |     if shape is None:
 10 |         return (length,)
 11 |     return (length, shape) if np.isscalar(shape) else (length, *shape)
 12 | 
 13 | def placeholder(dim=None):
 14 |     return tf.placeholder(dtype=tf.float32, shape=combined_shape(None,dim))
 15 | 
 16 | def placeholders(*args):
 17 |     return [placeholder(dim) for dim in args]
 18 | 
 19 | def placeholder_from_space(space):
 20 |     if isinstance(space, Box):
 21 |         return placeholder(space.shape)
 22 |     elif isinstance(space, Discrete):
 23 |         return tf.placeholder(dtype=tf.int32, shape=(None,))
 24 |     raise NotImplementedError
 25 | 
 26 | def placeholders_from_spaces(*args):
 27 |     return [placeholder_from_space(space) for space in args]
 28 | 
 29 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None):
 30 |     for h in hidden_sizes[:-1]:
 31 |         x = tf.layers.dense(x, units=h, activation=activation)
 32 |     return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation)
 33 | 
 34 | def get_vars(scope=''):
 35 |     return [x for x in tf.trainable_variables() if scope in x.name]
 36 | 
 37 | def count_vars(scope=''):
 38 |     v = get_vars(scope)
 39 |     return sum([np.prod(var.shape.as_list()) for var in v])
 40 | 
 41 | def gaussian_likelihood(x, mu, log_std):
 42 |     pre_sum = -0.5 * (((x-mu)/(tf.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi))
 43 |     return tf.reduce_sum(pre_sum, axis=1)
 44 | 
 45 | def discount_cumsum(x, discount):
 46 |     """
 47 |     magic from rllab for computing discounted cumulative sums of vectors.
 48 | 
 49 |     input: 
 50 |         vector x, 
 51 |         [x0, 
 52 |          x1, 
 53 |          x2]
 54 | 
 55 |     output:
 56 |         [x0 + discount * x1 + discount^2 * x2,  
 57 |          x1 + discount * x2,
 58 |          x2]
 59 |     """
 60 |     return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]
 61 | 
 62 | 
 63 | """
 64 | Policies
 65 | """
 66 | 
 67 | def mlp_categorical_policy(x, a, hidden_sizes, activation, output_activation, action_space):
 68 |     act_dim = action_space.n
 69 |     logits = mlp(x, list(hidden_sizes)+[act_dim], activation, None)
 70 |     logp_all = tf.nn.log_softmax(logits)
 71 |     pi = tf.squeeze(tf.multinomial(logits,1), axis=1)
 72 |     logp = tf.reduce_sum(tf.one_hot(a, depth=act_dim) * logp_all, axis=1)
 73 |     logp_pi = tf.reduce_sum(tf.one_hot(pi, depth=act_dim) * logp_all, axis=1)
 74 |     return pi, logp, logp_pi
 75 | 
 76 | 
 77 | def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation, action_space):
 78 |     act_dim = a.shape.as_list()[-1]
 79 |     mu = mlp(x, list(hidden_sizes)+[act_dim], activation, output_activation)
 80 |     log_std = tf.get_variable(name='log_std', initializer=-0.5*np.ones(act_dim, dtype=np.float32))
 81 |     std = tf.exp(log_std)
 82 |     pi = mu + tf.random_normal(tf.shape(mu)) * std
 83 |     logp = gaussian_likelihood(a, mu, log_std)
 84 |     logp_pi = gaussian_likelihood(pi, mu, log_std)
 85 |     return pi, logp, logp_pi
 86 | 
 87 | 
 88 | """
 89 | Actor-Critics
 90 | """
 91 | def mlp_actor_critic(x, a, hidden_sizes=(64,64), activation=tf.tanh, 
 92 |                      output_activation=None, policy=None, action_space=None):
 93 | 
 94 |     # default policy builder depends on action space
 95 |     if policy is None and isinstance(action_space, Box):
 96 |         policy = mlp_gaussian_policy
 97 |     elif policy is None and isinstance(action_space, Discrete):
 98 |         policy = mlp_categorical_policy
 99 | 
100 |     with tf.variable_scope('pi'):
101 |         pi, logp, logp_pi = policy(x, a, hidden_sizes, activation, output_activation, action_space)
102 |     with tf.variable_scope('v'):
103 |         v = tf.squeeze(mlp(x, list(hidden_sizes)+[1], activation, None), axis=1)
104 |     return pi, logp, logp_pi, v
105 | 


--------------------------------------------------------------------------------
/spinningup/spinup/algos/tf1/sac/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netx-repo/neuroplan/f4d73d71b67261d819494385b1a7e3270b7499e5/spinningup/spinup/algos/tf1/sac/__init__.py


--------------------------------------------------------------------------------
/spinningup/spinup/algos/tf1/sac/core.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | 
 4 | EPS = 1e-8
 5 | 
 6 | def placeholder(dim=None):
 7 |     return tf.placeholder(dtype=tf.float32, shape=(None,dim) if dim else (None,))
 8 | 
 9 | def placeholders(*args):
10 |     return [placeholder(dim) for dim in args]
11 | 
12 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None):
13 |     for h in hidden_sizes[:-1]:
14 |         x = tf.layers.dense(x, units=h, activation=activation)
15 |     return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation)
16 | 
17 | def get_vars(scope):
18 |     return [x for x in tf.global_variables() if scope in x.name]
19 | 
20 | def count_vars(scope):
21 |     v = get_vars(scope)
22 |     return sum([np.prod(var.shape.as_list()) for var in v])
23 | 
24 | def gaussian_likelihood(x, mu, log_std):
25 |     pre_sum = -0.5 * (((x-mu)/(tf.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi))
26 |     return tf.reduce_sum(pre_sum, axis=1)
27 | 
28 | 
29 | """
30 | Policies
31 | """
32 | 
33 | LOG_STD_MAX = 2
34 | LOG_STD_MIN = -20
35 | 
36 | def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation):
37 |     act_dim = a.shape.as_list()[-1]
38 |     net = mlp(x, list(hidden_sizes), activation, activation)
39 |     mu = tf.layers.dense(net, act_dim, activation=output_activation)
40 |     log_std = tf.layers.dense(net, act_dim, activation=None)
41 |     log_std = tf.clip_by_value(log_std, LOG_STD_MIN, LOG_STD_MAX)
42 | 
43 |     std = tf.exp(log_std)
44 |     pi = mu + tf.random_normal(tf.shape(mu)) * std
45 |     logp_pi = gaussian_likelihood(pi, mu, log_std)
46 |     return mu, pi, logp_pi
47 | 
48 | def apply_squashing_func(mu, pi, logp_pi):
49 |     # Adjustment to log prob
50 |     # NOTE: This formula is a little bit magic. To get an understanding of where it
51 |     # comes from, check out the original SAC paper (arXiv 1801.01290) and look in
52 |     # appendix C. This is a more numerically-stable equivalent to Eq 21.
53 |     # Try deriving it yourself as a (very difficult) exercise. :)
54 |     logp_pi -= tf.reduce_sum(2*(np.log(2) - pi - tf.nn.softplus(-2*pi)), axis=1)
55 | 
56 |     # Squash those unbounded actions!
57 |     mu = tf.tanh(mu)
58 |     pi = tf.tanh(pi)
59 |     return mu, pi, logp_pi
60 | 
61 | """
62 | Actor-Critics
63 | """
64 | def mlp_actor_critic(x, a, hidden_sizes=(256,256), activation=tf.nn.relu, 
65 |                      output_activation=None, policy=mlp_gaussian_policy, action_space=None):
66 |     # policy
67 |     with tf.variable_scope('pi'):
68 |         mu, pi, logp_pi = policy(x, a, hidden_sizes, activation, output_activation)
69 |         mu, pi, logp_pi = apply_squashing_func(mu, pi, logp_pi)
70 | 
71 |     # make sure actions are in correct range
72 |     action_scale = action_space.high[0]
73 |     mu *= action_scale
74 |     pi *= action_scale
75 | 
76 |     # vfs
77 |     vf_mlp = lambda x : tf.squeeze(mlp(x, list(hidden_sizes)+[1], activation, None), axis=1)
78 |     with tf.variable_scope('q1'):
79 |         q1 = vf_mlp(tf.concat([x,a], axis=-1))
80 |     with tf.variable_scope('q2'):
81 |         q2 = vf_mlp(tf.concat([x,a], axis=-1))
82 |     return mu, pi, logp_pi, q1, q2
83 | 


--------------------------------------------------------------------------------
/spinningup/spinup/algos/tf1/td3/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netx-repo/neuroplan/f4d73d71b67261d819494385b1a7e3270b7499e5/spinningup/spinup/algos/tf1/td3/__init__.py


--------------------------------------------------------------------------------
/spinningup/spinup/algos/tf1/td3/core.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | 
 4 | 
 5 | def placeholder(dim=None):
 6 |     return tf.placeholder(dtype=tf.float32, shape=(None,dim) if dim else (None,))
 7 | 
 8 | def placeholders(*args):
 9 |     return [placeholder(dim) for dim in args]
10 | 
11 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None):
12 |     for h in hidden_sizes[:-1]:
13 |         x = tf.layers.dense(x, units=h, activation=activation)
14 |     return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation)
15 | 
16 | def get_vars(scope):
17 |     return [x for x in tf.global_variables() if scope in x.name]
18 | 
19 | def count_vars(scope):
20 |     v = get_vars(scope)
21 |     return sum([np.prod(var.shape.as_list()) for var in v])
22 | 
23 | """
24 | Actor-Critics
25 | """
26 | def mlp_actor_critic(x, a, hidden_sizes=(256,256), activation=tf.nn.relu, 
27 |                      output_activation=tf.tanh, action_space=None):
28 |     act_dim = a.shape.as_list()[-1]
29 |     act_limit = action_space.high[0]
30 |     with tf.variable_scope('pi'):
31 |         pi = act_limit * mlp(x, list(hidden_sizes)+[act_dim], activation, output_activation)
32 |     with tf.variable_scope('q1'):
33 |         q1 = tf.squeeze(mlp(tf.concat([x,a], axis=-1), list(hidden_sizes)+[1], activation, None), axis=1)
34 |     with tf.variable_scope('q2'):
35 |         q2 = tf.squeeze(mlp(tf.concat([x,a], axis=-1), list(hidden_sizes)+[1], activation, None), axis=1)
36 |     with tf.variable_scope('q1', reuse=True):
37 |         q1_pi = tf.squeeze(mlp(tf.concat([x,pi], axis=-1), list(hidden_sizes)+[1], activation, None), axis=1)
38 |     return pi, q1, q2, q1_pi
39 | 


--------------------------------------------------------------------------------
/spinningup/spinup/algos/tf1/trpo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netx-repo/neuroplan/f4d73d71b67261d819494385b1a7e3270b7499e5/spinningup/spinup/algos/tf1/trpo/__init__.py


--------------------------------------------------------------------------------
/spinningup/spinup/algos/tf1/trpo/core.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import scipy.signal
  4 | from gym.spaces import Box, Discrete
  5 | 
  6 | EPS = 1e-8
  7 | 
  8 | def combined_shape(length, shape=None):
  9 |     if shape is None:
 10 |         return (length,)
 11 |     return (length, shape) if np.isscalar(shape) else (length, *shape)
 12 | 
 13 | def keys_as_sorted_list(dict):
 14 |     return sorted(list(dict.keys()))
 15 | 
 16 | def values_as_sorted_list(dict):
 17 |     return [dict[k] for k in keys_as_sorted_list(dict)]
 18 | 
 19 | def placeholder(dim=None):
 20 |     return tf.placeholder(dtype=tf.float32, shape=combined_shape(None,dim))
 21 | 
 22 | def placeholders(*args):
 23 |     return [placeholder(dim) for dim in args]
 24 | 
 25 | def placeholder_from_space(space):
 26 |     if isinstance(space, Box):
 27 |         return placeholder(space.shape)
 28 |     elif isinstance(space, Discrete):
 29 |         return tf.placeholder(dtype=tf.int32, shape=(None,))
 30 |     raise NotImplementedError
 31 | 
 32 | def placeholders_from_spaces(*args):
 33 |     return [placeholder_from_space(space) for space in args]
 34 | 
 35 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None):
 36 |     for h in hidden_sizes[:-1]:
 37 |         x = tf.layers.dense(x, units=h, activation=activation)
 38 |     return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation)
 39 | 
 40 | def get_vars(scope=''):
 41 |     return [x for x in tf.trainable_variables() if scope in x.name]
 42 | 
 43 | def count_vars(scope=''):
 44 |     v = get_vars(scope)
 45 |     return sum([np.prod(var.shape.as_list()) for var in v])
 46 | 
 47 | def gaussian_likelihood(x, mu, log_std):
 48 |     pre_sum = -0.5 * (((x-mu)/(tf.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi))
 49 |     return tf.reduce_sum(pre_sum, axis=1)
 50 | 
 51 | def diagonal_gaussian_kl(mu0, log_std0, mu1, log_std1):
 52 |     """
 53 |     tf symbol for mean KL divergence between two batches of diagonal gaussian distributions,
 54 |     where distributions are specified by means and log stds.
 55 |     (https://en.wikipedia.org/wiki/Kullback-Leibler_divergence#Multivariate_normal_distributions)
 56 |     """
 57 |     var0, var1 = tf.exp(2 * log_std0), tf.exp(2 * log_std1)
 58 |     pre_sum = 0.5*(((mu1- mu0)**2 + var0)/(var1 + EPS) - 1) +  log_std1 - log_std0
 59 |     all_kls = tf.reduce_sum(pre_sum, axis=1)
 60 |     return tf.reduce_mean(all_kls)
 61 | 
 62 | def categorical_kl(logp0, logp1):
 63 |     """
 64 |     tf symbol for mean KL divergence between two batches of categorical probability distributions,
 65 |     where the distributions are input as log probs.
 66 |     """
 67 |     all_kls = tf.reduce_sum(tf.exp(logp1) * (logp1 - logp0), axis=1)
 68 |     return tf.reduce_mean(all_kls)
 69 | 
 70 | def flat_concat(xs):
 71 |     return tf.concat([tf.reshape(x,(-1,)) for x in xs], axis=0)
 72 | 
 73 | def flat_grad(f, params):
 74 |     return flat_concat(tf.gradients(xs=params, ys=f))
 75 | 
 76 | def hessian_vector_product(f, params):
 77 |     # for H = grad**2 f, compute Hx
 78 |     g = flat_grad(f, params)
 79 |     x = tf.placeholder(tf.float32, shape=g.shape)
 80 |     return x, flat_grad(tf.reduce_sum(g*x), params)
 81 | 
 82 | def assign_params_from_flat(x, params):
 83 |     flat_size = lambda p : int(np.prod(p.shape.as_list())) # the 'int' is important for scalars
 84 |     splits = tf.split(x, [flat_size(p) for p in params])
 85 |     new_params = [tf.reshape(p_new, p.shape) for p, p_new in zip(params, splits)]
 86 |     return tf.group([tf.assign(p, p_new) for p, p_new in zip(params, new_params)])
 87 | 
 88 | def discount_cumsum(x, discount):
 89 |     """
 90 |     magic from rllab for computing discounted cumulative sums of vectors.
 91 | 
 92 |     input: 
 93 |         vector x, 
 94 |         [x0, 
 95 |          x1, 
 96 |          x2]
 97 | 
 98 |     output:
 99 |         [x0 + discount * x1 + discount^2 * x2,  
100 |          x1 + discount * x2,
101 |          x2]
102 |     """
103 |     return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]
104 | 
105 | """
106 | Policies
107 | """
108 | 
109 | def mlp_categorical_policy(x, a, hidden_sizes, activation, output_activation, action_space):
110 |     act_dim = action_space.n
111 |     logits = mlp(x, list(hidden_sizes)+[act_dim], activation, None)
112 |     logp_all = tf.nn.log_softmax(logits)
113 |     pi = tf.squeeze(tf.multinomial(logits,1), axis=1)
114 |     logp = tf.reduce_sum(tf.one_hot(a, depth=act_dim) * logp_all, axis=1)
115 |     logp_pi = tf.reduce_sum(tf.one_hot(pi, depth=act_dim) * logp_all, axis=1)
116 | 
117 |     old_logp_all = placeholder(act_dim)
118 |     d_kl = categorical_kl(logp_all, old_logp_all)
119 | 
120 |     info = {'logp_all': logp_all}
121 |     info_phs = {'logp_all': old_logp_all}
122 | 
123 |     return pi, logp, logp_pi, info, info_phs, d_kl
124 | 
125 | 
126 | def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation, action_space):
127 |     act_dim = a.shape.as_list()[-1]
128 |     mu = mlp(x, list(hidden_sizes)+[act_dim], activation, output_activation)
129 |     log_std = tf.get_variable(name='log_std', initializer=-0.5*np.ones(act_dim, dtype=np.float32))
130 |     std = tf.exp(log_std)
131 |     pi = mu + tf.random_normal(tf.shape(mu)) * std
132 |     logp = gaussian_likelihood(a, mu, log_std)
133 |     logp_pi = gaussian_likelihood(pi, mu, log_std)
134 | 
135 |     old_mu_ph, old_log_std_ph = placeholders(act_dim, act_dim)
136 |     d_kl = diagonal_gaussian_kl(mu, log_std, old_mu_ph, old_log_std_ph)
137 | 
138 |     info = {'mu': mu, 'log_std': log_std}
139 |     info_phs = {'mu': old_mu_ph, 'log_std': old_log_std_ph}
140 | 
141 |     return pi, logp, logp_pi, info, info_phs, d_kl
142 | 
143 | 
144 | """
145 | Actor-Critics
146 | """
147 | def mlp_actor_critic(x, a, hidden_sizes=(64,64), activation=tf.tanh, 
148 |                      output_activation=None, policy=None, action_space=None):
149 | 
150 |     # default policy builder depends on action space
151 |     if policy is None and isinstance(action_space, Box):
152 |         policy = mlp_gaussian_policy
153 |     elif policy is None and isinstance(action_space, Discrete):
154 |         policy = mlp_categorical_policy
155 | 
156 |     with tf.variable_scope('pi'):
157 |         policy_outs = policy(x, a, hidden_sizes, activation, output_activation, action_space)
158 |         pi, logp, logp_pi, info, info_phs, d_kl = policy_outs
159 |     with tf.variable_scope('v'):
160 |         v = tf.squeeze(mlp(x, list(hidden_sizes)+[1], activation, None), axis=1)
161 |     return pi, logp, logp_pi, info, info_phs, d_kl, v
162 | 


--------------------------------------------------------------------------------
/spinningup/spinup/algos/tf1/vpg/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netx-repo/neuroplan/f4d73d71b67261d819494385b1a7e3270b7499e5/spinningup/spinup/algos/tf1/vpg/__init__.py


--------------------------------------------------------------------------------
/spinningup/spinup/algos/tf1/vpg/core.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import scipy.signal
  4 | from gym.spaces import Box, Discrete
  5 | 
  6 | EPS = 1e-8
  7 | 
  8 | def combined_shape(length, shape=None):
  9 |     if shape is None:
 10 |         return (length,)
 11 |     return (length, shape) if np.isscalar(shape) else (length, *shape)
 12 | 
 13 | def placeholder(dim=None):
 14 |     return tf.placeholder(dtype=tf.float32, shape=combined_shape(None,dim))
 15 | 
 16 | def placeholders(*args):
 17 |     return [placeholder(dim) for dim in args]
 18 | 
 19 | def placeholder_from_space(space):
 20 |     if isinstance(space, Box):
 21 |         return placeholder(space.shape)
 22 |     elif isinstance(space, Discrete):
 23 |         return tf.placeholder(dtype=tf.int32, shape=(None,))
 24 |     raise NotImplementedError
 25 | 
 26 | def placeholders_from_spaces(*args):
 27 |     return [placeholder_from_space(space) for space in args]
 28 | 
 29 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None):
 30 |     for h in hidden_sizes[:-1]:
 31 |         x = tf.layers.dense(x, units=h, activation=activation)
 32 |     return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation)
 33 | 
 34 | def get_vars(scope=''):
 35 |     return [x for x in tf.trainable_variables() if scope in x.name]
 36 | 
 37 | def count_vars(scope=''):
 38 |     v = get_vars(scope)
 39 |     return sum([np.prod(var.shape.as_list()) for var in v])
 40 | 
 41 | def gaussian_likelihood(x, mu, log_std):
 42 |     pre_sum = -0.5 * (((x-mu)/(tf.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi))
 43 |     return tf.reduce_sum(pre_sum, axis=1)
 44 | 
 45 | def discount_cumsum(x, discount):
 46 |     """
 47 |     magic from rllab for computing discounted cumulative sums of vectors.
 48 | 
 49 |     input: 
 50 |         vector x, 
 51 |         [x0, 
 52 |          x1, 
 53 |          x2]
 54 | 
 55 |     output:
 56 |         [x0 + discount * x1 + discount^2 * x2,  
 57 |          x1 + discount * x2,
 58 |          x2]
 59 |     """
 60 |     return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]
 61 | 
 62 | 
 63 | """
 64 | Policies
 65 | """
 66 | 
 67 | def mlp_categorical_policy(x, a, hidden_sizes, activation, output_activation, action_space):
 68 |     act_dim = action_space.n
 69 |     logits = mlp(x, list(hidden_sizes)+[act_dim], activation, None)
 70 |     logp_all = tf.nn.log_softmax(logits)
 71 |     pi = tf.squeeze(tf.multinomial(logits,1), axis=1)
 72 |     logp = tf.reduce_sum(tf.one_hot(a, depth=act_dim) * logp_all, axis=1)
 73 |     logp_pi = tf.reduce_sum(tf.one_hot(pi, depth=act_dim) * logp_all, axis=1)
 74 |     return pi, logp, logp_pi
 75 | 
 76 | 
 77 | def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation, action_space):
 78 |     act_dim = a.shape.as_list()[-1]
 79 |     mu = mlp(x, list(hidden_sizes)+[act_dim], activation, output_activation)
 80 |     log_std = tf.get_variable(name='log_std', initializer=-0.5*np.ones(act_dim, dtype=np.float32))
 81 |     std = tf.exp(log_std)
 82 |     pi = mu + tf.random_normal(tf.shape(mu)) * std
 83 |     logp = gaussian_likelihood(a, mu, log_std)
 84 |     logp_pi = gaussian_likelihood(pi, mu, log_std)
 85 |     return pi, logp, logp_pi
 86 | 
 87 | 
 88 | """
 89 | Actor-Critics
 90 | """
 91 | def mlp_actor_critic(x, a, hidden_sizes=(64,64), activation=tf.tanh, 
 92 |                      output_activation=None, policy=None, action_space=None):
 93 | 
 94 |     # default policy builder depends on action space
 95 |     if policy is None and isinstance(action_space, Box):
 96 |         policy = mlp_gaussian_policy
 97 |     elif policy is None and isinstance(action_space, Discrete):
 98 |         policy = mlp_categorical_policy
 99 | 
100 |     with tf.variable_scope('pi'):
101 |         pi, logp, logp_pi = policy(x, a, hidden_sizes, activation, output_activation, action_space)
102 |     with tf.variable_scope('v'):
103 |         v = tf.squeeze(mlp(x, list(hidden_sizes)+[1], activation, None), axis=1)
104 |     return pi, logp, logp_pi, v
105 | 


--------------------------------------------------------------------------------
/spinningup/spinup/examples/pytorch/bench_ppo_cartpole.py:
--------------------------------------------------------------------------------
 1 | from spinup.utils.run_utils import ExperimentGrid
 2 | from spinup import ppo_pytorch
 3 | import torch
 4 | 
 5 | if __name__ == '__main__':
 6 |     import argparse
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument('--cpu', type=int, default=4)
 9 |     parser.add_argument('--num_runs', type=int, default=3)
10 |     args = parser.parse_args()
11 | 
12 |     eg = ExperimentGrid(name='ppo-pyt-bench')
13 |     eg.add('env_name', 'CartPole-v0', '', True)
14 |     eg.add('seed', [10*i for i in range(args.num_runs)])
15 |     eg.add('epochs', 10)
16 |     eg.add('steps_per_epoch', 4000)
17 |     eg.add('ac_kwargs:hidden_sizes', [(32,), (64,64)], 'hid')
18 |     eg.add('ac_kwargs:activation', [torch.nn.Tanh, torch.nn.ReLU], '')
19 |     eg.run(ppo_pytorch, num_cpu=args.cpu)


--------------------------------------------------------------------------------
/spinningup/spinup/examples/pytorch/pg_math/1_simple_pg.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torch.distributions.categorical import Categorical
  4 | from torch.optim import Adam
  5 | import numpy as np
  6 | import gym
  7 | from gym.spaces import Discrete, Box
  8 | 
  9 | def mlp(sizes, activation=nn.Tanh, output_activation=nn.Identity):
 10 |     # Build a feedforward neural network.
 11 |     layers = []
 12 |     for j in range(len(sizes)-1):
 13 |         act = activation if j < len(sizes)-2 else output_activation
 14 |         layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
 15 |     return nn.Sequential(*layers)
 16 | 
 17 | def train(env_name='CartPole-v0', hidden_sizes=[32], lr=1e-2, 
 18 |           epochs=50, batch_size=5000, render=False):
 19 | 
 20 |     # make environment, check spaces, get obs / act dims
 21 |     env = gym.make(env_name)
 22 |     assert isinstance(env.observation_space, Box), \
 23 |         "This example only works for envs with continuous state spaces."
 24 |     assert isinstance(env.action_space, Discrete), \
 25 |         "This example only works for envs with discrete action spaces."
 26 | 
 27 |     obs_dim = env.observation_space.shape[0]
 28 |     n_acts = env.action_space.n
 29 | 
 30 |     # make core of policy network
 31 |     logits_net = mlp(sizes=[obs_dim]+hidden_sizes+[n_acts])
 32 | 
 33 |     # make function to compute action distribution
 34 |     def get_policy(obs):
 35 |         logits = logits_net(obs)
 36 |         return Categorical(logits=logits)
 37 | 
 38 |     # make action selection function (outputs int actions, sampled from policy)
 39 |     def get_action(obs):
 40 |         return get_policy(obs).sample().item()
 41 | 
 42 |     # make loss function whose gradient, for the right data, is policy gradient
 43 |     def compute_loss(obs, act, weights):
 44 |         logp = get_policy(obs).log_prob(act)
 45 |         return -(logp * weights).mean()
 46 | 
 47 |     # make optimizer
 48 |     optimizer = Adam(logits_net.parameters(), lr=lr)
 49 | 
 50 |     # for training policy
 51 |     def train_one_epoch():
 52 |         # make some empty lists for logging.
 53 |         batch_obs = []          # for observations
 54 |         batch_acts = []         # for actions
 55 |         batch_weights = []      # for R(tau) weighting in policy gradient
 56 |         batch_rets = []         # for measuring episode returns
 57 |         batch_lens = []         # for measuring episode lengths
 58 | 
 59 |         # reset episode-specific variables
 60 |         obs = env.reset()       # first obs comes from starting distribution
 61 |         done = False            # signal from environment that episode is over
 62 |         ep_rews = []            # list for rewards accrued throughout ep
 63 | 
 64 |         # render first episode of each epoch
 65 |         finished_rendering_this_epoch = False
 66 | 
 67 |         # collect experience by acting in the environment with current policy
 68 |         while True:
 69 | 
 70 |             # rendering
 71 |             if (not finished_rendering_this_epoch) and render:
 72 |                 env.render()
 73 | 
 74 |             # save obs
 75 |             batch_obs.append(obs.copy())
 76 | 
 77 |             # act in the environment
 78 |             act = get_action(torch.as_tensor(obs, dtype=torch.float32))
 79 |             obs, rew, done, _ = env.step(act)
 80 | 
 81 |             # save action, reward
 82 |             batch_acts.append(act)
 83 |             ep_rews.append(rew)
 84 | 
 85 |             if done:
 86 |                 # if episode is over, record info about episode
 87 |                 ep_ret, ep_len = sum(ep_rews), len(ep_rews)
 88 |                 batch_rets.append(ep_ret)
 89 |                 batch_lens.append(ep_len)
 90 | 
 91 |                 # the weight for each logprob(a|s) is R(tau)
 92 |                 batch_weights += [ep_ret] * ep_len
 93 | 
 94 |                 # reset episode-specific variables
 95 |                 obs, done, ep_rews = env.reset(), False, []
 96 | 
 97 |                 # won't render again this epoch
 98 |                 finished_rendering_this_epoch = True
 99 | 
100 |                 # end experience loop if we have enough of it
101 |                 if len(batch_obs) > batch_size:
102 |                     break
103 | 
104 |         # take a single policy gradient update step
105 |         optimizer.zero_grad()
106 |         batch_loss = compute_loss(obs=torch.as_tensor(batch_obs, dtype=torch.float32),
107 |                                   act=torch.as_tensor(batch_acts, dtype=torch.int32),
108 |                                   weights=torch.as_tensor(batch_weights, dtype=torch.float32)
109 |                                   )
110 |         batch_loss.backward()
111 |         optimizer.step()
112 |         return batch_loss, batch_rets, batch_lens
113 | 
114 |     # training loop
115 |     for i in range(epochs):
116 |         batch_loss, batch_rets, batch_lens = train_one_epoch()
117 |         print('epoch: %3d \t loss: %.3f \t return: %.3f \t ep_len: %.3f'%
118 |                 (i, batch_loss, np.mean(batch_rets), np.mean(batch_lens)))
119 | 
120 | if __name__ == '__main__':
121 |     import argparse
122 |     parser = argparse.ArgumentParser()
123 |     parser.add_argument('--env_name', '--env', type=str, default='CartPole-v0')
124 |     parser.add_argument('--render', action='store_true')
125 |     parser.add_argument('--lr', type=float, default=1e-2)
126 |     args = parser.parse_args()
127 |     print('\nUsing simplest formulation of policy gradient.\n')
128 |     train(env_name=args.env_name, render=args.render, lr=args.lr)


--------------------------------------------------------------------------------
/spinningup/spinup/examples/pytorch/pg_math/2_rtg_pg.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torch.distributions.categorical import Categorical
  4 | from torch.optim import Adam
  5 | import numpy as np
  6 | import gym
  7 | from gym.spaces import Discrete, Box
  8 | 
  9 | def mlp(sizes, activation=nn.Tanh, output_activation=nn.Identity):
 10 |     # Build a feedforward neural network.
 11 |     layers = []
 12 |     for j in range(len(sizes)-1):
 13 |         act = activation if j < len(sizes)-2 else output_activation
 14 |         layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
 15 |     return nn.Sequential(*layers)
 16 | 
 17 | def reward_to_go(rews):
 18 |     n = len(rews)
 19 |     rtgs = np.zeros_like(rews)
 20 |     for i in reversed(range(n)):
 21 |         rtgs[i] = rews[i] + (rtgs[i+1] if i+1 < n else 0)
 22 |     return rtgs
 23 | 
 24 | def train(env_name='CartPole-v0', hidden_sizes=[32], lr=1e-2, 
 25 |           epochs=50, batch_size=5000, render=False):
 26 | 
 27 |     # make environment, check spaces, get obs / act dims
 28 |     env = gym.make(env_name)
 29 |     assert isinstance(env.observation_space, Box), \
 30 |         "This example only works for envs with continuous state spaces."
 31 |     assert isinstance(env.action_space, Discrete), \
 32 |         "This example only works for envs with discrete action spaces."
 33 | 
 34 |     obs_dim = env.observation_space.shape[0]
 35 |     n_acts = env.action_space.n
 36 | 
 37 |     # make core of policy network
 38 |     logits_net = mlp(sizes=[obs_dim]+hidden_sizes+[n_acts])
 39 | 
 40 |     # make function to compute action distribution
 41 |     def get_policy(obs):
 42 |         logits = logits_net(obs)
 43 |         return Categorical(logits=logits)
 44 | 
 45 |     # make action selection function (outputs int actions, sampled from policy)
 46 |     def get_action(obs):
 47 |         return get_policy(obs).sample().item()
 48 | 
 49 |     # make loss function whose gradient, for the right data, is policy gradient
 50 |     def compute_loss(obs, act, weights):
 51 |         logp = get_policy(obs).log_prob(act)
 52 |         return -(logp * weights).mean()
 53 | 
 54 |     # make optimizer
 55 |     optimizer = Adam(logits_net.parameters(), lr=lr)
 56 | 
 57 |     # for training policy
 58 |     def train_one_epoch():
 59 |         # make some empty lists for logging.
 60 |         batch_obs = []          # for observations
 61 |         batch_acts = []         # for actions
 62 |         batch_weights = []      # for reward-to-go weighting in policy gradient
 63 |         batch_rets = []         # for measuring episode returns
 64 |         batch_lens = []         # for measuring episode lengths
 65 | 
 66 |         # reset episode-specific variables
 67 |         obs = env.reset()       # first obs comes from starting distribution
 68 |         done = False            # signal from environment that episode is over
 69 |         ep_rews = []            # list for rewards accrued throughout ep
 70 | 
 71 |         # render first episode of each epoch
 72 |         finished_rendering_this_epoch = False
 73 | 
 74 |         # collect experience by acting in the environment with current policy
 75 |         while True:
 76 | 
 77 |             # rendering
 78 |             if (not finished_rendering_this_epoch) and render:
 79 |                 env.render()
 80 | 
 81 |             # save obs
 82 |             batch_obs.append(obs.copy())
 83 | 
 84 |             # act in the environment
 85 |             act = get_action(torch.as_tensor(obs, dtype=torch.float32))
 86 |             obs, rew, done, _ = env.step(act)
 87 | 
 88 |             # save action, reward
 89 |             batch_acts.append(act)
 90 |             ep_rews.append(rew)
 91 | 
 92 |             if done:
 93 |                 # if episode is over, record info about episode
 94 |                 ep_ret, ep_len = sum(ep_rews), len(ep_rews)
 95 |                 batch_rets.append(ep_ret)
 96 |                 batch_lens.append(ep_len)
 97 | 
 98 |                 # the weight for each logprob(a_t|s_t) is reward-to-go from t
 99 |                 batch_weights += list(reward_to_go(ep_rews))
100 | 
101 |                 # reset episode-specific variables
102 |                 obs, done, ep_rews = env.reset(), False, []
103 | 
104 |                 # won't render again this epoch
105 |                 finished_rendering_this_epoch = True
106 | 
107 |                 # end experience loop if we have enough of it
108 |                 if len(batch_obs) > batch_size:
109 |                     break
110 | 
111 |         # take a single policy gradient update step
112 |         optimizer.zero_grad()
113 |         batch_loss = compute_loss(obs=torch.as_tensor(batch_obs, dtype=torch.float32),
114 |                                   act=torch.as_tensor(batch_acts, dtype=torch.int32),
115 |                                   weights=torch.as_tensor(batch_weights, dtype=torch.float32)
116 |                                   )
117 |         batch_loss.backward()
118 |         optimizer.step()
119 |         return batch_loss, batch_rets, batch_lens
120 | 
121 |     # training loop
122 |     for i in range(epochs):
123 |         batch_loss, batch_rets, batch_lens = train_one_epoch()
124 |         print('epoch: %3d \t loss: %.3f \t return: %.3f \t ep_len: %.3f'%
125 |                 (i, batch_loss, np.mean(batch_rets), np.mean(batch_lens)))
126 | 
127 | if __name__ == '__main__':
128 |     import argparse
129 |     parser = argparse.ArgumentParser()
130 |     parser.add_argument('--env_name', '--env', type=str, default='CartPole-v0')
131 |     parser.add_argument('--render', action='store_true')
132 |     parser.add_argument('--lr', type=float, default=1e-2)
133 |     args = parser.parse_args()
134 |     print('\nUsing reward-to-go formulation of policy gradient.\n')
135 |     train(env_name=args.env_name, render=args.render, lr=args.lr)


--------------------------------------------------------------------------------
/spinningup/spinup/examples/tf1/bench_ppo_cartpole.py:
--------------------------------------------------------------------------------
 1 | from spinup.utils.run_utils import ExperimentGrid
 2 | from spinup import ppo_tf1
 3 | import tensorflow as tf
 4 | 
 5 | if __name__ == '__main__':
 6 |     import argparse
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument('--cpu', type=int, default=4)
 9 |     parser.add_argument('--num_runs', type=int, default=3)
10 |     args = parser.parse_args()
11 | 
12 |     eg = ExperimentGrid(name='ppo-tf1-bench')
13 |     eg.add('env_name', 'CartPole-v0', '', True)
14 |     eg.add('seed', [10*i for i in range(args.num_runs)])
15 |     eg.add('epochs', 10)
16 |     eg.add('steps_per_epoch', 4000)
17 |     eg.add('ac_kwargs:hidden_sizes', [(32,), (64,64)], 'hid')
18 |     eg.add('ac_kwargs:activation', [tf.tanh, tf.nn.relu], '')
19 |     eg.run(ppo_tf1, num_cpu=args.cpu)


--------------------------------------------------------------------------------
/spinningup/spinup/examples/tf1/pg_math/1_simple_pg.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import gym
  4 | from gym.spaces import Discrete, Box
  5 | 
  6 | def mlp(x, sizes, activation=tf.tanh, output_activation=None):
  7 |     # Build a feedforward neural network.
  8 |     for size in sizes[:-1]:
  9 |         x = tf.layers.dense(x, units=size, activation=activation)
 10 |     return tf.layers.dense(x, units=sizes[-1], activation=output_activation)
 11 | 
 12 | def train(env_name='CartPole-v0', hidden_sizes=[32], lr=1e-2, 
 13 |           epochs=50, batch_size=5000, render=False):
 14 | 
 15 |     # make environment, check spaces, get obs / act dims
 16 |     env = gym.make(env_name)
 17 |     assert isinstance(env.observation_space, Box), \
 18 |         "This example only works for envs with continuous state spaces."
 19 |     assert isinstance(env.action_space, Discrete), \
 20 |         "This example only works for envs with discrete action spaces."
 21 | 
 22 |     obs_dim = env.observation_space.shape[0]
 23 |     n_acts = env.action_space.n
 24 | 
 25 |     # make core of policy network
 26 |     obs_ph = tf.placeholder(shape=(None, obs_dim), dtype=tf.float32)
 27 |     logits = mlp(obs_ph, sizes=hidden_sizes+[n_acts])
 28 | 
 29 |     # make action selection op (outputs int actions, sampled from policy)
 30 |     actions = tf.squeeze(tf.multinomial(logits=logits,num_samples=1), axis=1)
 31 | 
 32 |     # make loss function whose gradient, for the right data, is policy gradient
 33 |     weights_ph = tf.placeholder(shape=(None,), dtype=tf.float32)
 34 |     act_ph = tf.placeholder(shape=(None,), dtype=tf.int32)
 35 |     action_masks = tf.one_hot(act_ph, n_acts)
 36 |     log_probs = tf.reduce_sum(action_masks * tf.nn.log_softmax(logits), axis=1)
 37 |     loss = -tf.reduce_mean(weights_ph * log_probs)
 38 | 
 39 |     # make train op
 40 |     train_op = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss)
 41 | 
 42 |     sess = tf.InteractiveSession()
 43 |     sess.run(tf.global_variables_initializer())
 44 | 
 45 |     # for training policy
 46 |     def train_one_epoch():
 47 |         # make some empty lists for logging.
 48 |         batch_obs = []          # for observations
 49 |         batch_acts = []         # for actions
 50 |         batch_weights = []      # for R(tau) weighting in policy gradient
 51 |         batch_rets = []         # for measuring episode returns
 52 |         batch_lens = []         # for measuring episode lengths
 53 | 
 54 |         # reset episode-specific variables
 55 |         obs = env.reset()       # first obs comes from starting distribution
 56 |         done = False            # signal from environment that episode is over
 57 |         ep_rews = []            # list for rewards accrued throughout ep
 58 | 
 59 |         # render first episode of each epoch
 60 |         finished_rendering_this_epoch = False
 61 | 
 62 |         # collect experience by acting in the environment with current policy
 63 |         while True:
 64 | 
 65 |             # rendering
 66 |             if (not finished_rendering_this_epoch) and render:
 67 |                 env.render()
 68 | 
 69 |             # save obs
 70 |             batch_obs.append(obs.copy())
 71 | 
 72 |             # act in the environment
 73 |             act = sess.run(actions, {obs_ph: obs.reshape(1,-1)})[0]
 74 |             obs, rew, done, _ = env.step(act)
 75 | 
 76 |             # save action, reward
 77 |             batch_acts.append(act)
 78 |             ep_rews.append(rew)
 79 | 
 80 |             if done:
 81 |                 # if episode is over, record info about episode
 82 |                 ep_ret, ep_len = sum(ep_rews), len(ep_rews)
 83 |                 batch_rets.append(ep_ret)
 84 |                 batch_lens.append(ep_len)
 85 | 
 86 |                 # the weight for each logprob(a|s) is R(tau)
 87 |                 batch_weights += [ep_ret] * ep_len
 88 | 
 89 |                 # reset episode-specific variables
 90 |                 obs, done, ep_rews = env.reset(), False, []
 91 | 
 92 |                 # won't render again this epoch
 93 |                 finished_rendering_this_epoch = True
 94 | 
 95 |                 # end experience loop if we have enough of it
 96 |                 if len(batch_obs) > batch_size:
 97 |                     break
 98 | 
 99 |         # take a single policy gradient update step
100 |         batch_loss, _ = sess.run([loss, train_op],
101 |                                  feed_dict={
102 |                                     obs_ph: np.array(batch_obs),
103 |                                     act_ph: np.array(batch_acts),
104 |                                     weights_ph: np.array(batch_weights)
105 |                                  })
106 |         return batch_loss, batch_rets, batch_lens
107 | 
108 |     # training loop
109 |     for i in range(epochs):
110 |         batch_loss, batch_rets, batch_lens = train_one_epoch()
111 |         print('epoch: %3d \t loss: %.3f \t return: %.3f \t ep_len: %.3f'%
112 |                 (i, batch_loss, np.mean(batch_rets), np.mean(batch_lens)))
113 | 
114 | if __name__ == '__main__':
115 |     import argparse
116 |     parser = argparse.ArgumentParser()
117 |     parser.add_argument('--env_name', '--env', type=str, default='CartPole-v0')
118 |     parser.add_argument('--render', action='store_true')
119 |     parser.add_argument('--lr', type=float, default=1e-2)
120 |     args = parser.parse_args()
121 |     print('\nUsing simplest formulation of policy gradient.\n')
122 |     train(env_name=args.env_name, render=args.render, lr=args.lr)


--------------------------------------------------------------------------------
/spinningup/spinup/examples/tf1/pg_math/2_rtg_pg.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import gym
  4 | from gym.spaces import Discrete, Box
  5 | 
  6 | def mlp(x, sizes, activation=tf.tanh, output_activation=None):
  7 |     # Build a feedforward neural network.
  8 |     for size in sizes[:-1]:
  9 |         x = tf.layers.dense(x, units=size, activation=activation)
 10 |     return tf.layers.dense(x, units=sizes[-1], activation=output_activation)
 11 | 
 12 | def reward_to_go(rews):
 13 |     n = len(rews)
 14 |     rtgs = np.zeros_like(rews)
 15 |     for i in reversed(range(n)):
 16 |         rtgs[i] = rews[i] + (rtgs[i+1] if i+1 < n else 0)
 17 |     return rtgs
 18 | 
 19 | def train(env_name='CartPole-v0', hidden_sizes=[32], lr=1e-2, 
 20 |           epochs=50, batch_size=5000, render=False):
 21 | 
 22 |     # make environment, check spaces, get obs / act dims
 23 |     env = gym.make(env_name)
 24 |     assert isinstance(env.observation_space, Box), \
 25 |         "This example only works for envs with continuous state spaces."
 26 |     assert isinstance(env.action_space, Discrete), \
 27 |         "This example only works for envs with discrete action spaces."
 28 | 
 29 |     obs_dim = env.observation_space.shape[0]
 30 |     n_acts = env.action_space.n
 31 | 
 32 |     # make core of policy network
 33 |     obs_ph = tf.placeholder(shape=(None, obs_dim), dtype=tf.float32)
 34 |     logits = mlp(obs_ph, sizes=hidden_sizes+[n_acts])
 35 | 
 36 |     # make action selection op (outputs int actions, sampled from policy)
 37 |     actions = tf.squeeze(tf.multinomial(logits=logits,num_samples=1), axis=1)
 38 | 
 39 |     # make loss function whose gradient, for the right data, is policy gradient
 40 |     weights_ph = tf.placeholder(shape=(None,), dtype=tf.float32)
 41 |     act_ph = tf.placeholder(shape=(None,), dtype=tf.int32)
 42 |     action_masks = tf.one_hot(act_ph, n_acts)
 43 |     log_probs = tf.reduce_sum(action_masks * tf.nn.log_softmax(logits), axis=1)
 44 |     loss = -tf.reduce_mean(weights_ph * log_probs)
 45 | 
 46 |     # make train op
 47 |     train_op = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss)
 48 | 
 49 |     sess = tf.InteractiveSession()
 50 |     sess.run(tf.global_variables_initializer())
 51 | 
 52 |     # for training policy
 53 |     def train_one_epoch():
 54 |         # make some empty lists for logging.
 55 |         batch_obs = []          # for observations
 56 |         batch_acts = []         # for actions
 57 |         batch_weights = []      # for reward-to-go weighting in policy gradient
 58 |         batch_rets = []         # for measuring episode returns
 59 |         batch_lens = []         # for measuring episode lengths
 60 | 
 61 |         # reset episode-specific variables
 62 |         obs = env.reset()       # first obs comes from starting distribution
 63 |         done = False            # signal from environment that episode is over
 64 |         ep_rews = []            # list for rewards accrued throughout ep
 65 | 
 66 |         # render first episode of each epoch
 67 |         finished_rendering_this_epoch = False
 68 | 
 69 |         # collect experience by acting in the environment with current policy
 70 |         while True:
 71 | 
 72 |             # rendering
 73 |             if (not finished_rendering_this_epoch) and render:
 74 |                 env.render()
 75 | 
 76 |             # save obs
 77 |             batch_obs.append(obs.copy())
 78 | 
 79 |             # act in the environment
 80 |             act = sess.run(actions, {obs_ph: obs.reshape(1,-1)})[0]
 81 |             obs, rew, done, _ = env.step(act)
 82 | 
 83 |             # save action, reward
 84 |             batch_acts.append(act)
 85 |             ep_rews.append(rew)
 86 | 
 87 |             if done:
 88 |                 # if episode is over, record info about episode
 89 |                 ep_ret, ep_len = sum(ep_rews), len(ep_rews)
 90 |                 batch_rets.append(ep_ret)
 91 |                 batch_lens.append(ep_len)
 92 | 
 93 |                 # the weight for each logprob(a_t|s_t) is reward-to-go from t
 94 |                 batch_weights += list(reward_to_go(ep_rews))
 95 | 
 96 |                 # reset episode-specific variables
 97 |                 obs, done, ep_rews = env.reset(), False, []
 98 | 
 99 |                 # won't render again this epoch
100 |                 finished_rendering_this_epoch = True
101 | 
102 |                 # end experience loop if we have enough of it
103 |                 if len(batch_obs) > batch_size:
104 |                     break
105 | 
106 |         # take a single policy gradient update step
107 |         batch_loss, _ = sess.run([loss, train_op],
108 |                                  feed_dict={
109 |                                     obs_ph: np.array(batch_obs),
110 |                                     act_ph: np.array(batch_acts),
111 |                                     weights_ph: np.array(batch_weights)
112 |                                  })
113 |         return batch_loss, batch_rets, batch_lens
114 | 
115 |     # training loop
116 |     for i in range(epochs):
117 |         batch_loss, batch_rets, batch_lens = train_one_epoch()
118 |         print('epoch: %3d \t loss: %.3f \t return: %.3f \t ep_len: %.3f'%
119 |                 (i, batch_loss, np.mean(batch_rets), np.mean(batch_lens)))
120 | 
121 | if __name__ == '__main__':
122 |     import argparse
123 |     parser = argparse.ArgumentParser()
124 |     parser.add_argument('--env_name', '--env', type=str, default='CartPole-v0')
125 |     parser.add_argument('--render', action='store_true')
126 |     parser.add_argument('--lr', type=float, default=1e-2)
127 |     args = parser.parse_args()
128 |     print('\nUsing reward-to-go formulation of policy gradient.\n')
129 |     train(env_name=args.env_name, render=args.render, lr=args.lr)


--------------------------------------------------------------------------------
/spinningup/spinup/examples/tf1/train_mnist.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | import time
 4 | from spinup.utils.logx import EpochLogger
 5 | 
 6 | 
 7 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None):
 8 |     for h in hidden_sizes[:-1]:
 9 |         x = tf.layers.dense(x, units=h, activation=activation)
10 |     return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation)
11 | 
12 | 
13 | # Simple script for training an MLP on MNIST.
14 | def train_mnist(steps_per_epoch=100, epochs=5, 
15 |                 lr=1e-3, layers=2, hidden_size=64, 
16 |                 logger_kwargs=dict(), save_freq=1):
17 | 
18 |     logger = EpochLogger(**logger_kwargs)
19 |     logger.save_config(locals())
20 | 
21 |     # Load and preprocess MNIST data
22 |     (x_train, y_train), _ = tf.keras.datasets.mnist.load_data()
23 |     x_train = x_train.reshape(-1, 28*28) / 255.0
24 | 
25 |     # Define inputs & main outputs from computation graph
26 |     x_ph = tf.placeholder(tf.float32, shape=(None, 28*28))
27 |     y_ph = tf.placeholder(tf.int32, shape=(None,))
28 |     logits = mlp(x_ph, hidden_sizes=[hidden_size]*layers + [10], activation=tf.nn.relu)
29 |     predict = tf.argmax(logits, axis=1, output_type=tf.int32)
30 | 
31 |     # Define loss function, accuracy, and training op
32 |     y = tf.one_hot(y_ph, 10)
33 |     loss = tf.losses.softmax_cross_entropy(y, logits)
34 |     acc = tf.reduce_mean(tf.cast(tf.equal(y_ph, predict), tf.float32))
35 |     train_op = tf.train.AdamOptimizer().minimize(loss)
36 | 
37 |     # Prepare session
38 |     sess = tf.Session()
39 |     sess.run(tf.global_variables_initializer())
40 | 
41 |     # Setup model saving
42 |     logger.setup_tf_saver(sess, inputs={'x': x_ph}, 
43 |                                 outputs={'logits': logits, 'predict': predict})
44 | 
45 |     start_time = time.time()
46 | 
47 |     # Run main training loop
48 |     for epoch in range(epochs):
49 |         for t in range(steps_per_epoch):
50 |             idxs = np.random.randint(0, len(x_train), 32)
51 |             feed_dict = {x_ph: x_train[idxs],
52 |                          y_ph: y_train[idxs]}
53 |             outs = sess.run([loss, acc, train_op], feed_dict=feed_dict)
54 |             logger.store(Loss=outs[0], Acc=outs[1])
55 | 
56 |         # Save model
57 |         if (epoch % save_freq == 0) or (epoch == epochs-1):
58 |             logger.save_state(state_dict=dict(), itr=None)
59 | 
60 |         # Log info about epoch
61 |         logger.log_tabular('Epoch', epoch)
62 |         logger.log_tabular('Acc', with_min_and_max=True)
63 |         logger.log_tabular('Loss', average_only=True)
64 |         logger.log_tabular('TotalGradientSteps', (epoch+1)*steps_per_epoch)
65 |         logger.log_tabular('Time', time.time()-start_time)
66 |         logger.dump_tabular()
67 | 
68 | if __name__ == '__main__':
69 |     train_mnist()


--------------------------------------------------------------------------------
/spinningup/spinup/exercises/common.py:
--------------------------------------------------------------------------------
1 | def print_result(correct=False):
2 |     print('\n'*5 + '='*50 + '\n'*3)
3 |     if correct:
4 |         print("Congratulations! Your answer is correct.")
5 |     else:
6 |         print("Your answer appears to be incorrect. Try again!")
7 |     print('\n'*3 + '='*50)


--------------------------------------------------------------------------------
/spinningup/spinup/exercises/pytorch/problem_set_1/exercise1_1.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | 
 4 | """
 5 | 
 6 | Exercise 1.1: Diagonal Gaussian Likelihood
 7 | 
 8 | Write a function that takes in PyTorch Tensors for the means and 
 9 | log stds of a batch of diagonal Gaussian distributions, along with a 
10 | PyTorch Tensor for (previously-generated) samples from those 
11 | distributions, and returns a Tensor containing the log 
12 | likelihoods of those samples.
13 | 
14 | """
15 | 
16 | def gaussian_likelihood(x, mu, log_std):
17 |     """
18 |     Args:
19 |         x: Tensor with shape [batch, dim]
20 |         mu: Tensor with shape [batch, dim]
21 |         log_std: Tensor with shape [batch, dim] or [dim]
22 | 
23 |     Returns:
24 |         Tensor with shape [batch]
25 |     """
26 |     #######################
27 |     #                     #
28 |     #   YOUR CODE HERE    #
29 |     #                     #
30 |     #######################
31 |     return torch.zeros(1)
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     """
36 |     Run this file to verify your solution.
37 |     """
38 |     from spinup.exercises.pytorch.problem_set_1_solutions import exercise1_1_soln
39 |     from spinup.exercises.common import print_result
40 | 
41 |     batch_size = 32
42 |     dim = 10
43 | 
44 |     x = torch.rand(batch_size, dim)
45 |     mu = torch.rand(batch_size, dim)
46 |     log_std = torch.rand(dim)
47 | 
48 |     your_gaussian_likelihood = gaussian_likelihood(x, mu, log_std)
49 |     true_gaussian_likelihood = exercise1_1_soln.gaussian_likelihood(x, mu, log_std)
50 | 
51 |     your_result = your_gaussian_likelihood.detach().numpy()
52 |     true_result = true_gaussian_likelihood.detach().numpy()
53 | 
54 |     correct = np.allclose(your_result, true_result)
55 |     print_result(correct)


--------------------------------------------------------------------------------
/spinningup/spinup/exercises/pytorch/problem_set_1/exercise1_2.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import numpy as np
  4 | from spinup.exercises.pytorch.problem_set_1 import exercise1_1
  5 | from spinup.exercises.pytorch.problem_set_1 import exercise1_2_auxiliary
  6 | 
  7 | """
  8 | 
  9 | Exercise 1.2: PPO Gaussian Policy
 10 | 
 11 | You will implement an MLP diagonal Gaussian policy for PPO by
 12 | writing an MLP-builder, and a few other key functions.
 13 | 
 14 | Log-likelihoods will be computed using your answer to Exercise 1.1,
 15 | so make sure to complete that exercise before beginning this one.
 16 | 
 17 | """
 18 | 
 19 | def mlp(sizes, activation, output_activation=nn.Identity):
 20 |     """
 21 |     Build a multi-layer perceptron in PyTorch.
 22 | 
 23 |     Args:
 24 |         sizes: Tuple, list, or other iterable giving the number of units
 25 |             for each layer of the MLP. 
 26 | 
 27 |         activation: Activation function for all layers except last.
 28 | 
 29 |         output_activation: Activation function for last layer.
 30 | 
 31 |     Returns:
 32 |         A PyTorch module that can be called to give the output of the MLP.
 33 |         (Use an nn.Sequential module.)
 34 | 
 35 |     """
 36 |     #######################
 37 |     #                     #
 38 |     #   YOUR CODE HERE    #
 39 |     #                     #
 40 |     #######################
 41 |     pass
 42 | 
 43 | class DiagonalGaussianDistribution:
 44 | 
 45 |     def __init__(self, mu, log_std):
 46 |         self.mu = mu
 47 |         self.log_std = log_std
 48 | 
 49 |     def sample(self):
 50 |         """
 51 |         Returns:
 52 |             A PyTorch Tensor of samples from the diagonal Gaussian distribution with
 53 |             mean and log_std given by self.mu and self.log_std.
 54 |         """
 55 |         #######################
 56 |         #                     #
 57 |         #   YOUR CODE HERE    #
 58 |         #                     #
 59 |         #######################
 60 |         pass
 61 | 
 62 |     #================================(Given, ignore)==========================================#
 63 |     def log_prob(self, value):
 64 |         return exercise1_1.gaussian_likelihood(value, self.mu, self.log_std)
 65 | 
 66 |     def entropy(self):
 67 |         return 0.5 + 0.5 * np.log(2 * np.pi) + self.log_std.sum(axis=-1)
 68 |     #=========================================================================================#
 69 | 
 70 | 
 71 | class MLPGaussianActor(nn.Module):
 72 | 
 73 |     def __init__(self, obs_dim, act_dim, hidden_sizes, activation):
 74 |         super().__init__()
 75 |         """
 76 |         Initialize an MLP Gaussian Actor by making a PyTorch module for computing the
 77 |         mean of the distribution given a batch of observations, and a log_std parameter.
 78 | 
 79 |         Make log_std a PyTorch Parameter with the same shape as the action vector, 
 80 |         independent of observations, initialized to [-0.5, -0.5, ..., -0.5].
 81 |         (Make sure it's trainable!)
 82 |         """
 83 |         #######################
 84 |         #                     #
 85 |         #   YOUR CODE HERE    #
 86 |         #                     #
 87 |         #######################
 88 |         # self.log_std = 
 89 |         # self.mu_net = 
 90 |         pass 
 91 | 
 92 |     #================================(Given, ignore)==========================================#
 93 |     def forward(self, obs, act=None):
 94 |         mu = self.mu_net(obs)
 95 |         pi = DiagonalGaussianDistribution(mu, self.log_std)
 96 |         logp_a = None
 97 |         if act is not None:
 98 |             logp_a = pi.log_prob(act)
 99 |         return pi, logp_a
100 |     #=========================================================================================#
101 | 
102 | 
103 | 
104 | if __name__ == '__main__':
105 |     """
106 |     Run this file to verify your solution.
107 |     """
108 | 
109 |     from spinup import ppo_pytorch as ppo
110 |     from spinup.exercises.common import print_result
111 |     from functools import partial
112 |     import gym
113 |     import os
114 |     import pandas as pd
115 |     import psutil
116 |     import time
117 | 
118 |     logdir = "/tmp/experiments/%i"%int(time.time())
119 | 
120 |     ActorCritic = partial(exercise1_2_auxiliary.ExerciseActorCritic, actor=MLPGaussianActor)
121 |     
122 |     ppo(env_fn = lambda : gym.make('InvertedPendulum-v2'),
123 |         actor_critic=ActorCritic,
124 |         ac_kwargs=dict(hidden_sizes=(64,)),
125 |         steps_per_epoch=4000, epochs=20, logger_kwargs=dict(output_dir=logdir))
126 | 
127 |     # Get scores from last five epochs to evaluate success.
128 |     data = pd.read_table(os.path.join(logdir,'progress.txt'))
129 |     last_scores = data['AverageEpRet'][-5:]
130 | 
131 |     # Your implementation is probably correct if the agent has a score >500,
132 |     # or if it reaches the top possible score of 1000, in the last five epochs.
133 |     correct = np.mean(last_scores) > 500 or np.max(last_scores)==1e3
134 |     print_result(correct)


--------------------------------------------------------------------------------
/spinningup/spinup/exercises/pytorch/problem_set_1/exercise1_2_auxiliary.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import numpy as np
 4 | 
 5 | """
 6 | 
 7 | Auxiliary code for Exercise 1.2. No part of the exercise requires you to 
 8 | look into or modify this file (and since it contains an mlp function, 
 9 | it has spoilers for the answer). Removed from the main file to avoid
10 | cluttering it up.
11 | 
12 | In other words, nothing to see here, move along, these are not the
13 | droids you're looking for, and all that...
14 | 
15 | """
16 | 
17 | def mlp(sizes, activation, output_activation=nn.Identity):
18 |     layers = []
19 |     for j in range(len(sizes)-1):
20 |         act = activation if j < len(sizes)-2 else output_activation
21 |         layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
22 |     return nn.Sequential(*layers)
23 | 
24 | 
25 | class MLPCritic(nn.Module):
26 | 
27 |     def __init__(self, obs_dim, hidden_sizes, activation):
28 |         super().__init__()
29 |         self.v_net = mlp([obs_dim] + list(hidden_sizes) + [1], activation)
30 | 
31 |     def forward(self, obs):
32 |         return torch.squeeze(self.v_net(obs), -1) # Critical to ensure v has right shape.
33 | 
34 | 
35 | class ExerciseActorCritic(nn.Module):
36 | 
37 |     def __init__(self, observation_space, action_space, 
38 |                  hidden_sizes=(64,64), activation=nn.Tanh,
39 |                  actor=None):
40 |         super().__init__()
41 |         obs_dim = observation_space.shape[0]
42 |         self.pi = actor(obs_dim, action_space.shape[0], hidden_sizes, activation)
43 |         self.v  = MLPCritic(obs_dim, hidden_sizes, activation)
44 | 
45 |     def step(self, obs):
46 |         with torch.no_grad():
47 |             pi, _ = self.pi(obs)
48 |             a = pi.sample()
49 |             logp_a = pi.log_prob(a)
50 |             v = self.v(obs)
51 |         return a.numpy(), v.numpy(), logp_a.numpy()
52 | 
53 |     def act(self, obs):
54 |         return self.step(obs)[0]


--------------------------------------------------------------------------------
/spinningup/spinup/exercises/pytorch/problem_set_1_solutions/exercise1_1_soln.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | 
4 | EPS=1e-8
5 | 
6 | def gaussian_likelihood(x, mu, log_std):
7 |     pre_sum = -0.5 * (((x-mu)/(torch.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi))
8 |     return pre_sum.sum(axis=-1)


--------------------------------------------------------------------------------
/spinningup/spinup/exercises/pytorch/problem_set_1_solutions/exercise1_2_soln.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import numpy as np
 4 | 
 5 | EPS=1e-8
 6 | 
 7 | def mlp(sizes, activation, output_activation=nn.Identity):
 8 |     layers = []
 9 |     for j in range(len(sizes)-1):
10 |         act = activation if j < len(sizes)-2 else output_activation
11 |         layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
12 |     return nn.Sequential(*layers)
13 | 
14 | def gaussian_likelihood(x, mu, log_std):
15 |     pre_sum = -0.5 * (((x-mu)/(torch.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi))
16 |     return pre_sum.sum(axis=-1)
17 | 
18 | 
19 | class DiagonalGaussianDistribution:
20 | 
21 |     def __init__(self, mu, log_std):
22 |         self.mu = mu
23 |         self.log_std = log_std
24 | 
25 |     def sample(self):
26 |         return self.mu + torch.exp(self.log_std) * torch.randn_like(self.mu)
27 | 
28 |     def log_prob(self, value):
29 |         return gaussian_likelihood(value, self.mu, self.log_std)
30 | 
31 |     def entropy(self):
32 |         return 0.5 + 0.5 * np.log(2 * np.pi) + self.log_std.sum(axis=-1)
33 | 
34 | 
35 | class MLPGaussianActor(nn.Module):
36 | 
37 |     def __init__(self, obs_dim, act_dim, hidden_sizes, activation):
38 |         super().__init__()
39 |         log_std = -0.5 * np.ones(act_dim, dtype=np.float32)
40 |         self.log_std = torch.nn.Parameter(torch.as_tensor(log_std))
41 |         self.mu_net = mlp([obs_dim] + list(hidden_sizes) + [act_dim], activation)
42 | 
43 |     def forward(self, obs, act=None):
44 |         mu = self.mu_net(obs)
45 |         pi = DiagonalGaussianDistribution(mu, self.log_std)
46 |         logp_a = None
47 |         if act is not None:
48 |             logp_a = pi.log_prob(act)
49 |         return pi, logp_a


--------------------------------------------------------------------------------
/spinningup/spinup/exercises/pytorch/problem_set_2/exercise2_2.py:
--------------------------------------------------------------------------------
 1 | from spinup.algos.pytorch.ddpg.core import mlp, MLPActorCritic
 2 | from spinup.utils.run_utils import ExperimentGrid
 3 | from spinup import ddpg_pytorch as ddpg
 4 | import numpy as np
 5 | import torch
 6 | import torch.nn as nn
 7 | 
 8 | """
 9 | 
10 | Exercise 2.2: Silent Bug in DDPG (PyTorch Version)
11 | 
12 | In this exercise, you will run DDPG with a bugged actor critic. Your goal is
13 | to determine whether or not there is any performance degredation, and if so,
14 | figure out what's going wrong.
15 | 
16 | You do NOT need to write code for this exercise.
17 | 
18 | """
19 | 
20 | """
21 | Bugged Actor-Critic
22 | """
23 | 
24 | class BuggedMLPActor(nn.Module):
25 | 
26 |     def __init__(self, obs_dim, act_dim, hidden_sizes, activation, act_limit):
27 |         super().__init__()
28 |         pi_sizes = [obs_dim] + list(hidden_sizes) + [act_dim]
29 |         self.pi = mlp(pi_sizes, activation, nn.Tanh)
30 |         self.act_limit = act_limit
31 | 
32 |     def forward(self, obs):
33 |         # Return output from network scaled to action space limits.
34 |         return self.act_limit * self.pi(obs)
35 | 
36 | class BuggedMLPQFunction(nn.Module):
37 | 
38 |     def __init__(self, obs_dim, act_dim, hidden_sizes, activation):
39 |         super().__init__()
40 |         self.q = mlp([obs_dim + act_dim] + list(hidden_sizes) + [1], activation)
41 | 
42 |     def forward(self, obs, act):
43 |         return self.q(torch.cat([obs, act], dim=-1))
44 | 
45 | class BuggedMLPActorCritic(nn.Module):
46 | 
47 |     def __init__(self, observation_space, action_space, hidden_sizes=(256,256),
48 |                  activation=nn.ReLU):
49 |         super().__init__()
50 | 
51 |         obs_dim = observation_space.shape[0]
52 |         act_dim = action_space.shape[0]
53 |         act_limit = action_space.high[0]
54 | 
55 |         # build policy and value functions
56 |         self.pi = BuggedMLPActor(obs_dim, act_dim, hidden_sizes, activation, act_limit)
57 |         self.q = BuggedMLPQFunction(obs_dim, act_dim, hidden_sizes, activation)
58 | 
59 |     def act(self, obs):
60 |         with torch.no_grad():
61 |             return self.pi(obs).numpy()
62 | 
63 | 
64 | if __name__ == '__main__':
65 |     import argparse
66 |     parser = argparse.ArgumentParser()
67 |     parser.add_argument('--env', type=str, default='HalfCheetah-v2')
68 |     parser.add_argument('--h', type=int, default=300)
69 |     parser.add_argument('--l', type=int, default=1)
70 |     parser.add_argument('--num_runs', '-n', type=int, default=3)
71 |     parser.add_argument('--steps_per_epoch', '-s', type=int, default=5000)
72 |     parser.add_argument('--total_steps', '-t', type=int, default=int(5e4))
73 |     args = parser.parse_args()
74 | 
75 |     def ddpg_with_actor_critic(bugged, **kwargs):
76 |         from spinup.exercises.pytorch.problem_set_2.exercise2_2 import BuggedMLPActorCritic
77 |         actor_critic = BuggedMLPActorCritic if bugged else MLPActorCritic
78 |         return ddpg(actor_critic=actor_critic, 
79 |                     ac_kwargs=dict(hidden_sizes=[args.h]*args.l),
80 |                     start_steps=5000,
81 |                     max_ep_len=150,
82 |                     batch_size=64,
83 |                     polyak=0.95,
84 |                     **kwargs)
85 | 
86 |     eg = ExperimentGrid(name='ex2-2_ddpg')
87 |     eg.add('replay_size', int(args.total_steps))
88 |     eg.add('env_name', args.env, '', True)
89 |     eg.add('seed', [10*i for i in range(args.num_runs)])
90 |     eg.add('epochs', int(args.total_steps / args.steps_per_epoch))
91 |     eg.add('steps_per_epoch', args.steps_per_epoch)
92 |     eg.add('bugged', [False, True])
93 |     eg.run(ddpg_with_actor_critic, datestamp=True)


--------------------------------------------------------------------------------
/spinningup/spinup/exercises/tf1/problem_set_1/exercise1_1.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | """
 5 | 
 6 | Exercise 1.1: Diagonal Gaussian Likelihood
 7 | 
 8 | Write a function which takes in Tensorflow symbols for the means and 
 9 | log stds of a batch of diagonal Gaussian distributions, along with a 
10 | Tensorflow placeholder for (previously-generated) samples from those 
11 | distributions, and returns a Tensorflow symbol for computing the log 
12 | likelihoods of those samples.
13 | 
14 | """
15 | 
16 | def gaussian_likelihood(x, mu, log_std):
17 |     """
18 |     Args:
19 |         x: Tensor with shape [batch, dim]
20 |         mu: Tensor with shape [batch, dim]
21 |         log_std: Tensor with shape [batch, dim] or [dim]
22 | 
23 |     Returns:
24 |         Tensor with shape [batch]
25 |     """
26 |     #######################
27 |     #                     #
28 |     #   YOUR CODE HERE    #
29 |     #                     #
30 |     #######################
31 |     return tf.constant(0)
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     """
36 |     Run this file to verify your solution.
37 |     """
38 |     from spinup.exercises.tf1.problem_set_1_solutions import exercise1_1_soln
39 |     from spinup.exercises.common import print_result
40 | 
41 |     sess = tf.Session()
42 | 
43 |     dim = 10
44 |     x = tf.placeholder(tf.float32, shape=(None, dim))
45 |     mu = tf.placeholder(tf.float32, shape=(None, dim))
46 |     log_std = tf.placeholder(tf.float32, shape=(dim,))
47 | 
48 |     your_gaussian_likelihood = gaussian_likelihood(x, mu, log_std)
49 |     true_gaussian_likelihood = exercise1_1_soln.gaussian_likelihood(x, mu, log_std)
50 | 
51 |     batch_size = 32
52 |     feed_dict = {x: np.random.rand(batch_size, dim),
53 |                  mu: np.random.rand(batch_size, dim),
54 |                  log_std: np.random.rand(dim)}
55 | 
56 |     your_result, true_result = sess.run([your_gaussian_likelihood, true_gaussian_likelihood],
57 |                                         feed_dict=feed_dict)
58 | 
59 |     correct = np.allclose(your_result, true_result)
60 |     print_result(correct)


--------------------------------------------------------------------------------
/spinningup/spinup/exercises/tf1/problem_set_1/exercise1_2.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | from spinup.exercises.tf1.problem_set_1 import exercise1_1
  4 | 
  5 | """
  6 | 
  7 | Exercise 1.2: PPO Gaussian Policy
  8 | 
  9 | Implement an MLP diagonal Gaussian policy for PPO. 
 10 | 
 11 | Log-likelihoods will be computed using your answer to Exercise 1.1,
 12 | so make sure to complete that exercise before beginning this one.
 13 | 
 14 | """
 15 | 
 16 | EPS = 1e-8
 17 | 
 18 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None):
 19 |     """
 20 |     Builds a multi-layer perceptron in Tensorflow.
 21 | 
 22 |     Args:
 23 |         x: Input tensor.
 24 | 
 25 |         hidden_sizes: Tuple, list, or other iterable giving the number of units
 26 |             for each hidden layer of the MLP.
 27 | 
 28 |         activation: Activation function for all layers except last.
 29 | 
 30 |         output_activation: Activation function for last layer.
 31 | 
 32 |     Returns:
 33 |         A TF symbol for the output of an MLP that takes x as an input.
 34 | 
 35 |     """
 36 |     #######################
 37 |     #                     #
 38 |     #   YOUR CODE HERE    #
 39 |     #                     #
 40 |     #######################
 41 |     pass
 42 | 
 43 | def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation, action_space):
 44 |     """
 45 |     Builds symbols to sample actions and compute log-probs of actions.
 46 | 
 47 |     Special instructions: Make log_std a tf variable with the same shape as
 48 |     the action vector, independent of x, initialized to [-0.5, -0.5, ..., -0.5].
 49 | 
 50 |     Args:
 51 |         x: Input tensor of states. Shape [batch, obs_dim].
 52 | 
 53 |         a: Input tensor of actions. Shape [batch, act_dim].
 54 | 
 55 |         hidden_sizes: Sizes of hidden layers for action network MLP.
 56 | 
 57 |         activation: Activation function for all layers except last.
 58 | 
 59 |         output_activation: Activation function for last layer (action layer).
 60 | 
 61 |         action_space: A gym.spaces object describing the action space of the
 62 |             environment this agent will interact with.
 63 | 
 64 |     Returns:
 65 |         pi: A symbol for sampling stochastic actions from a Gaussian 
 66 |             distribution.
 67 | 
 68 |         logp: A symbol for computing log-likelihoods of actions from a Gaussian 
 69 |             distribution.
 70 | 
 71 |         logp_pi: A symbol for computing log-likelihoods of actions in pi from a 
 72 |             Gaussian distribution.
 73 | 
 74 |     """
 75 |     #######################
 76 |     #                     #
 77 |     #   YOUR CODE HERE    #
 78 |     #                     #
 79 |     #######################
 80 |     # mu = 
 81 |     # log_std = 
 82 |     # pi = 
 83 | 
 84 |     logp = exercise1_1.gaussian_likelihood(a, mu, log_std)
 85 |     logp_pi = exercise1_1.gaussian_likelihood(pi, mu, log_std)
 86 |     return pi, logp, logp_pi
 87 | 
 88 | 
 89 | if __name__ == '__main__':
 90 |     """
 91 |     Run this file to verify your solution.
 92 |     """
 93 | 
 94 |     from spinup import ppo_tf1 as ppo
 95 |     from spinup.exercises.common import print_result
 96 |     import gym
 97 |     import os
 98 |     import pandas as pd
 99 |     import psutil
100 |     import time
101 | 
102 |     logdir = "/tmp/experiments/%i"%int(time.time())
103 |     ppo(env_fn = lambda : gym.make('InvertedPendulum-v2'),
104 |         ac_kwargs=dict(policy=mlp_gaussian_policy, hidden_sizes=(64,)),
105 |         steps_per_epoch=4000, epochs=20, logger_kwargs=dict(output_dir=logdir))
106 | 
107 |     # Get scores from last five epochs to evaluate success.
108 |     data = pd.read_table(os.path.join(logdir,'progress.txt'))
109 |     last_scores = data['AverageEpRet'][-5:]
110 | 
111 |     # Your implementation is probably correct if the agent has a score >500,
112 |     # or if it reaches the top possible score of 1000, in the last five epochs.
113 |     correct = np.mean(last_scores) > 500 or np.max(last_scores)==1e3
114 |     print_result(correct)


--------------------------------------------------------------------------------
/spinningup/spinup/exercises/tf1/problem_set_1_solutions/exercise1_1_soln.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import numpy as np
3 | 
4 | EPS=1e-8
5 | 
6 | def gaussian_likelihood(x, mu, log_std):
7 |     pre_sum = -0.5 * (((x-mu)/(tf.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi))
8 |     return tf.reduce_sum(pre_sum, axis=1)


--------------------------------------------------------------------------------
/spinningup/spinup/exercises/tf1/problem_set_1_solutions/exercise1_2_soln.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | 
 5 | EPS = 1e-8
 6 | 
 7 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None):
 8 |     for h in hidden_sizes[:-1]:
 9 |         x = tf.layers.dense(x, units=h, activation=activation)
10 |     return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation)
11 | 
12 | def gaussian_likelihood(x, mu, log_std):
13 |     pre_sum = -0.5 * (((x-mu)/(tf.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi))
14 |     return tf.reduce_sum(pre_sum, axis=1)
15 | 
16 | def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation, action_space):
17 |     act_dim = a.shape.as_list()[-1]
18 |     mu = mlp(x, list(hidden_sizes)+[act_dim], activation, output_activation)
19 |     log_std = tf.get_variable(name='log_std', initializer=-0.5*np.ones(act_dim, dtype=np.float32))
20 |     std = tf.exp(log_std)
21 |     pi = mu + tf.random_normal(tf.shape(mu)) * std
22 |     logp = gaussian_likelihood(a, mu, log_std)
23 |     logp_pi = gaussian_likelihood(pi, mu, log_std)
24 |     return pi, logp, logp_pi


--------------------------------------------------------------------------------
/spinningup/spinup/exercises/tf1/problem_set_2/exercise2_2.py:
--------------------------------------------------------------------------------
 1 | from spinup.algos.tf1.ddpg.core import mlp, mlp_actor_critic
 2 | from spinup.utils.run_utils import ExperimentGrid
 3 | from spinup import ddpg_tf1 as ddpg
 4 | import numpy as np
 5 | import tensorflow as tf
 6 | 
 7 | """
 8 | 
 9 | Exercise 2.2: Silent Bug in DDPG
10 | 
11 | In this exercise, you will run DDPG with a bugged actor critic. Your goal is
12 | to determine whether or not there is any performance degredation, and if so,
13 | figure out what's going wrong.
14 | 
15 | You do NOT need to write code for this exercise.
16 | 
17 | """
18 | 
19 | """
20 | Bugged Actor-Critic
21 | """
22 | def bugged_mlp_actor_critic(x, a, hidden_sizes=(400,300), activation=tf.nn.relu, 
23 |                             output_activation=tf.tanh, action_space=None):
24 |     act_dim = a.shape.as_list()[-1]
25 |     act_limit = action_space.high[0]
26 |     with tf.variable_scope('pi'):
27 |         pi = act_limit * mlp(x, list(hidden_sizes)+[act_dim], activation, output_activation)
28 |     with tf.variable_scope('q'):
29 |         q = mlp(tf.concat([x,a], axis=-1), list(hidden_sizes)+[1], activation, None)
30 |     with tf.variable_scope('q', reuse=True):
31 |         q_pi = mlp(tf.concat([x,pi], axis=-1), list(hidden_sizes)+[1], activation, None)
32 |     return pi, q, q_pi
33 | 
34 | 
35 | if __name__ == '__main__':
36 |     import argparse
37 |     parser = argparse.ArgumentParser()
38 |     parser.add_argument('--env', type=str, default='HalfCheetah-v2')
39 |     parser.add_argument('--h', type=int, default=300)
40 |     parser.add_argument('--l', type=int, default=1)
41 |     parser.add_argument('--num_runs', '-n', type=int, default=3)
42 |     parser.add_argument('--steps_per_epoch', '-s', type=int, default=5000)
43 |     parser.add_argument('--total_steps', '-t', type=int, default=int(5e4))
44 |     args = parser.parse_args()
45 | 
46 |     def ddpg_with_actor_critic(bugged, **kwargs):
47 |         actor_critic = bugged_mlp_actor_critic if bugged else mlp_actor_critic
48 |         return ddpg(actor_critic=actor_critic, 
49 |                     ac_kwargs=dict(hidden_sizes=[args.h]*args.l),
50 |                     start_steps=5000,
51 |                     max_ep_len=150,
52 |                     batch_size=64,
53 |                     polyak=0.95,
54 |                     **kwargs)
55 | 
56 |     eg = ExperimentGrid(name='ex2-2_ddpg')
57 |     eg.add('replay_size', int(args.total_steps))
58 |     eg.add('env_name', args.env, '', True)
59 |     eg.add('seed', [10*i for i in range(args.num_runs)])
60 |     eg.add('epochs', int(args.total_steps / args.steps_per_epoch))
61 |     eg.add('steps_per_epoch', args.steps_per_epoch)
62 |     eg.add('bugged', [False, True])
63 |     eg.run(ddpg_with_actor_critic, datestamp=True)


--------------------------------------------------------------------------------
/spinningup/spinup/run.py:
--------------------------------------------------------------------------------
  1 | import spinup
  2 | from spinup.user_config import DEFAULT_BACKEND
  3 | from spinup.utils.run_utils import ExperimentGrid
  4 | from spinup.utils.serialization_utils import convert_json
  5 | import argparse
  6 | import gym
  7 | import json
  8 | import os, subprocess, sys
  9 | import os.path as osp
 10 | import string
 11 | import tensorflow as tf
 12 | import torch
 13 | from copy import deepcopy
 14 | from textwrap import dedent
 15 | 
 16 | 
 17 | # Command line args that will go to ExperimentGrid.run, and must possess unique
 18 | # values (therefore must be treated separately).
 19 | RUN_KEYS = ['num_cpu', 'data_dir', 'datestamp']
 20 | 
 21 | # Command line sweetener, allowing short-form flags for common, longer flags.
 22 | SUBSTITUTIONS = {'env': 'env_name',
 23 |                  'hid': 'ac_kwargs:hidden_sizes',
 24 |                  'act': 'ac_kwargs:activation',
 25 |                  'cpu': 'num_cpu',
 26 |                  'dt': 'datestamp'}
 27 | 
 28 | # Only some algorithms can be parallelized (have num_cpu > 1):
 29 | MPI_COMPATIBLE_ALGOS = ['vpg', 'trpo', 'ppo']
 30 | 
 31 | # Algo names (used in a few places)
 32 | BASE_ALGO_NAMES = ['vpg', 'trpo', 'ppo', 'ddpg', 'td3', 'sac']
 33 | 
 34 | 
 35 | def add_with_backends(algo_list):
 36 |     # helper function to build lists with backend-specific function names
 37 |     algo_list_with_backends = deepcopy(algo_list)
 38 |     for algo in algo_list:
 39 |         algo_list_with_backends += [algo + '_tf1', algo + '_pytorch']
 40 |     return algo_list_with_backends
 41 | 
 42 | 
 43 | def friendly_err(err_msg):
 44 |     # add whitespace to error message to make it more readable
 45 |     return '\n\n' + err_msg + '\n\n'
 46 | 
 47 | 
 48 | def parse_and_execute_grid_search(cmd, args):
 49 |     """Interprets algorithm name and cmd line args into an ExperimentGrid."""
 50 | 
 51 |     if cmd in BASE_ALGO_NAMES:
 52 |         backend = DEFAULT_BACKEND[cmd]
 53 |         print('\n\nUsing default backend (%s) for %s.\n'%(backend, cmd))
 54 |         cmd = cmd + '_' + backend
 55 | 
 56 |     algo = eval('spinup.'+cmd)
 57 | 
 58 |     # Before all else, check to see if any of the flags is 'help'.
 59 |     valid_help = ['--help', '-h', 'help']
 60 |     if any([arg in valid_help for arg in args]):
 61 |         print('\n\nShowing docstring for spinup.'+cmd+':\n')
 62 |         print(algo.__doc__)
 63 |         sys.exit()
 64 | 
 65 |     def process(arg):
 66 |         # Process an arg by eval-ing it, so users can specify more
 67 |         # than just strings at the command line (eg allows for
 68 |         # users to give functions as args).
 69 |         try:
 70 |             return eval(arg)
 71 |         except:
 72 |             return arg
 73 | 
 74 |     # Make first pass through args to build base arg_dict. Anything
 75 |     # with a '--' in front of it is an argument flag and everything after,
 76 |     # until the next flag, is a possible value.
 77 |     arg_dict = dict()
 78 |     for i, arg in enumerate(args):
 79 |         assert i > 0 or '--' in arg, \
 80 |             friendly_err("You didn't specify a first flag.")
 81 |         if '--' in arg:
 82 |             arg_key = arg.lstrip('-')
 83 |             arg_dict[arg_key] = []
 84 |         else:
 85 |             arg_dict[arg_key].append(process(arg))
 86 | 
 87 |     # Make second pass through, to catch flags that have no vals.
 88 |     # Assume such flags indicate that a boolean parameter should have
 89 |     # value True.
 90 |     for k,v in arg_dict.items():
 91 |         if len(v) == 0:
 92 |             v.append(True)
 93 | 
 94 |     # Third pass: check for user-supplied shorthands, where a key has
 95 |     # the form --keyname[kn]. The thing in brackets, 'kn', is the
 96 |     # shorthand. NOTE: modifying a dict while looping through its
 97 |     # contents is dangerous, and breaks in 3.6+. We loop over a fixed list
 98 |     # of keys to avoid this issue.
 99 |     given_shorthands = dict()
100 |     fixed_keys = list(arg_dict.keys())
101 |     for k in fixed_keys:
102 |         p1, p2 = k.find('['), k.find(']')
103 |         if p1 >= 0 and p2 >= 0:
104 |             # Both '[' and ']' found, so shorthand has been given
105 |             k_new = k[:p1]
106 |             shorthand = k[p1+1:p2]
107 |             given_shorthands[k_new] = shorthand
108 |             arg_dict[k_new] = arg_dict[k]
109 |             del arg_dict[k]
110 | 
111 |     # Penultimate pass: sugar. Allow some special shortcuts in arg naming,
112 |     # eg treat "env" the same as "env_name". This is super specific
113 |     # to Spinning Up implementations, and may be hard to maintain.
114 |     # These special shortcuts are described by SUBSTITUTIONS.
115 |     for special_name, true_name in SUBSTITUTIONS.items():
116 |         if special_name in arg_dict:
117 |             # swap it in arg dict
118 |             arg_dict[true_name] = arg_dict[special_name]
119 |             del arg_dict[special_name]
120 | 
121 |         if special_name in given_shorthands:
122 |             # point the shortcut to the right name
123 |             given_shorthands[true_name] = given_shorthands[special_name]
124 |             del given_shorthands[special_name]
125 | 
126 |     # Final pass: check for the special args that go to the 'run' command
127 |     # for an experiment grid, separate them from the arg dict, and make sure
128 |     # that they have unique values. The special args are given by RUN_KEYS.
129 |     run_kwargs = dict()
130 |     for k in RUN_KEYS:
131 |         if k in arg_dict:
132 |             val = arg_dict[k]
133 |             assert len(val) == 1, \
134 |                 friendly_err("You can only provide one value for %s."%k)
135 |             run_kwargs[k] = val[0]
136 |             del arg_dict[k]
137 | 
138 |     # Determine experiment name. If not given by user, will be determined
139 |     # by the algorithm name.
140 |     if 'exp_name' in arg_dict:
141 |         assert len(arg_dict['exp_name']) == 1, \
142 |             friendly_err("You can only provide one value for exp_name.")
143 |         exp_name = arg_dict['exp_name'][0]
144 |         del arg_dict['exp_name']
145 |     else:
146 |         exp_name = 'cmd_' + cmd
147 | 
148 |     # Make sure that if num_cpu > 1, the algorithm being used is compatible
149 |     # with MPI.
150 |     if 'num_cpu' in run_kwargs and not(run_kwargs['num_cpu'] == 1):
151 |         assert cmd in add_with_backends(MPI_COMPATIBLE_ALGOS), \
152 |             friendly_err("This algorithm can't be run with num_cpu > 1.")
153 | 
154 |     # Special handling for environment: make sure that env_name is a real,
155 |     # registered gym environment.
156 |     valid_envs = [e.id for e in list(gym.envs.registry.all())]
157 |     assert 'env_name' in arg_dict, \
158 |         friendly_err("You did not give a value for --env_name! Add one and try again.")
159 |     for env_name in arg_dict['env_name']:
160 |         err_msg = dedent("""
161 | 
162 |             %s is not registered with Gym.
163 | 
164 |             Recommendations:
165 | 
166 |                 * Check for a typo (did you include the version tag?)
167 | 
168 |                 * View the complete list of valid Gym environments at
169 | 
170 |                     https://gym.openai.com/envs/
171 | 
172 |             """%env_name)
173 |         assert env_name in valid_envs, err_msg
174 | 
175 | 
176 |     # Construct and execute the experiment grid.
177 |     eg = ExperimentGrid(name=exp_name)
178 |     for k,v in arg_dict.items():
179 |         eg.add(k, v, shorthand=given_shorthands.get(k))
180 |     eg.run(algo, **run_kwargs)
181 | 
182 | 
183 | if __name__ == '__main__':
184 |     """
185 |     This is a wrapper allowing command-line interfaces to individual
186 |     algorithms and the plot / test_policy utilities.
187 | 
188 |     For utilities, it only checks which thing to run, and calls the
189 |     appropriate file, passing all arguments through.
190 | 
191 |     For algorithms, it sets up an ExperimentGrid object and uses the
192 |     ExperimentGrid run routine to execute each possible experiment.
193 |     """
194 | 
195 |     cmd = sys.argv[1] if len(sys.argv) > 1 else 'help'
196 |     valid_algos = add_with_backends(BASE_ALGO_NAMES)
197 |     valid_utils = ['plot', 'test_policy']
198 |     valid_help = ['--help', '-h', 'help']
199 |     valid_cmds = valid_algos + valid_utils + valid_help
200 |     assert cmd in valid_cmds, \
201 |         "Select an algorithm or utility which is implemented in Spinning Up."
202 | 
203 |     if cmd in valid_help:
204 |         # Before all else, check to see if any of the flags is 'help'.
205 | 
206 |         # List commands that are available.
207 |         str_valid_cmds = '\n\t' + '\n\t'.join(valid_algos+valid_utils)
208 |         help_msg = dedent("""
209 |             Experiment in Spinning Up from the command line with
210 | 
211 |             \tpython -m spinup.run CMD [ARGS...]
212 | 
213 |             where CMD is a valid command. Current valid commands are:
214 |             """) + str_valid_cmds
215 |         print(help_msg)
216 | 
217 |         # Provide some useful details for algorithm running.
218 |         subs_list = ['--' + k.ljust(10) + 'for'.ljust(10) + '--' + v \
219 |                      for k,v in SUBSTITUTIONS.items()]
220 |         str_valid_subs = '\n\t' + '\n\t'.join(subs_list)
221 |         special_info = dedent("""
222 |             FYI: When running an algorithm, any keyword argument to the
223 |             algorithm function can be used as a flag, eg
224 | 
225 |             \tpython -m spinup.run ppo --env HalfCheetah-v2 --clip_ratio 0.1
226 | 
227 |             If you need a quick refresher on valid kwargs, get the docstring
228 |             with
229 | 
230 |             \tpython -m spinup.run [algo] --help
231 | 
232 |             See the "Running Experiments" docs page for more details.
233 | 
234 |             Also: Some common but long flags can be substituted for shorter
235 |             ones. Valid substitutions are:
236 |             """) + str_valid_subs
237 |         print(special_info)
238 | 
239 |     elif cmd in valid_utils:
240 |         # Execute the correct utility file.
241 |         runfile = osp.join(osp.abspath(osp.dirname(__file__)), 'utils', cmd +'.py')
242 |         args = [sys.executable if sys.executable else 'python', runfile] + sys.argv[2:]
243 |         subprocess.check_call(args, env=os.environ)
244 |     else:
245 |         # Assume that the user plans to execute an algorithm. Run custom
246 |         # parsing on the arguments and build a grid search to execute.
247 |         args = sys.argv[2:]
248 |         parse_and_execute_grid_search(cmd, args)
249 | 


--------------------------------------------------------------------------------
/spinningup/spinup/user_config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import os.path as osp
 3 | 
 4 | # Default neural network backend for each algo
 5 | # (Must be either 'tf1' or 'pytorch')
 6 | DEFAULT_BACKEND = {
 7 |     'vpg': 'pytorch',
 8 |     'trpo': 'tf1',
 9 |     'ppo': 'pytorch',
10 |     'ddpg': 'pytorch',
11 |     'td3': 'pytorch',
12 |     'sac': 'pytorch'
13 | }
14 | 
15 | # Where experiment outputs are saved by default:
16 | DEFAULT_DATA_DIR = osp.join(osp.abspath(osp.dirname(osp.dirname(__file__))),'data')
17 | 
18 | # Whether to automatically insert a date and time stamp into the names of
19 | # save directories:
20 | FORCE_DATESTAMP = False
21 | 
22 | # Whether GridSearch provides automatically-generated default shorthands:
23 | DEFAULT_SHORTHAND = True
24 | 
25 | # Tells the GridSearch how many seconds to pause for before launching 
26 | # experiments.
27 | WAIT_BEFORE_LAUNCH = 5


--------------------------------------------------------------------------------
/spinningup/spinup/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netx-repo/neuroplan/f4d73d71b67261d819494385b1a7e3270b7499e5/spinningup/spinup/utils/__init__.py


--------------------------------------------------------------------------------
/spinningup/spinup/utils/mpi_pytorch.py:
--------------------------------------------------------------------------------
 1 | import multiprocessing
 2 | import numpy as np
 3 | import os
 4 | import torch
 5 | from mpi4py import MPI
 6 | from spinup.utils.mpi_tools import broadcast, mpi_avg, num_procs, proc_id
 7 | 
 8 | def setup_pytorch_for_mpi():
 9 |     """
10 |     Avoid slowdowns caused by each separate process's PyTorch using
11 |     more than its fair share of CPU resources.
12 |     """
13 |     #print('Proc %d: Reporting original number of Torch threads as %d.'%(proc_id(), torch.get_num_threads()), flush=True)
14 |     if torch.get_num_threads()==1:
15 |         return
16 |     fair_num_threads = max(int(torch.get_num_threads() / num_procs()), 1)
17 |     torch.set_num_threads(fair_num_threads)
18 |     #print('Proc %d: Reporting new number of Torch threads as %d.'%(proc_id(), torch.get_num_threads()), flush=True)
19 | 
20 | def mpi_avg_grads(module):
21 |     """ Average contents of gradient buffers across MPI processes. """
22 |     if num_procs()==1:
23 |         return
24 |     for p in module.parameters():
25 |         p_grad_numpy = p.grad.cpu().numpy()   # numpy view of tensor data
26 |         avg_p_grad = mpi_avg(p.grad.cpu())
27 |         p_grad_numpy[:] = avg_p_grad[:]
28 | 
29 | def sync_params(module):
30 |     """ Sync all parameters of module across all MPI processes. """
31 |     if num_procs()==1:
32 |         return
33 |     for p in module.parameters():
34 |         p_numpy = p.data.cpu().numpy()
35 |         broadcast(p_numpy)


--------------------------------------------------------------------------------
/spinningup/spinup/utils/mpi_tf.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | from mpi4py import MPI
 4 | from spinup.utils.mpi_tools import broadcast
 5 | 
 6 | 
 7 | def flat_concat(xs):
 8 |     return tf.concat([tf.reshape(x,(-1,)) for x in xs], axis=0)
 9 | 
10 | def assign_params_from_flat(x, params):
11 |     flat_size = lambda p : int(np.prod(p.shape.as_list())) # the 'int' is important for scalars
12 |     splits = tf.split(x, [flat_size(p) for p in params])
13 |     new_params = [tf.reshape(p_new, p.shape) for p, p_new in zip(params, splits)]
14 |     return tf.group([tf.assign(p, p_new) for p, p_new in zip(params, new_params)])
15 | 
16 | def sync_params(params):
17 |     get_params = flat_concat(params)
18 |     def _broadcast(x):
19 |         broadcast(x)
20 |         return x
21 |     synced_params = tf.py_func(_broadcast, [get_params], tf.float32)
22 |     return assign_params_from_flat(synced_params, params)
23 | 
24 | def sync_all_params():
25 |     """Sync all tf variables across MPI processes."""
26 |     return sync_params(tf.global_variables())
27 | 
28 | 
29 | class MpiAdamOptimizer(tf.train.AdamOptimizer):
30 |     """
31 |     Adam optimizer that averages gradients across MPI processes.
32 | 
33 |     The compute_gradients method is taken from Baselines `MpiAdamOptimizer`_. 
34 |     For documentation on method arguments, see the Tensorflow docs page for 
35 |     the base `AdamOptimizer`_.
36 | 
37 |     .. _`MpiAdamOptimizer`: https://github.com/openai/baselines/blob/master/baselines/common/mpi_adam_optimizer.py
38 |     .. _`AdamOptimizer`: https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer
39 |     """
40 | 
41 |     def __init__(self, **kwargs):
42 |         self.comm = MPI.COMM_WORLD
43 |         tf.train.AdamOptimizer.__init__(self, **kwargs)
44 | 
45 |     def compute_gradients(self, loss, var_list, **kwargs):
46 |         """
47 |         Same as normal compute_gradients, except average grads over processes.
48 |         """
49 |         grads_and_vars = super().compute_gradients(loss, var_list, **kwargs)
50 |         grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None]
51 |         flat_grad = flat_concat([g for g, v in grads_and_vars])
52 |         shapes = [v.shape.as_list() for g, v in grads_and_vars]
53 |         sizes = [int(np.prod(s)) for s in shapes]
54 | 
55 |         num_tasks = self.comm.Get_size()
56 |         buf = np.zeros(flat_grad.shape, np.float32)
57 | 
58 |         def _collect_grads(flat_grad):
59 |             self.comm.Allreduce(flat_grad, buf, op=MPI.SUM)
60 |             np.divide(buf, float(num_tasks), out=buf)
61 |             return buf
62 | 
63 |         avg_flat_grad = tf.py_func(_collect_grads, [flat_grad], tf.float32)
64 |         avg_flat_grad.set_shape(flat_grad.shape)
65 |         avg_grads = tf.split(avg_flat_grad, sizes, axis=0)
66 |         avg_grads_and_vars = [(tf.reshape(g, v.shape), v)
67 |                     for g, (_, v) in zip(avg_grads, grads_and_vars)]
68 | 
69 |         return avg_grads_and_vars
70 | 
71 |     def apply_gradients(self, grads_and_vars, global_step=None, name=None):
72 |         """
73 |         Same as normal apply_gradients, except sync params after update.
74 |         """
75 |         opt = super().apply_gradients(grads_and_vars, global_step, name)
76 |         with tf.control_dependencies([opt]):
77 |             sync = sync_params([v for g,v in grads_and_vars])
78 |         return tf.group([opt, sync])


--------------------------------------------------------------------------------
/spinningup/spinup/utils/mpi_tools.py:
--------------------------------------------------------------------------------
 1 | from mpi4py import MPI
 2 | import os, subprocess, sys
 3 | import numpy as np
 4 | 
 5 | 
 6 | def mpi_fork(n, bind_to_core=False):
 7 |     """
 8 |     Re-launches the current script with workers linked by MPI.
 9 | 
10 |     Also, terminates the original process that launched it.
11 | 
12 |     Taken almost without modification from the Baselines function of the
13 |     `same name`_.
14 | 
15 |     .. _`same name`: https://github.com/openai/baselines/blob/master/baselines/common/mpi_fork.py
16 | 
17 |     Args:
18 |         n (int): Number of process to split into.
19 | 
20 |         bind_to_core (bool): Bind each MPI process to a core.
21 |     """
22 |     if n<=1: 
23 |         return
24 |     if os.getenv("IN_MPI") is None:
25 |         env = os.environ.copy()
26 |         env.update(
27 |             MKL_NUM_THREADS="1",
28 |             OMP_NUM_THREADS="1",
29 |             IN_MPI="1"
30 |         )
31 |         args = ["mpirun", "-np", str(n)]
32 |         if bind_to_core:
33 |             args += ["-bind-to", "core"]
34 |         args += [sys.executable] + sys.argv
35 |         subprocess.check_call(args, env=env)
36 |         sys.exit()
37 | 
38 | 
39 | def msg(m, string=''):
40 |     print(('Message from %d: %s \t '%(MPI.COMM_WORLD.Get_rank(), string))+str(m))
41 | 
42 | def proc_id():
43 |     """Get rank of calling process."""
44 |     return MPI.COMM_WORLD.Get_rank()
45 | 
46 | def allreduce(*args, **kwargs):
47 |     return MPI.COMM_WORLD.Allreduce(*args, **kwargs)
48 | 
49 | def num_procs():
50 |     """Count active MPI processes."""
51 |     return MPI.COMM_WORLD.Get_size()
52 | 
53 | def broadcast(x, root=0):
54 |     MPI.COMM_WORLD.Bcast(x, root=root)
55 | 
56 | def mpi_op(x, op):
57 |     x, scalar = ([x], True) if np.isscalar(x) else (x, False)
58 |     x = np.asarray(x, dtype=np.float32)
59 |     buff = np.zeros_like(x, dtype=np.float32)
60 |     allreduce(x, buff, op=op)
61 |     return buff[0] if scalar else buff
62 | 
63 | def mpi_sum(x):
64 |     return mpi_op(x, MPI.SUM)
65 | 
66 | def mpi_avg(x):
67 |     """Average a scalar or vector over MPI processes."""
68 |     return mpi_sum(x) / num_procs()
69 |     
70 | def mpi_statistics_scalar(x, with_min_and_max=False):
71 |     """
72 |     Get mean/std and optional min/max of scalar x across MPI processes.
73 | 
74 |     Args:
75 |         x: An array containing samples of the scalar to produce statistics
76 |             for.
77 | 
78 |         with_min_and_max (bool): If true, return min and max of x in 
79 |             addition to mean and std.
80 |     """
81 |     x = np.array(x, dtype=np.float32)
82 |     global_sum, global_n = mpi_sum([np.sum(x), len(x)])
83 |     mean = global_sum / global_n
84 | 
85 |     global_sum_sq = mpi_sum(np.sum((x - mean)**2))
86 |     std = np.sqrt(global_sum_sq / global_n)  # compute global std
87 | 
88 |     if with_min_and_max:
89 |         global_min = mpi_op(np.min(x) if len(x) > 0 else np.inf, op=MPI.MIN)
90 |         global_max = mpi_op(np.max(x) if len(x) > 0 else -np.inf, op=MPI.MAX)
91 |         return mean, std, global_min, global_max
92 |     return mean, std


--------------------------------------------------------------------------------
/spinningup/spinup/utils/plot.py:
--------------------------------------------------------------------------------
  1 | import seaborn as sns
  2 | import pandas as pd
  3 | import matplotlib.pyplot as plt
  4 | import json
  5 | import os
  6 | import os.path as osp
  7 | import numpy as np
  8 | 
  9 | DIV_LINE_WIDTH = 50
 10 | 
 11 | # Global vars for tracking and labeling data at load time.
 12 | exp_idx = 0
 13 | units = dict()
 14 | 
 15 | def plot_data(data, xaxis='Epoch', value="AverageEpRet", condition="Condition1", smooth=1, **kwargs):
 16 |     if smooth > 1:
 17 |         """
 18 |         smooth data with moving window average.
 19 |         that is,
 20 |             smoothed_y[t] = average(y[t-k], y[t-k+1], ..., y[t+k-1], y[t+k])
 21 |         where the "smooth" param is width of that window (2k+1)
 22 |         """
 23 |         y = np.ones(smooth)
 24 |         for datum in data:
 25 |             x = np.asarray(datum[value])
 26 |             z = np.ones(len(x))
 27 |             smoothed_x = np.convolve(x,y,'same') / np.convolve(z,y,'same')
 28 |             datum[value] = smoothed_x
 29 | 
 30 |     if isinstance(data, list):
 31 |         data = pd.concat(data, ignore_index=True)
 32 |     sns.set(style="darkgrid", font_scale=1.5)
 33 |     sns.tsplot(data=data, time=xaxis, value=value, unit="Unit", condition=condition, ci='sd', **kwargs)
 34 |     """
 35 |     If you upgrade to any version of Seaborn greater than 0.8.1, switch from 
 36 |     tsplot to lineplot replacing L29 with:
 37 | 
 38 |         sns.lineplot(data=data, x=xaxis, y=value, hue=condition, ci='sd', **kwargs)
 39 | 
 40 |     Changes the colorscheme and the default legend style, though.
 41 |     """
 42 |     plt.legend(loc='best').set_draggable(True)
 43 |     #plt.legend(loc='upper center', ncol=3, handlelength=1,
 44 |     #           borderaxespad=0., prop={'size': 13})
 45 | 
 46 |     """
 47 |     For the version of the legend used in the Spinning Up benchmarking page, 
 48 |     swap L38 with:
 49 | 
 50 |     plt.legend(loc='upper center', ncol=6, handlelength=1,
 51 |                mode="expand", borderaxespad=0., prop={'size': 13})
 52 |     """
 53 | 
 54 |     xscale = np.max(np.asarray(data[xaxis])) > 5e3
 55 |     if xscale:
 56 |         # Just some formatting niceness: x-axis scale in scientific notation if max x is large
 57 |         plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0))
 58 | 
 59 |     plt.tight_layout(pad=0.5)
 60 | 
 61 | def get_datasets(logdir, condition=None):
 62 |     """
 63 |     Recursively look through logdir for output files produced by
 64 |     spinup.logx.Logger. 
 65 | 
 66 |     Assumes that any file "progress.txt" is a valid hit. 
 67 |     """
 68 |     global exp_idx
 69 |     global units
 70 |     datasets = []
 71 |     for root, _, files in os.walk(logdir):
 72 |         if 'progress.txt' in files:
 73 |             exp_name = None
 74 |             try:
 75 |                 config_path = open(os.path.join(root,'config.json'))
 76 |                 config = json.load(config_path)
 77 |                 if 'exp_name' in config:
 78 |                     exp_name = config['exp_name']
 79 |             except:
 80 |                 print('No file named config.json')
 81 |             condition1 = condition or exp_name or 'exp'
 82 |             condition2 = condition1 + '-' + str(exp_idx)
 83 |             exp_idx += 1
 84 |             if condition1 not in units:
 85 |                 units[condition1] = 0
 86 |             unit = units[condition1]
 87 |             units[condition1] += 1
 88 | 
 89 |             try:
 90 |                 exp_data = pd.read_table(os.path.join(root,'progress.txt'))
 91 |             except:
 92 |                 print('Could not read from %s'%os.path.join(root,'progress.txt'))
 93 |                 continue
 94 |             performance = 'AverageTestEpRet' if 'AverageTestEpRet' in exp_data else 'AverageEpRet'
 95 |             exp_data.insert(len(exp_data.columns),'Unit',unit)
 96 |             exp_data.insert(len(exp_data.columns),'Condition1',condition1)
 97 |             exp_data.insert(len(exp_data.columns),'Condition2',condition2)
 98 |             exp_data.insert(len(exp_data.columns),'Performance',exp_data[performance])
 99 |             datasets.append(exp_data)
100 |     return datasets
101 | 
102 | 
103 | def get_all_datasets(all_logdirs, legend=None, select=None, exclude=None):
104 |     """
105 |     For every entry in all_logdirs,
106 |         1) check if the entry is a real directory and if it is, 
107 |            pull data from it; 
108 | 
109 |         2) if not, check to see if the entry is a prefix for a 
110 |            real directory, and pull data from that.
111 |     """
112 |     logdirs = []
113 |     for logdir in all_logdirs:
114 |         if osp.isdir(logdir) and logdir[-1]==os.sep:
115 |             logdirs += [logdir]
116 |         else:
117 |             basedir = osp.dirname(logdir)
118 |             fulldir = lambda x : osp.join(basedir, x)
119 |             prefix = logdir.split(os.sep)[-1]
120 |             listdir= os.listdir(basedir)
121 |             logdirs += sorted([fulldir(x) for x in listdir if prefix in x])
122 | 
123 |     """
124 |     Enforce selection rules, which check logdirs for certain substrings.
125 |     Makes it easier to look at graphs from particular ablations, if you
126 |     launch many jobs at once with similar names.
127 |     """
128 |     if select is not None:
129 |         logdirs = [log for log in logdirs if all(x in log for x in select)]
130 |     if exclude is not None:
131 |         logdirs = [log for log in logdirs if all(not(x in log) for x in exclude)]
132 | 
133 |     # Verify logdirs
134 |     print('Plotting from...\n' + '='*DIV_LINE_WIDTH + '\n')
135 |     for logdir in logdirs:
136 |         print(logdir)
137 |     print('\n' + '='*DIV_LINE_WIDTH)
138 | 
139 |     # Make sure the legend is compatible with the logdirs
140 |     assert not(legend) or (len(legend) == len(logdirs)), \
141 |         "Must give a legend title for each set of experiments."
142 | 
143 |     # Load data from logdirs
144 |     data = []
145 |     if legend:
146 |         for log, leg in zip(logdirs, legend):
147 |             data += get_datasets(log, leg)
148 |     else:
149 |         for log in logdirs:
150 |             data += get_datasets(log)
151 |     return data
152 | 
153 | 
154 | def make_plots(all_logdirs, legend=None, xaxis=None, values=None, count=False,  
155 |                font_scale=1.5, smooth=1, select=None, exclude=None, estimator='mean'):
156 |     data = get_all_datasets(all_logdirs, legend, select, exclude)
157 |     values = values if isinstance(values, list) else [values]
158 |     condition = 'Condition2' if count else 'Condition1'
159 |     estimator = getattr(np, estimator)      # choose what to show on main curve: mean? max? min?
160 |     for value in values:
161 |         plt.figure()
162 |         plot_data(data, xaxis=xaxis, value=value, condition=condition, smooth=smooth, estimator=estimator)
163 |     plt.show()
164 | 
165 | 
166 | def main():
167 |     import argparse
168 |     parser = argparse.ArgumentParser()
169 |     parser.add_argument('logdir', nargs='*')
170 |     parser.add_argument('--legend', '-l', nargs='*')
171 |     parser.add_argument('--xaxis', '-x', default='TotalEnvInteracts')
172 |     parser.add_argument('--value', '-y', default='Performance', nargs='*')
173 |     parser.add_argument('--count', action='store_true')
174 |     parser.add_argument('--smooth', '-s', type=int, default=1)
175 |     parser.add_argument('--select', nargs='*')
176 |     parser.add_argument('--exclude', nargs='*')
177 |     parser.add_argument('--est', default='mean')
178 |     args = parser.parse_args()
179 |     """
180 | 
181 |     Args: 
182 |         logdir (strings): As many log directories (or prefixes to log 
183 |             directories, which the plotter will autocomplete internally) as 
184 |             you'd like to plot from.
185 | 
186 |         legend (strings): Optional way to specify legend for the plot. The 
187 |             plotter legend will automatically use the ``exp_name`` from the
188 |             config.json file, unless you tell it otherwise through this flag.
189 |             This only works if you provide a name for each directory that
190 |             will get plotted. (Note: this may not be the same as the number
191 |             of logdir args you provide! Recall that the plotter looks for
192 |             autocompletes of the logdir args: there may be more than one 
193 |             match for a given logdir prefix, and you will need to provide a 
194 |             legend string for each one of those matches---unless you have 
195 |             removed some of them as candidates via selection or exclusion 
196 |             rules (below).)
197 | 
198 |         xaxis (string): Pick what column from data is used for the x-axis.
199 |              Defaults to ``TotalEnvInteracts``.
200 | 
201 |         value (strings): Pick what columns from data to graph on the y-axis. 
202 |             Submitting multiple values will produce multiple graphs. Defaults
203 |             to ``Performance``, which is not an actual output of any algorithm.
204 |             Instead, ``Performance`` refers to either ``AverageEpRet``, the 
205 |             correct performance measure for the on-policy algorithms, or
206 |             ``AverageTestEpRet``, the correct performance measure for the 
207 |             off-policy algorithms. The plotter will automatically figure out 
208 |             which of ``AverageEpRet`` or ``AverageTestEpRet`` to report for 
209 |             each separate logdir.
210 | 
211 |         count: Optional flag. By default, the plotter shows y-values which
212 |             are averaged across all results that share an ``exp_name``, 
213 |             which is typically a set of identical experiments that only vary
214 |             in random seed. But if you'd like to see all of those curves 
215 |             separately, use the ``--count`` flag.
216 | 
217 |         smooth (int): Smooth data by averaging it over a fixed window. This 
218 |             parameter says how wide the averaging window will be.
219 | 
220 |         select (strings): Optional selection rule: the plotter will only show
221 |             curves from logdirs that contain all of these substrings.
222 | 
223 |         exclude (strings): Optional exclusion rule: plotter will only show 
224 |             curves from logdirs that do not contain these substrings.
225 | 
226 |     """
227 | 
228 |     make_plots(args.logdir, args.legend, args.xaxis, args.value, args.count, 
229 |                smooth=args.smooth, select=args.select, exclude=args.exclude,
230 |                estimator=args.est)
231 | 
232 | if __name__ == "__main__":
233 |     main()


--------------------------------------------------------------------------------
/spinningup/spinup/utils/run_entrypoint.py:
--------------------------------------------------------------------------------
 1 | import zlib
 2 | import pickle
 3 | import base64
 4 | 
 5 | if __name__ == '__main__':
 6 |     import argparse
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument('encoded_thunk')
 9 |     args = parser.parse_args()
10 |     thunk = pickle.loads(zlib.decompress(base64.b64decode(args.encoded_thunk)))
11 |     thunk()


--------------------------------------------------------------------------------
/spinningup/spinup/utils/serialization_utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | def convert_json(obj):
 4 |     """ Convert obj to a version which can be serialized with JSON. """
 5 |     if is_json_serializable(obj):
 6 |         return obj
 7 |     else:
 8 |         if isinstance(obj, dict):
 9 |             return {convert_json(k): convert_json(v) 
10 |                     for k,v in obj.items()}
11 | 
12 |         elif isinstance(obj, tuple):
13 |             return (convert_json(x) for x in obj)
14 | 
15 |         elif isinstance(obj, list):
16 |             return [convert_json(x) for x in obj]
17 | 
18 |         elif hasattr(obj,'__name__') and not('lambda' in obj.__name__):
19 |             return convert_json(obj.__name__)
20 | 
21 |         elif hasattr(obj,'__dict__') and obj.__dict__:
22 |             obj_dict = {convert_json(k): convert_json(v) 
23 |                         for k,v in obj.__dict__.items()}
24 |             return {str(obj): obj_dict}
25 | 
26 |         return str(obj)
27 | 
28 | def is_json_serializable(v):
29 |     try:
30 |         json.dumps(v)
31 |         return True
32 |     except:
33 |         return False


--------------------------------------------------------------------------------
/spinningup/spinup/utils/test_policy.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import joblib
  3 | import os
  4 | import os.path as osp
  5 | import tensorflow as tf
  6 | import torch
  7 | from spinup import EpochLogger
  8 | from spinup.utils.logx import restore_tf_graph
  9 | 
 10 | 
 11 | def load_policy_and_env(fpath, itr='last', deterministic=False):
 12 |     """
 13 |     Load a policy from save, whether it's TF or PyTorch, along with RL env.
 14 | 
 15 |     Not exceptionally future-proof, but it will suffice for basic uses of the 
 16 |     Spinning Up implementations.
 17 | 
 18 |     Checks to see if there's a tf1_save folder. If yes, assumes the model
 19 |     is tensorflow and loads it that way. Otherwise, loads as if there's a 
 20 |     PyTorch save.
 21 |     """
 22 | 
 23 |     # determine if tf save or pytorch save
 24 |     if any(['tf1_save' in x for x in os.listdir(fpath)]):
 25 |         backend = 'tf1'
 26 |     else:
 27 |         backend = 'pytorch'
 28 | 
 29 |     # handle which epoch to load from
 30 |     if itr=='last':
 31 |         # check filenames for epoch (AKA iteration) numbers, find maximum value
 32 | 
 33 |         if backend == 'tf1':
 34 |             saves = [int(x[8:]) for x in os.listdir(fpath) if 'tf1_save' in x and len(x)>8]
 35 | 
 36 |         elif backend == 'pytorch':
 37 |             pytsave_path = osp.join(fpath, 'pyt_save')
 38 |             # Each file in this folder has naming convention 'modelXX.pt', where
 39 |             # 'XX' is either an integer or empty string. Empty string case
 40 |             # corresponds to len(x)==8, hence that case is excluded.
 41 |             saves = [int(x.split('.')[0][5:]) for x in os.listdir(pytsave_path) if len(x)>8 and 'model' in x]
 42 | 
 43 |         itr = '%d'%max(saves) if len(saves) > 0 else ''
 44 | 
 45 |     else:
 46 |         assert isinstance(itr, int), \
 47 |             "Bad value provided for itr (needs to be int or 'last')."
 48 |         itr = '%d'%itr
 49 | 
 50 |     # load the get_action function
 51 |     if backend == 'tf1':
 52 |         get_action = load_tf_policy(fpath, itr, deterministic)
 53 |     else:
 54 |         get_action = load_pytorch_policy(fpath, itr, deterministic)
 55 | 
 56 |     # try to load environment from save
 57 |     # (sometimes this will fail because the environment could not be pickled)
 58 |     try:
 59 |         state = joblib.load(osp.join(fpath, 'vars'+itr+'.pkl'))
 60 |         env = state['env']
 61 |     except:
 62 |         env = None
 63 | 
 64 |     return env, get_action
 65 | 
 66 | 
 67 | def load_tf_policy(fpath, itr, deterministic=False):
 68 |     """ Load a tensorflow policy saved with Spinning Up Logger."""
 69 | 
 70 |     fname = osp.join(fpath, 'tf1_save'+itr)
 71 |     print('\n\nLoading from %s.\n\n'%fname)
 72 | 
 73 |     # load the things!
 74 |     sess = tf.Session()
 75 |     model = restore_tf_graph(sess, fname)
 76 | 
 77 |     # get the correct op for executing actions
 78 |     if deterministic and 'mu' in model.keys():
 79 |         # 'deterministic' is only a valid option for SAC policies
 80 |         print('Using deterministic action op.')
 81 |         action_op = model['mu']
 82 |     else:
 83 |         print('Using default action op.')
 84 |         action_op = model['pi']
 85 | 
 86 |     # make function for producing an action given a single state
 87 |     get_action = lambda x : sess.run(action_op, feed_dict={model['x']: x[None,:]})[0]
 88 | 
 89 |     return get_action
 90 | 
 91 | 
 92 | def load_pytorch_policy(fpath, itr, deterministic=False):
 93 |     """ Load a pytorch policy saved with Spinning Up Logger."""
 94 |     
 95 |     fname = osp.join(fpath, 'pyt_save', 'model'+itr+'.pt')
 96 |     print('\n\nLoading from %s.\n\n'%fname)
 97 | 
 98 |     model = torch.load(fname)
 99 | 
100 |     # make function for producing an action given a single state
101 |     def get_action(x):
102 |         with torch.no_grad():
103 |             x = torch.as_tensor(x, dtype=torch.float32)
104 |             action = model.act(x)
105 |         return action
106 | 
107 |     return get_action
108 | 
109 | 
110 | def run_policy(env, get_action, max_ep_len=None, num_episodes=100, render=True):
111 | 
112 |     assert env is not None, \
113 |         "Environment not found!\n\n It looks like the environment wasn't saved, " + \
114 |         "and we can't run the agent in it. :( \n\n Check out the readthedocs " + \
115 |         "page on Experiment Outputs for how to handle this situation."
116 | 
117 |     logger = EpochLogger()
118 |     o, r, d, ep_ret, ep_len, n = env.reset(), 0, False, 0, 0, 0
119 |     while n < num_episodes:
120 |         if render:
121 |             env.render()
122 |             time.sleep(1e-3)
123 | 
124 |         a = get_action(o)
125 |         o, r, d, _ = env.step(a)
126 |         ep_ret += r
127 |         ep_len += 1
128 | 
129 |         if d or (ep_len == max_ep_len):
130 |             logger.store(EpRet=ep_ret, EpLen=ep_len)
131 |             print('Episode %d \t EpRet %.3f \t EpLen %d'%(n, ep_ret, ep_len))
132 |             o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
133 |             n += 1
134 | 
135 |     logger.log_tabular('EpRet', with_min_and_max=True)
136 |     logger.log_tabular('EpLen', average_only=True)
137 |     logger.dump_tabular()
138 | 
139 | 
140 | if __name__ == '__main__':
141 |     import argparse
142 |     parser = argparse.ArgumentParser()
143 |     parser.add_argument('fpath', type=str)
144 |     parser.add_argument('--len', '-l', type=int, default=0)
145 |     parser.add_argument('--episodes', '-n', type=int, default=100)
146 |     parser.add_argument('--norender', '-nr', action='store_true')
147 |     parser.add_argument('--itr', '-i', type=int, default=-1)
148 |     parser.add_argument('--deterministic', '-d', action='store_true')
149 |     args = parser.parse_args()
150 |     env, get_action = load_policy_and_env(args.fpath, 
151 |                                           args.itr if args.itr >=0 else 'last',
152 |                                           args.deterministic)
153 |     run_policy(env, get_action, args.len, args.episodes, not(args.norender))


--------------------------------------------------------------------------------
/spinningup/spinup/version.py:
--------------------------------------------------------------------------------
1 | version_info = (0, 2, 0)
2 | # format:
3 | # ('spinup_major', 'spinup_minor', 'spinup_patch')
4 | 
5 | def get_version():
6 |     "Returns the version as a human-format string."
7 |     return '%d.%d.%d' % version_info
8 | 
9 | __version__ = get_version()


--------------------------------------------------------------------------------
/spinningup/travis_setup.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | mkdir -p $HOME/.mujoco
 6 | 
 7 | # Avoid using pyenv in travis, since it adds ~7 minutes to turnaround time
 8 | if [ "$TRAVIS_OS_NAME" == "osx" ]
 9 | then
10 |     # https://github.com/travis-ci/travis-ci/issues/9640
11 |     sudo softwareupdate --install "Command Line Tools (macOS High Sierra version 10.13) for Xcode-9.4"
12 |     brew update
13 |     brew install open-mpi
14 |     brew install gcc
15 |     brew link --overwrite gcc
16 |     curl $MUJOCO_FOR_OSX | tar xz -C $HOME/.mujoco/
17 | elif [ "$TRAVIS_OS_NAME" == "linux" ]
18 | then
19 |     # Because this is flaky, try several times
20 |     set +e
21 |     COUNT=0
22 |     while [  $COUNT -lt 5 ]; do
23 |        sudo curl -o /usr/local/bin/patchelf https://s3-us-west-2.amazonaws.com/openai-sci-artifacts/manual-builds/patchelf_0.9_amd64.elf
24 |        if [ $? -eq 0 ];then
25 |           break
26 |        fi
27 |        let COUNT=COUNT+1
28 |     done
29 |     if [ $COUNT -ge 5 ]; then
30 |         echo "Failed to download patchelf"
31 |         exit 1
32 |     fi
33 |     set -e
34 | 
35 |     sudo chmod +x /usr/local/bin/patchelf
36 |     curl $MUJOCO_FOR_LINUX | tar xz -C $HOME/.mujoco/
37 | 
38 |     sudo apt-get update
39 |     sudo apt-get install -y openmpi-bin libopenmpi-dev libosmesa6-dev libglew-dev
40 | fi
41 | 


--------------------------------------------------------------------------------