├── .gitignore ├── LICENSE ├── README.md ├── gurobi.sh ├── source ├── c_solver │ ├── compile.sh │ └── gurobi_c.cpp ├── gurobi.env ├── planning │ └── ilp.py ├── rl │ ├── ac.py │ ├── plan_env.py │ └── rl.py ├── simulate │ ├── flow.py │ ├── spof.py │ ├── spofs.py │ └── traffic_matrix.py ├── test.py └── topology │ ├── ip │ ├── link.py │ ├── network.py │ └── router.py │ ├── optical │ ├── optic_fiber.py │ ├── optic_network.py │ ├── optic_node.py │ └── optic_path.py │ ├── topology.py │ └── utils │ └── node.py └── spinningup ├── .gitignore ├── .travis.yml ├── LICENSE ├── readme.md ├── readthedocs.yml ├── setup.py ├── spinup ├── __init__.py ├── algos │ ├── __init__.py │ ├── pytorch │ │ ├── ddpg │ │ │ ├── core.py │ │ │ └── ddpg.py │ │ ├── ppo │ │ │ ├── core.py │ │ │ └── ppo.py │ │ ├── sac │ │ │ ├── core.py │ │ │ └── sac.py │ │ ├── td3 │ │ │ ├── core.py │ │ │ └── td3.py │ │ ├── trpo │ │ │ └── trpo.py │ │ └── vpg │ │ │ ├── core.py │ │ │ └── vpg.py │ └── tf1 │ │ ├── ddpg │ │ ├── __init__.py │ │ ├── core.py │ │ └── ddpg.py │ │ ├── ppo │ │ ├── __init__.py │ │ ├── core.py │ │ └── ppo.py │ │ ├── sac │ │ ├── __init__.py │ │ ├── core.py │ │ └── sac.py │ │ ├── td3 │ │ ├── __init__.py │ │ ├── core.py │ │ └── td3.py │ │ ├── trpo │ │ ├── __init__.py │ │ ├── core.py │ │ └── trpo.py │ │ └── vpg │ │ ├── __init__.py │ │ ├── core.py │ │ └── vpg.py ├── examples │ ├── pytorch │ │ ├── bench_ppo_cartpole.py │ │ └── pg_math │ │ │ ├── 1_simple_pg.py │ │ │ └── 2_rtg_pg.py │ └── tf1 │ │ ├── bench_ppo_cartpole.py │ │ ├── pg_math │ │ ├── 1_simple_pg.py │ │ └── 2_rtg_pg.py │ │ └── train_mnist.py ├── exercises │ ├── common.py │ ├── pytorch │ │ ├── problem_set_1 │ │ │ ├── exercise1_1.py │ │ │ ├── exercise1_2.py │ │ │ ├── exercise1_2_auxiliary.py │ │ │ └── exercise1_3.py │ │ ├── problem_set_1_solutions │ │ │ ├── exercise1_1_soln.py │ │ │ └── exercise1_2_soln.py │ │ └── problem_set_2 │ │ │ └── exercise2_2.py │ └── tf1 │ │ ├── problem_set_1 │ │ ├── exercise1_1.py │ │ ├── exercise1_2.py │ │ └── exercise1_3.py │ │ ├── problem_set_1_solutions │ │ ├── exercise1_1_soln.py │ │ └── exercise1_2_soln.py │ │ └── problem_set_2 │ │ └── exercise2_2.py ├── run.py ├── user_config.py ├── utils │ ├── __init__.py │ ├── logx.py │ ├── mpi_pytorch.py │ ├── mpi_tf.py │ ├── mpi_tools.py │ ├── plot.py │ ├── run_entrypoint.py │ ├── run_utils.py │ ├── serialization_utils.py │ └── test_policy.py └── version.py └── travis_setup.sh /.gitignore: -------------------------------------------------------------------------------- 1 | # Python byte code 2 | *.pyc 3 | 4 | # Vim 5 | *.swp 6 | 7 | # Mac 8 | *.DS_Store 9 | 10 | # Configuration files 11 | .env 12 | .vscode 13 | .VSCodeCounter 14 | 15 | # results 16 | source/results/* 17 | 18 | # Gurobi solver related 19 | *.log 20 | *.so 21 | *.lp 22 | 23 | # data and model 24 | fb_data_anon* 25 | results 26 | 27 | # Others 28 | source/config.py 29 | sync.py 30 | test_bak.py -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- 204 | 205 | Code in python/ray/rllib/{evolution_strategies, dqn} adapted from 206 | https://github.com/openai (MIT License) 207 | 208 | Copyright (c) 2016 OpenAI (http://openai.com) 209 | 210 | Permission is hereby granted, free of charge, to any person obtaining a copy 211 | of this software and associated documentation files (the "Software"), to deal 212 | in the Software without restriction, including without limitation the rights 213 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 214 | copies of the Software, and to permit persons to whom the Software is 215 | furnished to do so, subject to the following conditions: 216 | 217 | The above copyright notice and this permission notice shall be included in 218 | all copies or substantial portions of the Software. 219 | 220 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 221 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 222 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 223 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 224 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 225 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 226 | THE SOFTWARE. 227 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## 0. Introduction 2 | This repository contains the source code for our SIGCOMM'21 paper "Network Planning with Deep Reinforcement Learning". 3 | ### Notes 4 | The network topologies and the trained models used in the paper are not open-sourced. One can create synthetic topologies according to the problem formulation in the paper or modify the code for their own use case. 5 | 6 | ## 1. Environment config 7 | ### AWS instance configurations 8 | - AMI image: "Deep Learning AMI (Ubuntu 16.04) Version 43.0 - ami-0774e48892bd5f116" 9 | - for First-stage: g4dn.4xlarge; `Threads 16` in `gurobi.env` 10 | - for others (ILP, ILP-heur, Second-stage): m5zn.12xlarge; `Threads 8` in `gurobi.env` 11 | 12 | ### Step 0: download the git repo 13 | ### Step 1: install Linux dependencies 14 | ``` 15 | sudo apt-get update 16 | sudo apt-get install build-essential libopenmpi-dev libboost-all-dev 17 | ``` 18 | ### Step 2: install Gurobi 19 | ``` 20 | cd / 21 | ./gurobi.sh 22 | source ~/.bashrc 23 | ``` 24 | - Install the license here: https://www.gurobi.com/downloads/free-academic-license/ 25 | - Make sure your Gurobi solver work: `gurobi_cl /opt/gurobi902/linux64/examples/data/coins.lp` 26 | ### Step 3: setup && start conda environment with python3.7.7 27 | If you use the AWS Deep Learning AMI, conda is preinstalled. 28 | ``` 29 | conda create --name python=3.7.7 30 | conda activate 31 | ``` 32 | ### Step 4: install python dependencies in the conda env 33 | ``` 34 | cd /spinninup 35 | pip install -e . 36 | pip install networkx pulp pybind11 xlrd==1.2.0 37 | ``` 38 | ### Step 5: compile C++ program with pybind11 39 | ``` 40 | cd /source/c_solver 41 | ./compile.sh 42 | ``` 43 | ## 2. Content 44 | - source 45 | - c_solver: C++ implementation with Gurobi APIs for ILP solver and network plan evaluator 46 | - planning: `ILP` and `ILP-heur` implementation 47 | - results: store the provided trained models and solutions, and the training log 48 | - rl: the implementations of Critic-Actor, RL environment and RL solver 49 | - simulate: python classes of flow, spof, and traffic matrix 50 | - topology: python classes of network topology (both optical layer and IP layer) 51 | - `test.py`: the main script used to reproduce results 52 | - spinningup 53 | - adapted from [OpenAI Spinning Up](https://github.com/openai/spinningup) 54 | - `gurobi.sh` 55 | - used to install Gurobi solver 56 | ## 3. Reproduce results (for SIGCOMM'21 artifact evaluation) 57 | ### Notes 58 | - Some data points are time-consuming to get (i.e., First-stage for A-0, A-0.25, A-0.5, A-0.75 in Figure 8 and B, C, D, E in Figure 9). We provide pretrained models in `/source/results/trained//`, which will be loaded by default. 59 | - We recommend distributing different data points and differetnt experiments on multiple AWS instances to run simultaneously. 60 | - The default `epoch_num` for Figure 10, 11 and 12 is set to be 1024, to guarantee the convergence. The training process can be terminated manually if convergence is observed. 61 | ### How to reproduce 62 | - `cd /source` 63 | - Figure 7: `python test.py fig_7 `, `epoch_num` can be set smaller than 10 (e.g. 2) to get results faster. 64 | - Figure 8: `python test.py single_dp_fig8 ` produces one data point at a time (the default adjust_factor is 1). 65 | - For example, `python test.py single_dp_fig8 ILP 0.0` runs ILP algorithm for `A-0`. 66 | - Pretrained models will be loaded by default if provided in `source/results/trained/`. To train from scratch which is **NOT RECOMMENDED**, run `python test.py single_dp_fig8 False` 67 | - Figure 9&13: `python test.py single_dp_fig9 ` produces one data point at a time. 68 | - For example, `python test.py single_dp_fig9 E NeuroPlan` runs NeuroPlan (First-stage) for topology E with the pretrained model. To train from scratch which is **NOT RECOMMENDED**, run `python test.py single_dp_fig9 E NeuroPlan False`. 69 | - `python test.py second_stage ` can load the solution from the first stage in `` and run second-stage with `relax_factor=` on topo ``. For example, `python test.py second_stage D "results//opt_topo/***.txt" 1.5` 70 | - we also provide our results of First-stage in `results/trained//.txt`, which can be used to run second-stage directly. For example, `python test.py second_stage C "results/trained/C/C.txt" 1.5` 71 | - Figure 10: `python test.py fig_10 `. 72 | - `adjust_factor={0.0, 0.5, 1.0}, num_gnn_layer={0, 2, 4}` 73 | - For example, `python test.py fig_10 0.5 2` runs NeuroPlan with `2`-layer GNNs for topology `A-0.5` 74 | - Figure 11: `python test.py fig_11 `. 75 | - `adjust_factor={0.0, 0.5, 1.0}, mlp_hidden_size={64, 256, 512}` 76 | - For example, `python test.py fig_11 0.0 512` runs NeuroPlan with hidden_size=`512` for topology `A-0` 77 | - Figure 12: `python test.py fig_12 `. 78 | - `adjust_factor={0.0, 0.5, 1.0}, max_unit_per_step={1, 4, 16}` 79 | - For example, `python test.py fig_11 1.0 4` runs NeuroPlan with max_unit_per_step=`4` for topology `A-1` 80 | 81 | ## 4. Contact 82 | For any question, please contact `hzhu at jhu dot edu`. 83 | -------------------------------------------------------------------------------- /gurobi.sh: -------------------------------------------------------------------------------- 1 | wget https://packages.gurobi.com/9.0/gurobi9.0.2_linux64.tar.gz 2 | sudo mv gurobi9.0.2_linux64.tar.gz /opt 3 | cd /opt;sudo tar xvfz gurobi9.0.2_linux64.tar.gz 4 | cd /opt/gurobi902/linux64/src/build/ 5 | sudo make 6 | sudo cp libgurobi_c++.a ../../lib/ 7 | 8 | # set env var 9 | cat <> ~/.bashrc 10 | export GUROBI_HOME="/opt/gurobi902/linux64" 11 | export PATH="\${PATH}:\${GUROBI_HOME}/bin" 12 | export LD_LIBRARY_PATH="\${LD_LIBRARY_PATH}:\${GUROBI_HOME}/lib" 13 | EOT -------------------------------------------------------------------------------- /source/c_solver/compile.sh: -------------------------------------------------------------------------------- 1 | c++ -O3 -Wall -shared -std=c++11 -fPIC `python3 -m pybind11 --includes` gurobi_c.cpp -o ../gurobi_c`python3-config --extension-suffix` -I/opt/gurobi902/linux64/include -L/opt/gurobi902/linux64/lib -lgurobi_c++ -lgurobi90 2 | -------------------------------------------------------------------------------- /source/gurobi.env: -------------------------------------------------------------------------------- 1 | LogToConsole 0 2 | Threads 16 3 | 4 | -------------------------------------------------------------------------------- /source/planning/ilp.py: -------------------------------------------------------------------------------- 1 | #from planning.algorithm import PlanAlg 2 | from simulate.traffic_matrix import TrafficMatrix 3 | from topology.ip.router import Router 4 | from topology.ip.link import Link 5 | from topology.optical.optic_node import OpticNode 6 | from topology.optical.optic_fiber import OpticFiber 7 | from topology.topology import Topology 8 | 9 | import matplotlib.pyplot as plt 10 | import pdb, time, sys 11 | 12 | # pybind11, c++ impl 13 | import gurobi_c 14 | 15 | class ILP(object): 16 | def __init__(self, topo): 17 | self.topo = topo 18 | self.cost_opt = None 19 | 20 | def run_ilp(self, subopt_sol=None, delta_bw=100, relax_factor=1, mipgapabs=5e-2): 21 | ilp_solve_limit = -1 22 | 23 | non_direct_graph, init_cost = self.topo.ip.generate_non_direction_graph(1, subopt_sol, relax_factor) 24 | fiber_info = {} 25 | for fiber_name, fiber_inst in self.topo.optic.fibers.items(): 26 | if fiber_inst.lease_flag: 27 | max_spectrum = 0 28 | max_capa = fiber_inst.max_bw 29 | else: 30 | max_spectrum = fiber_inst.max_fp*fiber_inst.spectrum 31 | max_capa = 0 32 | 33 | fiber_info[fiber_name] = (fiber_inst.lease_flag, max_capa, max_spectrum) 34 | 35 | failed_links_for_spof_list = self.topo.failed_links_for_spof_list[:-1] 36 | print("start ilp_solve_c...", flush=True) 37 | start_time = time.time() 38 | (cost_opt, delta_capa_sum, opt_sol) = gurobi_c.ilp_solve_c(non_direct_graph, failed_links_for_spof_list, \ 39 | self.topo.tm.data['all'], self.topo.tm.data['no-bronze'], fiber_info, self.topo.l3node_map_stub, self.topo.load_factor, \ 40 | delta_bw, ilp_solve_limit, mipgapabs) 41 | print("ilp_solve result, running time: {} \nfinal_cost:{}, init_cost:{}, delta_cost:{}, delta_capa:{}".format( 42 | int(time.time()-start_time), cost_opt+init_cost, init_cost, cost_opt, delta_capa_sum), flush=True) 43 | 44 | print("opt_cost:{}".format(cost_opt), flush=True) 45 | self.cost_opt = cost_opt 46 | print(dict(sorted(opt_sol.items(), key=lambda item: item[1], reverse=True)), flush=True) 47 | 48 | def run_ilp_heuristic(self, subopt_sol=None, delta_bw=1600, relax_factor=1, spof_group_size=10): 49 | ilp_solve_limit = -1 50 | mipgapabs = 0.05 51 | fiber_info = {} 52 | for fiber_name, fiber_inst in self.topo.optic.fibers.items(): 53 | if fiber_inst.lease_flag: 54 | max_spectrum = 0 55 | max_capa = fiber_inst.max_bw 56 | else: 57 | max_spectrum = fiber_inst.max_fp*fiber_inst.spectrum 58 | max_capa = 0 59 | 60 | fiber_info[fiber_name] = (fiber_inst.lease_flag, max_capa, max_spectrum) 61 | 62 | failed_links_for_spof_list = self.topo.failed_links_for_spof_list[:-1] 63 | 64 | spof_group = failed_links_for_spof_list[:spof_group_size] 65 | group_idx = 0 66 | total_cost = 0 67 | total_sol = {} 68 | total_start_time = time.time() 69 | while len(spof_group) > 0: 70 | non_direct_graph, init_cost = self.topo.ip.generate_non_direction_graph(1, subopt_sol, relax_factor) 71 | start_time = time.time() 72 | (cost_opt, delta_capa_sum, opt_sol) = gurobi_c.ilp_solve_c(non_direct_graph, spof_group, \ 73 | self.topo.tm.data['all'], self.topo.tm.data['no-bronze'], fiber_info, self.topo.l3node_map_stub, \ 74 | self.topo.load_factor, delta_bw, ilp_solve_limit, mipgapabs, 0) 75 | print("spof_group_idx:{}, opt_slo:{}, running time:{}".format(group_idx, opt_sol, time.time()-start_time)) 76 | 77 | for link_idx, step_size in opt_sol.items(): 78 | self.topo.ip.links[self.topo.ip.idx_map_link_name[link_idx]].incr_bw(step_size*delta_bw) 79 | try: 80 | total_sol[link_idx] += step_size 81 | except: 82 | total_sol[link_idx] = step_size 83 | 84 | total_cost += cost_opt 85 | group_idx += 1 86 | spof_group = failed_links_for_spof_list[spof_group_size*group_idx:spof_group_size*(group_idx+1)] 87 | 88 | print("heuristic total time:{}".format(time.time()-total_start_time)) 89 | print("opt_cost:{}".format(total_cost), flush=True) 90 | self.cost_opt = total_cost 91 | print(dict(sorted(total_sol.items(), key=lambda item: item[1], reverse=True)), flush=True) 92 | 93 | -------------------------------------------------------------------------------- /source/rl/ac.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.signal, math 3 | from gym.spaces import Box, Discrete 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | from torch.distributions.normal import Normal 9 | from torch.distributions.categorical import Categorical 10 | from torch.nn.parameter import Parameter 11 | 12 | import pdb, functools 13 | 14 | 15 | def mlp(sizes, activation, dropout_flag=False, dropout=0.5, output_activation=nn.Identity): 16 | layers = [] 17 | for j in range(len(sizes)-1): 18 | act = activation if j < len(sizes)-2 else output_activation 19 | if dropout_flag: 20 | layers += [nn.Linear(sizes[j], sizes[j+1]), act(), nn.Dropout(dropout)] 21 | else: 22 | layers += [nn.Linear(sizes[j], sizes[j+1]), act()] 23 | return nn.Sequential(*layers) 24 | 25 | 26 | class SimpleGCN(nn.Module): 27 | """ 28 | Simple GCN layer, similar to https://arxiv.org/abs/1609.02907 29 | """ 30 | def __init__(self, in_features, out_features): 31 | super(SimpleGCN, self).__init__() 32 | self.weight = Parameter(torch.FloatTensor(in_features, out_features)) 33 | self.reset_parameters() 34 | 35 | def reset_parameters(self): 36 | stdv = 1. / math.sqrt(self.weight.size(1)) 37 | self.weight.data.uniform_(-stdv, stdv) 38 | 39 | # adj_adjust is D^(-0.5)*(adj+I)*D^(0.5) 40 | def forward(self, h_0, adj_adjust): 41 | support = torch.matmul(h_0, self.weight) 42 | output = torch.matmul(adj_adjust, support) 43 | return output 44 | 45 | """ 46 | batch: return (batch_size, one-hot vector encoding for the graph) 47 | one sample: return one-hot vector encoding for the graph 48 | """ 49 | class GCN(nn.Module): 50 | def __init__(self, feature_num, ip_node_num, n_hidden, num_layer): 51 | super(GCN, self).__init__() 52 | self.ip_node_num = ip_node_num 53 | self.feature_num = feature_num 54 | 55 | self.gcn_list = [] 56 | for i in range(num_layer): 57 | if i == 0: 58 | self.gcn_list.append(SimpleGCN(feature_num, n_hidden)) 59 | elif i == num_layer-1: 60 | self.gcn_list.append(SimpleGCN(n_hidden, feature_num)) 61 | else: 62 | self.gcn_list.append(SimpleGCN(n_hidden, n_hidden)) 63 | print("num of gcn layer:{}".format(len(self.gcn_list))) 64 | self.gcn_list = nn.ModuleList(self.gcn_list) 65 | 66 | # node_num: n 67 | # state_node: batch_size*n*feature_num 68 | # state_adj: batch_size*n*n 69 | # obs: batch_size*n*(feature_num+n) 70 | def forward(self, obs): 71 | # reconstruct state_node and state_adj from flatten_obs 72 | if (len(obs.size())==3): 73 | # batch 74 | adj_adjust, h_0 = torch.split(obs,[self.ip_node_num, self.feature_num],dim=2) 75 | else: 76 | adj_adjust, h_0 = torch.split(obs,[self.ip_node_num, self.feature_num],dim=1) 77 | 78 | for gcn in self.gcn_list: 79 | h_0 = F.relu(gcn(h_0, adj_adjust)) 80 | 81 | if (len(h_0.size())==3): 82 | # batch 83 | bn_emb = torch.flatten(h_0,1) 84 | else: 85 | bn_emb = torch.flatten(h_0) 86 | return bn_emb 87 | 88 | class Actor(nn.Module): 89 | 90 | def _distribution(self, obs): 91 | raise NotImplementedError 92 | 93 | def _log_prob_from_distribution(self, pi, act): 94 | raise NotImplementedError 95 | 96 | def forward(self, obs, act=None): 97 | # Produce action distributions for given observations, and 98 | # optionally compute the log likelihood of given actions under 99 | # those distributions. 100 | pi = self._distribution(obs) 101 | logp_a = None 102 | if act is not None: 103 | logp_a = self._log_prob_from_distribution(pi, act) 104 | return pi, logp_a 105 | 106 | 107 | class GCNCategoricalActor(Actor): 108 | 109 | def __init__(self, feature_num, ip_node_num, gcn, hidden_sizes, act_num, activation): 110 | super().__init__() 111 | self.GCN = gcn 112 | self.logits_net = mlp([feature_num*ip_node_num] + list(hidden_sizes) + [act_num], activation) 113 | 114 | # logits is the log probability, log_p = ln(p) 115 | def _distribution(self, obs): 116 | obs_emb = self.GCN(obs) 117 | logits = self.logits_net(obs_emb) 118 | return Categorical(logits=logits) 119 | 120 | def _get_logits(self, obs): 121 | obs_emb = self.GCN(obs) 122 | logits = self.logits_net(obs_emb) 123 | return logits 124 | 125 | def _log_prob_from_distribution(self, pi, act): 126 | return pi.log_prob(act) 127 | 128 | class GCNCritic(nn.Module): 129 | 130 | def __init__(self, feature_num, ip_node_num, gcn, hidden_sizes, activation): 131 | super().__init__() 132 | self.GCN = gcn 133 | self.v_net = mlp([feature_num*ip_node_num] + list(hidden_sizes) + [1], activation) 134 | 135 | def forward(self, obs): 136 | return torch.squeeze(self.v_net(self.GCN(obs)), -1) # Critical to ensure v has right shape. 137 | 138 | 139 | 140 | class GCNActorCritic(nn.Module): 141 | def __init__(self, observation_space, action_space, graph_encoder_hidden=256, num_gnn_layer=2, 142 | hidden_sizes=(64,64), activation=nn.ReLU): 143 | super().__init__() 144 | 145 | ip_node_num = observation_space.shape[0] 146 | feature_num = observation_space.shape[1] - ip_node_num 147 | 148 | act_num = action_space.n 149 | self.GCN = GCN(feature_num, ip_node_num, graph_encoder_hidden, num_gnn_layer) 150 | self.pi = GCNCategoricalActor(feature_num, ip_node_num, self.GCN, hidden_sizes, act_num, activation) 151 | 152 | # build value function 153 | self.v = GCNCritic(feature_num, ip_node_num, self.GCN, hidden_sizes, activation) 154 | params_num = sum(functools.reduce( lambda a, b: a*b, x.size()) for x in self.parameters()) 155 | print("# of trainable params:{}".format(params_num)) 156 | 157 | def step(self, obs, mask): 158 | with torch.no_grad(): 159 | pi = self.pi._distribution(obs) 160 | 161 | pi_logits = self.pi._get_logits(obs) 162 | pi_logits_delta = torch.zeros(mask.size()).to(mask.device) 163 | pi_logits_delta[mask == 0] = float("-Inf") 164 | pi_logits += pi_logits_delta 165 | pi_mask = Categorical(logits=pi_logits) 166 | 167 | a = pi_mask.sample() 168 | logp_a = self.pi._log_prob_from_distribution(pi, a) 169 | 170 | v = self.v(obs) 171 | return a.cpu().numpy(), v.cpu().numpy(), logp_a.cpu().numpy() -------------------------------------------------------------------------------- /source/rl/plan_env.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from topology.topology import Topology 3 | from copy import deepcopy 4 | import numpy as np 5 | import pdb, os, time, json 6 | import networkx as nx 7 | import matplotlib.pyplot as plt 8 | import sys, math 9 | 10 | class PlanEnv(gym.Env): 11 | def __init__(self, topo: Topology, log_dir, graph_encoder, max_n_delta_bw, max_action=512,\ 12 | steps_per_epoch=2048, delta_bw=100, checker_mode="all"): 13 | self.max_action = max_action 14 | self.steps_per_epoch = steps_per_epoch 15 | self.checker_mode = checker_mode 16 | self.delta_bw = delta_bw 17 | self.max_rwd = None 18 | self.opt_target = None 19 | self.action_cnt = 0 20 | 21 | self.w1 = 2 22 | self.w2 = 2 23 | self.norm_param=1e-7 24 | self.graph_encoder = graph_encoder 25 | self.max_n_delta_bw = max_n_delta_bw 26 | self.max_ip_node = len(topo.ip.routers) 27 | 28 | # related to utils 29 | self.topo = topo 30 | self.topo_preprocess() 31 | self.original_topo = deepcopy(topo) 32 | 33 | obs, _ = self.get_observation() 34 | 35 | self.observation_space = gym.Space(shape=list(obs.shape)) 36 | print("obv_space size: {}".format(self.observation_space.shape)) 37 | 38 | self.action_space = gym.spaces.Discrete(len(self.topo.ip.links)*max_n_delta_bw) 39 | print("act_space size: {}".format(self.action_space.n)) 40 | 41 | self.cum_rwd = 0 42 | self.complete_cnt = 0 43 | 44 | self.max_rwd = None 45 | self.opt_target = None 46 | self.cost = 0 47 | self.optm_topo = self.topo 48 | self.optm_ob = None 49 | self.opt_action_list = [] 50 | self.optm_epoch_idx = 0 51 | self.action_list = [] 52 | self.epoch_idx = 0 53 | 54 | self.start_sec_ts = int(time.time()) 55 | 56 | action_path = "results/{}/actions.txt".format(log_dir) 57 | self.topo_path = "results/{}/opt_topo".format(log_dir) 58 | if not os.path.exists(self.topo_path): 59 | os.makedirs(self.topo_path) 60 | 61 | self.action_fpr = open(action_path,"w") 62 | 63 | self.action_cnt_cum = 0 # to record the epoch num 64 | self.traj_set = set() # traj set stores the vars (the set of l3 link candidates) that decide the ilp 65 | self.main_epoch_traj_stats_list= [] 66 | traj_path = "results/{}/traj.txt".format(log_dir) 67 | self.traj_fpr = open(traj_path,"w") 68 | 69 | self.main_epoch_traj_num = 0 70 | self.main_epoch_traj_num_visited = 0 71 | self.main_epoch_cache_hit_num = 0 72 | 73 | # cache for accelerating check_sf 74 | # each state is represented as a frozenset((l3_link_idx1, delta_bw), (l3_link_idx2, delta_bw),...) 75 | self.state_map_fp_cache = {} 76 | self.cache_max_entry = 1e6 77 | self.cache_path = "results/{}/cache".format(log_dir) 78 | if not os.path.exists(self.cache_path): 79 | os.makedirs(self.cache_path) 80 | 81 | def topo_preprocess(self): 82 | self.topo.get_edge2node_adj() 83 | 84 | def step(self, action): 85 | obs, reward, done, info = None, None, False, None 86 | 87 | violate_max_cstr_flag, visited_flag, cache_hit_flag = False, False, False 88 | adapt_tm = -1 89 | 90 | action_int, act_type = int(action), 0 91 | 92 | l3_link_idx = int(action_int/self.max_n_delta_bw) 93 | delta_bw_act = self.delta_bw*(int(action_int%self.max_n_delta_bw)+1) 94 | 95 | if act_type == 0: 96 | # add IP link capacity 97 | cost = self.topo.aug_l3_link_by_idx_nocheck_max(l3_link_idx, delta_bw=delta_bw_act) 98 | assert(cost >= 0) 99 | else: 100 | # remove IP link capacity 101 | cost = self.topo.aug_l3_link_by_idx_nocheck_max(l3_link_idx, delta_bw=-delta_bw_act) 102 | 103 | self.action_list.append((action_int, self.topo.spof_failed_point)) 104 | self.action_cnt += 1 105 | self.action_cnt_cum += 1 106 | obs, mask = self.get_observation() 107 | 108 | if cost >= 0: 109 | # check the spof constraints further 110 | sat_flag, cache_hit_flag, self.state_map_fp_cache = self.topo.check_spof(l3_link_idx, delta_bw_act, self.state_map_fp_cache, self.cache_max_entry, self.checker_mode) 111 | reward = -round(cost*self.norm_param, 10) 112 | self.cost += cost 113 | else: 114 | # cost < 0 means violating the max_cstrs 115 | sat_flag = False 116 | violate_max_cstr_flag = True 117 | reward = -400 118 | 119 | 120 | if sum(mask)==0: 121 | # no feasible action 122 | violate_max_cstr_flag = True 123 | 124 | if cache_hit_flag: 125 | self.main_epoch_cache_hit_num += 1 126 | 127 | if sat_flag or self.action_cnt >= self.max_action or violate_max_cstr_flag: 128 | done = 1 129 | else: 130 | done = 0 131 | 132 | if done or (self.action_cnt_cum%self.steps_per_epoch == 0): 133 | visited_flag = self.is_visited_sol() 134 | 135 | if done == 1: 136 | if sat_flag: 137 | reward += 0 138 | else: 139 | reward -= 1 140 | 141 | self.cum_rwd += reward 142 | 143 | action_idx_map_extra_rwd = None 144 | self.opt_sol_on_rl = -1 145 | if sat_flag: 146 | self.save_if_best() 147 | 148 | if done or (self.action_cnt_cum%self.steps_per_epoch == 0): 149 | # save trajectory and plan results 150 | self.save_trajectory(visited_flag, adapt_tm, self.cost, self.cum_rwd, violate_max_cstr_flag, sat_flag) 151 | info = {"log_ptr": self.traj_fpr, "extra_rwd": action_idx_map_extra_rwd} 152 | return obs, mask, reward, done, info 153 | 154 | def reset(self): 155 | self.action_cnt = 0 156 | self.cum_rwd = 0 157 | self.cost = 0 158 | 159 | self.topo.reset() 160 | self.epoch_idx += 1 161 | self.action_list = [] 162 | sys.stdout.flush() 163 | 164 | return self.get_observation() 165 | 166 | def get_observation(self): 167 | """ 168 | ob['ip_adj']:n*n --- E 169 | ob['ip_node']:n*d_n ---- F (longitute, latitude, in_traffic, out_traffic) 170 | """ 171 | E_origin = self.topo.edge2node_adj 172 | E_hat = E_origin + np.eye(E_origin.shape[0]) 173 | 174 | D = np.diag(np.sum(E_hat, axis=1)) 175 | 176 | # https://towardsdatascience.com/how-to-do-deep-learning-on-graphs-with-graph-convolutional-networks-62acf5b143d0 177 | D_spectral = np.sqrt(np.linalg.inv(D)) 178 | E = np.matmul(np.matmul(D_spectral, E_hat),D_spectral) 179 | 180 | F = self.topo.get_edge_feature() 181 | ob = np.concatenate((E,F), axis=1) 182 | 183 | mask = np.asarray(self.topo.get_feasible_action(self.max_n_delta_bw,self.delta_bw)) 184 | return ob, mask 185 | 186 | def terminate(self): 187 | self.action_fpr.write("epoch_cnt:{}, ip_node_num:{}\n".format(self.epoch_idx, self.max_ip_node)) 188 | self.action_fpr.write("total_time(sec):{}\n".format(int(time.time())-self.start_sec_ts)) 189 | self.action_fpr.close() 190 | 191 | def ilp_opt_on_rl(self, action_list): 192 | l3_link_idx_map_cnt = {} 193 | for (l3_link_idx,_) in action_list: 194 | try: 195 | l3_link_idx_map_cnt[l3_link_idx] += 1 196 | except: 197 | l3_link_idx_map_cnt[l3_link_idx] = 1 198 | 199 | return self.original_topo.ilp_solve(self.delta_bw, l3_link_idx_map_cnt, relax_factor=1) 200 | 201 | def save_if_best(self): 202 | self.complete_cnt += 1 203 | 204 | if self.opt_target == None or self.opt_target >= self.cost: 205 | self.max_rwd = self.cum_rwd 206 | self.opt_target = self.cost 207 | self.optm_topo = self.topo 208 | self.optm_ob = self.get_observation() 209 | self.optm_epoch_idx = self.epoch_idx 210 | self.opt_action_list = self.action_list[:] 211 | self.ip_idx_map_num_step = {} 212 | for i in self.opt_action_list: 213 | link_idx = int(i[0]/self.max_n_delta_bw) 214 | delta_bw_act = int(i[0]%self.max_n_delta_bw)+1 215 | try: 216 | self.ip_idx_map_num_step[link_idx] += delta_bw_act 217 | except: 218 | self.ip_idx_map_num_step[link_idx] = delta_bw_act 219 | 220 | main_epoch_idx = int((self.action_cnt_cum-1)/self.steps_per_epoch) 221 | self.action_fpr.write("local_opt:{} {} {} ilp_opt_ob_rl:{} {} {}\nip_idx_list:{}\n".format(self.epoch_idx, self.max_rwd, self.opt_target, self.opt_sol_on_rl, \ 222 | len(self.opt_action_list), self.opt_action_list, self.ip_idx_map_num_step)) 223 | self.action_fpr.flush() 224 | 225 | opt_topo_path = '{}/{}_main_epoch{}_cost{}.txt'.format(self.topo_path, int(time.time())-self.start_sec_ts, main_epoch_idx, self.cost) 226 | opt_topo_fpr = open(opt_topo_path,"w") 227 | opt_topo_fpr.write(json.dumps(self.ip_idx_map_num_step)) 228 | opt_topo_fpr.close() 229 | 230 | def is_visited_sol(self): 231 | link_cand_list = [int(action) for (action, cost) in self.action_list] 232 | link_cand_tuple = tuple(sorted(link_cand_list)) 233 | visited_flag = (tuple(link_cand_tuple) in self.traj_set) 234 | if visited_flag: 235 | self.main_epoch_traj_num_visited += 1 236 | self.main_epoch_traj_num += 1 237 | self.traj_set.add(link_cand_tuple) 238 | 239 | return visited_flag 240 | 241 | def save_trajectory(self, visited_flag, adapt_tm, cost, reward, violate_max_cstr_flag, sat_flag): 242 | main_epoch_idx = int((self.action_cnt_cum-1)/self.steps_per_epoch) 243 | 244 | self.traj_fpr.write("main epoch idx:{}, visited_flag:{}, adapt_tm:{}, cost:{}, rwd:{}, violate_max_cstr_flag:{}, sat_flag:{}\n action_list:{} {}\n ".\ 245 | format(main_epoch_idx, visited_flag, adapt_tm, cost, reward, violate_max_cstr_flag, sat_flag, len(self.action_list), self.action_list)) 246 | if self.action_cnt_cum%self.max_action == 0: 247 | # current epoch terminate 248 | self.traj_fpr.write("main epoch idx:{}, # of traj:{}, # of visited traj:{}, visited_ratio:{}, cache_hit_num:{}, cache_num:{}\n". 249 | format(main_epoch_idx, self.main_epoch_traj_num, self.main_epoch_traj_num_visited, \ 250 | round(self.main_epoch_traj_num_visited/self.main_epoch_traj_num, 4), self.main_epoch_cache_hit_num, len(self.state_map_fp_cache))) 251 | self.main_epoch_traj_num_visited = 0 252 | self.main_epoch_traj_num = 0 253 | self.main_epoch_cache_hit_num = 0 254 | 255 | self.traj_fpr.flush() 256 | -------------------------------------------------------------------------------- /source/rl/rl.py: -------------------------------------------------------------------------------- 1 | import pdb, time, sys,torch 2 | 3 | from rl.plan_env import PlanEnv 4 | from rl.ac import GCNActorCritic 5 | sys.path.insert(0 ,"../spinningup/") 6 | from spinup import vpg_pytorch 7 | 8 | class RL(object): 9 | def __init__(self, topo, graph_encoder="GCN", num_gnn_layer=2, \ 10 | max_n_delta_bw=1, hidden_sizes=(256, 256), \ 11 | epoch_num=1024, max_action=512,steps_per_epoch=1024,\ 12 | delta_bw=100, checker_mode="all", model_path=None): 13 | 14 | self.topo = topo 15 | 16 | self.graph_encoder = graph_encoder 17 | self.num_gnn_layer = num_gnn_layer 18 | self.hidden_sizes = hidden_sizes 19 | 20 | self.epoch_num = epoch_num 21 | self.max_action = max_action 22 | self.steps_per_epoch = steps_per_epoch 23 | self.delta_bw = delta_bw 24 | self.max_n_delta_bw = max_n_delta_bw 25 | 26 | self.checker_mode = checker_mode 27 | self.model_path = model_path 28 | 29 | log_dir_name_list = [int(time.time()), len(self.topo.ip.links), self.graph_encoder, \ 30 | self.max_n_delta_bw, self.steps_per_epoch, self.delta_bw] 31 | self.log_dir = '_'.join([str(i) for i in log_dir_name_list]) 32 | 33 | def get_env(self): 34 | self.env = PlanEnv(self.topo, log_dir=self.log_dir, graph_encoder=self.graph_encoder, \ 35 | max_n_delta_bw=self.max_n_delta_bw, max_action=self.max_action, steps_per_epoch=self.steps_per_epoch, delta_bw=self.delta_bw, checker_mode=self.checker_mode) 36 | return self.env 37 | 38 | def run_training(self): 39 | logger_kwargs = dict(output_dir="results/{}".format(self.log_dir), exp_name="test") 40 | ac_kwargs = dict(graph_encoder_hidden=256,hidden_sizes=self.hidden_sizes, num_gnn_layer=self.num_gnn_layer) 41 | 42 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 43 | ac = GCNActorCritic 44 | 45 | vpg_pytorch(self.get_env, enable_mpi=False, non_blocking=False, gamma=1,actor_critic=ac,\ 46 | max_ep_len=self.max_action, seed=8, device=device, \ 47 | model_path=self.model_path, \ 48 | ac_kwargs=ac_kwargs,epochs=self.epoch_num,steps_per_epoch=self.steps_per_epoch,logger_kwargs=logger_kwargs) 49 | 50 | self.env.terminate() -------------------------------------------------------------------------------- /source/simulate/flow.py: -------------------------------------------------------------------------------- 1 | from topology.ip.router import Router 2 | 3 | class Flow: 4 | # cos represents the priority (GOLD-ICP, SILVER, BRONZE) 5 | def __init__(self, name, src:Router, dst:Router, capacity, cos): 6 | self.name = name 7 | self.src = src 8 | self.dst = dst 9 | self.capacity = capacity 10 | self.cos = cos 11 | -------------------------------------------------------------------------------- /source/simulate/spof.py: -------------------------------------------------------------------------------- 1 | class Spof: 2 | def __init__(self, name, fiber_name_list, cos_protect_list): 3 | self.name = name 4 | self.fiber_name_list = fiber_name_list 5 | self.cos_protect_list = cos_protect_list 6 | -------------------------------------------------------------------------------- /source/simulate/spofs.py: -------------------------------------------------------------------------------- 1 | from simulate.spof import Spof 2 | class Spofs: 3 | def __init__(self): 4 | self.spof_list = [] 5 | 6 | def register_spof(self, name, fiber_name_list, cos_protect_list): 7 | self.spof_list.append(Spof(name, fiber_name_list, cos_protect_list)) 8 | 9 | -------------------------------------------------------------------------------- /source/simulate/traffic_matrix.py: -------------------------------------------------------------------------------- 1 | from simulate.flow import Flow 2 | 3 | class TrafficMatrix: 4 | def __init__(self): 5 | self.data = {"all":{},"no-bronze":{}} 6 | self.flows = {} 7 | 8 | def add_data(self, src, dst, traffic, type): 9 | if src not in self.data[type]: 10 | self.data[type][src] = {} 11 | 12 | # aggregate traffic with the same OD pair for different priorities(GOLD, SILVER, BRONZE) 13 | try: 14 | self.data[type][src][dst] += traffic 15 | except: 16 | self.data[type][src][dst] = traffic 17 | 18 | def register_flow(self, flow_name, src, dst, traffic, cos): 19 | assert(flow_name not in self.flows) 20 | self.flows[flow_name] = Flow(flow_name, src, dst, traffic, cos) 21 | if cos!="BRONZE": 22 | self.add_data(src.name, dst.name, traffic, "no-bronze") 23 | self.add_data(src.name, dst.name, traffic, "all") 24 | -------------------------------------------------------------------------------- /source/test.py: -------------------------------------------------------------------------------- 1 | import argparse, sys, time, collections 2 | import os, signal, socket, json, pdb 3 | import torch 4 | from rl.rl import RL 5 | from planning.ilp import ILP 6 | from topology.topology import Topology 7 | 8 | def read_topo(topo_name, adjust_factor_in=None): 9 | assert(topo_name in ["A", "B", "C", "D", "E"]) 10 | topo_name_map_file_path = {} 11 | 12 | file_path = topo_name_map_file_path[topo_name] 13 | 14 | topo = Topology(adjust_factor=adjust_factor) 15 | topo.import_fiber_from_file(file_path) 16 | topo.import_lease_from_file(file_path) 17 | topo.import_l3_node_from_file(file_path) 18 | topo.import_l3_link_from_file(file_path) 19 | topo.import_tm_from_file(file_path) 20 | topo.import_spof_from_file(file_path) 21 | 22 | topo.gen_failed_ip_link_and_spof_map() 23 | topo.generate_delta_bw_matrix_from_spof_list() 24 | 25 | return topo 26 | 27 | # implementation efficiency 28 | def fig_7(epoch_num=10): 29 | checker_mode_list = ["all", "sa", "vanilla"] 30 | topo_name_list = ["A", "B", "C", "D", "E"] 31 | 32 | result_log = collections.defaultdict(dict) 33 | for topo_name in topo_name_list: 34 | if topo_name == "A": 35 | checker_mode_list = ["all", "sa", "vanilla"] 36 | else: 37 | checker_mode_list = ["all", "sa"] 38 | 39 | for checker_mode in checker_mode_list: 40 | print(f'\n========== checker_mode:{checker_mode} topo_name:{topo_name} ==========\n') 41 | rl_solver = RL(topo=read_topo(topo_name), num_gnn_layer=2, max_n_delta_bw=1, checker_mode=checker_mode) 42 | rl_solver.run_training() 43 | 44 | # read the last line of the log file and calculate the avg time per epoch 45 | file_path = "results/{}/progress.txt".format(rl_solver.log_dir) 46 | with open(file_path, 'r') as f: 47 | last_line = f.readlines()[-1] 48 | ele_list = last_line.strip().split('\t') 49 | avg_rt = round(float(ele_list[-1])/(int(ele_list[0])+1), 2) 50 | print(f'========== average running time: {avg_rt} seconds ========') 51 | result_log[topo_name][checker_mode] = avg_rt 52 | print(f'==== before normalization:{result_log}') 53 | for topo_name, d in result_log.items(): 54 | norm_val = d["all"] 55 | print(f'\n======== final results of topo: {topo_name} ========') 56 | for k in d.keys(): 57 | result_log[topo_name][k] = round(result_log[topo_name][k]/norm_val, 5) 58 | print(f'{k}, {result_log[topo_name][k]}') 59 | 60 | # single data point, used for Figure 8 61 | # support ILP, First-stage and Second-stage 62 | def single_dp_fig8(alg, adjust_factor_in=1, load_trained=True): 63 | print(f'\n========== Fig8 start, A-{adjust_factor_in}, alg:{alg} ==========\n') 64 | 65 | if alg == "ILP": 66 | ilp_solver = ILP(topo=read_topo("A", adjust_factor_in=adjust_factor_in)) 67 | ilp_solver.run_ilp() 68 | print(f'========== Topo: A-{adjust_factor_in}, result: {ilp_solver.cost_opt} =========\n') 69 | elif alg == "NeuroPlan": 70 | if load_trained: 71 | if int(adjust_factor_in) == adjust_factor_in: 72 | af_file_name = int(adjust_factor_in) 73 | else: 74 | af_file_name = adjust_factor_in 75 | model_path = f'results/trained/A-{af_file_name}/' 76 | if af_file_name == 1: 77 | model_path = f'results/trained/A/' 78 | if os.path.exists(model_path + "pyt_save/model.pt") == False: 79 | model_path = None 80 | else: 81 | model_path = None 82 | print(f'\n========== Fig8, RL: Topo: A-{adjust_factor_in}, load pre-trained model: {model_path} ==========\n') 83 | rl_solver = RL(topo=read_topo("A", adjust_factor_in=adjust_factor_in), model_path=model_path, num_gnn_layer=2, max_n_delta_bw=1) 84 | rl_solver.run_training() 85 | print(f'========== first stage result: {rl_solver.env.opt_target} =========\n') 86 | subopt_sol = rl_solver.env.ip_idx_map_num_step 87 | print(f'\n========== ILP on second stage: adjust_factor_in:{adjust_factor_in} ==========\n') 88 | ilp_solver = ILP(topo=read_topo("A", adjust_factor_in=adjust_factor_in)) 89 | ilp_solver.run_ilp(subopt_sol=subopt_sol, relax_factor=1.5) 90 | print(f'========== second stage, adjust_factor_in: {adjust_factor_in}, result: {ilp_solver.cost_opt} =========\n') 91 | else: 92 | print("Illegal args") 93 | 94 | # single data point, used for Figure 9 95 | # support ILP, ILP-huer and First-stage 96 | def single_dp_fig9(topo_name, alg, adjust_factor_in=1.0, load_trained=True): 97 | print(f'\n========== start: topo_name:{topo_name} alg:{alg} adjust_factor_in:{adjust_factor_in}==========\n') 98 | 99 | if alg == "ILP": 100 | ilp_solver = ILP(topo=read_topo(topo_name, adjust_factor_in=adjust_factor_in)) 101 | ilp_solver.run_ilp() 102 | print(f'========== result: {ilp_solver.cost_opt} =========\n') 103 | elif alg == "ILP-heur": 104 | ilp_solver = ILP(topo=read_topo(topo_name)) 105 | ilp_solver.run_ilp_heuristic() 106 | print(f'========== result: {ilp_solver.cost_opt} =========\n') 107 | elif alg == "NeuroPlan": 108 | if load_trained: 109 | model_path = f'results/trained/{topo_name}/' 110 | if os.path.exists(model_path + "pyt_save/model.pt") == False: 111 | model_path = None 112 | else: 113 | model_path = None 114 | print(f'\n========== RL: topo_name:{topo_name}, load pre-trained model: {model_path} ==========\n') 115 | rl_solver = RL(topo=read_topo(topo_name), model_path=model_path, num_gnn_layer=2, max_n_delta_bw=1) 116 | rl_solver.run_training() 117 | print(f'========== first stage result: {rl_solver.env.opt_target} =========\n') 118 | else: 119 | print("Illegal args") 120 | 121 | # given the path of the sol form the first stage, run second stage 122 | def second_stage(topo_name, sol_path, rf=1.0): 123 | 124 | with open(sol_path) as json_file: 125 | json_dict = json.load(json_file) 126 | subopt_sol = {} 127 | for k, v in json_dict.items(): 128 | subopt_sol[int(k)] = v 129 | ilp_solver = ILP(topo=read_topo(topo_name)) 130 | ilp_solver.run_ilp(subopt_sol=subopt_sol, relax_factor=rf) 131 | print(f'========== sol from the first stage: {subopt_sol} ============\n') 132 | print(f'========== second stage, topo_name: {topo_name}, rf: {rf}, result: {ilp_solver.cost_opt} =========\n') 133 | 134 | # single data point, used for Figure 10, 11, 12 135 | def params_rl(adjust_factor_in=1.0, num_gnn_layer=2, max_n_delta_bw=1, hidden_sizes=(256, 256)): 136 | print(f'\n========== start: adjust_factor_in:{adjust_factor_in} num_gnn_layer:{num_gnn_layer}, max_n_delta_bw:{max_n_delta_bw}, hidden_sizes:{hidden_sizes} ==========\n') 137 | 138 | rl_solver = RL(topo=read_topo("A", adjust_factor_in=adjust_factor_in), num_gnn_layer=num_gnn_layer, \ 139 | max_n_delta_bw=max_n_delta_bw,hidden_sizes=hidden_sizes) 140 | rl_solver.run_training() 141 | print(f'\n========== end: adjust_factor_in:{adjust_factor_in} num_gnn_layer:{num_gnn_layer}, max_n_delta_bw:{max_n_delta_bw}, hidden_sizes:{hidden_sizes} ==========') 142 | print(f'result: {rl_solver.env.opt_target}') 143 | 144 | if __name__ == "__main__": 145 | arg = sys.argv[1] 146 | if arg == 'fig_7': 147 | fig_7(int(sys.argv[2])) 148 | elif arg == 'fig_8': 149 | fig_8() 150 | elif arg == 'fig_9_13': 151 | fig_9_13() 152 | elif arg == "single_dp_fig8": 153 | if len(sys.argv)==5 and sys.argv[4]=="False": 154 | single_dp_fig8(sys.argv[2], float(sys.argv[3]), load_trained=False) 155 | else: 156 | single_dp_fig8(sys.argv[2], float(sys.argv[3]), load_trained=True) 157 | elif arg == "single_dp_fig9": 158 | if len(sys.argv)==5 and sys.argv[4]=="False": 159 | single_dp_fig9(sys.argv[2], sys.argv[3], load_trained=False) 160 | else: 161 | single_dp_fig9(sys.argv[2], sys.argv[3], load_trained=True) 162 | elif arg == "second_stage": 163 | second_stage(sys.argv[2], sys.argv[3], float(sys.argv[4])) 164 | elif arg == "fig_10": 165 | params_rl(adjust_factor_in=float(sys.argv[2]), num_gnn_layer=int(sys.argv[3])) 166 | elif arg == "fig_11": 167 | params_rl(adjust_factor_in=float(sys.argv[2]), hidden_sizes=(int(sys.argv[3]), int(sys.argv[3]))) 168 | elif arg == "fig_12": 169 | params_rl(adjust_factor_in=float(sys.argv[2]), max_n_delta_bw=int(sys.argv[3])) 170 | else: 171 | print("Illegal args") 172 | 173 | -------------------------------------------------------------------------------- /source/topology/ip/link.py: -------------------------------------------------------------------------------- 1 | from topology.utils.node import Node 2 | 3 | 4 | class Link: 5 | 6 | def __init__(self, name, optic_set: frozenset, src: Node, dst: Node, idx=-1, initial_bw=0, max_bw=None, igp=0, fiber_map_spectrum=None, cost=None): 7 | self.name = name 8 | self.optic_set = optic_set 9 | self.src = src 10 | self.dst = dst 11 | self.idx = idx 12 | self.initial_bw = initial_bw 13 | self.bandwidth = initial_bw 14 | self.max_bw = max_bw 15 | self.igp = igp 16 | self.fiber_map_spectrum = fiber_map_spectrum 17 | self.cost = cost 18 | 19 | 20 | def incr_bw(self, delta_bw): 21 | self.bandwidth += delta_bw 22 | 23 | def reset_bw(self): 24 | self.bandwidth = self.initial_bw 25 | -------------------------------------------------------------------------------- /source/topology/ip/network.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | 3 | import topology.ip as ip 4 | from topology.ip.router import Router 5 | from topology.ip.link import Link 6 | import collections, pdb, sys 7 | from pulp import * 8 | import numpy as np 9 | 10 | 11 | class Network: 12 | def __init__(self): 13 | self.routers = {} # Set of Router objects 14 | self.links = {} # Set of Link objects 15 | 16 | self.idx_map_router_name = {} 17 | self.router_name_map_idx = {} 18 | self.router_idx_cnt = 0 19 | 20 | self.link_name_map_cost = {} 21 | self.idx_map_link_name = {} 22 | self.link_name_map_idx = {} 23 | self.link_name_map_od_pair = {} 24 | self.link_name_map_fiber_and_spectrum = {} 25 | 26 | # utils for multiple edge between two routers 27 | self.od_pair_map_dup_cnt = {} 28 | 29 | def reset_link_bw(self): 30 | for link in self.links.values(): 31 | link.reset_bw() 32 | 33 | def clear_links(self): 34 | self.links = {} 35 | self.idx_map_link_name = {} 36 | self.link_name_map_idx = {} 37 | 38 | def register_router(self, router_name, l1_node, stub): 39 | self.routers[router_name] = Router(router_name, l1_node, stub) 40 | self.idx_map_router_name[self.router_idx_cnt] = router_name 41 | self.router_name_map_idx[router_name] = self.router_idx_cnt 42 | self.router_idx_cnt += 1 43 | 44 | def get_router_by_name(self, router_name): 45 | return self.routers[router_name] 46 | 47 | def register_link(self, link_name, optic_set, src, dst, idx=-1, initial_bw=0, max_bw=0, igp=0, fiber_map_spectrum=None, cost=None): 48 | try: 49 | assert(link_name not in self.links) 50 | except: 51 | raise Exception("exist link_name:{}".format(link_name)) 52 | 53 | self.idx_map_link_name[idx] = link_name 54 | self.link_name_map_idx[link_name] = idx 55 | 56 | self.links[link_name] = Link(link_name, optic_set, src, dst, idx=idx, initial_bw=initial_bw,\ 57 | max_bw=max_bw, igp=igp, fiber_map_spectrum=fiber_map_spectrum,cost=cost) 58 | 59 | def add_link(self, src_router, dst_router, bandwidth): 60 | src, dst = src_router.name, dst_router.name 61 | link_name = '%s-%s' % (min(src, dst), max(src, dst)) 62 | new_link = Link(src_router, dst_router, bandwidth) 63 | self.links[link_name] = new_link 64 | 65 | def generate_graph_from_multi_edge(self, failed_link_name_list=[]): 66 | graph = nx.DiGraph() 67 | capa_matrix = {} 68 | # Add nodes 69 | for link_name, link in self.links.items(): 70 | src_name = link.src.name 71 | dst_name = link.dst.name 72 | 73 | if link_name not in failed_link_name_list: 74 | # parallel edges between two nodes 75 | try: 76 | capa_matrix[(src_name, dst_name)] += link.bandwidth 77 | except: 78 | capa_matrix[(src_name, dst_name)] = link.bandwidth 79 | 80 | # add nodes and edges 81 | for (src_name, dst_name), bw in capa_matrix.items(): 82 | graph.add_edge(src_name, dst_name, capacity=bw) 83 | graph.add_edge(dst_name, src_name, capacity=bw) 84 | return graph 85 | 86 | # generate non direction graph, used to perform a complete ILP approach 87 | # max_cstr_sol is used to take the second step for RL approach 88 | def generate_non_direction_graph(self, adjust_factor=1.0, max_cstr_sol=None, relax_factor=1): 89 | graph = nx.MultiGraph() 90 | init_cost = 0 91 | for link_name, link in self.links.items(): 92 | 93 | src_name = link.src.name 94 | dst_name = link.dst.name 95 | if max_cstr_sol == None: 96 | max_delta_step = -1 97 | else: 98 | max_delta_step = int(max_cstr_sol.get(link.idx, 0)*relax_factor) 99 | 100 | graph.add_edge(src_name, dst_name, capacity=int(link.bandwidth*adjust_factor), name=link_name, cost=link.cost, \ 101 | fiber_map_spectrum=link.fiber_map_spectrum, idx=link.idx, max_delta_step=max_delta_step) 102 | init_cost += link.bandwidth*link.cost 103 | 104 | return graph, init_cost 105 | 106 | def generate_graph(self, failed_links=None): 107 | graph = nx.DiGraph() 108 | 109 | # Add nodes 110 | graph.add_nodes_from(list(self.routers.keys())) 111 | 112 | # Add edges 113 | for link in self.links.values(): 114 | if failed_links is None or link not in failed_links: 115 | # add bidirectional edge 116 | graph.add_edge(link.src.name, link.dst.name, capacity=link.bandwidth) 117 | graph.add_edge(link.dst.name, link.src.name, capacity=link.bandwidth) 118 | 119 | return graph 120 | -------------------------------------------------------------------------------- /source/topology/ip/router.py: -------------------------------------------------------------------------------- 1 | from topology.utils.node import Node 2 | 3 | class Router(Node): 4 | def __init__(self, name, l1_node:Node, stub:bool): 5 | super().__init__(name) 6 | self.l1_node = l1_node 7 | self.stub = stub 8 | 9 | 10 | -------------------------------------------------------------------------------- /source/topology/optical/optic_fiber.py: -------------------------------------------------------------------------------- 1 | from topology.optical.optic_node import OpticNode 2 | 3 | class OpticFiber: 4 | def __init__(self, name, src: OpticNode, dst: OpticNode, length,lease_flag=False,max_fp=None,lighted_fp=None,spectrum=None,\ 5 | min_bw=None,max_bw=None): 6 | self.name = name 7 | self.src = src 8 | self.dst = dst 9 | self.length = length 10 | self.lease_flag = lease_flag 11 | 12 | # attributes for fibers owned: lease_flag=False 13 | self.max_fp = max_fp 14 | self.lighted_fp = lighted_fp 15 | self.spectrum = spectrum 16 | 17 | # attributes for fibers leased: lease_flag=True 18 | self.min_bw = min_bw 19 | self.max_bw = max_bw 20 | 21 | if lease_flag: 22 | assert(self.min_bw!=None) 23 | assert(self.max_bw!=None) 24 | else: 25 | assert(self.max_fp!=None) 26 | assert(self.spectrum!=None) 27 | -------------------------------------------------------------------------------- /source/topology/optical/optic_network.py: -------------------------------------------------------------------------------- 1 | from topology.optical.optic_node import OpticNode 2 | from topology.optical.optic_fiber import OpticFiber 3 | import networkx as nx 4 | 5 | class OpticNetwork: 6 | def __init__(self): 7 | self.nodes = {} # Set of OpticNode objects 8 | self.fibers = {} # Set of OpticFiber objects 9 | 10 | def register_node(self, node_name): 11 | self.nodes[node_name] = OpticNode(node_name) 12 | 13 | def get_node_by_name(self, node_name): 14 | return self.nodes[node_name] 15 | 16 | def register_fiber(self, fiber_name, src: OpticNode, dst: OpticNode,length,lease_flag=False,max_fp=None,lighted_fp=None,spectrum=None,\ 17 | min_bw=None,max_bw=None): 18 | self.fibers[fiber_name] = OpticFiber(fiber_name, src, dst, length=length, lease_flag=lease_flag, \ 19 | max_fp=max_fp, lighted_fp=lighted_fp, spectrum=spectrum,min_bw=min_bw,max_bw=max_bw) 20 | 21 | def generate_non_direction_graph(self, od_pair_map_optic): 22 | graph = nx.Graph() 23 | for optic_name in od_pair_map_optic.values(): 24 | optic_inst = self.fibers[optic_name] 25 | src_name = optic_inst.src.name 26 | dst_name = optic_inst.dst.name 27 | graph.add_edge(src_name, dst_name, name=optic_name, length=optic_inst.length) 28 | 29 | return graph 30 | -------------------------------------------------------------------------------- /source/topology/optical/optic_node.py: -------------------------------------------------------------------------------- 1 | from topology.utils.node import Node 2 | 3 | class OpticNode(Node): 4 | def __init__(self, name): 5 | super().__init__(name) 6 | -------------------------------------------------------------------------------- /source/topology/optical/optic_path.py: -------------------------------------------------------------------------------- 1 | from topology.optical.optic_node import OpticNode 2 | from topology.optical.optic_fiber import OpticFiber 3 | 4 | class OpticPath: 5 | def __init__(self, node_name_set: set, fiber_name_list: list, length): 6 | self.node_name_set = node_name_set 7 | self.fiber_name_list = fiber_name_list 8 | self.length = length -------------------------------------------------------------------------------- /source/topology/utils/node.py: -------------------------------------------------------------------------------- 1 | class Node: 2 | def __init__(self, name): 3 | self.name = name 4 | 5 | -------------------------------------------------------------------------------- /spinningup/.gitignore: -------------------------------------------------------------------------------- 1 | *.*~ 2 | __pycache__/ 3 | *.pkl 4 | data/ 5 | **/*.egg-info 6 | .python-version 7 | .idea/ 8 | .vscode/ 9 | .DS_Store 10 | _build/ 11 | -------------------------------------------------------------------------------- /spinningup/.travis.yml: -------------------------------------------------------------------------------- 1 | env: 2 | global: 3 | - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/travis/.mujoco/mujoco200/bin 4 | 5 | matrix: 6 | include: 7 | - os: linux 8 | language: python 9 | python: "3.6" 10 | 11 | before_install: 12 | - ./travis_setup.sh 13 | 14 | script: 15 | - pip3 install --upgrade -e .[mujoco] 16 | - python3 -c "import mujoco_py" 17 | - python3 -c "import spinup" 18 | - python3 -m pytest 19 | -------------------------------------------------------------------------------- /spinningup/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2018 OpenAI (http://openai.com) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /spinningup/readme.md: -------------------------------------------------------------------------------- 1 | **Status:** Maintenance (expect bug fixes and minor updates) 2 | 3 | Welcome to Spinning Up in Deep RL! 4 | ================================== 5 | 6 | This is an educational resource produced by OpenAI that makes it easier to learn about deep reinforcement learning (deep RL). 7 | 8 | For the unfamiliar: [reinforcement learning](https://en.wikipedia.org/wiki/Reinforcement_learning) (RL) is a machine learning approach for teaching agents how to solve tasks by trial and error. Deep RL refers to the combination of RL with [deep learning](http://ufldl.stanford.edu/tutorial/). 9 | 10 | This module contains a variety of helpful resources, including: 11 | 12 | - a short [introduction](https://spinningup.openai.com/en/latest/spinningup/rl_intro.html) to RL terminology, kinds of algorithms, and basic theory, 13 | - an [essay](https://spinningup.openai.com/en/latest/spinningup/spinningup.html) about how to grow into an RL research role, 14 | - a [curated list](https://spinningup.openai.com/en/latest/spinningup/keypapers.html) of important papers organized by topic, 15 | - a well-documented [code repo](https://github.com/openai/spinningup) of short, standalone implementations of key algorithms, 16 | - and a few [exercises](https://spinningup.openai.com/en/latest/spinningup/exercises.html) to serve as warm-ups. 17 | 18 | Get started at [spinningup.openai.com](https://spinningup.openai.com)! 19 | 20 | 21 | Citing Spinning Up 22 | ------------------ 23 | 24 | If you reference or use Spinning Up in your research, please cite: 25 | 26 | ``` 27 | @article{SpinningUp2018, 28 | author = {Achiam, Joshua}, 29 | title = {{Spinning Up in Deep Reinforcement Learning}}, 30 | year = {2018} 31 | } 32 | ``` -------------------------------------------------------------------------------- /spinningup/readthedocs.yml: -------------------------------------------------------------------------------- 1 | build: 2 | image: latest 3 | 4 | python: 5 | version: 3.6 -------------------------------------------------------------------------------- /spinningup/setup.py: -------------------------------------------------------------------------------- 1 | from os.path import join, dirname, realpath 2 | from setuptools import setup 3 | import sys 4 | 5 | assert sys.version_info.major == 3 and sys.version_info.minor >= 6, \ 6 | "The Spinning Up repo is designed to work with Python 3.6 and greater." \ 7 | + "Please install it before proceeding." 8 | 9 | with open(join("spinup", "version.py")) as version_file: 10 | exec(version_file.read()) 11 | 12 | setup( 13 | name='spinup', 14 | py_modules=['spinup'], 15 | version=__version__,#'0.1', 16 | install_requires=[ 17 | 'cloudpickle==1.2.1', 18 | 'gym[atari,box2d,classic_control]~=0.15.3', 19 | 'ipython', 20 | 'joblib', 21 | 'matplotlib==3.1.1', 22 | 'mpi4py', 23 | 'numpy', 24 | 'pandas', 25 | 'pytest', 26 | 'psutil', 27 | 'scipy', 28 | 'seaborn==0.8.1', 29 | 'tensorflow>=1.8.0,<2.0', 30 | 'torch==1.3.1', 31 | 'tqdm' 32 | ], 33 | description="Teaching tools for introducing people to deep RL.", 34 | author="Joshua Achiam", 35 | ) 36 | -------------------------------------------------------------------------------- /spinningup/spinup/__init__.py: -------------------------------------------------------------------------------- 1 | # Disable TF deprecation warnings. 2 | # Syntax from tf1 is not expected to be compatible with tf2. 3 | import tensorflow as tf 4 | tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) 5 | 6 | # Algorithms 7 | from spinup.algos.tf1.ddpg.ddpg import ddpg as ddpg_tf1 8 | from spinup.algos.tf1.ppo.ppo import ppo as ppo_tf1 9 | from spinup.algos.tf1.sac.sac import sac as sac_tf1 10 | from spinup.algos.tf1.td3.td3 import td3 as td3_tf1 11 | from spinup.algos.tf1.trpo.trpo import trpo as trpo_tf1 12 | from spinup.algos.tf1.vpg.vpg import vpg as vpg_tf1 13 | 14 | from spinup.algos.pytorch.ddpg.ddpg import ddpg as ddpg_pytorch 15 | from spinup.algos.pytorch.ppo.ppo import ppo as ppo_pytorch 16 | from spinup.algos.pytorch.sac.sac import sac as sac_pytorch 17 | from spinup.algos.pytorch.td3.td3 import td3 as td3_pytorch 18 | from spinup.algos.pytorch.trpo.trpo import trpo as trpo_pytorch 19 | from spinup.algos.pytorch.vpg.vpg import vpg as vpg_pytorch 20 | 21 | # Loggers 22 | from spinup.utils.logx import Logger, EpochLogger 23 | 24 | # Version 25 | from spinup.version import __version__ 26 | -------------------------------------------------------------------------------- /spinningup/spinup/algos/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netx-repo/neuroplan/f4d73d71b67261d819494385b1a7e3270b7499e5/spinningup/spinup/algos/__init__.py -------------------------------------------------------------------------------- /spinningup/spinup/algos/pytorch/ddpg/core.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.signal 3 | 4 | import torch 5 | import torch.nn as nn 6 | 7 | 8 | def combined_shape(length, shape=None): 9 | if shape is None: 10 | return (length,) 11 | return (length, shape) if np.isscalar(shape) else (length, *shape) 12 | 13 | def mlp(sizes, activation, output_activation=nn.Identity): 14 | layers = [] 15 | for j in range(len(sizes)-1): 16 | act = activation if j < len(sizes)-2 else output_activation 17 | layers += [nn.Linear(sizes[j], sizes[j+1]), act()] 18 | return nn.Sequential(*layers) 19 | 20 | def count_vars(module): 21 | return sum([np.prod(p.shape) for p in module.parameters()]) 22 | 23 | class MLPActor(nn.Module): 24 | 25 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation, act_limit): 26 | super().__init__() 27 | pi_sizes = [obs_dim] + list(hidden_sizes) + [act_dim] 28 | self.pi = mlp(pi_sizes, activation, nn.Tanh) 29 | self.act_limit = act_limit 30 | 31 | def forward(self, obs): 32 | # Return output from network scaled to action space limits. 33 | return self.act_limit * self.pi(obs) 34 | 35 | class MLPQFunction(nn.Module): 36 | 37 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation): 38 | super().__init__() 39 | self.q = mlp([obs_dim + act_dim] + list(hidden_sizes) + [1], activation) 40 | 41 | def forward(self, obs, act): 42 | q = self.q(torch.cat([obs, act], dim=-1)) 43 | return torch.squeeze(q, -1) # Critical to ensure q has right shape. 44 | 45 | class MLPActorCritic(nn.Module): 46 | 47 | def __init__(self, observation_space, action_space, hidden_sizes=(256,256), 48 | activation=nn.ReLU): 49 | super().__init__() 50 | 51 | obs_dim = observation_space.shape[0] 52 | act_dim = action_space.shape[0] 53 | act_limit = action_space.high[0] 54 | 55 | # build policy and value functions 56 | self.pi = MLPActor(obs_dim, act_dim, hidden_sizes, activation, act_limit) 57 | self.q = MLPQFunction(obs_dim, act_dim, hidden_sizes, activation) 58 | 59 | def act(self, obs): 60 | with torch.no_grad(): 61 | return self.pi(obs).numpy() 62 | -------------------------------------------------------------------------------- /spinningup/spinup/algos/pytorch/ppo/core.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.signal 3 | from gym.spaces import Box, Discrete 4 | 5 | import torch 6 | import torch.nn as nn 7 | from torch.distributions.normal import Normal 8 | from torch.distributions.categorical import Categorical 9 | 10 | 11 | def combined_shape(length, shape=None): 12 | if shape is None: 13 | return (length,) 14 | return (length, shape) if np.isscalar(shape) else (length, *shape) 15 | 16 | 17 | def mlp(sizes, activation, output_activation=nn.Identity): 18 | layers = [] 19 | for j in range(len(sizes)-1): 20 | act = activation if j < len(sizes)-2 else output_activation 21 | layers += [nn.Linear(sizes[j], sizes[j+1]), act()] 22 | return nn.Sequential(*layers) 23 | 24 | 25 | def count_vars(module): 26 | return sum([np.prod(p.shape) for p in module.parameters()]) 27 | 28 | 29 | def discount_cumsum(x, discount): 30 | """ 31 | magic from rllab for computing discounted cumulative sums of vectors. 32 | 33 | input: 34 | vector x, 35 | [x0, 36 | x1, 37 | x2] 38 | 39 | output: 40 | [x0 + discount * x1 + discount^2 * x2, 41 | x1 + discount * x2, 42 | x2] 43 | """ 44 | return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1] 45 | 46 | 47 | class Actor(nn.Module): 48 | 49 | def _distribution(self, obs): 50 | raise NotImplementedError 51 | 52 | def _log_prob_from_distribution(self, pi, act): 53 | raise NotImplementedError 54 | 55 | def forward(self, obs, act=None): 56 | # Produce action distributions for given observations, and 57 | # optionally compute the log likelihood of given actions under 58 | # those distributions. 59 | pi = self._distribution(obs) 60 | logp_a = None 61 | if act is not None: 62 | logp_a = self._log_prob_from_distribution(pi, act) 63 | return pi, logp_a 64 | 65 | 66 | class MLPCategoricalActor(Actor): 67 | 68 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation): 69 | super().__init__() 70 | self.logits_net = mlp([obs_dim] + list(hidden_sizes) + [act_dim], activation) 71 | 72 | def _distribution(self, obs): 73 | logits = self.logits_net(obs) 74 | return Categorical(logits=logits) 75 | 76 | def _log_prob_from_distribution(self, pi, act): 77 | return pi.log_prob(act) 78 | 79 | 80 | class MLPGaussianActor(Actor): 81 | 82 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation): 83 | super().__init__() 84 | log_std = -0.5 * np.ones(act_dim, dtype=np.float32) 85 | self.log_std = torch.nn.Parameter(torch.as_tensor(log_std)) 86 | self.mu_net = mlp([obs_dim] + list(hidden_sizes) + [act_dim], activation) 87 | 88 | def _distribution(self, obs): 89 | mu = self.mu_net(obs) 90 | std = torch.exp(self.log_std) 91 | return Normal(mu, std) 92 | 93 | def _log_prob_from_distribution(self, pi, act): 94 | return pi.log_prob(act).sum(axis=-1) # Last axis sum needed for Torch Normal distribution 95 | 96 | 97 | class MLPCritic(nn.Module): 98 | 99 | def __init__(self, obs_dim, hidden_sizes, activation): 100 | super().__init__() 101 | self.v_net = mlp([obs_dim] + list(hidden_sizes) + [1], activation) 102 | 103 | def forward(self, obs): 104 | return torch.squeeze(self.v_net(obs), -1) # Critical to ensure v has right shape. 105 | 106 | 107 | 108 | class MLPActorCritic(nn.Module): 109 | 110 | 111 | def __init__(self, observation_space, action_space, 112 | hidden_sizes=(64,64), activation=nn.Tanh): 113 | super().__init__() 114 | 115 | obs_dim = observation_space.shape[0] 116 | 117 | # policy builder depends on action space 118 | if isinstance(action_space, Box): 119 | self.pi = MLPGaussianActor(obs_dim, action_space.shape[0], hidden_sizes, activation) 120 | elif isinstance(action_space, Discrete): 121 | self.pi = MLPCategoricalActor(obs_dim, action_space.n, hidden_sizes, activation) 122 | 123 | # build value function 124 | self.v = MLPCritic(obs_dim, hidden_sizes, activation) 125 | 126 | def step(self, obs): 127 | with torch.no_grad(): 128 | pi = self.pi._distribution(obs) 129 | a = pi.sample() 130 | logp_a = self.pi._log_prob_from_distribution(pi, a) 131 | v = self.v(obs) 132 | return a.numpy(), v.numpy(), logp_a.numpy() 133 | 134 | def act(self, obs): 135 | return self.step(obs)[0] -------------------------------------------------------------------------------- /spinningup/spinup/algos/pytorch/sac/core.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.signal 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.distributions.normal import Normal 8 | 9 | 10 | def combined_shape(length, shape=None): 11 | if shape is None: 12 | return (length,) 13 | return (length, shape) if np.isscalar(shape) else (length, *shape) 14 | 15 | def mlp(sizes, activation, output_activation=nn.Identity): 16 | layers = [] 17 | for j in range(len(sizes)-1): 18 | act = activation if j < len(sizes)-2 else output_activation 19 | layers += [nn.Linear(sizes[j], sizes[j+1]), act()] 20 | return nn.Sequential(*layers) 21 | 22 | def count_vars(module): 23 | return sum([np.prod(p.shape) for p in module.parameters()]) 24 | 25 | 26 | LOG_STD_MAX = 2 27 | LOG_STD_MIN = -20 28 | 29 | class SquashedGaussianMLPActor(nn.Module): 30 | 31 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation, act_limit): 32 | super().__init__() 33 | self.net = mlp([obs_dim] + list(hidden_sizes), activation, activation) 34 | self.mu_layer = nn.Linear(hidden_sizes[-1], act_dim) 35 | self.log_std_layer = nn.Linear(hidden_sizes[-1], act_dim) 36 | self.act_limit = act_limit 37 | 38 | def forward(self, obs, deterministic=False, with_logprob=True): 39 | net_out = self.net(obs) 40 | mu = self.mu_layer(net_out) 41 | log_std = self.log_std_layer(net_out) 42 | log_std = torch.clamp(log_std, LOG_STD_MIN, LOG_STD_MAX) 43 | std = torch.exp(log_std) 44 | 45 | # Pre-squash distribution and sample 46 | pi_distribution = Normal(mu, std) 47 | if deterministic: 48 | # Only used for evaluating policy at test time. 49 | pi_action = mu 50 | else: 51 | pi_action = pi_distribution.rsample() 52 | 53 | if with_logprob: 54 | # Compute logprob from Gaussian, and then apply correction for Tanh squashing. 55 | # NOTE: The correction formula is a little bit magic. To get an understanding 56 | # of where it comes from, check out the original SAC paper (arXiv 1801.01290) 57 | # and look in appendix C. This is a more numerically-stable equivalent to Eq 21. 58 | # Try deriving it yourself as a (very difficult) exercise. :) 59 | logp_pi = pi_distribution.log_prob(pi_action).sum(axis=-1) 60 | logp_pi -= (2*(np.log(2) - pi_action - F.softplus(-2*pi_action))).sum(axis=1) 61 | else: 62 | logp_pi = None 63 | 64 | pi_action = torch.tanh(pi_action) 65 | pi_action = self.act_limit * pi_action 66 | 67 | return pi_action, logp_pi 68 | 69 | 70 | class MLPQFunction(nn.Module): 71 | 72 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation): 73 | super().__init__() 74 | self.q = mlp([obs_dim + act_dim] + list(hidden_sizes) + [1], activation) 75 | 76 | def forward(self, obs, act): 77 | q = self.q(torch.cat([obs, act], dim=-1)) 78 | return torch.squeeze(q, -1) # Critical to ensure q has right shape. 79 | 80 | class MLPActorCritic(nn.Module): 81 | 82 | def __init__(self, observation_space, action_space, hidden_sizes=(256,256), 83 | activation=nn.ReLU): 84 | super().__init__() 85 | 86 | obs_dim = observation_space.shape[0] 87 | act_dim = action_space.shape[0] 88 | act_limit = action_space.high[0] 89 | 90 | # build policy and value functions 91 | self.pi = SquashedGaussianMLPActor(obs_dim, act_dim, hidden_sizes, activation, act_limit) 92 | self.q1 = MLPQFunction(obs_dim, act_dim, hidden_sizes, activation) 93 | self.q2 = MLPQFunction(obs_dim, act_dim, hidden_sizes, activation) 94 | 95 | def act(self, obs, deterministic=False): 96 | with torch.no_grad(): 97 | a, _ = self.pi(obs, deterministic, False) 98 | return a.numpy() 99 | -------------------------------------------------------------------------------- /spinningup/spinup/algos/pytorch/td3/core.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.signal 3 | 4 | import torch 5 | import torch.nn as nn 6 | 7 | 8 | def combined_shape(length, shape=None): 9 | if shape is None: 10 | return (length,) 11 | return (length, shape) if np.isscalar(shape) else (length, *shape) 12 | 13 | def mlp(sizes, activation, output_activation=nn.Identity): 14 | layers = [] 15 | for j in range(len(sizes)-1): 16 | act = activation if j < len(sizes)-2 else output_activation 17 | layers += [nn.Linear(sizes[j], sizes[j+1]), act()] 18 | return nn.Sequential(*layers) 19 | 20 | def count_vars(module): 21 | return sum([np.prod(p.shape) for p in module.parameters()]) 22 | 23 | class MLPActor(nn.Module): 24 | 25 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation, act_limit): 26 | super().__init__() 27 | pi_sizes = [obs_dim] + list(hidden_sizes) + [act_dim] 28 | self.pi = mlp(pi_sizes, activation, nn.Tanh) 29 | self.act_limit = act_limit 30 | 31 | def forward(self, obs): 32 | # Return output from network scaled to action space limits. 33 | return self.act_limit * self.pi(obs) 34 | 35 | class MLPQFunction(nn.Module): 36 | 37 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation): 38 | super().__init__() 39 | self.q = mlp([obs_dim + act_dim] + list(hidden_sizes) + [1], activation) 40 | 41 | def forward(self, obs, act): 42 | q = self.q(torch.cat([obs, act], dim=-1)) 43 | return torch.squeeze(q, -1) # Critical to ensure q has right shape. 44 | 45 | class MLPActorCritic(nn.Module): 46 | 47 | def __init__(self, observation_space, action_space, hidden_sizes=(256,256), 48 | activation=nn.ReLU): 49 | super().__init__() 50 | 51 | obs_dim = observation_space.shape[0] 52 | act_dim = action_space.shape[0] 53 | act_limit = action_space.high[0] 54 | 55 | # build policy and value functions 56 | self.pi = MLPActor(obs_dim, act_dim, hidden_sizes, activation, act_limit) 57 | self.q1 = MLPQFunction(obs_dim, act_dim, hidden_sizes, activation) 58 | self.q2 = MLPQFunction(obs_dim, act_dim, hidden_sizes, activation) 59 | 60 | def act(self, obs): 61 | with torch.no_grad(): 62 | return self.pi(obs).numpy() 63 | -------------------------------------------------------------------------------- /spinningup/spinup/algos/pytorch/trpo/trpo.py: -------------------------------------------------------------------------------- 1 | def trpo(*args, **kwargs): 2 | print('\n\nUnfortunately, TRPO has not yet been implemented in PyTorch '\ 3 | + 'for Spinning Up. TRPO will migrate some time in the future.\n\n') 4 | raise NotImplementedError -------------------------------------------------------------------------------- /spinningup/spinup/algos/pytorch/vpg/core.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.signal 3 | from gym.spaces import Box, Discrete 4 | 5 | import torch 6 | import torch.nn as nn 7 | from torch.distributions.normal import Normal 8 | from torch.distributions.categorical import Categorical 9 | 10 | 11 | def combined_shape(length, shape=None): 12 | if shape is None: 13 | return (length,) 14 | return (length, shape) if np.isscalar(shape) else (length, *shape) 15 | 16 | 17 | def mlp(sizes, activation, output_activation=nn.Identity): 18 | layers = [] 19 | for j in range(len(sizes)-1): 20 | act = activation if j < len(sizes)-2 else output_activation 21 | layers += [nn.Linear(sizes[j], sizes[j+1]), act()] 22 | return nn.Sequential(*layers) 23 | 24 | 25 | def count_vars(module): 26 | return sum([np.prod(p.shape) for p in module.parameters()]) 27 | 28 | 29 | def discount_cumsum(x, discount): 30 | """ 31 | magic from rllab for computing discounted cumulative sums of vectors. 32 | 33 | input: 34 | vector x, 35 | [x0, 36 | x1, 37 | x2] 38 | 39 | output: 40 | [x0 + discount * x1 + discount^2 * x2, 41 | x1 + discount * x2, 42 | x2] 43 | """ 44 | return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1] 45 | 46 | 47 | class Actor(nn.Module): 48 | 49 | def _distribution(self, obs): 50 | raise NotImplementedError 51 | 52 | def _log_prob_from_distribution(self, pi, act): 53 | raise NotImplementedError 54 | 55 | def forward(self, obs, act=None): 56 | # Produce action distributions for given observations, and 57 | # optionally compute the log likelihood of given actions under 58 | # those distributions. 59 | pi = self._distribution(obs) 60 | logp_a = None 61 | if act is not None: 62 | logp_a = self._log_prob_from_distribution(pi, act) 63 | return pi, logp_a 64 | 65 | 66 | class MLPCategoricalActor(Actor): 67 | 68 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation): 69 | super().__init__() 70 | self.logits_net = mlp([obs_dim] + list(hidden_sizes) + [act_dim], activation) 71 | 72 | def _distribution(self, obs): 73 | logits = self.logits_net(obs) 74 | return Categorical(logits=logits) 75 | 76 | def _log_prob_from_distribution(self, pi, act): 77 | return pi.log_prob(act) 78 | 79 | 80 | class MLPGaussianActor(Actor): 81 | 82 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation): 83 | super().__init__() 84 | log_std = -0.5 * np.ones(act_dim, dtype=np.float32) 85 | self.log_std = torch.nn.Parameter(torch.as_tensor(log_std)) 86 | self.mu_net = mlp([obs_dim] + list(hidden_sizes) + [act_dim], activation) 87 | 88 | def _distribution(self, obs): 89 | mu = self.mu_net(obs) 90 | std = torch.exp(self.log_std) 91 | return Normal(mu, std) 92 | 93 | def _log_prob_from_distribution(self, pi, act): 94 | return pi.log_prob(act).sum(axis=-1) # Last axis sum needed for Torch Normal distribution 95 | 96 | 97 | class MLPCritic(nn.Module): 98 | 99 | def __init__(self, obs_dim, hidden_sizes, activation): 100 | super().__init__() 101 | self.v_net = mlp([obs_dim] + list(hidden_sizes) + [1], activation) 102 | 103 | def forward(self, obs): 104 | return torch.squeeze(self.v_net(obs), -1) # Critical to ensure v has right shape. 105 | 106 | 107 | 108 | class MLPActorCritic(nn.Module): 109 | 110 | 111 | def __init__(self, observation_space, action_space, 112 | hidden_sizes=(64,64), activation=nn.Tanh): 113 | super().__init__() 114 | 115 | obs_dim = observation_space.shape[0] 116 | 117 | # policy builder depends on action space 118 | if isinstance(action_space, Box): 119 | self.pi = MLPGaussianActor(obs_dim, action_space.shape[0], hidden_sizes, activation) 120 | elif isinstance(action_space, Discrete): 121 | self.pi = MLPCategoricalActor(obs_dim, action_space.n, hidden_sizes, activation) 122 | 123 | # build value function 124 | self.v = MLPCritic(obs_dim, hidden_sizes, activation) 125 | 126 | def step(self, obs): 127 | with torch.no_grad(): 128 | pi = self.pi._distribution(obs) 129 | a = pi.sample() 130 | logp_a = self.pi._log_prob_from_distribution(pi, a) 131 | v = self.v(obs) 132 | return a.numpy(), v.numpy(), logp_a.numpy() 133 | 134 | def act(self, obs): 135 | return self.step(obs)[0] -------------------------------------------------------------------------------- /spinningup/spinup/algos/tf1/ddpg/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netx-repo/neuroplan/f4d73d71b67261d819494385b1a7e3270b7499e5/spinningup/spinup/algos/tf1/ddpg/__init__.py -------------------------------------------------------------------------------- /spinningup/spinup/algos/tf1/ddpg/core.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | 5 | def placeholder(dim=None): 6 | return tf.placeholder(dtype=tf.float32, shape=(None,dim) if dim else (None,)) 7 | 8 | def placeholders(*args): 9 | return [placeholder(dim) for dim in args] 10 | 11 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None): 12 | for h in hidden_sizes[:-1]: 13 | x = tf.layers.dense(x, units=h, activation=activation) 14 | return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation) 15 | 16 | def get_vars(scope): 17 | return [x for x in tf.global_variables() if scope in x.name] 18 | 19 | def count_vars(scope): 20 | v = get_vars(scope) 21 | return sum([np.prod(var.shape.as_list()) for var in v]) 22 | 23 | """ 24 | Actor-Critics 25 | """ 26 | def mlp_actor_critic(x, a, hidden_sizes=(256,256), activation=tf.nn.relu, 27 | output_activation=tf.tanh, action_space=None): 28 | act_dim = a.shape.as_list()[-1] 29 | act_limit = action_space.high[0] 30 | with tf.variable_scope('pi'): 31 | pi = act_limit * mlp(x, list(hidden_sizes)+[act_dim], activation, output_activation) 32 | with tf.variable_scope('q'): 33 | q = tf.squeeze(mlp(tf.concat([x,a], axis=-1), list(hidden_sizes)+[1], activation, None), axis=1) 34 | with tf.variable_scope('q', reuse=True): 35 | q_pi = tf.squeeze(mlp(tf.concat([x,pi], axis=-1), list(hidden_sizes)+[1], activation, None), axis=1) 36 | return pi, q, q_pi 37 | -------------------------------------------------------------------------------- /spinningup/spinup/algos/tf1/ddpg/ddpg.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import gym 4 | import time 5 | from spinup.algos.tf1.ddpg import core 6 | from spinup.algos.tf1.ddpg.core import get_vars 7 | from spinup.utils.logx import EpochLogger 8 | 9 | 10 | class ReplayBuffer: 11 | """ 12 | A simple FIFO experience replay buffer for DDPG agents. 13 | """ 14 | 15 | def __init__(self, obs_dim, act_dim, size): 16 | self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32) 17 | self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32) 18 | self.acts_buf = np.zeros([size, act_dim], dtype=np.float32) 19 | self.rews_buf = np.zeros(size, dtype=np.float32) 20 | self.done_buf = np.zeros(size, dtype=np.float32) 21 | self.ptr, self.size, self.max_size = 0, 0, size 22 | 23 | def store(self, obs, act, rew, next_obs, done): 24 | self.obs1_buf[self.ptr] = obs 25 | self.obs2_buf[self.ptr] = next_obs 26 | self.acts_buf[self.ptr] = act 27 | self.rews_buf[self.ptr] = rew 28 | self.done_buf[self.ptr] = done 29 | self.ptr = (self.ptr+1) % self.max_size 30 | self.size = min(self.size+1, self.max_size) 31 | 32 | def sample_batch(self, batch_size=32): 33 | idxs = np.random.randint(0, self.size, size=batch_size) 34 | return dict(obs1=self.obs1_buf[idxs], 35 | obs2=self.obs2_buf[idxs], 36 | acts=self.acts_buf[idxs], 37 | rews=self.rews_buf[idxs], 38 | done=self.done_buf[idxs]) 39 | 40 | 41 | 42 | def ddpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, 43 | steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, 44 | polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, 45 | update_after=1000, update_every=50, act_noise=0.1, num_test_episodes=10, 46 | max_ep_len=1000, logger_kwargs=dict(), save_freq=1): 47 | """ 48 | Deep Deterministic Policy Gradient (DDPG) 49 | 50 | 51 | Args: 52 | env_fn : A function which creates a copy of the environment. 53 | The environment must satisfy the OpenAI Gym API. 54 | 55 | actor_critic: A function which takes in placeholder symbols 56 | for state, ``x_ph``, and action, ``a_ph``, and returns the main 57 | outputs from the agent's Tensorflow computation graph: 58 | 59 | =========== ================ ====================================== 60 | Symbol Shape Description 61 | =========== ================ ====================================== 62 | ``pi`` (batch, act_dim) | Deterministically computes actions 63 | | from policy given states. 64 | ``q`` (batch,) | Gives the current estimate of Q* for 65 | | states in ``x_ph`` and actions in 66 | | ``a_ph``. 67 | ``q_pi`` (batch,) | Gives the composition of ``q`` and 68 | | ``pi`` for states in ``x_ph``: 69 | | q(x, pi(x)). 70 | =========== ================ ====================================== 71 | 72 | ac_kwargs (dict): Any kwargs appropriate for the actor_critic 73 | function you provided to DDPG. 74 | 75 | seed (int): Seed for random number generators. 76 | 77 | steps_per_epoch (int): Number of steps of interaction (state-action pairs) 78 | for the agent and the environment in each epoch. 79 | 80 | epochs (int): Number of epochs to run and train agent. 81 | 82 | replay_size (int): Maximum length of replay buffer. 83 | 84 | gamma (float): Discount factor. (Always between 0 and 1.) 85 | 86 | polyak (float): Interpolation factor in polyak averaging for target 87 | networks. Target networks are updated towards main networks 88 | according to: 89 | 90 | .. math:: \\theta_{\\text{targ}} \\leftarrow 91 | \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta 92 | 93 | where :math:`\\rho` is polyak. (Always between 0 and 1, usually 94 | close to 1.) 95 | 96 | pi_lr (float): Learning rate for policy. 97 | 98 | q_lr (float): Learning rate for Q-networks. 99 | 100 | batch_size (int): Minibatch size for SGD. 101 | 102 | start_steps (int): Number of steps for uniform-random action selection, 103 | before running real policy. Helps exploration. 104 | 105 | update_after (int): Number of env interactions to collect before 106 | starting to do gradient descent updates. Ensures replay buffer 107 | is full enough for useful updates. 108 | 109 | update_every (int): Number of env interactions that should elapse 110 | between gradient descent updates. Note: Regardless of how long 111 | you wait between updates, the ratio of env steps to gradient steps 112 | is locked to 1. 113 | 114 | act_noise (float): Stddev for Gaussian exploration noise added to 115 | policy at training time. (At test time, no noise is added.) 116 | 117 | num_test_episodes (int): Number of episodes to test the deterministic 118 | policy at the end of each epoch. 119 | 120 | max_ep_len (int): Maximum length of trajectory / episode / rollout. 121 | 122 | logger_kwargs (dict): Keyword args for EpochLogger. 123 | 124 | save_freq (int): How often (in terms of gap between epochs) to save 125 | the current policy and value function. 126 | 127 | """ 128 | 129 | logger = EpochLogger(**logger_kwargs) 130 | logger.save_config(locals()) 131 | 132 | tf.set_random_seed(seed) 133 | np.random.seed(seed) 134 | 135 | env, test_env = env_fn(), env_fn() 136 | obs_dim = env.observation_space.shape[0] 137 | act_dim = env.action_space.shape[0] 138 | 139 | # Action limit for clamping: critically, assumes all dimensions share the same bound! 140 | act_limit = env.action_space.high[0] 141 | 142 | # Share information about action space with policy architecture 143 | ac_kwargs['action_space'] = env.action_space 144 | 145 | # Inputs to computation graph 146 | x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) 147 | 148 | # Main outputs from computation graph 149 | with tf.variable_scope('main'): 150 | pi, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs) 151 | 152 | # Target networks 153 | with tf.variable_scope('target'): 154 | # Note that the action placeholder going to actor_critic here is 155 | # irrelevant, because we only need q_targ(s, pi_targ(s)). 156 | pi_targ, _, q_pi_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) 157 | 158 | # Experience buffer 159 | replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) 160 | 161 | # Count variables 162 | var_counts = tuple(core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main']) 163 | print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n'%var_counts) 164 | 165 | # Bellman backup for Q function 166 | backup = tf.stop_gradient(r_ph + gamma*(1-d_ph)*q_pi_targ) 167 | 168 | # DDPG losses 169 | pi_loss = -tf.reduce_mean(q_pi) 170 | q_loss = tf.reduce_mean((q-backup)**2) 171 | 172 | # Separate train ops for pi, q 173 | pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) 174 | q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) 175 | train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) 176 | train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q')) 177 | 178 | # Polyak averaging for target variables 179 | target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main) 180 | for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) 181 | 182 | # Initializing targets to match main variables 183 | target_init = tf.group([tf.assign(v_targ, v_main) 184 | for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) 185 | 186 | sess = tf.Session() 187 | sess.run(tf.global_variables_initializer()) 188 | sess.run(target_init) 189 | 190 | # Setup model saving 191 | logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'pi': pi, 'q': q}) 192 | 193 | def get_action(o, noise_scale): 194 | a = sess.run(pi, feed_dict={x_ph: o.reshape(1,-1)})[0] 195 | a += noise_scale * np.random.randn(act_dim) 196 | return np.clip(a, -act_limit, act_limit) 197 | 198 | def test_agent(): 199 | for j in range(num_test_episodes): 200 | o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 201 | while not(d or (ep_len == max_ep_len)): 202 | # Take deterministic actions at test time (noise_scale=0) 203 | o, r, d, _ = test_env.step(get_action(o, 0)) 204 | ep_ret += r 205 | ep_len += 1 206 | logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) 207 | 208 | # Prepare for interaction with environment 209 | total_steps = steps_per_epoch * epochs 210 | start_time = time.time() 211 | o, ep_ret, ep_len = env.reset(), 0, 0 212 | 213 | # Main loop: collect experience in env and update/log each epoch 214 | for t in range(total_steps): 215 | 216 | # Until start_steps have elapsed, randomly sample actions 217 | # from a uniform distribution for better exploration. Afterwards, 218 | # use the learned policy (with some noise, via act_noise). 219 | if t > start_steps: 220 | a = get_action(o, act_noise) 221 | else: 222 | a = env.action_space.sample() 223 | 224 | # Step the env 225 | o2, r, d, _ = env.step(a) 226 | ep_ret += r 227 | ep_len += 1 228 | 229 | # Ignore the "done" signal if it comes from hitting the time 230 | # horizon (that is, when it's an artificial terminal signal 231 | # that isn't based on the agent's state) 232 | d = False if ep_len==max_ep_len else d 233 | 234 | # Store experience to replay buffer 235 | replay_buffer.store(o, a, r, o2, d) 236 | 237 | # Super critical, easy to overlook step: make sure to update 238 | # most recent observation! 239 | o = o2 240 | 241 | # End of trajectory handling 242 | if d or (ep_len == max_ep_len): 243 | logger.store(EpRet=ep_ret, EpLen=ep_len) 244 | o, ep_ret, ep_len = env.reset(), 0, 0 245 | 246 | # Update handling 247 | if t >= update_after and t % update_every == 0: 248 | for _ in range(update_every): 249 | batch = replay_buffer.sample_batch(batch_size) 250 | feed_dict = {x_ph: batch['obs1'], 251 | x2_ph: batch['obs2'], 252 | a_ph: batch['acts'], 253 | r_ph: batch['rews'], 254 | d_ph: batch['done'] 255 | } 256 | 257 | # Q-learning update 258 | outs = sess.run([q_loss, q, train_q_op], feed_dict) 259 | logger.store(LossQ=outs[0], QVals=outs[1]) 260 | 261 | # Policy update 262 | outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) 263 | logger.store(LossPi=outs[0]) 264 | 265 | # End of epoch wrap-up 266 | if (t+1) % steps_per_epoch == 0: 267 | epoch = (t+1) // steps_per_epoch 268 | 269 | # Save model 270 | if (epoch % save_freq == 0) or (epoch == epochs): 271 | logger.save_state({'env': env}, None) 272 | 273 | # Test the performance of the deterministic version of the agent. 274 | test_agent() 275 | 276 | # Log info about epoch 277 | logger.log_tabular('Epoch', epoch) 278 | logger.log_tabular('EpRet', with_min_and_max=True) 279 | logger.log_tabular('TestEpRet', with_min_and_max=True) 280 | logger.log_tabular('EpLen', average_only=True) 281 | logger.log_tabular('TestEpLen', average_only=True) 282 | logger.log_tabular('TotalEnvInteracts', t) 283 | logger.log_tabular('QVals', with_min_and_max=True) 284 | logger.log_tabular('LossPi', average_only=True) 285 | logger.log_tabular('LossQ', average_only=True) 286 | logger.log_tabular('Time', time.time()-start_time) 287 | logger.dump_tabular() 288 | 289 | if __name__ == '__main__': 290 | import argparse 291 | parser = argparse.ArgumentParser() 292 | parser.add_argument('--env', type=str, default='HalfCheetah-v2') 293 | parser.add_argument('--hid', type=int, default=256) 294 | parser.add_argument('--l', type=int, default=2) 295 | parser.add_argument('--gamma', type=float, default=0.99) 296 | parser.add_argument('--seed', '-s', type=int, default=0) 297 | parser.add_argument('--epochs', type=int, default=50) 298 | parser.add_argument('--exp_name', type=str, default='ddpg') 299 | args = parser.parse_args() 300 | 301 | from spinup.utils.run_utils import setup_logger_kwargs 302 | logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed) 303 | 304 | ddpg(lambda : gym.make(args.env), actor_critic=core.mlp_actor_critic, 305 | ac_kwargs=dict(hidden_sizes=[args.hid]*args.l), 306 | gamma=args.gamma, seed=args.seed, epochs=args.epochs, 307 | logger_kwargs=logger_kwargs) 308 | -------------------------------------------------------------------------------- /spinningup/spinup/algos/tf1/ppo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netx-repo/neuroplan/f4d73d71b67261d819494385b1a7e3270b7499e5/spinningup/spinup/algos/tf1/ppo/__init__.py -------------------------------------------------------------------------------- /spinningup/spinup/algos/tf1/ppo/core.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import scipy.signal 4 | from gym.spaces import Box, Discrete 5 | 6 | EPS = 1e-8 7 | 8 | def combined_shape(length, shape=None): 9 | if shape is None: 10 | return (length,) 11 | return (length, shape) if np.isscalar(shape) else (length, *shape) 12 | 13 | def placeholder(dim=None): 14 | return tf.placeholder(dtype=tf.float32, shape=combined_shape(None,dim)) 15 | 16 | def placeholders(*args): 17 | return [placeholder(dim) for dim in args] 18 | 19 | def placeholder_from_space(space): 20 | if isinstance(space, Box): 21 | return placeholder(space.shape) 22 | elif isinstance(space, Discrete): 23 | return tf.placeholder(dtype=tf.int32, shape=(None,)) 24 | raise NotImplementedError 25 | 26 | def placeholders_from_spaces(*args): 27 | return [placeholder_from_space(space) for space in args] 28 | 29 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None): 30 | for h in hidden_sizes[:-1]: 31 | x = tf.layers.dense(x, units=h, activation=activation) 32 | return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation) 33 | 34 | def get_vars(scope=''): 35 | return [x for x in tf.trainable_variables() if scope in x.name] 36 | 37 | def count_vars(scope=''): 38 | v = get_vars(scope) 39 | return sum([np.prod(var.shape.as_list()) for var in v]) 40 | 41 | def gaussian_likelihood(x, mu, log_std): 42 | pre_sum = -0.5 * (((x-mu)/(tf.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi)) 43 | return tf.reduce_sum(pre_sum, axis=1) 44 | 45 | def discount_cumsum(x, discount): 46 | """ 47 | magic from rllab for computing discounted cumulative sums of vectors. 48 | 49 | input: 50 | vector x, 51 | [x0, 52 | x1, 53 | x2] 54 | 55 | output: 56 | [x0 + discount * x1 + discount^2 * x2, 57 | x1 + discount * x2, 58 | x2] 59 | """ 60 | return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1] 61 | 62 | 63 | """ 64 | Policies 65 | """ 66 | 67 | def mlp_categorical_policy(x, a, hidden_sizes, activation, output_activation, action_space): 68 | act_dim = action_space.n 69 | logits = mlp(x, list(hidden_sizes)+[act_dim], activation, None) 70 | logp_all = tf.nn.log_softmax(logits) 71 | pi = tf.squeeze(tf.multinomial(logits,1), axis=1) 72 | logp = tf.reduce_sum(tf.one_hot(a, depth=act_dim) * logp_all, axis=1) 73 | logp_pi = tf.reduce_sum(tf.one_hot(pi, depth=act_dim) * logp_all, axis=1) 74 | return pi, logp, logp_pi 75 | 76 | 77 | def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation, action_space): 78 | act_dim = a.shape.as_list()[-1] 79 | mu = mlp(x, list(hidden_sizes)+[act_dim], activation, output_activation) 80 | log_std = tf.get_variable(name='log_std', initializer=-0.5*np.ones(act_dim, dtype=np.float32)) 81 | std = tf.exp(log_std) 82 | pi = mu + tf.random_normal(tf.shape(mu)) * std 83 | logp = gaussian_likelihood(a, mu, log_std) 84 | logp_pi = gaussian_likelihood(pi, mu, log_std) 85 | return pi, logp, logp_pi 86 | 87 | 88 | """ 89 | Actor-Critics 90 | """ 91 | def mlp_actor_critic(x, a, hidden_sizes=(64,64), activation=tf.tanh, 92 | output_activation=None, policy=None, action_space=None): 93 | 94 | # default policy builder depends on action space 95 | if policy is None and isinstance(action_space, Box): 96 | policy = mlp_gaussian_policy 97 | elif policy is None and isinstance(action_space, Discrete): 98 | policy = mlp_categorical_policy 99 | 100 | with tf.variable_scope('pi'): 101 | pi, logp, logp_pi = policy(x, a, hidden_sizes, activation, output_activation, action_space) 102 | with tf.variable_scope('v'): 103 | v = tf.squeeze(mlp(x, list(hidden_sizes)+[1], activation, None), axis=1) 104 | return pi, logp, logp_pi, v 105 | -------------------------------------------------------------------------------- /spinningup/spinup/algos/tf1/sac/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netx-repo/neuroplan/f4d73d71b67261d819494385b1a7e3270b7499e5/spinningup/spinup/algos/tf1/sac/__init__.py -------------------------------------------------------------------------------- /spinningup/spinup/algos/tf1/sac/core.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | EPS = 1e-8 5 | 6 | def placeholder(dim=None): 7 | return tf.placeholder(dtype=tf.float32, shape=(None,dim) if dim else (None,)) 8 | 9 | def placeholders(*args): 10 | return [placeholder(dim) for dim in args] 11 | 12 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None): 13 | for h in hidden_sizes[:-1]: 14 | x = tf.layers.dense(x, units=h, activation=activation) 15 | return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation) 16 | 17 | def get_vars(scope): 18 | return [x for x in tf.global_variables() if scope in x.name] 19 | 20 | def count_vars(scope): 21 | v = get_vars(scope) 22 | return sum([np.prod(var.shape.as_list()) for var in v]) 23 | 24 | def gaussian_likelihood(x, mu, log_std): 25 | pre_sum = -0.5 * (((x-mu)/(tf.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi)) 26 | return tf.reduce_sum(pre_sum, axis=1) 27 | 28 | 29 | """ 30 | Policies 31 | """ 32 | 33 | LOG_STD_MAX = 2 34 | LOG_STD_MIN = -20 35 | 36 | def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation): 37 | act_dim = a.shape.as_list()[-1] 38 | net = mlp(x, list(hidden_sizes), activation, activation) 39 | mu = tf.layers.dense(net, act_dim, activation=output_activation) 40 | log_std = tf.layers.dense(net, act_dim, activation=None) 41 | log_std = tf.clip_by_value(log_std, LOG_STD_MIN, LOG_STD_MAX) 42 | 43 | std = tf.exp(log_std) 44 | pi = mu + tf.random_normal(tf.shape(mu)) * std 45 | logp_pi = gaussian_likelihood(pi, mu, log_std) 46 | return mu, pi, logp_pi 47 | 48 | def apply_squashing_func(mu, pi, logp_pi): 49 | # Adjustment to log prob 50 | # NOTE: This formula is a little bit magic. To get an understanding of where it 51 | # comes from, check out the original SAC paper (arXiv 1801.01290) and look in 52 | # appendix C. This is a more numerically-stable equivalent to Eq 21. 53 | # Try deriving it yourself as a (very difficult) exercise. :) 54 | logp_pi -= tf.reduce_sum(2*(np.log(2) - pi - tf.nn.softplus(-2*pi)), axis=1) 55 | 56 | # Squash those unbounded actions! 57 | mu = tf.tanh(mu) 58 | pi = tf.tanh(pi) 59 | return mu, pi, logp_pi 60 | 61 | """ 62 | Actor-Critics 63 | """ 64 | def mlp_actor_critic(x, a, hidden_sizes=(256,256), activation=tf.nn.relu, 65 | output_activation=None, policy=mlp_gaussian_policy, action_space=None): 66 | # policy 67 | with tf.variable_scope('pi'): 68 | mu, pi, logp_pi = policy(x, a, hidden_sizes, activation, output_activation) 69 | mu, pi, logp_pi = apply_squashing_func(mu, pi, logp_pi) 70 | 71 | # make sure actions are in correct range 72 | action_scale = action_space.high[0] 73 | mu *= action_scale 74 | pi *= action_scale 75 | 76 | # vfs 77 | vf_mlp = lambda x : tf.squeeze(mlp(x, list(hidden_sizes)+[1], activation, None), axis=1) 78 | with tf.variable_scope('q1'): 79 | q1 = vf_mlp(tf.concat([x,a], axis=-1)) 80 | with tf.variable_scope('q2'): 81 | q2 = vf_mlp(tf.concat([x,a], axis=-1)) 82 | return mu, pi, logp_pi, q1, q2 83 | -------------------------------------------------------------------------------- /spinningup/spinup/algos/tf1/td3/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netx-repo/neuroplan/f4d73d71b67261d819494385b1a7e3270b7499e5/spinningup/spinup/algos/tf1/td3/__init__.py -------------------------------------------------------------------------------- /spinningup/spinup/algos/tf1/td3/core.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | 5 | def placeholder(dim=None): 6 | return tf.placeholder(dtype=tf.float32, shape=(None,dim) if dim else (None,)) 7 | 8 | def placeholders(*args): 9 | return [placeholder(dim) for dim in args] 10 | 11 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None): 12 | for h in hidden_sizes[:-1]: 13 | x = tf.layers.dense(x, units=h, activation=activation) 14 | return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation) 15 | 16 | def get_vars(scope): 17 | return [x for x in tf.global_variables() if scope in x.name] 18 | 19 | def count_vars(scope): 20 | v = get_vars(scope) 21 | return sum([np.prod(var.shape.as_list()) for var in v]) 22 | 23 | """ 24 | Actor-Critics 25 | """ 26 | def mlp_actor_critic(x, a, hidden_sizes=(256,256), activation=tf.nn.relu, 27 | output_activation=tf.tanh, action_space=None): 28 | act_dim = a.shape.as_list()[-1] 29 | act_limit = action_space.high[0] 30 | with tf.variable_scope('pi'): 31 | pi = act_limit * mlp(x, list(hidden_sizes)+[act_dim], activation, output_activation) 32 | with tf.variable_scope('q1'): 33 | q1 = tf.squeeze(mlp(tf.concat([x,a], axis=-1), list(hidden_sizes)+[1], activation, None), axis=1) 34 | with tf.variable_scope('q2'): 35 | q2 = tf.squeeze(mlp(tf.concat([x,a], axis=-1), list(hidden_sizes)+[1], activation, None), axis=1) 36 | with tf.variable_scope('q1', reuse=True): 37 | q1_pi = tf.squeeze(mlp(tf.concat([x,pi], axis=-1), list(hidden_sizes)+[1], activation, None), axis=1) 38 | return pi, q1, q2, q1_pi 39 | -------------------------------------------------------------------------------- /spinningup/spinup/algos/tf1/trpo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netx-repo/neuroplan/f4d73d71b67261d819494385b1a7e3270b7499e5/spinningup/spinup/algos/tf1/trpo/__init__.py -------------------------------------------------------------------------------- /spinningup/spinup/algos/tf1/trpo/core.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import scipy.signal 4 | from gym.spaces import Box, Discrete 5 | 6 | EPS = 1e-8 7 | 8 | def combined_shape(length, shape=None): 9 | if shape is None: 10 | return (length,) 11 | return (length, shape) if np.isscalar(shape) else (length, *shape) 12 | 13 | def keys_as_sorted_list(dict): 14 | return sorted(list(dict.keys())) 15 | 16 | def values_as_sorted_list(dict): 17 | return [dict[k] for k in keys_as_sorted_list(dict)] 18 | 19 | def placeholder(dim=None): 20 | return tf.placeholder(dtype=tf.float32, shape=combined_shape(None,dim)) 21 | 22 | def placeholders(*args): 23 | return [placeholder(dim) for dim in args] 24 | 25 | def placeholder_from_space(space): 26 | if isinstance(space, Box): 27 | return placeholder(space.shape) 28 | elif isinstance(space, Discrete): 29 | return tf.placeholder(dtype=tf.int32, shape=(None,)) 30 | raise NotImplementedError 31 | 32 | def placeholders_from_spaces(*args): 33 | return [placeholder_from_space(space) for space in args] 34 | 35 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None): 36 | for h in hidden_sizes[:-1]: 37 | x = tf.layers.dense(x, units=h, activation=activation) 38 | return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation) 39 | 40 | def get_vars(scope=''): 41 | return [x for x in tf.trainable_variables() if scope in x.name] 42 | 43 | def count_vars(scope=''): 44 | v = get_vars(scope) 45 | return sum([np.prod(var.shape.as_list()) for var in v]) 46 | 47 | def gaussian_likelihood(x, mu, log_std): 48 | pre_sum = -0.5 * (((x-mu)/(tf.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi)) 49 | return tf.reduce_sum(pre_sum, axis=1) 50 | 51 | def diagonal_gaussian_kl(mu0, log_std0, mu1, log_std1): 52 | """ 53 | tf symbol for mean KL divergence between two batches of diagonal gaussian distributions, 54 | where distributions are specified by means and log stds. 55 | (https://en.wikipedia.org/wiki/Kullback-Leibler_divergence#Multivariate_normal_distributions) 56 | """ 57 | var0, var1 = tf.exp(2 * log_std0), tf.exp(2 * log_std1) 58 | pre_sum = 0.5*(((mu1- mu0)**2 + var0)/(var1 + EPS) - 1) + log_std1 - log_std0 59 | all_kls = tf.reduce_sum(pre_sum, axis=1) 60 | return tf.reduce_mean(all_kls) 61 | 62 | def categorical_kl(logp0, logp1): 63 | """ 64 | tf symbol for mean KL divergence between two batches of categorical probability distributions, 65 | where the distributions are input as log probs. 66 | """ 67 | all_kls = tf.reduce_sum(tf.exp(logp1) * (logp1 - logp0), axis=1) 68 | return tf.reduce_mean(all_kls) 69 | 70 | def flat_concat(xs): 71 | return tf.concat([tf.reshape(x,(-1,)) for x in xs], axis=0) 72 | 73 | def flat_grad(f, params): 74 | return flat_concat(tf.gradients(xs=params, ys=f)) 75 | 76 | def hessian_vector_product(f, params): 77 | # for H = grad**2 f, compute Hx 78 | g = flat_grad(f, params) 79 | x = tf.placeholder(tf.float32, shape=g.shape) 80 | return x, flat_grad(tf.reduce_sum(g*x), params) 81 | 82 | def assign_params_from_flat(x, params): 83 | flat_size = lambda p : int(np.prod(p.shape.as_list())) # the 'int' is important for scalars 84 | splits = tf.split(x, [flat_size(p) for p in params]) 85 | new_params = [tf.reshape(p_new, p.shape) for p, p_new in zip(params, splits)] 86 | return tf.group([tf.assign(p, p_new) for p, p_new in zip(params, new_params)]) 87 | 88 | def discount_cumsum(x, discount): 89 | """ 90 | magic from rllab for computing discounted cumulative sums of vectors. 91 | 92 | input: 93 | vector x, 94 | [x0, 95 | x1, 96 | x2] 97 | 98 | output: 99 | [x0 + discount * x1 + discount^2 * x2, 100 | x1 + discount * x2, 101 | x2] 102 | """ 103 | return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1] 104 | 105 | """ 106 | Policies 107 | """ 108 | 109 | def mlp_categorical_policy(x, a, hidden_sizes, activation, output_activation, action_space): 110 | act_dim = action_space.n 111 | logits = mlp(x, list(hidden_sizes)+[act_dim], activation, None) 112 | logp_all = tf.nn.log_softmax(logits) 113 | pi = tf.squeeze(tf.multinomial(logits,1), axis=1) 114 | logp = tf.reduce_sum(tf.one_hot(a, depth=act_dim) * logp_all, axis=1) 115 | logp_pi = tf.reduce_sum(tf.one_hot(pi, depth=act_dim) * logp_all, axis=1) 116 | 117 | old_logp_all = placeholder(act_dim) 118 | d_kl = categorical_kl(logp_all, old_logp_all) 119 | 120 | info = {'logp_all': logp_all} 121 | info_phs = {'logp_all': old_logp_all} 122 | 123 | return pi, logp, logp_pi, info, info_phs, d_kl 124 | 125 | 126 | def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation, action_space): 127 | act_dim = a.shape.as_list()[-1] 128 | mu = mlp(x, list(hidden_sizes)+[act_dim], activation, output_activation) 129 | log_std = tf.get_variable(name='log_std', initializer=-0.5*np.ones(act_dim, dtype=np.float32)) 130 | std = tf.exp(log_std) 131 | pi = mu + tf.random_normal(tf.shape(mu)) * std 132 | logp = gaussian_likelihood(a, mu, log_std) 133 | logp_pi = gaussian_likelihood(pi, mu, log_std) 134 | 135 | old_mu_ph, old_log_std_ph = placeholders(act_dim, act_dim) 136 | d_kl = diagonal_gaussian_kl(mu, log_std, old_mu_ph, old_log_std_ph) 137 | 138 | info = {'mu': mu, 'log_std': log_std} 139 | info_phs = {'mu': old_mu_ph, 'log_std': old_log_std_ph} 140 | 141 | return pi, logp, logp_pi, info, info_phs, d_kl 142 | 143 | 144 | """ 145 | Actor-Critics 146 | """ 147 | def mlp_actor_critic(x, a, hidden_sizes=(64,64), activation=tf.tanh, 148 | output_activation=None, policy=None, action_space=None): 149 | 150 | # default policy builder depends on action space 151 | if policy is None and isinstance(action_space, Box): 152 | policy = mlp_gaussian_policy 153 | elif policy is None and isinstance(action_space, Discrete): 154 | policy = mlp_categorical_policy 155 | 156 | with tf.variable_scope('pi'): 157 | policy_outs = policy(x, a, hidden_sizes, activation, output_activation, action_space) 158 | pi, logp, logp_pi, info, info_phs, d_kl = policy_outs 159 | with tf.variable_scope('v'): 160 | v = tf.squeeze(mlp(x, list(hidden_sizes)+[1], activation, None), axis=1) 161 | return pi, logp, logp_pi, info, info_phs, d_kl, v 162 | -------------------------------------------------------------------------------- /spinningup/spinup/algos/tf1/vpg/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netx-repo/neuroplan/f4d73d71b67261d819494385b1a7e3270b7499e5/spinningup/spinup/algos/tf1/vpg/__init__.py -------------------------------------------------------------------------------- /spinningup/spinup/algos/tf1/vpg/core.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import scipy.signal 4 | from gym.spaces import Box, Discrete 5 | 6 | EPS = 1e-8 7 | 8 | def combined_shape(length, shape=None): 9 | if shape is None: 10 | return (length,) 11 | return (length, shape) if np.isscalar(shape) else (length, *shape) 12 | 13 | def placeholder(dim=None): 14 | return tf.placeholder(dtype=tf.float32, shape=combined_shape(None,dim)) 15 | 16 | def placeholders(*args): 17 | return [placeholder(dim) for dim in args] 18 | 19 | def placeholder_from_space(space): 20 | if isinstance(space, Box): 21 | return placeholder(space.shape) 22 | elif isinstance(space, Discrete): 23 | return tf.placeholder(dtype=tf.int32, shape=(None,)) 24 | raise NotImplementedError 25 | 26 | def placeholders_from_spaces(*args): 27 | return [placeholder_from_space(space) for space in args] 28 | 29 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None): 30 | for h in hidden_sizes[:-1]: 31 | x = tf.layers.dense(x, units=h, activation=activation) 32 | return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation) 33 | 34 | def get_vars(scope=''): 35 | return [x for x in tf.trainable_variables() if scope in x.name] 36 | 37 | def count_vars(scope=''): 38 | v = get_vars(scope) 39 | return sum([np.prod(var.shape.as_list()) for var in v]) 40 | 41 | def gaussian_likelihood(x, mu, log_std): 42 | pre_sum = -0.5 * (((x-mu)/(tf.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi)) 43 | return tf.reduce_sum(pre_sum, axis=1) 44 | 45 | def discount_cumsum(x, discount): 46 | """ 47 | magic from rllab for computing discounted cumulative sums of vectors. 48 | 49 | input: 50 | vector x, 51 | [x0, 52 | x1, 53 | x2] 54 | 55 | output: 56 | [x0 + discount * x1 + discount^2 * x2, 57 | x1 + discount * x2, 58 | x2] 59 | """ 60 | return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1] 61 | 62 | 63 | """ 64 | Policies 65 | """ 66 | 67 | def mlp_categorical_policy(x, a, hidden_sizes, activation, output_activation, action_space): 68 | act_dim = action_space.n 69 | logits = mlp(x, list(hidden_sizes)+[act_dim], activation, None) 70 | logp_all = tf.nn.log_softmax(logits) 71 | pi = tf.squeeze(tf.multinomial(logits,1), axis=1) 72 | logp = tf.reduce_sum(tf.one_hot(a, depth=act_dim) * logp_all, axis=1) 73 | logp_pi = tf.reduce_sum(tf.one_hot(pi, depth=act_dim) * logp_all, axis=1) 74 | return pi, logp, logp_pi 75 | 76 | 77 | def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation, action_space): 78 | act_dim = a.shape.as_list()[-1] 79 | mu = mlp(x, list(hidden_sizes)+[act_dim], activation, output_activation) 80 | log_std = tf.get_variable(name='log_std', initializer=-0.5*np.ones(act_dim, dtype=np.float32)) 81 | std = tf.exp(log_std) 82 | pi = mu + tf.random_normal(tf.shape(mu)) * std 83 | logp = gaussian_likelihood(a, mu, log_std) 84 | logp_pi = gaussian_likelihood(pi, mu, log_std) 85 | return pi, logp, logp_pi 86 | 87 | 88 | """ 89 | Actor-Critics 90 | """ 91 | def mlp_actor_critic(x, a, hidden_sizes=(64,64), activation=tf.tanh, 92 | output_activation=None, policy=None, action_space=None): 93 | 94 | # default policy builder depends on action space 95 | if policy is None and isinstance(action_space, Box): 96 | policy = mlp_gaussian_policy 97 | elif policy is None and isinstance(action_space, Discrete): 98 | policy = mlp_categorical_policy 99 | 100 | with tf.variable_scope('pi'): 101 | pi, logp, logp_pi = policy(x, a, hidden_sizes, activation, output_activation, action_space) 102 | with tf.variable_scope('v'): 103 | v = tf.squeeze(mlp(x, list(hidden_sizes)+[1], activation, None), axis=1) 104 | return pi, logp, logp_pi, v 105 | -------------------------------------------------------------------------------- /spinningup/spinup/examples/pytorch/bench_ppo_cartpole.py: -------------------------------------------------------------------------------- 1 | from spinup.utils.run_utils import ExperimentGrid 2 | from spinup import ppo_pytorch 3 | import torch 4 | 5 | if __name__ == '__main__': 6 | import argparse 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument('--cpu', type=int, default=4) 9 | parser.add_argument('--num_runs', type=int, default=3) 10 | args = parser.parse_args() 11 | 12 | eg = ExperimentGrid(name='ppo-pyt-bench') 13 | eg.add('env_name', 'CartPole-v0', '', True) 14 | eg.add('seed', [10*i for i in range(args.num_runs)]) 15 | eg.add('epochs', 10) 16 | eg.add('steps_per_epoch', 4000) 17 | eg.add('ac_kwargs:hidden_sizes', [(32,), (64,64)], 'hid') 18 | eg.add('ac_kwargs:activation', [torch.nn.Tanh, torch.nn.ReLU], '') 19 | eg.run(ppo_pytorch, num_cpu=args.cpu) -------------------------------------------------------------------------------- /spinningup/spinup/examples/pytorch/pg_math/1_simple_pg.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.distributions.categorical import Categorical 4 | from torch.optim import Adam 5 | import numpy as np 6 | import gym 7 | from gym.spaces import Discrete, Box 8 | 9 | def mlp(sizes, activation=nn.Tanh, output_activation=nn.Identity): 10 | # Build a feedforward neural network. 11 | layers = [] 12 | for j in range(len(sizes)-1): 13 | act = activation if j < len(sizes)-2 else output_activation 14 | layers += [nn.Linear(sizes[j], sizes[j+1]), act()] 15 | return nn.Sequential(*layers) 16 | 17 | def train(env_name='CartPole-v0', hidden_sizes=[32], lr=1e-2, 18 | epochs=50, batch_size=5000, render=False): 19 | 20 | # make environment, check spaces, get obs / act dims 21 | env = gym.make(env_name) 22 | assert isinstance(env.observation_space, Box), \ 23 | "This example only works for envs with continuous state spaces." 24 | assert isinstance(env.action_space, Discrete), \ 25 | "This example only works for envs with discrete action spaces." 26 | 27 | obs_dim = env.observation_space.shape[0] 28 | n_acts = env.action_space.n 29 | 30 | # make core of policy network 31 | logits_net = mlp(sizes=[obs_dim]+hidden_sizes+[n_acts]) 32 | 33 | # make function to compute action distribution 34 | def get_policy(obs): 35 | logits = logits_net(obs) 36 | return Categorical(logits=logits) 37 | 38 | # make action selection function (outputs int actions, sampled from policy) 39 | def get_action(obs): 40 | return get_policy(obs).sample().item() 41 | 42 | # make loss function whose gradient, for the right data, is policy gradient 43 | def compute_loss(obs, act, weights): 44 | logp = get_policy(obs).log_prob(act) 45 | return -(logp * weights).mean() 46 | 47 | # make optimizer 48 | optimizer = Adam(logits_net.parameters(), lr=lr) 49 | 50 | # for training policy 51 | def train_one_epoch(): 52 | # make some empty lists for logging. 53 | batch_obs = [] # for observations 54 | batch_acts = [] # for actions 55 | batch_weights = [] # for R(tau) weighting in policy gradient 56 | batch_rets = [] # for measuring episode returns 57 | batch_lens = [] # for measuring episode lengths 58 | 59 | # reset episode-specific variables 60 | obs = env.reset() # first obs comes from starting distribution 61 | done = False # signal from environment that episode is over 62 | ep_rews = [] # list for rewards accrued throughout ep 63 | 64 | # render first episode of each epoch 65 | finished_rendering_this_epoch = False 66 | 67 | # collect experience by acting in the environment with current policy 68 | while True: 69 | 70 | # rendering 71 | if (not finished_rendering_this_epoch) and render: 72 | env.render() 73 | 74 | # save obs 75 | batch_obs.append(obs.copy()) 76 | 77 | # act in the environment 78 | act = get_action(torch.as_tensor(obs, dtype=torch.float32)) 79 | obs, rew, done, _ = env.step(act) 80 | 81 | # save action, reward 82 | batch_acts.append(act) 83 | ep_rews.append(rew) 84 | 85 | if done: 86 | # if episode is over, record info about episode 87 | ep_ret, ep_len = sum(ep_rews), len(ep_rews) 88 | batch_rets.append(ep_ret) 89 | batch_lens.append(ep_len) 90 | 91 | # the weight for each logprob(a|s) is R(tau) 92 | batch_weights += [ep_ret] * ep_len 93 | 94 | # reset episode-specific variables 95 | obs, done, ep_rews = env.reset(), False, [] 96 | 97 | # won't render again this epoch 98 | finished_rendering_this_epoch = True 99 | 100 | # end experience loop if we have enough of it 101 | if len(batch_obs) > batch_size: 102 | break 103 | 104 | # take a single policy gradient update step 105 | optimizer.zero_grad() 106 | batch_loss = compute_loss(obs=torch.as_tensor(batch_obs, dtype=torch.float32), 107 | act=torch.as_tensor(batch_acts, dtype=torch.int32), 108 | weights=torch.as_tensor(batch_weights, dtype=torch.float32) 109 | ) 110 | batch_loss.backward() 111 | optimizer.step() 112 | return batch_loss, batch_rets, batch_lens 113 | 114 | # training loop 115 | for i in range(epochs): 116 | batch_loss, batch_rets, batch_lens = train_one_epoch() 117 | print('epoch: %3d \t loss: %.3f \t return: %.3f \t ep_len: %.3f'% 118 | (i, batch_loss, np.mean(batch_rets), np.mean(batch_lens))) 119 | 120 | if __name__ == '__main__': 121 | import argparse 122 | parser = argparse.ArgumentParser() 123 | parser.add_argument('--env_name', '--env', type=str, default='CartPole-v0') 124 | parser.add_argument('--render', action='store_true') 125 | parser.add_argument('--lr', type=float, default=1e-2) 126 | args = parser.parse_args() 127 | print('\nUsing simplest formulation of policy gradient.\n') 128 | train(env_name=args.env_name, render=args.render, lr=args.lr) -------------------------------------------------------------------------------- /spinningup/spinup/examples/pytorch/pg_math/2_rtg_pg.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.distributions.categorical import Categorical 4 | from torch.optim import Adam 5 | import numpy as np 6 | import gym 7 | from gym.spaces import Discrete, Box 8 | 9 | def mlp(sizes, activation=nn.Tanh, output_activation=nn.Identity): 10 | # Build a feedforward neural network. 11 | layers = [] 12 | for j in range(len(sizes)-1): 13 | act = activation if j < len(sizes)-2 else output_activation 14 | layers += [nn.Linear(sizes[j], sizes[j+1]), act()] 15 | return nn.Sequential(*layers) 16 | 17 | def reward_to_go(rews): 18 | n = len(rews) 19 | rtgs = np.zeros_like(rews) 20 | for i in reversed(range(n)): 21 | rtgs[i] = rews[i] + (rtgs[i+1] if i+1 < n else 0) 22 | return rtgs 23 | 24 | def train(env_name='CartPole-v0', hidden_sizes=[32], lr=1e-2, 25 | epochs=50, batch_size=5000, render=False): 26 | 27 | # make environment, check spaces, get obs / act dims 28 | env = gym.make(env_name) 29 | assert isinstance(env.observation_space, Box), \ 30 | "This example only works for envs with continuous state spaces." 31 | assert isinstance(env.action_space, Discrete), \ 32 | "This example only works for envs with discrete action spaces." 33 | 34 | obs_dim = env.observation_space.shape[0] 35 | n_acts = env.action_space.n 36 | 37 | # make core of policy network 38 | logits_net = mlp(sizes=[obs_dim]+hidden_sizes+[n_acts]) 39 | 40 | # make function to compute action distribution 41 | def get_policy(obs): 42 | logits = logits_net(obs) 43 | return Categorical(logits=logits) 44 | 45 | # make action selection function (outputs int actions, sampled from policy) 46 | def get_action(obs): 47 | return get_policy(obs).sample().item() 48 | 49 | # make loss function whose gradient, for the right data, is policy gradient 50 | def compute_loss(obs, act, weights): 51 | logp = get_policy(obs).log_prob(act) 52 | return -(logp * weights).mean() 53 | 54 | # make optimizer 55 | optimizer = Adam(logits_net.parameters(), lr=lr) 56 | 57 | # for training policy 58 | def train_one_epoch(): 59 | # make some empty lists for logging. 60 | batch_obs = [] # for observations 61 | batch_acts = [] # for actions 62 | batch_weights = [] # for reward-to-go weighting in policy gradient 63 | batch_rets = [] # for measuring episode returns 64 | batch_lens = [] # for measuring episode lengths 65 | 66 | # reset episode-specific variables 67 | obs = env.reset() # first obs comes from starting distribution 68 | done = False # signal from environment that episode is over 69 | ep_rews = [] # list for rewards accrued throughout ep 70 | 71 | # render first episode of each epoch 72 | finished_rendering_this_epoch = False 73 | 74 | # collect experience by acting in the environment with current policy 75 | while True: 76 | 77 | # rendering 78 | if (not finished_rendering_this_epoch) and render: 79 | env.render() 80 | 81 | # save obs 82 | batch_obs.append(obs.copy()) 83 | 84 | # act in the environment 85 | act = get_action(torch.as_tensor(obs, dtype=torch.float32)) 86 | obs, rew, done, _ = env.step(act) 87 | 88 | # save action, reward 89 | batch_acts.append(act) 90 | ep_rews.append(rew) 91 | 92 | if done: 93 | # if episode is over, record info about episode 94 | ep_ret, ep_len = sum(ep_rews), len(ep_rews) 95 | batch_rets.append(ep_ret) 96 | batch_lens.append(ep_len) 97 | 98 | # the weight for each logprob(a_t|s_t) is reward-to-go from t 99 | batch_weights += list(reward_to_go(ep_rews)) 100 | 101 | # reset episode-specific variables 102 | obs, done, ep_rews = env.reset(), False, [] 103 | 104 | # won't render again this epoch 105 | finished_rendering_this_epoch = True 106 | 107 | # end experience loop if we have enough of it 108 | if len(batch_obs) > batch_size: 109 | break 110 | 111 | # take a single policy gradient update step 112 | optimizer.zero_grad() 113 | batch_loss = compute_loss(obs=torch.as_tensor(batch_obs, dtype=torch.float32), 114 | act=torch.as_tensor(batch_acts, dtype=torch.int32), 115 | weights=torch.as_tensor(batch_weights, dtype=torch.float32) 116 | ) 117 | batch_loss.backward() 118 | optimizer.step() 119 | return batch_loss, batch_rets, batch_lens 120 | 121 | # training loop 122 | for i in range(epochs): 123 | batch_loss, batch_rets, batch_lens = train_one_epoch() 124 | print('epoch: %3d \t loss: %.3f \t return: %.3f \t ep_len: %.3f'% 125 | (i, batch_loss, np.mean(batch_rets), np.mean(batch_lens))) 126 | 127 | if __name__ == '__main__': 128 | import argparse 129 | parser = argparse.ArgumentParser() 130 | parser.add_argument('--env_name', '--env', type=str, default='CartPole-v0') 131 | parser.add_argument('--render', action='store_true') 132 | parser.add_argument('--lr', type=float, default=1e-2) 133 | args = parser.parse_args() 134 | print('\nUsing reward-to-go formulation of policy gradient.\n') 135 | train(env_name=args.env_name, render=args.render, lr=args.lr) -------------------------------------------------------------------------------- /spinningup/spinup/examples/tf1/bench_ppo_cartpole.py: -------------------------------------------------------------------------------- 1 | from spinup.utils.run_utils import ExperimentGrid 2 | from spinup import ppo_tf1 3 | import tensorflow as tf 4 | 5 | if __name__ == '__main__': 6 | import argparse 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument('--cpu', type=int, default=4) 9 | parser.add_argument('--num_runs', type=int, default=3) 10 | args = parser.parse_args() 11 | 12 | eg = ExperimentGrid(name='ppo-tf1-bench') 13 | eg.add('env_name', 'CartPole-v0', '', True) 14 | eg.add('seed', [10*i for i in range(args.num_runs)]) 15 | eg.add('epochs', 10) 16 | eg.add('steps_per_epoch', 4000) 17 | eg.add('ac_kwargs:hidden_sizes', [(32,), (64,64)], 'hid') 18 | eg.add('ac_kwargs:activation', [tf.tanh, tf.nn.relu], '') 19 | eg.run(ppo_tf1, num_cpu=args.cpu) -------------------------------------------------------------------------------- /spinningup/spinup/examples/tf1/pg_math/1_simple_pg.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import gym 4 | from gym.spaces import Discrete, Box 5 | 6 | def mlp(x, sizes, activation=tf.tanh, output_activation=None): 7 | # Build a feedforward neural network. 8 | for size in sizes[:-1]: 9 | x = tf.layers.dense(x, units=size, activation=activation) 10 | return tf.layers.dense(x, units=sizes[-1], activation=output_activation) 11 | 12 | def train(env_name='CartPole-v0', hidden_sizes=[32], lr=1e-2, 13 | epochs=50, batch_size=5000, render=False): 14 | 15 | # make environment, check spaces, get obs / act dims 16 | env = gym.make(env_name) 17 | assert isinstance(env.observation_space, Box), \ 18 | "This example only works for envs with continuous state spaces." 19 | assert isinstance(env.action_space, Discrete), \ 20 | "This example only works for envs with discrete action spaces." 21 | 22 | obs_dim = env.observation_space.shape[0] 23 | n_acts = env.action_space.n 24 | 25 | # make core of policy network 26 | obs_ph = tf.placeholder(shape=(None, obs_dim), dtype=tf.float32) 27 | logits = mlp(obs_ph, sizes=hidden_sizes+[n_acts]) 28 | 29 | # make action selection op (outputs int actions, sampled from policy) 30 | actions = tf.squeeze(tf.multinomial(logits=logits,num_samples=1), axis=1) 31 | 32 | # make loss function whose gradient, for the right data, is policy gradient 33 | weights_ph = tf.placeholder(shape=(None,), dtype=tf.float32) 34 | act_ph = tf.placeholder(shape=(None,), dtype=tf.int32) 35 | action_masks = tf.one_hot(act_ph, n_acts) 36 | log_probs = tf.reduce_sum(action_masks * tf.nn.log_softmax(logits), axis=1) 37 | loss = -tf.reduce_mean(weights_ph * log_probs) 38 | 39 | # make train op 40 | train_op = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss) 41 | 42 | sess = tf.InteractiveSession() 43 | sess.run(tf.global_variables_initializer()) 44 | 45 | # for training policy 46 | def train_one_epoch(): 47 | # make some empty lists for logging. 48 | batch_obs = [] # for observations 49 | batch_acts = [] # for actions 50 | batch_weights = [] # for R(tau) weighting in policy gradient 51 | batch_rets = [] # for measuring episode returns 52 | batch_lens = [] # for measuring episode lengths 53 | 54 | # reset episode-specific variables 55 | obs = env.reset() # first obs comes from starting distribution 56 | done = False # signal from environment that episode is over 57 | ep_rews = [] # list for rewards accrued throughout ep 58 | 59 | # render first episode of each epoch 60 | finished_rendering_this_epoch = False 61 | 62 | # collect experience by acting in the environment with current policy 63 | while True: 64 | 65 | # rendering 66 | if (not finished_rendering_this_epoch) and render: 67 | env.render() 68 | 69 | # save obs 70 | batch_obs.append(obs.copy()) 71 | 72 | # act in the environment 73 | act = sess.run(actions, {obs_ph: obs.reshape(1,-1)})[0] 74 | obs, rew, done, _ = env.step(act) 75 | 76 | # save action, reward 77 | batch_acts.append(act) 78 | ep_rews.append(rew) 79 | 80 | if done: 81 | # if episode is over, record info about episode 82 | ep_ret, ep_len = sum(ep_rews), len(ep_rews) 83 | batch_rets.append(ep_ret) 84 | batch_lens.append(ep_len) 85 | 86 | # the weight for each logprob(a|s) is R(tau) 87 | batch_weights += [ep_ret] * ep_len 88 | 89 | # reset episode-specific variables 90 | obs, done, ep_rews = env.reset(), False, [] 91 | 92 | # won't render again this epoch 93 | finished_rendering_this_epoch = True 94 | 95 | # end experience loop if we have enough of it 96 | if len(batch_obs) > batch_size: 97 | break 98 | 99 | # take a single policy gradient update step 100 | batch_loss, _ = sess.run([loss, train_op], 101 | feed_dict={ 102 | obs_ph: np.array(batch_obs), 103 | act_ph: np.array(batch_acts), 104 | weights_ph: np.array(batch_weights) 105 | }) 106 | return batch_loss, batch_rets, batch_lens 107 | 108 | # training loop 109 | for i in range(epochs): 110 | batch_loss, batch_rets, batch_lens = train_one_epoch() 111 | print('epoch: %3d \t loss: %.3f \t return: %.3f \t ep_len: %.3f'% 112 | (i, batch_loss, np.mean(batch_rets), np.mean(batch_lens))) 113 | 114 | if __name__ == '__main__': 115 | import argparse 116 | parser = argparse.ArgumentParser() 117 | parser.add_argument('--env_name', '--env', type=str, default='CartPole-v0') 118 | parser.add_argument('--render', action='store_true') 119 | parser.add_argument('--lr', type=float, default=1e-2) 120 | args = parser.parse_args() 121 | print('\nUsing simplest formulation of policy gradient.\n') 122 | train(env_name=args.env_name, render=args.render, lr=args.lr) -------------------------------------------------------------------------------- /spinningup/spinup/examples/tf1/pg_math/2_rtg_pg.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import gym 4 | from gym.spaces import Discrete, Box 5 | 6 | def mlp(x, sizes, activation=tf.tanh, output_activation=None): 7 | # Build a feedforward neural network. 8 | for size in sizes[:-1]: 9 | x = tf.layers.dense(x, units=size, activation=activation) 10 | return tf.layers.dense(x, units=sizes[-1], activation=output_activation) 11 | 12 | def reward_to_go(rews): 13 | n = len(rews) 14 | rtgs = np.zeros_like(rews) 15 | for i in reversed(range(n)): 16 | rtgs[i] = rews[i] + (rtgs[i+1] if i+1 < n else 0) 17 | return rtgs 18 | 19 | def train(env_name='CartPole-v0', hidden_sizes=[32], lr=1e-2, 20 | epochs=50, batch_size=5000, render=False): 21 | 22 | # make environment, check spaces, get obs / act dims 23 | env = gym.make(env_name) 24 | assert isinstance(env.observation_space, Box), \ 25 | "This example only works for envs with continuous state spaces." 26 | assert isinstance(env.action_space, Discrete), \ 27 | "This example only works for envs with discrete action spaces." 28 | 29 | obs_dim = env.observation_space.shape[0] 30 | n_acts = env.action_space.n 31 | 32 | # make core of policy network 33 | obs_ph = tf.placeholder(shape=(None, obs_dim), dtype=tf.float32) 34 | logits = mlp(obs_ph, sizes=hidden_sizes+[n_acts]) 35 | 36 | # make action selection op (outputs int actions, sampled from policy) 37 | actions = tf.squeeze(tf.multinomial(logits=logits,num_samples=1), axis=1) 38 | 39 | # make loss function whose gradient, for the right data, is policy gradient 40 | weights_ph = tf.placeholder(shape=(None,), dtype=tf.float32) 41 | act_ph = tf.placeholder(shape=(None,), dtype=tf.int32) 42 | action_masks = tf.one_hot(act_ph, n_acts) 43 | log_probs = tf.reduce_sum(action_masks * tf.nn.log_softmax(logits), axis=1) 44 | loss = -tf.reduce_mean(weights_ph * log_probs) 45 | 46 | # make train op 47 | train_op = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss) 48 | 49 | sess = tf.InteractiveSession() 50 | sess.run(tf.global_variables_initializer()) 51 | 52 | # for training policy 53 | def train_one_epoch(): 54 | # make some empty lists for logging. 55 | batch_obs = [] # for observations 56 | batch_acts = [] # for actions 57 | batch_weights = [] # for reward-to-go weighting in policy gradient 58 | batch_rets = [] # for measuring episode returns 59 | batch_lens = [] # for measuring episode lengths 60 | 61 | # reset episode-specific variables 62 | obs = env.reset() # first obs comes from starting distribution 63 | done = False # signal from environment that episode is over 64 | ep_rews = [] # list for rewards accrued throughout ep 65 | 66 | # render first episode of each epoch 67 | finished_rendering_this_epoch = False 68 | 69 | # collect experience by acting in the environment with current policy 70 | while True: 71 | 72 | # rendering 73 | if (not finished_rendering_this_epoch) and render: 74 | env.render() 75 | 76 | # save obs 77 | batch_obs.append(obs.copy()) 78 | 79 | # act in the environment 80 | act = sess.run(actions, {obs_ph: obs.reshape(1,-1)})[0] 81 | obs, rew, done, _ = env.step(act) 82 | 83 | # save action, reward 84 | batch_acts.append(act) 85 | ep_rews.append(rew) 86 | 87 | if done: 88 | # if episode is over, record info about episode 89 | ep_ret, ep_len = sum(ep_rews), len(ep_rews) 90 | batch_rets.append(ep_ret) 91 | batch_lens.append(ep_len) 92 | 93 | # the weight for each logprob(a_t|s_t) is reward-to-go from t 94 | batch_weights += list(reward_to_go(ep_rews)) 95 | 96 | # reset episode-specific variables 97 | obs, done, ep_rews = env.reset(), False, [] 98 | 99 | # won't render again this epoch 100 | finished_rendering_this_epoch = True 101 | 102 | # end experience loop if we have enough of it 103 | if len(batch_obs) > batch_size: 104 | break 105 | 106 | # take a single policy gradient update step 107 | batch_loss, _ = sess.run([loss, train_op], 108 | feed_dict={ 109 | obs_ph: np.array(batch_obs), 110 | act_ph: np.array(batch_acts), 111 | weights_ph: np.array(batch_weights) 112 | }) 113 | return batch_loss, batch_rets, batch_lens 114 | 115 | # training loop 116 | for i in range(epochs): 117 | batch_loss, batch_rets, batch_lens = train_one_epoch() 118 | print('epoch: %3d \t loss: %.3f \t return: %.3f \t ep_len: %.3f'% 119 | (i, batch_loss, np.mean(batch_rets), np.mean(batch_lens))) 120 | 121 | if __name__ == '__main__': 122 | import argparse 123 | parser = argparse.ArgumentParser() 124 | parser.add_argument('--env_name', '--env', type=str, default='CartPole-v0') 125 | parser.add_argument('--render', action='store_true') 126 | parser.add_argument('--lr', type=float, default=1e-2) 127 | args = parser.parse_args() 128 | print('\nUsing reward-to-go formulation of policy gradient.\n') 129 | train(env_name=args.env_name, render=args.render, lr=args.lr) -------------------------------------------------------------------------------- /spinningup/spinup/examples/tf1/train_mnist.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import time 4 | from spinup.utils.logx import EpochLogger 5 | 6 | 7 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None): 8 | for h in hidden_sizes[:-1]: 9 | x = tf.layers.dense(x, units=h, activation=activation) 10 | return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation) 11 | 12 | 13 | # Simple script for training an MLP on MNIST. 14 | def train_mnist(steps_per_epoch=100, epochs=5, 15 | lr=1e-3, layers=2, hidden_size=64, 16 | logger_kwargs=dict(), save_freq=1): 17 | 18 | logger = EpochLogger(**logger_kwargs) 19 | logger.save_config(locals()) 20 | 21 | # Load and preprocess MNIST data 22 | (x_train, y_train), _ = tf.keras.datasets.mnist.load_data() 23 | x_train = x_train.reshape(-1, 28*28) / 255.0 24 | 25 | # Define inputs & main outputs from computation graph 26 | x_ph = tf.placeholder(tf.float32, shape=(None, 28*28)) 27 | y_ph = tf.placeholder(tf.int32, shape=(None,)) 28 | logits = mlp(x_ph, hidden_sizes=[hidden_size]*layers + [10], activation=tf.nn.relu) 29 | predict = tf.argmax(logits, axis=1, output_type=tf.int32) 30 | 31 | # Define loss function, accuracy, and training op 32 | y = tf.one_hot(y_ph, 10) 33 | loss = tf.losses.softmax_cross_entropy(y, logits) 34 | acc = tf.reduce_mean(tf.cast(tf.equal(y_ph, predict), tf.float32)) 35 | train_op = tf.train.AdamOptimizer().minimize(loss) 36 | 37 | # Prepare session 38 | sess = tf.Session() 39 | sess.run(tf.global_variables_initializer()) 40 | 41 | # Setup model saving 42 | logger.setup_tf_saver(sess, inputs={'x': x_ph}, 43 | outputs={'logits': logits, 'predict': predict}) 44 | 45 | start_time = time.time() 46 | 47 | # Run main training loop 48 | for epoch in range(epochs): 49 | for t in range(steps_per_epoch): 50 | idxs = np.random.randint(0, len(x_train), 32) 51 | feed_dict = {x_ph: x_train[idxs], 52 | y_ph: y_train[idxs]} 53 | outs = sess.run([loss, acc, train_op], feed_dict=feed_dict) 54 | logger.store(Loss=outs[0], Acc=outs[1]) 55 | 56 | # Save model 57 | if (epoch % save_freq == 0) or (epoch == epochs-1): 58 | logger.save_state(state_dict=dict(), itr=None) 59 | 60 | # Log info about epoch 61 | logger.log_tabular('Epoch', epoch) 62 | logger.log_tabular('Acc', with_min_and_max=True) 63 | logger.log_tabular('Loss', average_only=True) 64 | logger.log_tabular('TotalGradientSteps', (epoch+1)*steps_per_epoch) 65 | logger.log_tabular('Time', time.time()-start_time) 66 | logger.dump_tabular() 67 | 68 | if __name__ == '__main__': 69 | train_mnist() -------------------------------------------------------------------------------- /spinningup/spinup/exercises/common.py: -------------------------------------------------------------------------------- 1 | def print_result(correct=False): 2 | print('\n'*5 + '='*50 + '\n'*3) 3 | if correct: 4 | print("Congratulations! Your answer is correct.") 5 | else: 6 | print("Your answer appears to be incorrect. Try again!") 7 | print('\n'*3 + '='*50) -------------------------------------------------------------------------------- /spinningup/spinup/exercises/pytorch/problem_set_1/exercise1_1.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | """ 5 | 6 | Exercise 1.1: Diagonal Gaussian Likelihood 7 | 8 | Write a function that takes in PyTorch Tensors for the means and 9 | log stds of a batch of diagonal Gaussian distributions, along with a 10 | PyTorch Tensor for (previously-generated) samples from those 11 | distributions, and returns a Tensor containing the log 12 | likelihoods of those samples. 13 | 14 | """ 15 | 16 | def gaussian_likelihood(x, mu, log_std): 17 | """ 18 | Args: 19 | x: Tensor with shape [batch, dim] 20 | mu: Tensor with shape [batch, dim] 21 | log_std: Tensor with shape [batch, dim] or [dim] 22 | 23 | Returns: 24 | Tensor with shape [batch] 25 | """ 26 | ####################### 27 | # # 28 | # YOUR CODE HERE # 29 | # # 30 | ####################### 31 | return torch.zeros(1) 32 | 33 | 34 | if __name__ == '__main__': 35 | """ 36 | Run this file to verify your solution. 37 | """ 38 | from spinup.exercises.pytorch.problem_set_1_solutions import exercise1_1_soln 39 | from spinup.exercises.common import print_result 40 | 41 | batch_size = 32 42 | dim = 10 43 | 44 | x = torch.rand(batch_size, dim) 45 | mu = torch.rand(batch_size, dim) 46 | log_std = torch.rand(dim) 47 | 48 | your_gaussian_likelihood = gaussian_likelihood(x, mu, log_std) 49 | true_gaussian_likelihood = exercise1_1_soln.gaussian_likelihood(x, mu, log_std) 50 | 51 | your_result = your_gaussian_likelihood.detach().numpy() 52 | true_result = true_gaussian_likelihood.detach().numpy() 53 | 54 | correct = np.allclose(your_result, true_result) 55 | print_result(correct) -------------------------------------------------------------------------------- /spinningup/spinup/exercises/pytorch/problem_set_1/exercise1_2.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | from spinup.exercises.pytorch.problem_set_1 import exercise1_1 5 | from spinup.exercises.pytorch.problem_set_1 import exercise1_2_auxiliary 6 | 7 | """ 8 | 9 | Exercise 1.2: PPO Gaussian Policy 10 | 11 | You will implement an MLP diagonal Gaussian policy for PPO by 12 | writing an MLP-builder, and a few other key functions. 13 | 14 | Log-likelihoods will be computed using your answer to Exercise 1.1, 15 | so make sure to complete that exercise before beginning this one. 16 | 17 | """ 18 | 19 | def mlp(sizes, activation, output_activation=nn.Identity): 20 | """ 21 | Build a multi-layer perceptron in PyTorch. 22 | 23 | Args: 24 | sizes: Tuple, list, or other iterable giving the number of units 25 | for each layer of the MLP. 26 | 27 | activation: Activation function for all layers except last. 28 | 29 | output_activation: Activation function for last layer. 30 | 31 | Returns: 32 | A PyTorch module that can be called to give the output of the MLP. 33 | (Use an nn.Sequential module.) 34 | 35 | """ 36 | ####################### 37 | # # 38 | # YOUR CODE HERE # 39 | # # 40 | ####################### 41 | pass 42 | 43 | class DiagonalGaussianDistribution: 44 | 45 | def __init__(self, mu, log_std): 46 | self.mu = mu 47 | self.log_std = log_std 48 | 49 | def sample(self): 50 | """ 51 | Returns: 52 | A PyTorch Tensor of samples from the diagonal Gaussian distribution with 53 | mean and log_std given by self.mu and self.log_std. 54 | """ 55 | ####################### 56 | # # 57 | # YOUR CODE HERE # 58 | # # 59 | ####################### 60 | pass 61 | 62 | #================================(Given, ignore)==========================================# 63 | def log_prob(self, value): 64 | return exercise1_1.gaussian_likelihood(value, self.mu, self.log_std) 65 | 66 | def entropy(self): 67 | return 0.5 + 0.5 * np.log(2 * np.pi) + self.log_std.sum(axis=-1) 68 | #=========================================================================================# 69 | 70 | 71 | class MLPGaussianActor(nn.Module): 72 | 73 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation): 74 | super().__init__() 75 | """ 76 | Initialize an MLP Gaussian Actor by making a PyTorch module for computing the 77 | mean of the distribution given a batch of observations, and a log_std parameter. 78 | 79 | Make log_std a PyTorch Parameter with the same shape as the action vector, 80 | independent of observations, initialized to [-0.5, -0.5, ..., -0.5]. 81 | (Make sure it's trainable!) 82 | """ 83 | ####################### 84 | # # 85 | # YOUR CODE HERE # 86 | # # 87 | ####################### 88 | # self.log_std = 89 | # self.mu_net = 90 | pass 91 | 92 | #================================(Given, ignore)==========================================# 93 | def forward(self, obs, act=None): 94 | mu = self.mu_net(obs) 95 | pi = DiagonalGaussianDistribution(mu, self.log_std) 96 | logp_a = None 97 | if act is not None: 98 | logp_a = pi.log_prob(act) 99 | return pi, logp_a 100 | #=========================================================================================# 101 | 102 | 103 | 104 | if __name__ == '__main__': 105 | """ 106 | Run this file to verify your solution. 107 | """ 108 | 109 | from spinup import ppo_pytorch as ppo 110 | from spinup.exercises.common import print_result 111 | from functools import partial 112 | import gym 113 | import os 114 | import pandas as pd 115 | import psutil 116 | import time 117 | 118 | logdir = "/tmp/experiments/%i"%int(time.time()) 119 | 120 | ActorCritic = partial(exercise1_2_auxiliary.ExerciseActorCritic, actor=MLPGaussianActor) 121 | 122 | ppo(env_fn = lambda : gym.make('InvertedPendulum-v2'), 123 | actor_critic=ActorCritic, 124 | ac_kwargs=dict(hidden_sizes=(64,)), 125 | steps_per_epoch=4000, epochs=20, logger_kwargs=dict(output_dir=logdir)) 126 | 127 | # Get scores from last five epochs to evaluate success. 128 | data = pd.read_table(os.path.join(logdir,'progress.txt')) 129 | last_scores = data['AverageEpRet'][-5:] 130 | 131 | # Your implementation is probably correct if the agent has a score >500, 132 | # or if it reaches the top possible score of 1000, in the last five epochs. 133 | correct = np.mean(last_scores) > 500 or np.max(last_scores)==1e3 134 | print_result(correct) -------------------------------------------------------------------------------- /spinningup/spinup/exercises/pytorch/problem_set_1/exercise1_2_auxiliary.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | 5 | """ 6 | 7 | Auxiliary code for Exercise 1.2. No part of the exercise requires you to 8 | look into or modify this file (and since it contains an mlp function, 9 | it has spoilers for the answer). Removed from the main file to avoid 10 | cluttering it up. 11 | 12 | In other words, nothing to see here, move along, these are not the 13 | droids you're looking for, and all that... 14 | 15 | """ 16 | 17 | def mlp(sizes, activation, output_activation=nn.Identity): 18 | layers = [] 19 | for j in range(len(sizes)-1): 20 | act = activation if j < len(sizes)-2 else output_activation 21 | layers += [nn.Linear(sizes[j], sizes[j+1]), act()] 22 | return nn.Sequential(*layers) 23 | 24 | 25 | class MLPCritic(nn.Module): 26 | 27 | def __init__(self, obs_dim, hidden_sizes, activation): 28 | super().__init__() 29 | self.v_net = mlp([obs_dim] + list(hidden_sizes) + [1], activation) 30 | 31 | def forward(self, obs): 32 | return torch.squeeze(self.v_net(obs), -1) # Critical to ensure v has right shape. 33 | 34 | 35 | class ExerciseActorCritic(nn.Module): 36 | 37 | def __init__(self, observation_space, action_space, 38 | hidden_sizes=(64,64), activation=nn.Tanh, 39 | actor=None): 40 | super().__init__() 41 | obs_dim = observation_space.shape[0] 42 | self.pi = actor(obs_dim, action_space.shape[0], hidden_sizes, activation) 43 | self.v = MLPCritic(obs_dim, hidden_sizes, activation) 44 | 45 | def step(self, obs): 46 | with torch.no_grad(): 47 | pi, _ = self.pi(obs) 48 | a = pi.sample() 49 | logp_a = pi.log_prob(a) 50 | v = self.v(obs) 51 | return a.numpy(), v.numpy(), logp_a.numpy() 52 | 53 | def act(self, obs): 54 | return self.step(obs)[0] -------------------------------------------------------------------------------- /spinningup/spinup/exercises/pytorch/problem_set_1_solutions/exercise1_1_soln.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | EPS=1e-8 5 | 6 | def gaussian_likelihood(x, mu, log_std): 7 | pre_sum = -0.5 * (((x-mu)/(torch.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi)) 8 | return pre_sum.sum(axis=-1) -------------------------------------------------------------------------------- /spinningup/spinup/exercises/pytorch/problem_set_1_solutions/exercise1_2_soln.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | 5 | EPS=1e-8 6 | 7 | def mlp(sizes, activation, output_activation=nn.Identity): 8 | layers = [] 9 | for j in range(len(sizes)-1): 10 | act = activation if j < len(sizes)-2 else output_activation 11 | layers += [nn.Linear(sizes[j], sizes[j+1]), act()] 12 | return nn.Sequential(*layers) 13 | 14 | def gaussian_likelihood(x, mu, log_std): 15 | pre_sum = -0.5 * (((x-mu)/(torch.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi)) 16 | return pre_sum.sum(axis=-1) 17 | 18 | 19 | class DiagonalGaussianDistribution: 20 | 21 | def __init__(self, mu, log_std): 22 | self.mu = mu 23 | self.log_std = log_std 24 | 25 | def sample(self): 26 | return self.mu + torch.exp(self.log_std) * torch.randn_like(self.mu) 27 | 28 | def log_prob(self, value): 29 | return gaussian_likelihood(value, self.mu, self.log_std) 30 | 31 | def entropy(self): 32 | return 0.5 + 0.5 * np.log(2 * np.pi) + self.log_std.sum(axis=-1) 33 | 34 | 35 | class MLPGaussianActor(nn.Module): 36 | 37 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation): 38 | super().__init__() 39 | log_std = -0.5 * np.ones(act_dim, dtype=np.float32) 40 | self.log_std = torch.nn.Parameter(torch.as_tensor(log_std)) 41 | self.mu_net = mlp([obs_dim] + list(hidden_sizes) + [act_dim], activation) 42 | 43 | def forward(self, obs, act=None): 44 | mu = self.mu_net(obs) 45 | pi = DiagonalGaussianDistribution(mu, self.log_std) 46 | logp_a = None 47 | if act is not None: 48 | logp_a = pi.log_prob(act) 49 | return pi, logp_a -------------------------------------------------------------------------------- /spinningup/spinup/exercises/pytorch/problem_set_2/exercise2_2.py: -------------------------------------------------------------------------------- 1 | from spinup.algos.pytorch.ddpg.core import mlp, MLPActorCritic 2 | from spinup.utils.run_utils import ExperimentGrid 3 | from spinup import ddpg_pytorch as ddpg 4 | import numpy as np 5 | import torch 6 | import torch.nn as nn 7 | 8 | """ 9 | 10 | Exercise 2.2: Silent Bug in DDPG (PyTorch Version) 11 | 12 | In this exercise, you will run DDPG with a bugged actor critic. Your goal is 13 | to determine whether or not there is any performance degredation, and if so, 14 | figure out what's going wrong. 15 | 16 | You do NOT need to write code for this exercise. 17 | 18 | """ 19 | 20 | """ 21 | Bugged Actor-Critic 22 | """ 23 | 24 | class BuggedMLPActor(nn.Module): 25 | 26 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation, act_limit): 27 | super().__init__() 28 | pi_sizes = [obs_dim] + list(hidden_sizes) + [act_dim] 29 | self.pi = mlp(pi_sizes, activation, nn.Tanh) 30 | self.act_limit = act_limit 31 | 32 | def forward(self, obs): 33 | # Return output from network scaled to action space limits. 34 | return self.act_limit * self.pi(obs) 35 | 36 | class BuggedMLPQFunction(nn.Module): 37 | 38 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation): 39 | super().__init__() 40 | self.q = mlp([obs_dim + act_dim] + list(hidden_sizes) + [1], activation) 41 | 42 | def forward(self, obs, act): 43 | return self.q(torch.cat([obs, act], dim=-1)) 44 | 45 | class BuggedMLPActorCritic(nn.Module): 46 | 47 | def __init__(self, observation_space, action_space, hidden_sizes=(256,256), 48 | activation=nn.ReLU): 49 | super().__init__() 50 | 51 | obs_dim = observation_space.shape[0] 52 | act_dim = action_space.shape[0] 53 | act_limit = action_space.high[0] 54 | 55 | # build policy and value functions 56 | self.pi = BuggedMLPActor(obs_dim, act_dim, hidden_sizes, activation, act_limit) 57 | self.q = BuggedMLPQFunction(obs_dim, act_dim, hidden_sizes, activation) 58 | 59 | def act(self, obs): 60 | with torch.no_grad(): 61 | return self.pi(obs).numpy() 62 | 63 | 64 | if __name__ == '__main__': 65 | import argparse 66 | parser = argparse.ArgumentParser() 67 | parser.add_argument('--env', type=str, default='HalfCheetah-v2') 68 | parser.add_argument('--h', type=int, default=300) 69 | parser.add_argument('--l', type=int, default=1) 70 | parser.add_argument('--num_runs', '-n', type=int, default=3) 71 | parser.add_argument('--steps_per_epoch', '-s', type=int, default=5000) 72 | parser.add_argument('--total_steps', '-t', type=int, default=int(5e4)) 73 | args = parser.parse_args() 74 | 75 | def ddpg_with_actor_critic(bugged, **kwargs): 76 | from spinup.exercises.pytorch.problem_set_2.exercise2_2 import BuggedMLPActorCritic 77 | actor_critic = BuggedMLPActorCritic if bugged else MLPActorCritic 78 | return ddpg(actor_critic=actor_critic, 79 | ac_kwargs=dict(hidden_sizes=[args.h]*args.l), 80 | start_steps=5000, 81 | max_ep_len=150, 82 | batch_size=64, 83 | polyak=0.95, 84 | **kwargs) 85 | 86 | eg = ExperimentGrid(name='ex2-2_ddpg') 87 | eg.add('replay_size', int(args.total_steps)) 88 | eg.add('env_name', args.env, '', True) 89 | eg.add('seed', [10*i for i in range(args.num_runs)]) 90 | eg.add('epochs', int(args.total_steps / args.steps_per_epoch)) 91 | eg.add('steps_per_epoch', args.steps_per_epoch) 92 | eg.add('bugged', [False, True]) 93 | eg.run(ddpg_with_actor_critic, datestamp=True) -------------------------------------------------------------------------------- /spinningup/spinup/exercises/tf1/problem_set_1/exercise1_1.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | """ 5 | 6 | Exercise 1.1: Diagonal Gaussian Likelihood 7 | 8 | Write a function which takes in Tensorflow symbols for the means and 9 | log stds of a batch of diagonal Gaussian distributions, along with a 10 | Tensorflow placeholder for (previously-generated) samples from those 11 | distributions, and returns a Tensorflow symbol for computing the log 12 | likelihoods of those samples. 13 | 14 | """ 15 | 16 | def gaussian_likelihood(x, mu, log_std): 17 | """ 18 | Args: 19 | x: Tensor with shape [batch, dim] 20 | mu: Tensor with shape [batch, dim] 21 | log_std: Tensor with shape [batch, dim] or [dim] 22 | 23 | Returns: 24 | Tensor with shape [batch] 25 | """ 26 | ####################### 27 | # # 28 | # YOUR CODE HERE # 29 | # # 30 | ####################### 31 | return tf.constant(0) 32 | 33 | 34 | if __name__ == '__main__': 35 | """ 36 | Run this file to verify your solution. 37 | """ 38 | from spinup.exercises.tf1.problem_set_1_solutions import exercise1_1_soln 39 | from spinup.exercises.common import print_result 40 | 41 | sess = tf.Session() 42 | 43 | dim = 10 44 | x = tf.placeholder(tf.float32, shape=(None, dim)) 45 | mu = tf.placeholder(tf.float32, shape=(None, dim)) 46 | log_std = tf.placeholder(tf.float32, shape=(dim,)) 47 | 48 | your_gaussian_likelihood = gaussian_likelihood(x, mu, log_std) 49 | true_gaussian_likelihood = exercise1_1_soln.gaussian_likelihood(x, mu, log_std) 50 | 51 | batch_size = 32 52 | feed_dict = {x: np.random.rand(batch_size, dim), 53 | mu: np.random.rand(batch_size, dim), 54 | log_std: np.random.rand(dim)} 55 | 56 | your_result, true_result = sess.run([your_gaussian_likelihood, true_gaussian_likelihood], 57 | feed_dict=feed_dict) 58 | 59 | correct = np.allclose(your_result, true_result) 60 | print_result(correct) -------------------------------------------------------------------------------- /spinningup/spinup/exercises/tf1/problem_set_1/exercise1_2.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | from spinup.exercises.tf1.problem_set_1 import exercise1_1 4 | 5 | """ 6 | 7 | Exercise 1.2: PPO Gaussian Policy 8 | 9 | Implement an MLP diagonal Gaussian policy for PPO. 10 | 11 | Log-likelihoods will be computed using your answer to Exercise 1.1, 12 | so make sure to complete that exercise before beginning this one. 13 | 14 | """ 15 | 16 | EPS = 1e-8 17 | 18 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None): 19 | """ 20 | Builds a multi-layer perceptron in Tensorflow. 21 | 22 | Args: 23 | x: Input tensor. 24 | 25 | hidden_sizes: Tuple, list, or other iterable giving the number of units 26 | for each hidden layer of the MLP. 27 | 28 | activation: Activation function for all layers except last. 29 | 30 | output_activation: Activation function for last layer. 31 | 32 | Returns: 33 | A TF symbol for the output of an MLP that takes x as an input. 34 | 35 | """ 36 | ####################### 37 | # # 38 | # YOUR CODE HERE # 39 | # # 40 | ####################### 41 | pass 42 | 43 | def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation, action_space): 44 | """ 45 | Builds symbols to sample actions and compute log-probs of actions. 46 | 47 | Special instructions: Make log_std a tf variable with the same shape as 48 | the action vector, independent of x, initialized to [-0.5, -0.5, ..., -0.5]. 49 | 50 | Args: 51 | x: Input tensor of states. Shape [batch, obs_dim]. 52 | 53 | a: Input tensor of actions. Shape [batch, act_dim]. 54 | 55 | hidden_sizes: Sizes of hidden layers for action network MLP. 56 | 57 | activation: Activation function for all layers except last. 58 | 59 | output_activation: Activation function for last layer (action layer). 60 | 61 | action_space: A gym.spaces object describing the action space of the 62 | environment this agent will interact with. 63 | 64 | Returns: 65 | pi: A symbol for sampling stochastic actions from a Gaussian 66 | distribution. 67 | 68 | logp: A symbol for computing log-likelihoods of actions from a Gaussian 69 | distribution. 70 | 71 | logp_pi: A symbol for computing log-likelihoods of actions in pi from a 72 | Gaussian distribution. 73 | 74 | """ 75 | ####################### 76 | # # 77 | # YOUR CODE HERE # 78 | # # 79 | ####################### 80 | # mu = 81 | # log_std = 82 | # pi = 83 | 84 | logp = exercise1_1.gaussian_likelihood(a, mu, log_std) 85 | logp_pi = exercise1_1.gaussian_likelihood(pi, mu, log_std) 86 | return pi, logp, logp_pi 87 | 88 | 89 | if __name__ == '__main__': 90 | """ 91 | Run this file to verify your solution. 92 | """ 93 | 94 | from spinup import ppo_tf1 as ppo 95 | from spinup.exercises.common import print_result 96 | import gym 97 | import os 98 | import pandas as pd 99 | import psutil 100 | import time 101 | 102 | logdir = "/tmp/experiments/%i"%int(time.time()) 103 | ppo(env_fn = lambda : gym.make('InvertedPendulum-v2'), 104 | ac_kwargs=dict(policy=mlp_gaussian_policy, hidden_sizes=(64,)), 105 | steps_per_epoch=4000, epochs=20, logger_kwargs=dict(output_dir=logdir)) 106 | 107 | # Get scores from last five epochs to evaluate success. 108 | data = pd.read_table(os.path.join(logdir,'progress.txt')) 109 | last_scores = data['AverageEpRet'][-5:] 110 | 111 | # Your implementation is probably correct if the agent has a score >500, 112 | # or if it reaches the top possible score of 1000, in the last five epochs. 113 | correct = np.mean(last_scores) > 500 or np.max(last_scores)==1e3 114 | print_result(correct) -------------------------------------------------------------------------------- /spinningup/spinup/exercises/tf1/problem_set_1_solutions/exercise1_1_soln.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | EPS=1e-8 5 | 6 | def gaussian_likelihood(x, mu, log_std): 7 | pre_sum = -0.5 * (((x-mu)/(tf.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi)) 8 | return tf.reduce_sum(pre_sum, axis=1) -------------------------------------------------------------------------------- /spinningup/spinup/exercises/tf1/problem_set_1_solutions/exercise1_2_soln.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | 5 | EPS = 1e-8 6 | 7 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None): 8 | for h in hidden_sizes[:-1]: 9 | x = tf.layers.dense(x, units=h, activation=activation) 10 | return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation) 11 | 12 | def gaussian_likelihood(x, mu, log_std): 13 | pre_sum = -0.5 * (((x-mu)/(tf.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi)) 14 | return tf.reduce_sum(pre_sum, axis=1) 15 | 16 | def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation, action_space): 17 | act_dim = a.shape.as_list()[-1] 18 | mu = mlp(x, list(hidden_sizes)+[act_dim], activation, output_activation) 19 | log_std = tf.get_variable(name='log_std', initializer=-0.5*np.ones(act_dim, dtype=np.float32)) 20 | std = tf.exp(log_std) 21 | pi = mu + tf.random_normal(tf.shape(mu)) * std 22 | logp = gaussian_likelihood(a, mu, log_std) 23 | logp_pi = gaussian_likelihood(pi, mu, log_std) 24 | return pi, logp, logp_pi -------------------------------------------------------------------------------- /spinningup/spinup/exercises/tf1/problem_set_2/exercise2_2.py: -------------------------------------------------------------------------------- 1 | from spinup.algos.tf1.ddpg.core import mlp, mlp_actor_critic 2 | from spinup.utils.run_utils import ExperimentGrid 3 | from spinup import ddpg_tf1 as ddpg 4 | import numpy as np 5 | import tensorflow as tf 6 | 7 | """ 8 | 9 | Exercise 2.2: Silent Bug in DDPG 10 | 11 | In this exercise, you will run DDPG with a bugged actor critic. Your goal is 12 | to determine whether or not there is any performance degredation, and if so, 13 | figure out what's going wrong. 14 | 15 | You do NOT need to write code for this exercise. 16 | 17 | """ 18 | 19 | """ 20 | Bugged Actor-Critic 21 | """ 22 | def bugged_mlp_actor_critic(x, a, hidden_sizes=(400,300), activation=tf.nn.relu, 23 | output_activation=tf.tanh, action_space=None): 24 | act_dim = a.shape.as_list()[-1] 25 | act_limit = action_space.high[0] 26 | with tf.variable_scope('pi'): 27 | pi = act_limit * mlp(x, list(hidden_sizes)+[act_dim], activation, output_activation) 28 | with tf.variable_scope('q'): 29 | q = mlp(tf.concat([x,a], axis=-1), list(hidden_sizes)+[1], activation, None) 30 | with tf.variable_scope('q', reuse=True): 31 | q_pi = mlp(tf.concat([x,pi], axis=-1), list(hidden_sizes)+[1], activation, None) 32 | return pi, q, q_pi 33 | 34 | 35 | if __name__ == '__main__': 36 | import argparse 37 | parser = argparse.ArgumentParser() 38 | parser.add_argument('--env', type=str, default='HalfCheetah-v2') 39 | parser.add_argument('--h', type=int, default=300) 40 | parser.add_argument('--l', type=int, default=1) 41 | parser.add_argument('--num_runs', '-n', type=int, default=3) 42 | parser.add_argument('--steps_per_epoch', '-s', type=int, default=5000) 43 | parser.add_argument('--total_steps', '-t', type=int, default=int(5e4)) 44 | args = parser.parse_args() 45 | 46 | def ddpg_with_actor_critic(bugged, **kwargs): 47 | actor_critic = bugged_mlp_actor_critic if bugged else mlp_actor_critic 48 | return ddpg(actor_critic=actor_critic, 49 | ac_kwargs=dict(hidden_sizes=[args.h]*args.l), 50 | start_steps=5000, 51 | max_ep_len=150, 52 | batch_size=64, 53 | polyak=0.95, 54 | **kwargs) 55 | 56 | eg = ExperimentGrid(name='ex2-2_ddpg') 57 | eg.add('replay_size', int(args.total_steps)) 58 | eg.add('env_name', args.env, '', True) 59 | eg.add('seed', [10*i for i in range(args.num_runs)]) 60 | eg.add('epochs', int(args.total_steps / args.steps_per_epoch)) 61 | eg.add('steps_per_epoch', args.steps_per_epoch) 62 | eg.add('bugged', [False, True]) 63 | eg.run(ddpg_with_actor_critic, datestamp=True) -------------------------------------------------------------------------------- /spinningup/spinup/run.py: -------------------------------------------------------------------------------- 1 | import spinup 2 | from spinup.user_config import DEFAULT_BACKEND 3 | from spinup.utils.run_utils import ExperimentGrid 4 | from spinup.utils.serialization_utils import convert_json 5 | import argparse 6 | import gym 7 | import json 8 | import os, subprocess, sys 9 | import os.path as osp 10 | import string 11 | import tensorflow as tf 12 | import torch 13 | from copy import deepcopy 14 | from textwrap import dedent 15 | 16 | 17 | # Command line args that will go to ExperimentGrid.run, and must possess unique 18 | # values (therefore must be treated separately). 19 | RUN_KEYS = ['num_cpu', 'data_dir', 'datestamp'] 20 | 21 | # Command line sweetener, allowing short-form flags for common, longer flags. 22 | SUBSTITUTIONS = {'env': 'env_name', 23 | 'hid': 'ac_kwargs:hidden_sizes', 24 | 'act': 'ac_kwargs:activation', 25 | 'cpu': 'num_cpu', 26 | 'dt': 'datestamp'} 27 | 28 | # Only some algorithms can be parallelized (have num_cpu > 1): 29 | MPI_COMPATIBLE_ALGOS = ['vpg', 'trpo', 'ppo'] 30 | 31 | # Algo names (used in a few places) 32 | BASE_ALGO_NAMES = ['vpg', 'trpo', 'ppo', 'ddpg', 'td3', 'sac'] 33 | 34 | 35 | def add_with_backends(algo_list): 36 | # helper function to build lists with backend-specific function names 37 | algo_list_with_backends = deepcopy(algo_list) 38 | for algo in algo_list: 39 | algo_list_with_backends += [algo + '_tf1', algo + '_pytorch'] 40 | return algo_list_with_backends 41 | 42 | 43 | def friendly_err(err_msg): 44 | # add whitespace to error message to make it more readable 45 | return '\n\n' + err_msg + '\n\n' 46 | 47 | 48 | def parse_and_execute_grid_search(cmd, args): 49 | """Interprets algorithm name and cmd line args into an ExperimentGrid.""" 50 | 51 | if cmd in BASE_ALGO_NAMES: 52 | backend = DEFAULT_BACKEND[cmd] 53 | print('\n\nUsing default backend (%s) for %s.\n'%(backend, cmd)) 54 | cmd = cmd + '_' + backend 55 | 56 | algo = eval('spinup.'+cmd) 57 | 58 | # Before all else, check to see if any of the flags is 'help'. 59 | valid_help = ['--help', '-h', 'help'] 60 | if any([arg in valid_help for arg in args]): 61 | print('\n\nShowing docstring for spinup.'+cmd+':\n') 62 | print(algo.__doc__) 63 | sys.exit() 64 | 65 | def process(arg): 66 | # Process an arg by eval-ing it, so users can specify more 67 | # than just strings at the command line (eg allows for 68 | # users to give functions as args). 69 | try: 70 | return eval(arg) 71 | except: 72 | return arg 73 | 74 | # Make first pass through args to build base arg_dict. Anything 75 | # with a '--' in front of it is an argument flag and everything after, 76 | # until the next flag, is a possible value. 77 | arg_dict = dict() 78 | for i, arg in enumerate(args): 79 | assert i > 0 or '--' in arg, \ 80 | friendly_err("You didn't specify a first flag.") 81 | if '--' in arg: 82 | arg_key = arg.lstrip('-') 83 | arg_dict[arg_key] = [] 84 | else: 85 | arg_dict[arg_key].append(process(arg)) 86 | 87 | # Make second pass through, to catch flags that have no vals. 88 | # Assume such flags indicate that a boolean parameter should have 89 | # value True. 90 | for k,v in arg_dict.items(): 91 | if len(v) == 0: 92 | v.append(True) 93 | 94 | # Third pass: check for user-supplied shorthands, where a key has 95 | # the form --keyname[kn]. The thing in brackets, 'kn', is the 96 | # shorthand. NOTE: modifying a dict while looping through its 97 | # contents is dangerous, and breaks in 3.6+. We loop over a fixed list 98 | # of keys to avoid this issue. 99 | given_shorthands = dict() 100 | fixed_keys = list(arg_dict.keys()) 101 | for k in fixed_keys: 102 | p1, p2 = k.find('['), k.find(']') 103 | if p1 >= 0 and p2 >= 0: 104 | # Both '[' and ']' found, so shorthand has been given 105 | k_new = k[:p1] 106 | shorthand = k[p1+1:p2] 107 | given_shorthands[k_new] = shorthand 108 | arg_dict[k_new] = arg_dict[k] 109 | del arg_dict[k] 110 | 111 | # Penultimate pass: sugar. Allow some special shortcuts in arg naming, 112 | # eg treat "env" the same as "env_name". This is super specific 113 | # to Spinning Up implementations, and may be hard to maintain. 114 | # These special shortcuts are described by SUBSTITUTIONS. 115 | for special_name, true_name in SUBSTITUTIONS.items(): 116 | if special_name in arg_dict: 117 | # swap it in arg dict 118 | arg_dict[true_name] = arg_dict[special_name] 119 | del arg_dict[special_name] 120 | 121 | if special_name in given_shorthands: 122 | # point the shortcut to the right name 123 | given_shorthands[true_name] = given_shorthands[special_name] 124 | del given_shorthands[special_name] 125 | 126 | # Final pass: check for the special args that go to the 'run' command 127 | # for an experiment grid, separate them from the arg dict, and make sure 128 | # that they have unique values. The special args are given by RUN_KEYS. 129 | run_kwargs = dict() 130 | for k in RUN_KEYS: 131 | if k in arg_dict: 132 | val = arg_dict[k] 133 | assert len(val) == 1, \ 134 | friendly_err("You can only provide one value for %s."%k) 135 | run_kwargs[k] = val[0] 136 | del arg_dict[k] 137 | 138 | # Determine experiment name. If not given by user, will be determined 139 | # by the algorithm name. 140 | if 'exp_name' in arg_dict: 141 | assert len(arg_dict['exp_name']) == 1, \ 142 | friendly_err("You can only provide one value for exp_name.") 143 | exp_name = arg_dict['exp_name'][0] 144 | del arg_dict['exp_name'] 145 | else: 146 | exp_name = 'cmd_' + cmd 147 | 148 | # Make sure that if num_cpu > 1, the algorithm being used is compatible 149 | # with MPI. 150 | if 'num_cpu' in run_kwargs and not(run_kwargs['num_cpu'] == 1): 151 | assert cmd in add_with_backends(MPI_COMPATIBLE_ALGOS), \ 152 | friendly_err("This algorithm can't be run with num_cpu > 1.") 153 | 154 | # Special handling for environment: make sure that env_name is a real, 155 | # registered gym environment. 156 | valid_envs = [e.id for e in list(gym.envs.registry.all())] 157 | assert 'env_name' in arg_dict, \ 158 | friendly_err("You did not give a value for --env_name! Add one and try again.") 159 | for env_name in arg_dict['env_name']: 160 | err_msg = dedent(""" 161 | 162 | %s is not registered with Gym. 163 | 164 | Recommendations: 165 | 166 | * Check for a typo (did you include the version tag?) 167 | 168 | * View the complete list of valid Gym environments at 169 | 170 | https://gym.openai.com/envs/ 171 | 172 | """%env_name) 173 | assert env_name in valid_envs, err_msg 174 | 175 | 176 | # Construct and execute the experiment grid. 177 | eg = ExperimentGrid(name=exp_name) 178 | for k,v in arg_dict.items(): 179 | eg.add(k, v, shorthand=given_shorthands.get(k)) 180 | eg.run(algo, **run_kwargs) 181 | 182 | 183 | if __name__ == '__main__': 184 | """ 185 | This is a wrapper allowing command-line interfaces to individual 186 | algorithms and the plot / test_policy utilities. 187 | 188 | For utilities, it only checks which thing to run, and calls the 189 | appropriate file, passing all arguments through. 190 | 191 | For algorithms, it sets up an ExperimentGrid object and uses the 192 | ExperimentGrid run routine to execute each possible experiment. 193 | """ 194 | 195 | cmd = sys.argv[1] if len(sys.argv) > 1 else 'help' 196 | valid_algos = add_with_backends(BASE_ALGO_NAMES) 197 | valid_utils = ['plot', 'test_policy'] 198 | valid_help = ['--help', '-h', 'help'] 199 | valid_cmds = valid_algos + valid_utils + valid_help 200 | assert cmd in valid_cmds, \ 201 | "Select an algorithm or utility which is implemented in Spinning Up." 202 | 203 | if cmd in valid_help: 204 | # Before all else, check to see if any of the flags is 'help'. 205 | 206 | # List commands that are available. 207 | str_valid_cmds = '\n\t' + '\n\t'.join(valid_algos+valid_utils) 208 | help_msg = dedent(""" 209 | Experiment in Spinning Up from the command line with 210 | 211 | \tpython -m spinup.run CMD [ARGS...] 212 | 213 | where CMD is a valid command. Current valid commands are: 214 | """) + str_valid_cmds 215 | print(help_msg) 216 | 217 | # Provide some useful details for algorithm running. 218 | subs_list = ['--' + k.ljust(10) + 'for'.ljust(10) + '--' + v \ 219 | for k,v in SUBSTITUTIONS.items()] 220 | str_valid_subs = '\n\t' + '\n\t'.join(subs_list) 221 | special_info = dedent(""" 222 | FYI: When running an algorithm, any keyword argument to the 223 | algorithm function can be used as a flag, eg 224 | 225 | \tpython -m spinup.run ppo --env HalfCheetah-v2 --clip_ratio 0.1 226 | 227 | If you need a quick refresher on valid kwargs, get the docstring 228 | with 229 | 230 | \tpython -m spinup.run [algo] --help 231 | 232 | See the "Running Experiments" docs page for more details. 233 | 234 | Also: Some common but long flags can be substituted for shorter 235 | ones. Valid substitutions are: 236 | """) + str_valid_subs 237 | print(special_info) 238 | 239 | elif cmd in valid_utils: 240 | # Execute the correct utility file. 241 | runfile = osp.join(osp.abspath(osp.dirname(__file__)), 'utils', cmd +'.py') 242 | args = [sys.executable if sys.executable else 'python', runfile] + sys.argv[2:] 243 | subprocess.check_call(args, env=os.environ) 244 | else: 245 | # Assume that the user plans to execute an algorithm. Run custom 246 | # parsing on the arguments and build a grid search to execute. 247 | args = sys.argv[2:] 248 | parse_and_execute_grid_search(cmd, args) 249 | -------------------------------------------------------------------------------- /spinningup/spinup/user_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path as osp 3 | 4 | # Default neural network backend for each algo 5 | # (Must be either 'tf1' or 'pytorch') 6 | DEFAULT_BACKEND = { 7 | 'vpg': 'pytorch', 8 | 'trpo': 'tf1', 9 | 'ppo': 'pytorch', 10 | 'ddpg': 'pytorch', 11 | 'td3': 'pytorch', 12 | 'sac': 'pytorch' 13 | } 14 | 15 | # Where experiment outputs are saved by default: 16 | DEFAULT_DATA_DIR = osp.join(osp.abspath(osp.dirname(osp.dirname(__file__))),'data') 17 | 18 | # Whether to automatically insert a date and time stamp into the names of 19 | # save directories: 20 | FORCE_DATESTAMP = False 21 | 22 | # Whether GridSearch provides automatically-generated default shorthands: 23 | DEFAULT_SHORTHAND = True 24 | 25 | # Tells the GridSearch how many seconds to pause for before launching 26 | # experiments. 27 | WAIT_BEFORE_LAUNCH = 5 -------------------------------------------------------------------------------- /spinningup/spinup/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netx-repo/neuroplan/f4d73d71b67261d819494385b1a7e3270b7499e5/spinningup/spinup/utils/__init__.py -------------------------------------------------------------------------------- /spinningup/spinup/utils/mpi_pytorch.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import numpy as np 3 | import os 4 | import torch 5 | from mpi4py import MPI 6 | from spinup.utils.mpi_tools import broadcast, mpi_avg, num_procs, proc_id 7 | 8 | def setup_pytorch_for_mpi(): 9 | """ 10 | Avoid slowdowns caused by each separate process's PyTorch using 11 | more than its fair share of CPU resources. 12 | """ 13 | #print('Proc %d: Reporting original number of Torch threads as %d.'%(proc_id(), torch.get_num_threads()), flush=True) 14 | if torch.get_num_threads()==1: 15 | return 16 | fair_num_threads = max(int(torch.get_num_threads() / num_procs()), 1) 17 | torch.set_num_threads(fair_num_threads) 18 | #print('Proc %d: Reporting new number of Torch threads as %d.'%(proc_id(), torch.get_num_threads()), flush=True) 19 | 20 | def mpi_avg_grads(module): 21 | """ Average contents of gradient buffers across MPI processes. """ 22 | if num_procs()==1: 23 | return 24 | for p in module.parameters(): 25 | p_grad_numpy = p.grad.cpu().numpy() # numpy view of tensor data 26 | avg_p_grad = mpi_avg(p.grad.cpu()) 27 | p_grad_numpy[:] = avg_p_grad[:] 28 | 29 | def sync_params(module): 30 | """ Sync all parameters of module across all MPI processes. """ 31 | if num_procs()==1: 32 | return 33 | for p in module.parameters(): 34 | p_numpy = p.data.cpu().numpy() 35 | broadcast(p_numpy) -------------------------------------------------------------------------------- /spinningup/spinup/utils/mpi_tf.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from mpi4py import MPI 4 | from spinup.utils.mpi_tools import broadcast 5 | 6 | 7 | def flat_concat(xs): 8 | return tf.concat([tf.reshape(x,(-1,)) for x in xs], axis=0) 9 | 10 | def assign_params_from_flat(x, params): 11 | flat_size = lambda p : int(np.prod(p.shape.as_list())) # the 'int' is important for scalars 12 | splits = tf.split(x, [flat_size(p) for p in params]) 13 | new_params = [tf.reshape(p_new, p.shape) for p, p_new in zip(params, splits)] 14 | return tf.group([tf.assign(p, p_new) for p, p_new in zip(params, new_params)]) 15 | 16 | def sync_params(params): 17 | get_params = flat_concat(params) 18 | def _broadcast(x): 19 | broadcast(x) 20 | return x 21 | synced_params = tf.py_func(_broadcast, [get_params], tf.float32) 22 | return assign_params_from_flat(synced_params, params) 23 | 24 | def sync_all_params(): 25 | """Sync all tf variables across MPI processes.""" 26 | return sync_params(tf.global_variables()) 27 | 28 | 29 | class MpiAdamOptimizer(tf.train.AdamOptimizer): 30 | """ 31 | Adam optimizer that averages gradients across MPI processes. 32 | 33 | The compute_gradients method is taken from Baselines `MpiAdamOptimizer`_. 34 | For documentation on method arguments, see the Tensorflow docs page for 35 | the base `AdamOptimizer`_. 36 | 37 | .. _`MpiAdamOptimizer`: https://github.com/openai/baselines/blob/master/baselines/common/mpi_adam_optimizer.py 38 | .. _`AdamOptimizer`: https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer 39 | """ 40 | 41 | def __init__(self, **kwargs): 42 | self.comm = MPI.COMM_WORLD 43 | tf.train.AdamOptimizer.__init__(self, **kwargs) 44 | 45 | def compute_gradients(self, loss, var_list, **kwargs): 46 | """ 47 | Same as normal compute_gradients, except average grads over processes. 48 | """ 49 | grads_and_vars = super().compute_gradients(loss, var_list, **kwargs) 50 | grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None] 51 | flat_grad = flat_concat([g for g, v in grads_and_vars]) 52 | shapes = [v.shape.as_list() for g, v in grads_and_vars] 53 | sizes = [int(np.prod(s)) for s in shapes] 54 | 55 | num_tasks = self.comm.Get_size() 56 | buf = np.zeros(flat_grad.shape, np.float32) 57 | 58 | def _collect_grads(flat_grad): 59 | self.comm.Allreduce(flat_grad, buf, op=MPI.SUM) 60 | np.divide(buf, float(num_tasks), out=buf) 61 | return buf 62 | 63 | avg_flat_grad = tf.py_func(_collect_grads, [flat_grad], tf.float32) 64 | avg_flat_grad.set_shape(flat_grad.shape) 65 | avg_grads = tf.split(avg_flat_grad, sizes, axis=0) 66 | avg_grads_and_vars = [(tf.reshape(g, v.shape), v) 67 | for g, (_, v) in zip(avg_grads, grads_and_vars)] 68 | 69 | return avg_grads_and_vars 70 | 71 | def apply_gradients(self, grads_and_vars, global_step=None, name=None): 72 | """ 73 | Same as normal apply_gradients, except sync params after update. 74 | """ 75 | opt = super().apply_gradients(grads_and_vars, global_step, name) 76 | with tf.control_dependencies([opt]): 77 | sync = sync_params([v for g,v in grads_and_vars]) 78 | return tf.group([opt, sync]) -------------------------------------------------------------------------------- /spinningup/spinup/utils/mpi_tools.py: -------------------------------------------------------------------------------- 1 | from mpi4py import MPI 2 | import os, subprocess, sys 3 | import numpy as np 4 | 5 | 6 | def mpi_fork(n, bind_to_core=False): 7 | """ 8 | Re-launches the current script with workers linked by MPI. 9 | 10 | Also, terminates the original process that launched it. 11 | 12 | Taken almost without modification from the Baselines function of the 13 | `same name`_. 14 | 15 | .. _`same name`: https://github.com/openai/baselines/blob/master/baselines/common/mpi_fork.py 16 | 17 | Args: 18 | n (int): Number of process to split into. 19 | 20 | bind_to_core (bool): Bind each MPI process to a core. 21 | """ 22 | if n<=1: 23 | return 24 | if os.getenv("IN_MPI") is None: 25 | env = os.environ.copy() 26 | env.update( 27 | MKL_NUM_THREADS="1", 28 | OMP_NUM_THREADS="1", 29 | IN_MPI="1" 30 | ) 31 | args = ["mpirun", "-np", str(n)] 32 | if bind_to_core: 33 | args += ["-bind-to", "core"] 34 | args += [sys.executable] + sys.argv 35 | subprocess.check_call(args, env=env) 36 | sys.exit() 37 | 38 | 39 | def msg(m, string=''): 40 | print(('Message from %d: %s \t '%(MPI.COMM_WORLD.Get_rank(), string))+str(m)) 41 | 42 | def proc_id(): 43 | """Get rank of calling process.""" 44 | return MPI.COMM_WORLD.Get_rank() 45 | 46 | def allreduce(*args, **kwargs): 47 | return MPI.COMM_WORLD.Allreduce(*args, **kwargs) 48 | 49 | def num_procs(): 50 | """Count active MPI processes.""" 51 | return MPI.COMM_WORLD.Get_size() 52 | 53 | def broadcast(x, root=0): 54 | MPI.COMM_WORLD.Bcast(x, root=root) 55 | 56 | def mpi_op(x, op): 57 | x, scalar = ([x], True) if np.isscalar(x) else (x, False) 58 | x = np.asarray(x, dtype=np.float32) 59 | buff = np.zeros_like(x, dtype=np.float32) 60 | allreduce(x, buff, op=op) 61 | return buff[0] if scalar else buff 62 | 63 | def mpi_sum(x): 64 | return mpi_op(x, MPI.SUM) 65 | 66 | def mpi_avg(x): 67 | """Average a scalar or vector over MPI processes.""" 68 | return mpi_sum(x) / num_procs() 69 | 70 | def mpi_statistics_scalar(x, with_min_and_max=False): 71 | """ 72 | Get mean/std and optional min/max of scalar x across MPI processes. 73 | 74 | Args: 75 | x: An array containing samples of the scalar to produce statistics 76 | for. 77 | 78 | with_min_and_max (bool): If true, return min and max of x in 79 | addition to mean and std. 80 | """ 81 | x = np.array(x, dtype=np.float32) 82 | global_sum, global_n = mpi_sum([np.sum(x), len(x)]) 83 | mean = global_sum / global_n 84 | 85 | global_sum_sq = mpi_sum(np.sum((x - mean)**2)) 86 | std = np.sqrt(global_sum_sq / global_n) # compute global std 87 | 88 | if with_min_and_max: 89 | global_min = mpi_op(np.min(x) if len(x) > 0 else np.inf, op=MPI.MIN) 90 | global_max = mpi_op(np.max(x) if len(x) > 0 else -np.inf, op=MPI.MAX) 91 | return mean, std, global_min, global_max 92 | return mean, std -------------------------------------------------------------------------------- /spinningup/spinup/utils/plot.py: -------------------------------------------------------------------------------- 1 | import seaborn as sns 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | import json 5 | import os 6 | import os.path as osp 7 | import numpy as np 8 | 9 | DIV_LINE_WIDTH = 50 10 | 11 | # Global vars for tracking and labeling data at load time. 12 | exp_idx = 0 13 | units = dict() 14 | 15 | def plot_data(data, xaxis='Epoch', value="AverageEpRet", condition="Condition1", smooth=1, **kwargs): 16 | if smooth > 1: 17 | """ 18 | smooth data with moving window average. 19 | that is, 20 | smoothed_y[t] = average(y[t-k], y[t-k+1], ..., y[t+k-1], y[t+k]) 21 | where the "smooth" param is width of that window (2k+1) 22 | """ 23 | y = np.ones(smooth) 24 | for datum in data: 25 | x = np.asarray(datum[value]) 26 | z = np.ones(len(x)) 27 | smoothed_x = np.convolve(x,y,'same') / np.convolve(z,y,'same') 28 | datum[value] = smoothed_x 29 | 30 | if isinstance(data, list): 31 | data = pd.concat(data, ignore_index=True) 32 | sns.set(style="darkgrid", font_scale=1.5) 33 | sns.tsplot(data=data, time=xaxis, value=value, unit="Unit", condition=condition, ci='sd', **kwargs) 34 | """ 35 | If you upgrade to any version of Seaborn greater than 0.8.1, switch from 36 | tsplot to lineplot replacing L29 with: 37 | 38 | sns.lineplot(data=data, x=xaxis, y=value, hue=condition, ci='sd', **kwargs) 39 | 40 | Changes the colorscheme and the default legend style, though. 41 | """ 42 | plt.legend(loc='best').set_draggable(True) 43 | #plt.legend(loc='upper center', ncol=3, handlelength=1, 44 | # borderaxespad=0., prop={'size': 13}) 45 | 46 | """ 47 | For the version of the legend used in the Spinning Up benchmarking page, 48 | swap L38 with: 49 | 50 | plt.legend(loc='upper center', ncol=6, handlelength=1, 51 | mode="expand", borderaxespad=0., prop={'size': 13}) 52 | """ 53 | 54 | xscale = np.max(np.asarray(data[xaxis])) > 5e3 55 | if xscale: 56 | # Just some formatting niceness: x-axis scale in scientific notation if max x is large 57 | plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0)) 58 | 59 | plt.tight_layout(pad=0.5) 60 | 61 | def get_datasets(logdir, condition=None): 62 | """ 63 | Recursively look through logdir for output files produced by 64 | spinup.logx.Logger. 65 | 66 | Assumes that any file "progress.txt" is a valid hit. 67 | """ 68 | global exp_idx 69 | global units 70 | datasets = [] 71 | for root, _, files in os.walk(logdir): 72 | if 'progress.txt' in files: 73 | exp_name = None 74 | try: 75 | config_path = open(os.path.join(root,'config.json')) 76 | config = json.load(config_path) 77 | if 'exp_name' in config: 78 | exp_name = config['exp_name'] 79 | except: 80 | print('No file named config.json') 81 | condition1 = condition or exp_name or 'exp' 82 | condition2 = condition1 + '-' + str(exp_idx) 83 | exp_idx += 1 84 | if condition1 not in units: 85 | units[condition1] = 0 86 | unit = units[condition1] 87 | units[condition1] += 1 88 | 89 | try: 90 | exp_data = pd.read_table(os.path.join(root,'progress.txt')) 91 | except: 92 | print('Could not read from %s'%os.path.join(root,'progress.txt')) 93 | continue 94 | performance = 'AverageTestEpRet' if 'AverageTestEpRet' in exp_data else 'AverageEpRet' 95 | exp_data.insert(len(exp_data.columns),'Unit',unit) 96 | exp_data.insert(len(exp_data.columns),'Condition1',condition1) 97 | exp_data.insert(len(exp_data.columns),'Condition2',condition2) 98 | exp_data.insert(len(exp_data.columns),'Performance',exp_data[performance]) 99 | datasets.append(exp_data) 100 | return datasets 101 | 102 | 103 | def get_all_datasets(all_logdirs, legend=None, select=None, exclude=None): 104 | """ 105 | For every entry in all_logdirs, 106 | 1) check if the entry is a real directory and if it is, 107 | pull data from it; 108 | 109 | 2) if not, check to see if the entry is a prefix for a 110 | real directory, and pull data from that. 111 | """ 112 | logdirs = [] 113 | for logdir in all_logdirs: 114 | if osp.isdir(logdir) and logdir[-1]==os.sep: 115 | logdirs += [logdir] 116 | else: 117 | basedir = osp.dirname(logdir) 118 | fulldir = lambda x : osp.join(basedir, x) 119 | prefix = logdir.split(os.sep)[-1] 120 | listdir= os.listdir(basedir) 121 | logdirs += sorted([fulldir(x) for x in listdir if prefix in x]) 122 | 123 | """ 124 | Enforce selection rules, which check logdirs for certain substrings. 125 | Makes it easier to look at graphs from particular ablations, if you 126 | launch many jobs at once with similar names. 127 | """ 128 | if select is not None: 129 | logdirs = [log for log in logdirs if all(x in log for x in select)] 130 | if exclude is not None: 131 | logdirs = [log for log in logdirs if all(not(x in log) for x in exclude)] 132 | 133 | # Verify logdirs 134 | print('Plotting from...\n' + '='*DIV_LINE_WIDTH + '\n') 135 | for logdir in logdirs: 136 | print(logdir) 137 | print('\n' + '='*DIV_LINE_WIDTH) 138 | 139 | # Make sure the legend is compatible with the logdirs 140 | assert not(legend) or (len(legend) == len(logdirs)), \ 141 | "Must give a legend title for each set of experiments." 142 | 143 | # Load data from logdirs 144 | data = [] 145 | if legend: 146 | for log, leg in zip(logdirs, legend): 147 | data += get_datasets(log, leg) 148 | else: 149 | for log in logdirs: 150 | data += get_datasets(log) 151 | return data 152 | 153 | 154 | def make_plots(all_logdirs, legend=None, xaxis=None, values=None, count=False, 155 | font_scale=1.5, smooth=1, select=None, exclude=None, estimator='mean'): 156 | data = get_all_datasets(all_logdirs, legend, select, exclude) 157 | values = values if isinstance(values, list) else [values] 158 | condition = 'Condition2' if count else 'Condition1' 159 | estimator = getattr(np, estimator) # choose what to show on main curve: mean? max? min? 160 | for value in values: 161 | plt.figure() 162 | plot_data(data, xaxis=xaxis, value=value, condition=condition, smooth=smooth, estimator=estimator) 163 | plt.show() 164 | 165 | 166 | def main(): 167 | import argparse 168 | parser = argparse.ArgumentParser() 169 | parser.add_argument('logdir', nargs='*') 170 | parser.add_argument('--legend', '-l', nargs='*') 171 | parser.add_argument('--xaxis', '-x', default='TotalEnvInteracts') 172 | parser.add_argument('--value', '-y', default='Performance', nargs='*') 173 | parser.add_argument('--count', action='store_true') 174 | parser.add_argument('--smooth', '-s', type=int, default=1) 175 | parser.add_argument('--select', nargs='*') 176 | parser.add_argument('--exclude', nargs='*') 177 | parser.add_argument('--est', default='mean') 178 | args = parser.parse_args() 179 | """ 180 | 181 | Args: 182 | logdir (strings): As many log directories (or prefixes to log 183 | directories, which the plotter will autocomplete internally) as 184 | you'd like to plot from. 185 | 186 | legend (strings): Optional way to specify legend for the plot. The 187 | plotter legend will automatically use the ``exp_name`` from the 188 | config.json file, unless you tell it otherwise through this flag. 189 | This only works if you provide a name for each directory that 190 | will get plotted. (Note: this may not be the same as the number 191 | of logdir args you provide! Recall that the plotter looks for 192 | autocompletes of the logdir args: there may be more than one 193 | match for a given logdir prefix, and you will need to provide a 194 | legend string for each one of those matches---unless you have 195 | removed some of them as candidates via selection or exclusion 196 | rules (below).) 197 | 198 | xaxis (string): Pick what column from data is used for the x-axis. 199 | Defaults to ``TotalEnvInteracts``. 200 | 201 | value (strings): Pick what columns from data to graph on the y-axis. 202 | Submitting multiple values will produce multiple graphs. Defaults 203 | to ``Performance``, which is not an actual output of any algorithm. 204 | Instead, ``Performance`` refers to either ``AverageEpRet``, the 205 | correct performance measure for the on-policy algorithms, or 206 | ``AverageTestEpRet``, the correct performance measure for the 207 | off-policy algorithms. The plotter will automatically figure out 208 | which of ``AverageEpRet`` or ``AverageTestEpRet`` to report for 209 | each separate logdir. 210 | 211 | count: Optional flag. By default, the plotter shows y-values which 212 | are averaged across all results that share an ``exp_name``, 213 | which is typically a set of identical experiments that only vary 214 | in random seed. But if you'd like to see all of those curves 215 | separately, use the ``--count`` flag. 216 | 217 | smooth (int): Smooth data by averaging it over a fixed window. This 218 | parameter says how wide the averaging window will be. 219 | 220 | select (strings): Optional selection rule: the plotter will only show 221 | curves from logdirs that contain all of these substrings. 222 | 223 | exclude (strings): Optional exclusion rule: plotter will only show 224 | curves from logdirs that do not contain these substrings. 225 | 226 | """ 227 | 228 | make_plots(args.logdir, args.legend, args.xaxis, args.value, args.count, 229 | smooth=args.smooth, select=args.select, exclude=args.exclude, 230 | estimator=args.est) 231 | 232 | if __name__ == "__main__": 233 | main() -------------------------------------------------------------------------------- /spinningup/spinup/utils/run_entrypoint.py: -------------------------------------------------------------------------------- 1 | import zlib 2 | import pickle 3 | import base64 4 | 5 | if __name__ == '__main__': 6 | import argparse 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument('encoded_thunk') 9 | args = parser.parse_args() 10 | thunk = pickle.loads(zlib.decompress(base64.b64decode(args.encoded_thunk))) 11 | thunk() -------------------------------------------------------------------------------- /spinningup/spinup/utils/serialization_utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | def convert_json(obj): 4 | """ Convert obj to a version which can be serialized with JSON. """ 5 | if is_json_serializable(obj): 6 | return obj 7 | else: 8 | if isinstance(obj, dict): 9 | return {convert_json(k): convert_json(v) 10 | for k,v in obj.items()} 11 | 12 | elif isinstance(obj, tuple): 13 | return (convert_json(x) for x in obj) 14 | 15 | elif isinstance(obj, list): 16 | return [convert_json(x) for x in obj] 17 | 18 | elif hasattr(obj,'__name__') and not('lambda' in obj.__name__): 19 | return convert_json(obj.__name__) 20 | 21 | elif hasattr(obj,'__dict__') and obj.__dict__: 22 | obj_dict = {convert_json(k): convert_json(v) 23 | for k,v in obj.__dict__.items()} 24 | return {str(obj): obj_dict} 25 | 26 | return str(obj) 27 | 28 | def is_json_serializable(v): 29 | try: 30 | json.dumps(v) 31 | return True 32 | except: 33 | return False -------------------------------------------------------------------------------- /spinningup/spinup/utils/test_policy.py: -------------------------------------------------------------------------------- 1 | import time 2 | import joblib 3 | import os 4 | import os.path as osp 5 | import tensorflow as tf 6 | import torch 7 | from spinup import EpochLogger 8 | from spinup.utils.logx import restore_tf_graph 9 | 10 | 11 | def load_policy_and_env(fpath, itr='last', deterministic=False): 12 | """ 13 | Load a policy from save, whether it's TF or PyTorch, along with RL env. 14 | 15 | Not exceptionally future-proof, but it will suffice for basic uses of the 16 | Spinning Up implementations. 17 | 18 | Checks to see if there's a tf1_save folder. If yes, assumes the model 19 | is tensorflow and loads it that way. Otherwise, loads as if there's a 20 | PyTorch save. 21 | """ 22 | 23 | # determine if tf save or pytorch save 24 | if any(['tf1_save' in x for x in os.listdir(fpath)]): 25 | backend = 'tf1' 26 | else: 27 | backend = 'pytorch' 28 | 29 | # handle which epoch to load from 30 | if itr=='last': 31 | # check filenames for epoch (AKA iteration) numbers, find maximum value 32 | 33 | if backend == 'tf1': 34 | saves = [int(x[8:]) for x in os.listdir(fpath) if 'tf1_save' in x and len(x)>8] 35 | 36 | elif backend == 'pytorch': 37 | pytsave_path = osp.join(fpath, 'pyt_save') 38 | # Each file in this folder has naming convention 'modelXX.pt', where 39 | # 'XX' is either an integer or empty string. Empty string case 40 | # corresponds to len(x)==8, hence that case is excluded. 41 | saves = [int(x.split('.')[0][5:]) for x in os.listdir(pytsave_path) if len(x)>8 and 'model' in x] 42 | 43 | itr = '%d'%max(saves) if len(saves) > 0 else '' 44 | 45 | else: 46 | assert isinstance(itr, int), \ 47 | "Bad value provided for itr (needs to be int or 'last')." 48 | itr = '%d'%itr 49 | 50 | # load the get_action function 51 | if backend == 'tf1': 52 | get_action = load_tf_policy(fpath, itr, deterministic) 53 | else: 54 | get_action = load_pytorch_policy(fpath, itr, deterministic) 55 | 56 | # try to load environment from save 57 | # (sometimes this will fail because the environment could not be pickled) 58 | try: 59 | state = joblib.load(osp.join(fpath, 'vars'+itr+'.pkl')) 60 | env = state['env'] 61 | except: 62 | env = None 63 | 64 | return env, get_action 65 | 66 | 67 | def load_tf_policy(fpath, itr, deterministic=False): 68 | """ Load a tensorflow policy saved with Spinning Up Logger.""" 69 | 70 | fname = osp.join(fpath, 'tf1_save'+itr) 71 | print('\n\nLoading from %s.\n\n'%fname) 72 | 73 | # load the things! 74 | sess = tf.Session() 75 | model = restore_tf_graph(sess, fname) 76 | 77 | # get the correct op for executing actions 78 | if deterministic and 'mu' in model.keys(): 79 | # 'deterministic' is only a valid option for SAC policies 80 | print('Using deterministic action op.') 81 | action_op = model['mu'] 82 | else: 83 | print('Using default action op.') 84 | action_op = model['pi'] 85 | 86 | # make function for producing an action given a single state 87 | get_action = lambda x : sess.run(action_op, feed_dict={model['x']: x[None,:]})[0] 88 | 89 | return get_action 90 | 91 | 92 | def load_pytorch_policy(fpath, itr, deterministic=False): 93 | """ Load a pytorch policy saved with Spinning Up Logger.""" 94 | 95 | fname = osp.join(fpath, 'pyt_save', 'model'+itr+'.pt') 96 | print('\n\nLoading from %s.\n\n'%fname) 97 | 98 | model = torch.load(fname) 99 | 100 | # make function for producing an action given a single state 101 | def get_action(x): 102 | with torch.no_grad(): 103 | x = torch.as_tensor(x, dtype=torch.float32) 104 | action = model.act(x) 105 | return action 106 | 107 | return get_action 108 | 109 | 110 | def run_policy(env, get_action, max_ep_len=None, num_episodes=100, render=True): 111 | 112 | assert env is not None, \ 113 | "Environment not found!\n\n It looks like the environment wasn't saved, " + \ 114 | "and we can't run the agent in it. :( \n\n Check out the readthedocs " + \ 115 | "page on Experiment Outputs for how to handle this situation." 116 | 117 | logger = EpochLogger() 118 | o, r, d, ep_ret, ep_len, n = env.reset(), 0, False, 0, 0, 0 119 | while n < num_episodes: 120 | if render: 121 | env.render() 122 | time.sleep(1e-3) 123 | 124 | a = get_action(o) 125 | o, r, d, _ = env.step(a) 126 | ep_ret += r 127 | ep_len += 1 128 | 129 | if d or (ep_len == max_ep_len): 130 | logger.store(EpRet=ep_ret, EpLen=ep_len) 131 | print('Episode %d \t EpRet %.3f \t EpLen %d'%(n, ep_ret, ep_len)) 132 | o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 133 | n += 1 134 | 135 | logger.log_tabular('EpRet', with_min_and_max=True) 136 | logger.log_tabular('EpLen', average_only=True) 137 | logger.dump_tabular() 138 | 139 | 140 | if __name__ == '__main__': 141 | import argparse 142 | parser = argparse.ArgumentParser() 143 | parser.add_argument('fpath', type=str) 144 | parser.add_argument('--len', '-l', type=int, default=0) 145 | parser.add_argument('--episodes', '-n', type=int, default=100) 146 | parser.add_argument('--norender', '-nr', action='store_true') 147 | parser.add_argument('--itr', '-i', type=int, default=-1) 148 | parser.add_argument('--deterministic', '-d', action='store_true') 149 | args = parser.parse_args() 150 | env, get_action = load_policy_and_env(args.fpath, 151 | args.itr if args.itr >=0 else 'last', 152 | args.deterministic) 153 | run_policy(env, get_action, args.len, args.episodes, not(args.norender)) -------------------------------------------------------------------------------- /spinningup/spinup/version.py: -------------------------------------------------------------------------------- 1 | version_info = (0, 2, 0) 2 | # format: 3 | # ('spinup_major', 'spinup_minor', 'spinup_patch') 4 | 5 | def get_version(): 6 | "Returns the version as a human-format string." 7 | return '%d.%d.%d' % version_info 8 | 9 | __version__ = get_version() -------------------------------------------------------------------------------- /spinningup/travis_setup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | mkdir -p $HOME/.mujoco 6 | 7 | # Avoid using pyenv in travis, since it adds ~7 minutes to turnaround time 8 | if [ "$TRAVIS_OS_NAME" == "osx" ] 9 | then 10 | # https://github.com/travis-ci/travis-ci/issues/9640 11 | sudo softwareupdate --install "Command Line Tools (macOS High Sierra version 10.13) for Xcode-9.4" 12 | brew update 13 | brew install open-mpi 14 | brew install gcc 15 | brew link --overwrite gcc 16 | curl $MUJOCO_FOR_OSX | tar xz -C $HOME/.mujoco/ 17 | elif [ "$TRAVIS_OS_NAME" == "linux" ] 18 | then 19 | # Because this is flaky, try several times 20 | set +e 21 | COUNT=0 22 | while [ $COUNT -lt 5 ]; do 23 | sudo curl -o /usr/local/bin/patchelf https://s3-us-west-2.amazonaws.com/openai-sci-artifacts/manual-builds/patchelf_0.9_amd64.elf 24 | if [ $? -eq 0 ];then 25 | break 26 | fi 27 | let COUNT=COUNT+1 28 | done 29 | if [ $COUNT -ge 5 ]; then 30 | echo "Failed to download patchelf" 31 | exit 1 32 | fi 33 | set -e 34 | 35 | sudo chmod +x /usr/local/bin/patchelf 36 | curl $MUJOCO_FOR_LINUX | tar xz -C $HOME/.mujoco/ 37 | 38 | sudo apt-get update 39 | sudo apt-get install -y openmpi-bin libopenmpi-dev libosmesa6-dev libglew-dev 40 | fi 41 | --------------------------------------------------------------------------------