├── .gitignore ├── LICENSE ├── README.md ├── RL_algorithms ├── Torch │ ├── .DS_Store │ ├── DDPG │ │ └── DDPG_ENV │ │ │ ├── core.py │ │ │ ├── ddpg.py │ │ │ ├── logger │ │ │ ├── events.out.tfevents.1658931832.Tosins-MacBook-Air.local.42757.0 │ │ │ ├── events.out.tfevents.1658931911.Tosins-MacBook-Air.local.42851.0 │ │ │ ├── events.out.tfevents.1658931934.Tosins-MacBook-Air.local.42904.0 │ │ │ ├── events.out.tfevents.1658932019.Tosins-MacBook-Air.local.42965.0 │ │ │ ├── events.out.tfevents.1658932066.Tosins-MacBook-Air.local.43009.0 │ │ │ ├── events.out.tfevents.1658932179.Tosins-MacBook-Air.local.43178.0 │ │ │ ├── events.out.tfevents.1658932723.Tosins-MacBook-Air.local.43565.0 │ │ │ └── events.out.tfevents.1658932834.Tosins-MacBook-Air.local.43628.0 │ │ │ ├── memory.py │ │ │ └── training_log_csv │ │ │ ├── Avg Reward (1).svg │ │ │ └── run-.-tag-Avg Reward (1).csv │ ├── PPO │ │ ├── Continious │ │ │ ├── PPO │ │ │ │ ├── __init__.py │ │ │ │ ├── actor.py │ │ │ │ ├── agent.py │ │ │ │ ├── critic.py │ │ │ │ ├── main.py │ │ │ │ ├── memory.py │ │ │ │ ├── models │ │ │ │ │ ├── space_robot_actor.pt │ │ │ │ │ └── space_robot_critic.pt │ │ │ │ ├── plots │ │ │ │ │ └── space_robot_performance.png │ │ │ │ └── utils.py │ │ │ ├── PPO_Two_heads │ │ │ │ ├── __init__.py │ │ │ │ ├── actor.py │ │ │ │ ├── agent.py │ │ │ │ ├── critic.py │ │ │ │ ├── main.py │ │ │ │ ├── memory.py │ │ │ │ ├── models │ │ │ │ │ ├── space_robot_actor.pt │ │ │ │ │ └── space_robot_critic.pt │ │ │ │ ├── plots │ │ │ │ │ └── space_robot_performance.png │ │ │ │ └── utils.py │ │ │ └── __init__.py │ │ ├── Discrete │ │ │ ├── PPO │ │ │ │ ├── __init__.py │ │ │ │ ├── actor.py │ │ │ │ ├── agent.py │ │ │ │ ├── critic.py │ │ │ │ ├── main.py │ │ │ │ ├── memory.py │ │ │ │ ├── models │ │ │ │ │ ├── space_robot_actor.pt │ │ │ │ │ └── space_robot_critic.pt │ │ │ │ ├── plots │ │ │ │ │ └── space_robot_performance.png │ │ │ │ ├── training_log │ │ │ │ └── utils.py │ │ │ └── PPOImage │ │ │ │ ├── __init__.py │ │ │ │ ├── actor.py │ │ │ │ ├── agent.py │ │ │ │ ├── critic.py │ │ │ │ ├── main.py │ │ │ │ ├── memory.py │ │ │ │ ├── models │ │ │ │ ├── .space_robot_actor.pt.icloud │ │ │ │ └── .space_robot_critic.pt.icloud │ │ │ │ ├── plots │ │ │ │ └── space_robot_performance.png │ │ │ │ └── utils.py │ │ └── __init__.py │ ├── SAC │ │ ├── SAC_ENV │ │ │ ├── core.py │ │ │ ├── logger │ │ │ │ ├── events.out.tfevents.1658847118.Tosins-Air.19214.0 │ │ │ │ ├── events.out.tfevents.1658847140.Tosins-Air.19431.0 │ │ │ │ ├── events.out.tfevents.1658847454.Tosins-Air.19535.0 │ │ │ │ ├── events.out.tfevents.1658847513.Tosins-Air.19931.0 │ │ │ │ ├── events.out.tfevents.1658847612.Tosins-Air.19979.0 │ │ │ │ ├── events.out.tfevents.1658847918.Tosins-Air.20089.0 │ │ │ │ ├── events.out.tfevents.1658848049.Tosins-Air.20232.0 │ │ │ │ ├── events.out.tfevents.1658848339.Tosins-Air.20384.0 │ │ │ │ ├── events.out.tfevents.1658848364.Tosins-Air.20423.0 │ │ │ │ ├── events.out.tfevents.1658848673.Tosins-Air.20649.0 │ │ │ │ ├── events.out.tfevents.1658848831.Tosins-Air.20793.0 │ │ │ │ ├── events.out.tfevents.1658849191.Tosins-Air.20924.0 │ │ │ │ ├── events.out.tfevents.1658849218.Tosins-Air.20984.0 │ │ │ │ ├── events.out.tfevents.1658849777.Tosins-Air.21229.0 │ │ │ │ ├── events.out.tfevents.1658849785.Tosins-Air.21269.0 │ │ │ │ ├── events.out.tfevents.1658849885.Tosins-Air.21429.0 │ │ │ │ ├── events.out.tfevents.1658849941.Tosins-Air.21521.0 │ │ │ │ └── events.out.tfevents.1658850278.Tosins-Air.21678.0 │ │ │ ├── memory.py │ │ │ ├── sac.py │ │ │ └── training_log_csv │ │ │ │ ├── run-.-tag-Avg Reward.csv │ │ │ │ └── run-.-tag-Loss_Pi.csv │ │ └── __init__.py │ └── __init__.py ├── __init__.py └── utils │ └── mpi_tools.py ├── Simulation.jpg ├── SpaceRobotEnv ├── .DS_Store ├── __init__.py ├── assets │ ├── .DS_Store │ ├── common │ │ ├── __init__.py │ │ ├── materials.xml │ │ ├── skybox.xml │ │ └── visual.xml │ └── spacerobot │ │ ├── arm_v3.xml │ │ ├── arm_v31.xml │ │ ├── asset.xml │ │ ├── sensor.xml │ │ ├── spacerobot_cost.xml │ │ ├── spacerobot_dualarm.xml │ │ ├── spacerobot_image.xml │ │ ├── spacerobot_state.xml │ │ ├── stls │ │ ├── R10.stl │ │ ├── cube.stl │ │ ├── v_base.stl │ │ ├── v_forearm.stl │ │ ├── v_shoulder.stl │ │ ├── v_upperarm.stl │ │ ├── v_wrist1.stl │ │ ├── v_wrist2.stl │ │ └── v_wrist3.stl │ │ └── subgoal.xml ├── envs │ ├── SpaceRobotCost.py │ ├── SpaceRobotDualArm.py │ ├── SpaceRobotImage.py │ ├── SpaceRobotPointCloud.py │ ├── SpaceRobotReorientation.py │ ├── SpaceRobotState.py │ └── __init__.py └── images │ ├── Simulation.jpg │ ├── ccc.png │ ├── iros.gif │ ├── ral.gif │ └── robot.png ├── requirements.txt ├── setup.py └── test_env.py /.gitignore: -------------------------------------------------------------------------------- 1 | 1/ 2 | 2/ 3 | 3/ 4 | SpaceRobotEnv.egg-info/ 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.pyc 8 | *.py[cod] 9 | *$py.class 10 | 11 | # C extensions 12 | *.so 13 | 14 | # Distribution / packaging 15 | .Python 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | share/python-wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | MANIFEST 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .nox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | *.py,cover 55 | .hypothesis/ 56 | .pytest_cache/ 57 | cover/ 58 | 59 | # Translations 60 | *.mo 61 | *.pot 62 | 63 | # Django stuff: 64 | *.log 65 | local_settings.py 66 | db.sqlite3 67 | db.sqlite3-journal 68 | 69 | # Flask stuff: 70 | instance/ 71 | .webassets-cache 72 | 73 | # Scrapy stuff: 74 | .scrapy 75 | 76 | # Sphinx documentation 77 | docs/_build/ 78 | 79 | # PyBuilder 80 | .pybuilder/ 81 | target/ 82 | 83 | # Jupyter Notebook 84 | .ipynb_checkpoints 85 | 86 | # IPython 87 | profile_default/ 88 | ipython_config.py 89 | 90 | # pyenv 91 | # For a library or package, you might want to ignore these files since the code is 92 | # intended to run in multiple environments; otherwise, check them in: 93 | # .python-version 94 | 95 | # pipenv 96 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 97 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 98 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 99 | # install all needed dependencies. 100 | #Pipfile.lock 101 | 102 | # poetry 103 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 104 | # This is especially recommended for binary packages to ensure reproducibility, and is more 105 | # commonly ignored for libraries. 106 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 107 | #poetry.lock 108 | 109 | # pdm 110 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 111 | #pdm.lock 112 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 113 | # in version control. 114 | # https://pdm.fming.dev/#use-with-ide 115 | .pdm.toml 116 | 117 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 118 | __pypackages__/ 119 | 120 | # Celery stuff 121 | celerybeat-schedule 122 | celerybeat.pid 123 | 124 | # SageMath parsed files 125 | *.sage.py 126 | 127 | # Environments 128 | .env 129 | .venv 130 | env/ 131 | venv/ 132 | ENV/ 133 | env.bak/ 134 | venv.bak/ 135 | 136 | # Spyder project settings 137 | .spyderproject 138 | .spyproject 139 | 140 | # Rope project settings 141 | .ropeproject 142 | 143 | # mkdocs documentation 144 | /site 145 | 146 | # mypy 147 | .mypy_cache/ 148 | .dmypy.json 149 | dmypy.json 150 | 151 | # Pyre type checker 152 | .pyre/ 153 | 154 | # pytype static type analyzer 155 | .pytype/ 156 | 157 | # Cython debug symbols 158 | cython_debug/ 159 | 160 | # PyCharm 161 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 162 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 163 | # and can be added to the global gitignore or merged into this file. For a more nuclear 164 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 165 | .idea/ 166 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SpaceRobotEnv 2 | 3 | > Note: our repo can be found in the OpenAI Gym Documentation now. Please see [SpaceRobotEnv](https://www.gymlibrary.dev/environments/third_party_environments/#spacerobotenv). 4 | 5 | **SpaceRobotEnv** is an open-sourced environments for trajectory planning of free-floating space robots. 6 | Different from the traditional robot, the free-floating space robot is a dynamic coupling system because of the non-actuated base, as shown in the figure below. 7 | Therefore, model-based trajectory planning methods encounter many dif- ficulties in modeling and computing. 8 | 9 | 10 | Accordingly, the researches focus on how to utilize the model-free methods, like reinforcement learning algorithms, to obtain the trajectory directly. 11 | However, reaching high-level planning accuracy, bimanual coordination and end-to-end control remains an open challenge for space robotics researchers. 12 | To better help the community study this problem, SpaceRobotEnv are developed with the following key features: 13 | * **Real Space Environment**: we construct environments similar to the space. The free-floating space robot is located in a low-gravity condition. 14 | * **Dynamic coupling control**: Compared with robots on the ground, the torques of joints have a significant impact on the posture of the base. The movement of the base makes a disturbance on the positions of end-effectors, thus leading to a more complex trajectory planning task. 15 | * **Image input**: We provide the ability to use images as observations. And we also demonstrates our environment is effective, please see [our paper](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=9550509). 16 | 17 | - **Quick Demos** 18 | 19 | [Paper link](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=9636681) 20 |
21 | 22 |
23 | 24 | [Paper link](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=9636681) 25 |
26 | 27 |
28 | 29 | [Paper link](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=9550509) 30 |
31 | 32 |
33 | 34 | Environments of this repo are as follows: 35 | * **SpaceRobotState-v0** 36 | * State vector contains the angular positions and velocities of joints, the positions and velocities of end-effectors and the positions of goals. The core goal is to make the end-effector reach the goal randomly selected within a large space. 37 | * **SpaceRobotCost-v0** 38 | * The task is to make the end-effector reach a random goal while avoiding obvious movement of the base, especially for its orientation. Because the rotation of the base will cause the interruption of communication with the earth. 39 | * **SpaceRobotImage-v0** 40 | * State vector only contains images information. The core goal is the same as that of the `SpaceRobotState-v0` environment. 41 | * **SpaceRobotDualArm-v0** 42 | * The free floating space robot owns two robotic arms which are attached with the base. That means two end-effectors are corresponding to two goal positions. 43 | When two end-effectors reach the goals together, the task is finished. 44 | * **SpaceRobotReorientation-v0** 45 | * The free floating space robot owns two robotic arms which are attached with the base. The inital orientation of the base is sampled randomly in each episode. 46 | When two arms help the base to reach the target orientation, the task is finished. 47 | * **SpaceRobotPointCloud-v0** 48 | * State vector contains the point colouds information. The core goal is the same as that of the `SpaceRobotState-v0` environment. 49 | 50 | ## Installation 51 | 52 | Our environment is built on the [Mujoco Simulation](https://github.com/deepmind/mujoco). So before using our repo, please make sure you install the [Mujoco](https://github.com/deepmind/mujoco) platform. 53 | Additionally, our framework is based on the [Gym](https://github.com/openai/gym). 54 | Details regarding installation of Gym can be found [here](https://github.com/openai/gym). 55 | 56 | After you finish the installation of the Mujoco and Gym and test some toy examples using them, you can install this repo from the source code: 57 | 58 | ```bash 59 | pip install -e . 60 | ``` 61 | 62 | ## Quick Start 63 | 64 | We provide a Gym-Like API that allows us to get interacting information. `test_env.py` shows a toy example to verify the environments. 65 | As you can see, A Gym-Like API makes some popular RL-based algorithm repos, like [Stable Baselines3](https://github.com/DLR-RM/stable-baselines3), easily implemented in our environments. 66 | ```python 67 | import gym 68 | 69 | import SpaceRobotEnv 70 | import numpy as np 71 | 72 | env = gym.make("SpaceRobotState-v0") 73 | 74 | dim_u = env.action_space.shape[0] 75 | print(dim_u) 76 | dim_o = env.observation_space["observation"].shape[0] 77 | print(dim_o) 78 | 79 | 80 | observation = env.reset() 81 | max_action = env.action_space.high 82 | print("max_action:", max_action) 83 | print("mmin_action", env.action_space.low) 84 | for e_step in range(20): 85 | observation = env.reset() 86 | for i_step in range(50): 87 | env.render() 88 | action = np.random.uniform(low=-1.0, high=1.0, size=(dim_u,)) 89 | observation, reward, done, info = env.step(max_action * action) 90 | 91 | env.close() 92 | ``` 93 | 94 | ## Introduction of free-floating space robot 95 | 96 | The free-floating space robot contains two parts, a robotic arm and a base satellite. The robot arm is rigidly connected with the base, and the whole space robot remains in a low-gravity condition. 97 | The 6-DoF UR5 model is chosen as the robot arm, and to simplify, we considered the base as a cubic structure. The specific structure is shown as follows. 98 | 99 |
100 | 101 |
102 | 103 | 104 | ## Future plan 105 | 106 | 107 | ### Tasks under development: 108 | - [x] Point cloud inputs 109 | - [ ] Add new torque controllers, like impedance controller. 110 | - [ ] Bulid new environments 111 | 112 | ### Algorithms: 113 | - [x] PPO 114 | - [ ] TRPO 115 | - [x] DDPG 116 | - [ ] TD3 117 | - [x] SAC 118 | - [ ] HER 119 | - [ ] [HDO](https://ieeexplore.ieee.org/abstract/document/9718193) 120 | 121 | ## Citing SpaceRobotEnv 122 | 123 | If you find SpaceRobotEnv useful, please cite our recent work in your publications. 124 | 125 | ``` 126 | @article{wang2022collision, 127 | title={Collision-Free Trajectory Planning for a 6-DoF Free-Floating Space Robot via Hierarchical Decoupling Optimization}, 128 | author={Wang, Shengjie and Cao, Yuxue and Zheng, Xiang and Zhang, Tao}, 129 | journal={IEEE Robotics and Automation Letters}, 130 | volume={7}, 131 | number={2}, 132 | pages={4953--4960}, 133 | year={2022}, 134 | publisher={IEEE} 135 | } 136 | 137 | @inproceedings{wang2021multi, 138 | title={A Multi-Target Trajectory Planning of a 6-DoF Free-Floating Space Robot via Reinforcement Learning}, 139 | author={Wang, Shengjie and Zheng, Xiang and Cao, Yuxue and Zhang, Tao}, 140 | booktitle={2021 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)}, 141 | pages={3724--3730}, 142 | organization={IEEE} 143 | } 144 | 145 | @inproceedings{wang2021end, 146 | title={An End-to-End Trajectory Planning Strategy for Free-floating Space Robots}, 147 | author={Wang, Shengjie and Cao, Yuxue and Zheng, Xiang and Zhang, Tao}, 148 | booktitle={2021 40th Chinese Control Conference (CCC)}, 149 | pages={4236--4241}, 150 | year={2021}, 151 | organization={IEEE} 152 | } 153 | 154 | @article{cao2022reinforcement, 155 | title={Reinforcement Learning with Prior Policy Guidance for Motion Planning of Dual-Arm Free-Floating Space Robot}, 156 | author={Cao, Yuxue and Wang, Shengjie and Zheng, Xiang and Ma, Wenke and Xie, Xinru and Liu, Lei}, 157 | journal={arXiv preprint arXiv:2209.01434}, 158 | year={2022} 159 | } 160 | 161 | ``` 162 | 163 | ## The Team 164 | 165 | SpaceRobotEnv is a project maintained by 166 | [Shengjie Wang](https://github.com/Shengjie-bob), [Xiang Zheng](https://github.com/x-zheng16), [Yuxue Cao](https://github.com/ShenGe123000) , [Fengbo Lan](https://github.com/lanrobot) at Tsinghua University. Also thanks a lot for the great contribution from [Tosin](https://github.com/tohsin) . 167 | 168 | 169 | ## License 170 | 171 | SpaceRobotEnv has an Apache license, as found in the [LICENSE](LICENSE) file. 172 | -------------------------------------------------------------------------------- /RL_algorithms/Torch/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/.DS_Store -------------------------------------------------------------------------------- /RL_algorithms/Torch/DDPG/DDPG_ENV/core.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.signal 3 | 4 | import torch 5 | import torch.nn as nn 6 | 7 | 8 | def combined_shape(length, shape=None): 9 | if shape is None: 10 | return (length,) 11 | return (length, shape) if np.isscalar(shape) else (length, *shape) 12 | 13 | def mlp(sizes, activation, output_activation=nn.Identity): 14 | layers = [] 15 | for j in range(len(sizes)-1): 16 | act = activation if j < len(sizes)-2 else output_activation 17 | layers += [nn.Linear(sizes[j], sizes[j+1]), act()] 18 | return nn.Sequential(*layers) 19 | 20 | def count_vars(module): 21 | return sum([np.prod(p.shape) for p in module.parameters()]) 22 | 23 | class MLPActor(nn.Module): 24 | 25 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation, act_limit): 26 | super().__init__() 27 | pi_sizes = [obs_dim] + list(hidden_sizes) + [act_dim] 28 | self.pi = mlp(pi_sizes, activation, nn.Tanh) 29 | self.act_limit = act_limit 30 | 31 | def forward(self, obs): 32 | # Return output from network scaled to action space limits. 33 | return self.act_limit * self.pi(obs) 34 | 35 | class MLPQFunction(nn.Module): 36 | 37 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation): 38 | super().__init__() 39 | self.q = mlp([obs_dim + act_dim] + list(hidden_sizes) + [1], activation) 40 | 41 | def forward(self, obs, act): 42 | q = self.q(torch.cat([obs, act], dim=-1)) 43 | return torch.squeeze(q, -1) # Critical to ensure q has right shape. 44 | 45 | class MLPActorCritic(nn.Module): 46 | 47 | def __init__(self, observation_space, action_space, hidden_sizes=(256,256), 48 | activation=nn.ReLU): 49 | super().__init__() 50 | 51 | obs_dim = observation_space.shape[0] 52 | act_dim = action_space.shape[0] 53 | act_limit = action_space.high[0] 54 | 55 | # build policy and value functions 56 | self.pi = MLPActor(obs_dim, act_dim, hidden_sizes, activation, act_limit) 57 | self.q = MLPQFunction(obs_dim, act_dim, hidden_sizes, activation) 58 | 59 | def act(self, obs): 60 | with torch.no_grad(): 61 | return self.pi(obs).numpy() 62 | -------------------------------------------------------------------------------- /RL_algorithms/Torch/DDPG/DDPG_ENV/ddpg.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | import numpy as np 3 | import torch 4 | from torch.optim import Adam 5 | import gym 6 | import time 7 | import SpaceRobotEnv 8 | import core 9 | from torch.utils.tensorboard import SummaryWriter 10 | # run tensor board tensorboard --logdir = /Users/emma/dev/SpaceRobotEnv/RL_algorithms/Torch/DDPG/DDPG_ENV/logger 11 | #tensorboard --logdir=/Users/emma/dev/SpaceRobotEnv/RL_algorithms/Torch/DDPG/DDPG_ENV/logger 12 | class ReplayBuffer: 13 | """ 14 | A simple FIFO experience replay buffer for DDPG agents. 15 | """ 16 | 17 | def __init__(self, obs_dim, act_dim, size): 18 | self.obs_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32) 19 | self.obs2_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32) 20 | self.act_buf = np.zeros(core.combined_shape(size, act_dim), dtype=np.float32) 21 | self.rew_buf = np.zeros(size, dtype=np.float32) 22 | self.done_buf = np.zeros(size, dtype=np.float32) 23 | self.ptr, self.size, self.max_size = 0, 0, size 24 | 25 | def store(self, obs, act, rew, next_obs, done): 26 | self.obs_buf[self.ptr] = obs 27 | self.obs2_buf[self.ptr] = next_obs 28 | self.act_buf[self.ptr] = act 29 | self.rew_buf[self.ptr] = rew 30 | self.done_buf[self.ptr] = done 31 | self.ptr = (self.ptr+1) % self.max_size 32 | self.size = min(self.size+1, self.max_size) 33 | 34 | def sample_batch(self, batch_size=32): 35 | idxs = np.random.randint(0, self.size, size=batch_size) 36 | batch = dict(obs=self.obs_buf[idxs], 37 | obs2=self.obs2_buf[idxs], 38 | act=self.act_buf[idxs], 39 | rew=self.rew_buf[idxs], 40 | done=self.done_buf[idxs]) 41 | return {k: torch.as_tensor(v, dtype=torch.float32) for k,v in batch.items()} 42 | 43 | 44 | 45 | def ddpg(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, 46 | steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, 47 | polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, 48 | update_after=1000, update_every=50, act_noise=0.1, num_test_episodes=10, 49 | max_ep_len=1000, save_freq=1): 50 | """ 51 | Deep Deterministic Policy Gradient (DDPG) 52 | 53 | 54 | Args: 55 | env_fn : A function which creates a copy of the environment. 56 | The environment must satisfy the OpenAI Gym API. 57 | 58 | actor_critic: The constructor method for a PyTorch Module with an ``act`` 59 | method, a ``pi`` module, and a ``q`` module. The ``act`` method and 60 | ``pi`` module should accept batches of observations as inputs, 61 | and ``q`` should accept a batch of observations and a batch of 62 | actions as inputs. When called, these should return: 63 | 64 | =========== ================ ====================================== 65 | Call Output Shape Description 66 | =========== ================ ====================================== 67 | ``act`` (batch, act_dim) | Numpy array of actions for each 68 | | observation. 69 | ``pi`` (batch, act_dim) | Tensor containing actions from policy 70 | | given observations. 71 | ``q`` (batch,) | Tensor containing the current estimate 72 | | of Q* for the provided observations 73 | | and actions. (Critical: make sure to 74 | | flatten this!) 75 | =========== ================ ====================================== 76 | 77 | ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 78 | you provided to DDPG. 79 | 80 | seed (int): Seed for random number generators. 81 | 82 | steps_per_epoch (int): Number of steps of interaction (state-action pairs) 83 | for the agent and the environment in each epoch. 84 | 85 | epochs (int): Number of epochs to run and train agent. 86 | 87 | replay_size (int): Maximum length of replay buffer. 88 | 89 | gamma (float): Discount factor. (Always between 0 and 1.) 90 | 91 | polyak (float): Interpolation factor in polyak averaging for target 92 | networks. Target networks are updated towards main networks 93 | according to: 94 | 95 | .. math:: \\theta_{\\text{targ}} \\leftarrow 96 | \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta 97 | 98 | where :math:`\\rho` is polyak. (Always between 0 and 1, usually 99 | close to 1.) 100 | 101 | pi_lr (float): Learning rate for policy. 102 | 103 | q_lr (float): Learning rate for Q-networks. 104 | 105 | batch_size (int): Minibatch size for SGD. 106 | 107 | start_steps (int): Number of steps for uniform-random action selection, 108 | before running real policy. Helps exploration. 109 | 110 | update_after (int): Number of env interactions to collect before 111 | starting to do gradient descent updates. Ensures replay buffer 112 | is full enough for useful updates. 113 | 114 | update_every (int): Number of env interactions that should elapse 115 | between gradient descent updates. Note: Regardless of how long 116 | you wait between updates, the ratio of env steps to gradient steps 117 | is locked to 1. 118 | 119 | act_noise (float): Stddev for Gaussian exploration noise added to 120 | policy at training time. (At test time, no noise is added.) 121 | 122 | num_test_episodes (int): Number of episodes to test the deterministic 123 | policy at the end of each epoch. 124 | 125 | max_ep_len (int): Maximum length of trajectory / episode / rollout. 126 | 127 | logger_kwargs (dict): Keyword args for EpochLogger. 128 | 129 | save_freq (int): How often (in terms of gap between epochs) to save 130 | the current policy and value function. 131 | 132 | """ 133 | 134 | # logger = EpochLogger(**logger_kwargs) 135 | # logger.save_config(locals()) 136 | n_update_step = 0 137 | n_test_step = 0 138 | n_played_games = 0 139 | score_history = [] 140 | torch.manual_seed(seed) 141 | np.random.seed(seed) 142 | 143 | env, test_env = env_fn(), env_fn() 144 | obs_dim = env.observation_space['observation'].shape[0] 145 | act_dim = env.action_space.shape[0] 146 | 147 | # Action limit for clamping: critically, assumes all dimensions share the same bound! 148 | act_limit = env.action_space.high[0] 149 | 150 | # Create actor-critic module and target networks 151 | ac = actor_critic(env.observation_space['observation'], env.action_space, **ac_kwargs) 152 | ac_targ = deepcopy(ac) 153 | 154 | # Freeze target networks with respect to optimizers (only update via polyak averaging) 155 | for p in ac_targ.parameters(): 156 | p.requires_grad = False 157 | 158 | # Experience buffer 159 | replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) 160 | 161 | # Count variables (protip: try to get a feel for how different size networks behave!) 162 | var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.q]) 163 | # logger.log('\nNumber of parameters: \t pi: %d, \t q: %d\n'%var_counts) 164 | 165 | # Set up function for computing DDPG Q-loss 166 | def compute_loss_q(data): 167 | o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done'] 168 | 169 | q = ac.q(o,a) 170 | 171 | # Bellman backup for Q function 172 | with torch.no_grad(): 173 | q_pi_targ = ac_targ.q(o2, ac_targ.pi(o2)) 174 | backup = r + gamma * (1 - d) * q_pi_targ 175 | 176 | # MSE loss against Bellman backup 177 | loss_q = ((q - backup)**2).mean() 178 | 179 | # Useful info for logging 180 | loss_info = dict(QVals=q.detach().numpy()) 181 | 182 | return loss_q, loss_info 183 | 184 | # Set up function for computing DDPG pi loss 185 | def compute_loss_pi(data): 186 | o = data['obs'] 187 | q_pi = ac.q(o, ac.pi(o)) 188 | return -q_pi.mean() 189 | 190 | # Set up optimizers for policy and q-function 191 | pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) 192 | q_optimizer = Adam(ac.q.parameters(), lr=q_lr) 193 | 194 | # Set up model saving 195 | # logger.setup_pytorch_saver(ac) 196 | 197 | def update(data): 198 | # First run one gradient descent step for Q. 199 | q_optimizer.zero_grad() 200 | loss_q, loss_info = compute_loss_q(data) 201 | loss_q.backward() 202 | q_optimizer.step() 203 | writer.add_scalar("Loss_Q", loss_q.item(), n_update_step ) 204 | 205 | # Freeze Q-network so you don't waste computational effort 206 | # computing gradients for it during the policy learning step. 207 | for p in ac.q.parameters(): 208 | p.requires_grad = False 209 | 210 | # Next run one gradient descent step for pi. 211 | pi_optimizer.zero_grad() 212 | loss_pi = compute_loss_pi(data) 213 | loss_pi.backward() 214 | pi_optimizer.step() 215 | writer.add_scalar("Loss_Pi", loss_pi.item(), n_update_step) 216 | 217 | # Unfreeze Q-network so you can optimize it at next DDPG step. 218 | for p in ac.q.parameters(): 219 | p.requires_grad = True 220 | 221 | 222 | # Record things 223 | # logger.store(LossQ=loss_q.item(), LossPi=loss_pi.item(), **loss_info) 224 | 225 | # Finally, update target networks by polyak averaging. 226 | with torch.no_grad(): 227 | for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): 228 | # NB: We use an in-place operations "mul_", "add_" to update target 229 | # params, as opposed to "mul" and "add", which would make new tensors. 230 | p_targ.data.mul_(polyak) 231 | p_targ.data.add_((1 - polyak) * p.data) 232 | 233 | def get_action(o, noise_scale): 234 | a = ac.act(torch.as_tensor(o, dtype=torch.float32)) 235 | a += noise_scale * np.random.randn(act_dim) 236 | return np.clip(a, -act_limit, act_limit) 237 | 238 | def test_agent(): 239 | avg_score_test = [] 240 | for j in range(num_test_episodes): 241 | o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 242 | o = o['observation'] 243 | while not(d or (ep_len == max_ep_len)): 244 | # Take deterministic actions at test time (noise_2scale=0) 245 | o, r, d, _ = test_env.step(get_action(o, 0)) 246 | o = o['observation'] 247 | ep_ret += r 248 | ep_len += 1 249 | avg_score_test.append(ep_ret) 250 | writer.add_scalar("Test_score avg", np.mean(avg_score_test), n_test_step) 251 | # logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) 252 | 253 | # Prepare for interaction with environment 254 | total_steps = steps_per_epoch * epochs 255 | start_time = time.time() 256 | o, ep_ret, ep_len = env.reset(), 0, 0 257 | o = o["observation"] 258 | 259 | # Main loop: collect experience in env and update/log each epoch 260 | for t in range(total_steps): 261 | 262 | # Until start_steps have elapsed, randomly sample actions 263 | # from a uniform distribution for better exploration. Afterwards, 264 | # use the learned policy (with some noise, via act_noise). 265 | if t > start_steps: 266 | a = get_action(o, act_noise) 267 | else: 268 | a = env.action_space.sample() 269 | 270 | # Step the env 271 | o2, r, d, _ = env.step(a) 272 | o2 = o2["observation"] 273 | ep_ret += r 274 | ep_len += 1 275 | 276 | # Ignore the "done" signal if it comes from hitting the time 277 | # horizon (that is, when it's an artificial terminal signal 278 | # that isn't based on the agent's state) 279 | d = False if ep_len==max_ep_len else d 280 | 281 | # Store experience to replay buffer 282 | replay_buffer.store(o, a, r, o2, d) 283 | 284 | # Super critical, easy to overlook step: make sure to update 285 | # most recent observation! 286 | o = o2 287 | 288 | # End of trajectory handling 289 | if d or (ep_len == max_ep_len): 290 | # logger.store(EpRet=ep_ret, EpLen=ep_len) 291 | n_played_games += 1 292 | score_history.append(ep_ret) 293 | avg_score = np.mean(score_history[-100:]) 294 | writer.add_scalar("Avg Reward", avg_score, n_played_games ) 295 | print( 'score %.1f' %ep_ret, 'avg_score %.1f' %avg_score,'num_games', n_played_games, ) 296 | 297 | o, ep_ret, ep_len = env.reset(), 0, 0 298 | o= o["observation"] 299 | 300 | # Update handling 301 | if t >= update_after and t % update_every == 0: 302 | for _ in range(update_every): 303 | n_update_step += 1 304 | batch = replay_buffer.sample_batch(batch_size) 305 | update(data=batch) 306 | 307 | # End of epoch handling 308 | if (t+1) % steps_per_epoch == 0: 309 | epoch = (t+1) // steps_per_epoch 310 | 311 | # Test the performance of the deterministic version of the agent. 312 | n_test_step +=1 313 | test_agent() 314 | 315 | # Log info about epoch 316 | 317 | 318 | if __name__ == '__main__': 319 | import argparse 320 | parser = argparse.ArgumentParser() 321 | parser.add_argument('--env', type=str, default='SpaceRobotState-v0') 322 | parser.add_argument('--hid', type=int, default=256) 323 | parser.add_argument('--l', type=int, default=2) 324 | parser.add_argument('--gamma', type=float, default=0.99) 325 | parser.add_argument('--seed', '-s', type=int, default=0) 326 | parser.add_argument('--epochs', type=int, default=50) 327 | parser.add_argument('--exp_name', type=str, default='ddpg') 328 | args = parser.parse_args() 329 | 330 | writer = SummaryWriter("RL_algorithms/Torch/DDPG/DDPG_ENV/logger") 331 | writer.add_text( 332 | "hyperparameters", 333 | "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])), 334 | ) 335 | ddpg(lambda : gym.make(args.env), actor_critic=core.MLPActorCritic, 336 | ac_kwargs=dict(hidden_sizes=[args.hid]*args.l), 337 | gamma=args.gamma, seed=args.seed, epochs=args.epochs) 338 | -------------------------------------------------------------------------------- /RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658931832.Tosins-MacBook-Air.local.42757.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658931832.Tosins-MacBook-Air.local.42757.0 -------------------------------------------------------------------------------- /RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658931911.Tosins-MacBook-Air.local.42851.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658931911.Tosins-MacBook-Air.local.42851.0 -------------------------------------------------------------------------------- /RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658931934.Tosins-MacBook-Air.local.42904.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658931934.Tosins-MacBook-Air.local.42904.0 -------------------------------------------------------------------------------- /RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658932019.Tosins-MacBook-Air.local.42965.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658932019.Tosins-MacBook-Air.local.42965.0 -------------------------------------------------------------------------------- /RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658932066.Tosins-MacBook-Air.local.43009.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658932066.Tosins-MacBook-Air.local.43009.0 -------------------------------------------------------------------------------- /RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658932179.Tosins-MacBook-Air.local.43178.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658932179.Tosins-MacBook-Air.local.43178.0 -------------------------------------------------------------------------------- /RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658932723.Tosins-MacBook-Air.local.43565.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658932723.Tosins-MacBook-Air.local.43565.0 -------------------------------------------------------------------------------- /RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658932834.Tosins-MacBook-Air.local.43628.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658932834.Tosins-MacBook-Air.local.43628.0 -------------------------------------------------------------------------------- /RL_algorithms/Torch/DDPG/DDPG_ENV/memory.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/DDPG/DDPG_ENV/memory.py -------------------------------------------------------------------------------- /RL_algorithms/Torch/PPO/Continious/PPO/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Continious/PPO/__init__.py -------------------------------------------------------------------------------- /RL_algorithms/Torch/PPO/Continious/PPO/actor.py: -------------------------------------------------------------------------------- 1 | from multiprocessing.context import BaseContext 2 | import os 3 | import numpy as np 4 | import torch as T 5 | 6 | import torch.nn as nn 7 | import torch.optim as optim 8 | from torch.distributions.normal import Normal 9 | PATH = os.getcwd() 10 | 11 | class ActorNetwork(nn.Module): 12 | 13 | def __init__(self, n_actions, input_dims, alpha, model_name : str, 14 | fc1_dims=256, fc2_dims=256, check_point_base_dir = 'RL_algorithms/Torch/PPO/Continious/PPO/models/'): 15 | super(ActorNetwork, self).__init__() 16 | self.n_actions = n_actions 17 | 18 | log_std = -0.5 * np.ones(n_actions, dtype=np.float32) 19 | self.log_std = T.nn.Parameter(T.as_tensor(log_std)) 20 | 21 | check_point_base_dir = os.path.join( PATH , check_point_base_dir ) 22 | self.checkpoint_file = os.path.join(check_point_base_dir, model_name) 23 | 24 | self.actor = nn.Sequential( 25 | nn.Linear(*input_dims, fc1_dims), 26 | nn.ReLU(), 27 | nn.Linear(fc1_dims, fc2_dims), 28 | nn.ReLU(), 29 | nn.Linear(fc2_dims, n_actions), 30 | nn.Tanh() 31 | ) 32 | 33 | self.optimizer = optim.Adam(self.parameters(), lr=alpha) 34 | 35 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu') 36 | 37 | self.to(self.device) 38 | 39 | def forward(self, obs, act = None): 40 | pi = self._distribution(obs) 41 | logp_a = None 42 | if act is not None: 43 | logp_a = self._log_prob_from_distribution(pi, act) 44 | return pi, logp_a 45 | 46 | def _distribution(self, state): 47 | mu = self.actor(state) 48 | std = T.exp(self.log_std) 49 | return Normal(mu, std) 50 | 51 | def _log_prob_from_distribution(self, pi, act): 52 | return pi.log_prob(act).sum(axis=-1) 53 | 54 | def save_checkpoint(self): 55 | T.save(self.state_dict(), self.checkpoint_file) 56 | 57 | def load_checkpoint(self): 58 | self.load_state_dict(T.load(self.checkpoint_file)) 59 | -------------------------------------------------------------------------------- /RL_algorithms/Torch/PPO/Continious/PPO/agent.py: -------------------------------------------------------------------------------- 1 | import imp 2 | from multiprocessing.context import BaseContext 3 | import os 4 | import copy 5 | from tqdm import tqdm 6 | import numpy as np 7 | import torch as T 8 | import torch.nn as nn 9 | import torch.optim as optim 10 | 11 | from actor import ActorNetwork 12 | from critic import CriticNetwork 13 | from memory import PPOBuffer 14 | 15 | 16 | PATH = os.getcwd() 17 | # MODEL_XML_PATH = os.path.join( 18 | # PATH, "SpaceRobotEnv", "assets", "spacerobot", "spacerobot_image.xml" 19 | # ) 20 | 21 | class Agent: 22 | def __init__(self, n_actions, input_dims, model_name_actor : str, model_name_critic : str, \ 23 | gamma = 0.99, alpha = 0.0003, gae_lambda = 0.95, \ 24 | policy_clip = 0.2, n_epoch = 10, batch_size = 64): 25 | ''' 26 | parameter 27 | arguments: 28 | - model_name_actor : model name for actor to be used in model savind directory 29 | - model_name_critic :model name for critic to be used in model savind directory 30 | ''' 31 | #self, n_actions, gae_lamda = 0.95, gamma = 0.99, alpha = 0 .0003, policy_clip = 0.2, batch_size = 64, N = 2048 , n_epoch = 10 32 | self.gamma = gamma 33 | self.gae_lambda = gae_lambda 34 | self.policy_clip = policy_clip 35 | self.n_epoch = n_epoch 36 | 37 | self.actor = ActorNetwork(n_actions, input_dims, alpha, model_name = model_name_actor) 38 | self.critic = CriticNetwork(input_dims, alpha, model_name = model_name_critic) 39 | self.memory_handler = PPOBuffer( batch_size ) 40 | 41 | def remember(self, state, action, probs, vals, reward, done): 42 | self.memory_handler.store_memory(state, action, probs, vals, reward, done) 43 | 44 | def save_models(self): 45 | print("Saving models now") 46 | self.actor.save_checkpoint() 47 | self.critic.save_checkpoint() 48 | 49 | def load_model(self): 50 | print("Load model") 51 | self.actor.load_checkpoint() 52 | self.critic.load_checkpoint() 53 | 54 | def play_optimal(self, observation): 55 | with T.no_grad(): 56 | state = T.tensor([observation], dtype=T.float).to(self.actor.device) 57 | dist = self.actor(state) 58 | # action shoulnt be sampe it should be arg max 59 | action = dist.sample() 60 | action =T.squeeze(action).item() 61 | return action 62 | 63 | def choose_action(self, observation): 64 | with T.no_grad(): 65 | observation = T.tensor([observation], dtype=T.float).to(self.actor.device) 66 | policy = self.actor._distribution(observation) 67 | action = policy.sample() 68 | logp_a = self.actor._log_prob_from_distribution(policy, action) 69 | value = self.critic(observation) 70 | 71 | return action.numpy(), logp_a.numpy(), value.numpy() 72 | 73 | def learn(self): 74 | for _ in range(self.n_epoch): 75 | 76 | state_arr, action_arr, old_prob_arr, vals_arr,\ 77 | reward_arr, dones_arr, batches = \ 78 | self.memory_handler.generate_batches() 79 | 80 | values = vals_arr.copy() 81 | advantage = np.zeros(len(reward_arr), dtype=np.float32) 82 | # calculate advantage = sigma_t + (gamma * lamda) * sigma_t+1 + (gamma * lamda) ^ 2 * sigma_t+2..... 83 | # sigma_t = reward_t + gamma * Value(s_ t+1 ) - Value(s_t) 84 | for t in range(len(reward_arr)-1): 85 | discount = 1 86 | a_t = 0 87 | for k in range(t, len(reward_arr)-1): 88 | 89 | a_t += discount * (reward_arr[k] + self.gamma*values[k+1]*\ 90 | (1-int(dones_arr[k])) - values[k]) 91 | 92 | # discount term gamma * gae_lamda (y*lamda) 93 | discount *= self.gamma * self.gae_lambda 94 | advantage[t] = a_t 95 | advantage = T.tensor(advantage).to(self.actor.device) 96 | 97 | values = T.tensor(values).to(self.actor.device) 98 | 99 | for batch in batches: 100 | states = T.tensor(state_arr[batch], dtype=T.float).to(self.actor.device) 101 | old_probs = T.tensor(old_prob_arr[batch]).to(self.actor.device) 102 | 103 | actions = T.tensor(action_arr[batch]).to(self.actor.device) 104 | 105 | pi, new_probs = self.actor(states, actions) 106 | 107 | critic_value = self.critic(states) 108 | 109 | critic_value = T.squeeze(critic_value) 110 | 111 | # new_probs = dist.log_prob(actions) 112 | 113 | 114 | # prob_ratio = new_probs.exp() / old_probs.exp() 115 | prob_ratio = T.exp(new_probs - old_probs) 116 | weighted_probs = advantage[batch] * prob_ratio 117 | 118 | weighted_clipped_probs = T.clamp(prob_ratio, 1-self.policy_clip, 119 | 1 + self.policy_clip) * advantage[batch] 120 | 121 | actor_loss = -T.min(weighted_probs, weighted_clipped_probs).mean() 122 | 123 | returns = advantage[batch] + values[batch] 124 | critic_loss = (returns-critic_value)**2 125 | critic_loss = critic_loss.mean() 126 | 127 | total_loss = actor_loss + 0.5* critic_loss 128 | self.actor.optimizer.zero_grad() 129 | self.critic.optimiser.zero_grad() 130 | # print("total loss", total_loss.item()) 131 | total_loss.backward() 132 | self.actor.optimizer.step() 133 | self.critic.optimiser.step() 134 | 135 | self.memory_handler.clear_memory() 136 | -------------------------------------------------------------------------------- /RL_algorithms/Torch/PPO/Continious/PPO/critic.py: -------------------------------------------------------------------------------- 1 | from multiprocessing.context import BaseContext 2 | import os 3 | import numpy as np 4 | import torch as T 5 | import torch.nn as nn 6 | import torch.optim as optim 7 | from torch.distributions.categorical import Categorical 8 | PATH = os.getcwd() 9 | class CriticNetwork(nn.Module): 10 | def __init__(self, input_dims, alpha, model_name : str , fc1_dims = 256,\ 11 | fc2_dims = 256, check_point_base_dir = 'RL_algorithms/Torch/PPO/Continious/PPO/models') -> None: 12 | super(CriticNetwork, self).__init__() 13 | 14 | check_point_base_dir = os.path.join( PATH , check_point_base_dir ) 15 | self.check_point_file = os.path.join(check_point_base_dir, model_name) 16 | self.critic = nn.Sequential( 17 | nn.Linear(*input_dims , fc1_dims), 18 | nn.ReLU(), 19 | nn.Linear(fc1_dims , fc2_dims), 20 | nn.ReLU(), 21 | nn.Linear(fc2_dims , 1), 22 | 23 | ) 24 | self.optimiser = optim.Adam(self.parameters(), lr = alpha) 25 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu') 26 | self.to(self.device) 27 | 28 | def forward(self, state): 29 | value = self.critic(state) 30 | return value 31 | 32 | def save_checkpoint(self): 33 | T.save(self.state_dict(), self.check_point_file) 34 | 35 | def load_checkpoint(self): 36 | self.load_state_dict(T.load(self.check_point_file)) -------------------------------------------------------------------------------- /RL_algorithms/Torch/PPO/Continious/PPO/main.py: -------------------------------------------------------------------------------- 1 | from turtle import shape 2 | import gym 3 | import numpy as np 4 | from agent import Agent 5 | 6 | from utils import plot_learning_curve 7 | import gym 8 | 9 | import SpaceRobotEnv 10 | import numpy as np 11 | 12 | 13 | 14 | if __name__ == '__main__': 15 | env = gym.make("SpaceRobotState-v0") 16 | N = 30 17 | batch_size = 16 18 | n_epochs = 3 19 | alpha = 0.0003 20 | action_space = env.action_space.shape[0] 21 | obs_shape = env.observation_space["observation"].shape 22 | 23 | 24 | 25 | agent = Agent( n_actions = action_space, 26 | batch_size=batch_size, 27 | alpha = alpha, 28 | n_epoch = n_epochs, 29 | input_dims = obs_shape, 30 | model_name_actor = "space_robot_actor.pt", 31 | model_name_critic = "space_robot_critic.pt") 32 | n_iter = 3000 33 | figure_file = 'RL_algorithms/Torch/PPO/Continious/PPO/plots/space_robot_performance.png' 34 | best_score = env.reward_range[0] 35 | score_history = [] 36 | n_steps = 0 37 | learn_iters = 0 38 | avg_score = 0 39 | 40 | for i in range(n_iter): 41 | obs = env.reset() 42 | observation = obs["observation"] 43 | done = False 44 | score = 0 45 | while not done: 46 | action, prob, val = agent.choose_action(observation) 47 | v = prob 48 | # a = action 49 | a = action.reshape(6,) 50 | observation_, reward, done, info = env.step(a) 51 | n_steps+=1 52 | score += reward 53 | 54 | agent.remember(observation, action, prob, val, reward, done) 55 | #steps before we begin learning 20 56 | if n_steps % N ==0: 57 | agent.learn() 58 | learn_iters += 1 59 | observation = observation_["observation"] 60 | score_history.append(score) 61 | avg_score = np.mean(score_history[-100:]) 62 | 63 | if avg_score>best_score: 64 | best_score= avg_score 65 | agent.save_models() 66 | print('episode', i , 'score %.1f', 'avg_score %.1f' %avg_score, 67 | 'time_steps',n_steps, 'learning_steps', learn_iters) 68 | 69 | x = [i+1 for i in range(len(score_history))] 70 | plot_learning_curve(x, score_history,figure_file) 71 | env.close() 72 | -------------------------------------------------------------------------------- /RL_algorithms/Torch/PPO/Continious/PPO/memory.py: -------------------------------------------------------------------------------- 1 | from multiprocessing.context import BaseContext 2 | import os 3 | import numpy as np 4 | import torch as T 5 | import torch.nn as nn 6 | import torch.optim as optim 7 | 8 | 9 | 10 | class PPOBuffer: 11 | def __init__(self, batch_size): 12 | self.states = [] 13 | self.probs = [] 14 | self.vals = [] 15 | self.actions = [] 16 | self.rewards = [] 17 | self.dones = [] 18 | 19 | self.batch_size = batch_size 20 | 21 | def generate_batches(self): 22 | n_states = len(self.states) 23 | batch_start = np.arange(0, n_states, self.batch_size) 24 | indices = np.arange(n_states, dtype=np.int64) 25 | np.random.shuffle(indices) 26 | batches = [indices[i:i+self.batch_size] for i in batch_start] 27 | 28 | return np.array(self.states),\ 29 | np.array(self.actions),\ 30 | np.array(self.probs),\ 31 | np.array(self.vals),\ 32 | np.array(self.rewards),\ 33 | np.array(self.dones),\ 34 | batches 35 | 36 | def store_memory(self, state, action, probs, vals, reward, done): 37 | self.states.append(state) 38 | self.actions.append(action) 39 | self.probs.append(probs) 40 | self.vals.append(vals) 41 | self.rewards.append(reward) 42 | self.dones.append(done) 43 | 44 | def clear_memory(self): 45 | self.states = [] 46 | self.probs = [] 47 | self.actions = [] 48 | self.rewards = [] 49 | self.dones = [] 50 | self.vals = [] 51 | -------------------------------------------------------------------------------- /RL_algorithms/Torch/PPO/Continious/PPO/models/space_robot_actor.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Continious/PPO/models/space_robot_actor.pt -------------------------------------------------------------------------------- /RL_algorithms/Torch/PPO/Continious/PPO/models/space_robot_critic.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Continious/PPO/models/space_robot_critic.pt -------------------------------------------------------------------------------- /RL_algorithms/Torch/PPO/Continious/PPO/plots/space_robot_performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Continious/PPO/plots/space_robot_performance.png -------------------------------------------------------------------------------- /RL_algorithms/Torch/PPO/Continious/PPO/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | def plot_learning_curve(x, scores, figure_file): 5 | running_avg = np.zeros(len(scores)) 6 | for i in range(len(running_avg)): 7 | running_avg[i] = np.mean(scores[max(0, i-100):(i+1)]) 8 | plt.plot(x, running_avg) 9 | plt.title('Running average of previous 100 scores') 10 | plt.savefig(figure_file) -------------------------------------------------------------------------------- /RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/__init__.py -------------------------------------------------------------------------------- /RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/actor.py: -------------------------------------------------------------------------------- 1 | from multiprocessing.context import BaseContext 2 | import os 3 | import numpy as np 4 | import torch as T 5 | import torch.nn.functional as F 6 | import torch.nn as nn 7 | import torch.optim as optim 8 | from torch.distributions.normal import Normal 9 | PATH = os.getcwd() 10 | 11 | class ActorNetwork(nn.Module): 12 | 13 | def __init__(self, max_actions, n_actions, input_dims, alpha, model_name : str, 14 | fc1_dims=256, fc2_dims=256, check_point_base_dir = 'RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/models'): 15 | super(ActorNetwork, self).__init__() 16 | self.n_actions = n_actions 17 | self.max_actions = max_actions 18 | 19 | 20 | check_point_base_dir = os.path.join( PATH , check_point_base_dir ) 21 | self.checkpoint_file = os.path.join(check_point_base_dir, model_name) 22 | self.base_model = nn.Sequential( 23 | nn.Linear(*input_dims, fc1_dims), 24 | nn.ReLU(), 25 | nn.Linear(fc1_dims, fc2_dims), 26 | nn.ReLU(), 27 | ) 28 | fc = [nn.Linear(fc2_dims, 2*n_actions)] 29 | self.fc = nn.Sequential(*fc) 30 | self.optimizer = optim.Adam(self.parameters(), lr=alpha) 31 | 32 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu') 33 | 34 | self.to(self.device) 35 | 36 | def forward(self, state): 37 | x = self.base_model(state) 38 | x = self.fc(x) 39 | mean, std = T.chunk(x, chunks=2, dim=-1) 40 | mean, std = self.max_actions * T.tanh(mean), F.softplus(std) 41 | return mean, std 42 | 43 | def get_logprob(self, state, action): 44 | mean, std = self.forward(state) 45 | dist = Normal(mean, std) 46 | log_prob = dist.log_prob(action).sum(axis=-1) 47 | return log_prob 48 | 49 | 50 | 51 | def save_checkpoint(self): 52 | T.save(self.state_dict(), self.checkpoint_file) 53 | 54 | def load_checkpoint(self): 55 | self.load_state_dict(T.load(self.checkpoint_file)) 56 | -------------------------------------------------------------------------------- /RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/agent.py: -------------------------------------------------------------------------------- 1 | import imp 2 | from multiprocessing.context import BaseContext 3 | import os 4 | import copy 5 | from tqdm import tqdm 6 | import numpy as np 7 | import torch as T 8 | import torch.nn as nn 9 | import torch.optim as optim 10 | from torch.distributions import Normal 11 | 12 | from actor import ActorNetwork 13 | from critic import CriticNetwork 14 | from memory import PPOBuffer 15 | 16 | 17 | PATH = os.getcwd() 18 | # MODEL_XML_PATH = os.path.join( 19 | # PATH, "SpaceRobotEnv", "assets", "spacerobot", "spacerobot_image.xml" 20 | # ) 21 | 22 | class Agent: 23 | def __init__(self, env_max_action, n_actions, input_dims, model_name_actor : str, model_name_critic : str, \ 24 | gamma = 0.99, alpha = 0.0003, gae_lambda = 0.95, \ 25 | policy_clip = 0.2, n_epoch = 3, batch_size = 64): 26 | ''' 27 | parameter 28 | arguments: 29 | - model_name_actor : model name for actor to be used in model savind directory 30 | - model_name_critic :model name for critic to be used in model savind directory 31 | ''' 32 | #self, n_actions, gae_lamda = 0.95, gamma = 0.99, alpha = 0 .0003, policy_clip = 0.2, batch_size = 64, N = 2048 , n_epoch = 10 33 | self.gamma = gamma 34 | self.gae_lambda = gae_lambda 35 | self.policy_clip = policy_clip 36 | self.n_epoch = n_epoch 37 | 38 | self.actor = ActorNetwork( env_max_action , n_actions, input_dims, alpha, model_name = model_name_actor) 39 | self.critic = CriticNetwork(input_dims, alpha, model_name = model_name_critic) 40 | self.memory_handler = PPOBuffer( batch_size ) 41 | 42 | def remember(self, state, action, probs, vals, reward, done): 43 | self.memory_handler.store_memory(state, action, probs, vals, reward, done) 44 | 45 | def save_models(self): 46 | print("Saving models now") 47 | self.actor.save_checkpoint() 48 | self.critic.save_checkpoint() 49 | 50 | def load_model(self): 51 | print("Load model") 52 | self.actor.load_checkpoint() 53 | self.critic.load_checkpoint() 54 | 55 | 56 | def choose_action(self, state): 57 | # state = T.as_tensor(state, dtype=T.float, device=device) 58 | state = T.tensor([state], dtype=T.float).to(self.actor.device) 59 | 60 | mean, std = self.actor.forward(state) 61 | 62 | dist = Normal(mean, std) 63 | 64 | 65 | action = dist.sample() 66 | action_logprob = dist.log_prob(action).sum(axis=-1) 67 | value = self.critic(state) 68 | 69 | return action, action_logprob, value 70 | 71 | # def choose_action(self, observation): 72 | # with T.no_grad(): 73 | # observation = T.tensor([observation], dtype=T.float).to(self.actor.device) 74 | # action , logp_a = self.actor.sample_normal(observation) 75 | # value = self.critic(observation) 76 | # return action.numpy(), logp_a.numpy(), value.numpy() 77 | def learn(self): 78 | for _ in range(self.n_epoch): 79 | 80 | state_arr, action_arr, old_prob_arr, vals_arr,\ 81 | reward_arr, dones_arr, batches = \ 82 | self.memory_handler.generate_batches() 83 | 84 | values = vals_arr.copy() 85 | advantage = np.zeros(len(reward_arr), dtype=np.float32) 86 | # calculate advantage = sigma_t + (gamma * lamda) * sigma_t+1 + (gamma * lamda) ^ 2 * sigma_t+2..... 87 | # sigma_t = reward_t + gamma * Value(s_ t+1 ) - Value(s_t) 88 | for t in range(len(reward_arr)-1): 89 | discount = 1 90 | a_t = 0 91 | for k in range(t, len(reward_arr)-1): 92 | 93 | a_t += discount * (reward_arr[k] + self.gamma*values[k+1]*\ 94 | (1-int(dones_arr[k])) - values[k]) 95 | 96 | # discount term gamma * gae_lamda (y*lamda) 97 | discount *= self.gamma * self.gae_lambda 98 | advantage[t] = a_t 99 | advantage = T.tensor(advantage).to(self.actor.device) 100 | 101 | values = T.tensor(values).to(self.actor.device) 102 | 103 | for batch in batches: 104 | states = T.tensor(state_arr[batch], dtype=T.float).to(self.actor.device) 105 | old_probs = T.tensor(old_prob_arr[batch]).to(self.actor.device) 106 | 107 | actions = T.tensor(action_arr[batch]).to(self.actor.device) 108 | 109 | new_probs = self.actor.get_logprob(states, actions) 110 | 111 | critic_value = self.critic(states) 112 | 113 | critic_value = T.squeeze(critic_value) 114 | 115 | prob_ratio = T.exp(new_probs - old_probs) 116 | 117 | weighted_probs = advantage[batch] * prob_ratio 118 | 119 | 120 | weighted_clipped_probs = T.clamp(prob_ratio, 1-self.policy_clip, 121 | 1 + self.policy_clip)*advantage[batch] 122 | actor_loss = -T.min(weighted_probs, weighted_clipped_probs).mean() 123 | 124 | returns = advantage[batch] + values[batch] 125 | critic_loss = (returns-critic_value)**2 126 | critic_loss = critic_loss.mean() 127 | 128 | total_loss = actor_loss + 0.5* critic_loss 129 | self.actor.optimizer.zero_grad() 130 | self.critic.optimiser.zero_grad() 131 | # print("total loss", total_loss.item()) 132 | total_loss.backward() 133 | self.actor.optimizer.step() 134 | self.critic.optimiser.step() 135 | 136 | self.memory_handler.clear_memory() 137 | -------------------------------------------------------------------------------- /RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/critic.py: -------------------------------------------------------------------------------- 1 | from multiprocessing.context import BaseContext 2 | import os 3 | import numpy as np 4 | import torch as T 5 | import torch.nn as nn 6 | import torch.optim as optim 7 | from torch.distributions.categorical import Categorical 8 | PATH = os.getcwd() 9 | class CriticNetwork(nn.Module): 10 | def __init__(self, input_dims, alpha, model_name : str , fc1_dims = 256,\ 11 | fc2_dims = 256, check_point_base_dir = 'RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/models') -> None: 12 | super(CriticNetwork, self).__init__() 13 | 14 | check_point_base_dir = os.path.join( PATH , check_point_base_dir ) 15 | self.check_point_file = os.path.join(check_point_base_dir, model_name) 16 | self.critic = nn.Sequential( 17 | nn.Linear(*input_dims , fc1_dims), 18 | nn.ReLU(), 19 | nn.Linear(fc1_dims , fc2_dims), 20 | nn.ReLU(), 21 | nn.Linear(fc2_dims , 1), 22 | 23 | ) 24 | self.optimiser = optim.Adam(self.parameters(), lr = alpha) 25 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu') 26 | self.to(self.device) 27 | 28 | def forward(self, state): 29 | value = self.critic(state) 30 | return value 31 | 32 | def save_checkpoint(self): 33 | T.save(self.state_dict(), self.check_point_file) 34 | 35 | def load_checkpoint(self): 36 | self.load_state_dict(T.load(self.check_point_file)) -------------------------------------------------------------------------------- /RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/main.py: -------------------------------------------------------------------------------- 1 | from turtle import shape 2 | import gym 3 | import numpy as np 4 | from agent import Agent 5 | 6 | from utils import plot_learning_curve 7 | import gym 8 | 9 | import SpaceRobotEnv 10 | import numpy as np 11 | 12 | 13 | 14 | if __name__ == '__main__': 15 | env = gym.make("SpaceRobotState-v0") 16 | N = 30 17 | batch_size = 16 18 | n_epochs = 3 19 | alpha = 0.0003 20 | action_space = env.action_space.shape[0] 21 | obs_shape = env.observation_space["observation"].shape 22 | env_max_action = float(env.action_space.high[0]) 23 | 24 | agent = Agent( env_max_action = env_max_action, 25 | n_actions = action_space, 26 | batch_size = batch_size, 27 | alpha = alpha, 28 | n_epoch = n_epochs, 29 | input_dims = obs_shape, 30 | model_name_actor = "space_robot_actor.pt", 31 | model_name_critic = "space_robot_critic.pt") 32 | n_iter = 300 33 | figure_file = 'RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/plots/space_robot_performance.png' 34 | best_score = env.reward_range[0] 35 | score_history = [] 36 | n_steps = 0 37 | learn_iters = 0 38 | avg_score = 0 39 | 40 | for i in range(n_iter): 41 | obs = env.reset() 42 | observation = obs["observation"] 43 | 44 | done = False 45 | score = 0 46 | while not done: 47 | action, prob, val = agent.choose_action(observation) 48 | 49 | action = action.detach().cpu().numpy().flatten() 50 | action = action.clip(env.action_space.low, env.action_space.high) 51 | 52 | action_logprob = prob.detach().cpu().numpy().flatten() 53 | val = val.detach().cpu().numpy().flatten() 54 | 55 | observation_, reward, done, info = env.step(action) 56 | n_steps+=1 57 | score += reward 58 | 59 | agent.remember(observation, action, action_logprob, val, reward, done) 60 | #steps before we begin learning 20 61 | if n_steps % N ==0: 62 | agent.learn() 63 | learn_iters += 1 64 | observation = observation_["observation"] 65 | score_history.append(score) 66 | avg_score = np.mean(score_history[-100:]) 67 | 68 | if avg_score>best_score: 69 | best_score= avg_score 70 | agent.save_models() 71 | print('episode', i , 'score %.1f', 'avg_score %.1f' %avg_score, 72 | 'time_steps',n_steps, 'learning_steps', learn_iters) 73 | 74 | x = [i+1 for i in range(len(score_history))] 75 | plot_learning_curve(x, score_history,figure_file) 76 | env.close() 77 | -------------------------------------------------------------------------------- /RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/memory.py: -------------------------------------------------------------------------------- 1 | from multiprocessing.context import BaseContext 2 | import os 3 | import numpy as np 4 | import torch as T 5 | import torch.nn as nn 6 | import torch.optim as optim 7 | 8 | 9 | 10 | class PPOBuffer: 11 | def __init__(self, batch_size): 12 | self.states = [] 13 | self.probs = [] 14 | self.vals = [] 15 | self.actions = [] 16 | self.rewards = [] 17 | self.dones = [] 18 | 19 | self.batch_size = batch_size 20 | 21 | def generate_batches(self): 22 | n_states = len(self.states) 23 | batch_start = np.arange(0, n_states, self.batch_size) 24 | indices = np.arange(n_states, dtype=np.int64) 25 | np.random.shuffle(indices) 26 | batches = [indices[i:i+self.batch_size] for i in batch_start] 27 | 28 | return np.array(self.states),\ 29 | np.array(self.actions),\ 30 | np.array(self.probs),\ 31 | np.array(self.vals),\ 32 | np.array(self.rewards),\ 33 | np.array(self.dones),\ 34 | batches 35 | 36 | def store_memory(self, state, action, probs, vals, reward, done): 37 | self.states.append(state) 38 | self.actions.append(action) 39 | self.probs.append(probs) 40 | self.vals.append(vals) 41 | self.rewards.append(reward) 42 | self.dones.append(done) 43 | 44 | def clear_memory(self): 45 | self.states = [] 46 | self.probs = [] 47 | self.actions = [] 48 | self.rewards = [] 49 | self.dones = [] 50 | self.vals = [] 51 | -------------------------------------------------------------------------------- /RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/models/space_robot_actor.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/models/space_robot_actor.pt -------------------------------------------------------------------------------- /RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/models/space_robot_critic.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/models/space_robot_critic.pt -------------------------------------------------------------------------------- /RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/plots/space_robot_performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/plots/space_robot_performance.png -------------------------------------------------------------------------------- /RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | def plot_learning_curve(x, scores, figure_file): 5 | running_avg = np.zeros(len(scores)) 6 | for i in range(len(running_avg)): 7 | running_avg[i] = np.mean(scores[max(0, i-100):(i+1)]) 8 | plt.plot(x, running_avg) 9 | plt.title('Running average of previous 100 scores') 10 | plt.savefig(figure_file) -------------------------------------------------------------------------------- /RL_algorithms/Torch/PPO/Continious/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Continious/__init__.py -------------------------------------------------------------------------------- /RL_algorithms/Torch/PPO/Discrete/PPO/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Discrete/PPO/__init__.py -------------------------------------------------------------------------------- /RL_algorithms/Torch/PPO/Discrete/PPO/actor.py: -------------------------------------------------------------------------------- 1 | from multiprocessing.context import BaseContext 2 | import os 3 | import numpy as np 4 | import torch as T 5 | import torch.nn as nn 6 | import torch.optim as optim 7 | from torch.distributions.categorical import Categorical 8 | PATH = os.getcwd() 9 | 10 | class ActorNetwork(nn.Module): 11 | 12 | def __init__(self, n_actions, input_dims, alpha, model_name : str, 13 | fc1_dims=256, fc2_dims=256, check_point_base_dir = 'Learning_algorithm/Torch/PPO/models/'): 14 | super(ActorNetwork, self).__init__() 15 | 16 | check_point_base_dir = os.path.join( PATH , check_point_base_dir ) 17 | self.checkpoint_file = os.path.join(check_point_base_dir, model_name) 18 | self.actor = nn.Sequential( 19 | nn.Linear(*input_dims, fc1_dims), 20 | nn.ReLU(), 21 | nn.Linear(fc1_dims, fc2_dims), 22 | nn.ReLU(), 23 | nn.Linear(fc2_dims, n_actions), 24 | nn.Softmax(dim=-1) 25 | ) 26 | 27 | self.optimizer = optim.Adam(self.parameters(), lr=alpha) 28 | 29 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu') 30 | 31 | self.to(self.device) 32 | 33 | def forward(self, state): 34 | dist = self.actor(state) 35 | dist = Categorical(dist) 36 | return dist 37 | 38 | def save_checkpoint(self): 39 | T.save(self.state_dict(), self.checkpoint_file) 40 | 41 | def load_checkpoint(self): 42 | self.load_state_dict(T.load(self.checkpoint_file)) 43 | -------------------------------------------------------------------------------- /RL_algorithms/Torch/PPO/Discrete/PPO/agent.py: -------------------------------------------------------------------------------- 1 | import imp 2 | from multiprocessing.context import BaseContext 3 | import os 4 | import copy 5 | from tqdm import tqdm 6 | import numpy as np 7 | import torch as T 8 | import torch.nn as nn 9 | import torch.optim as optim 10 | from torch.distributions.categorical import Categorical 11 | from actor import ActorNetwork 12 | from critic import CriticNetwork 13 | from memory import PPOMemory 14 | 15 | 16 | PATH = os.getcwd() 17 | # MODEL_XML_PATH = os.path.join( 18 | # PATH, "SpaceRobotEnv", "assets", "spacerobot", "spacerobot_image.xml" 19 | # ) 20 | 21 | class Agent: 22 | def __init__(self, n_actions, input_dims, model_name_actor : str, model_name_critic : str, \ 23 | gamma = 0.99, alpha = 0.0003, gae_lambda = 0.95, \ 24 | policy_clip = 0.1, n_epoch = 10, batch_size = 64): 25 | ''' 26 | parameter 27 | arguments: 28 | - model_name_actor : model name for actor to be used in model savind directory 29 | - model_name_critic :model name for critic to be used in model savind directory 30 | ''' 31 | seed = 10000 32 | T.manual_seed(seed) 33 | np.random.seed(seed) 34 | #self, n_actions, gae_lamda = 0.95, gamma = 0.99, alpha = 0 .0003, policy_clip = 0.2, batch_size = 64, N = 2048 , n_epoch = 10 35 | self.gamma = gamma 36 | self.gae_lambda = gae_lambda 37 | self.policy_clip = policy_clip 38 | self.n_epoch = n_epoch 39 | 40 | self.actor = ActorNetwork(n_actions, input_dims, alpha, model_name = model_name_actor) 41 | self.critic = CriticNetwork(input_dims, alpha, model_name = model_name_critic) 42 | self.memory_handler = PPOMemory( batch_size ) 43 | 44 | def remember(self, state, action, probs, vals, reward, done): 45 | self.memory_handler.store_memory(state, action, probs, vals, reward, done) 46 | 47 | def save_models(self): 48 | print("Saving models now") 49 | self.actor.save_checkpoint() 50 | self.critic.save_checkpoint() 51 | 52 | def load_model(self): 53 | print("Load model") 54 | self.actor.load_checkpoint() 55 | self.critic.load_checkpoint() 56 | 57 | def play_optimal(self, observation): 58 | with T.no_grad(): 59 | state = T.tensor([observation], dtype=T.float).to(self.actor.device) 60 | dist = self.actor(state) 61 | # action shoulnt be sampe it should be arg max 62 | action = dist.sample() 63 | action =T.squeeze(action).item() 64 | return action 65 | 66 | def choose_action(self, observation): 67 | state = T.tensor([observation], dtype=T.float).to(self.actor.device) 68 | dist = self.actor(state) 69 | value = self.critic(state) 70 | 71 | action = dist.sample() 72 | 73 | # this is equivalent to the reinforce algorithm of probablity distribition 74 | probs = T.squeeze(dist.log_prob(action)).item() 75 | 76 | action =T.squeeze(action).item() 77 | value =T.squeeze(value).item() 78 | 79 | return action, probs , value 80 | 81 | def learn(self): 82 | for _ in range(self.n_epoch): 83 | 84 | state_arr, action_arr, old_prob_arr, vals_arr,\ 85 | reward_arr, dones_arr, batches = \ 86 | self.memory_handler.generate_batches() 87 | 88 | values = vals_arr.copy() 89 | advantage = np.zeros(len(reward_arr), dtype=np.float32) 90 | 91 | for t in range(len(reward_arr)-1): 92 | discount = 1 93 | a_t = 0 94 | for k in range(t, len(reward_arr)-1): 95 | a_t += discount*(reward_arr[k] + self.gamma*values[k+1]*\ 96 | ( 1 - int(dones_arr[k]) ) - values[k]) 97 | discount *= self.gamma*self.gae_lambda 98 | advantage[t] = a_t 99 | advantage = T.tensor(advantage).to(self.actor.device) 100 | 101 | values = T.tensor(values).to(self.actor.device) 102 | for batch in batches: 103 | states = T.tensor(state_arr[batch], dtype=T.float).to(self.actor.device) 104 | old_probs = T.tensor(old_prob_arr[batch]).to(self.actor.device) 105 | actions = T.tensor(action_arr[batch]).to(self.actor.device) 106 | 107 | dist = self.actor(states) 108 | critic_value = self.critic(states) 109 | 110 | critic_value = T.squeeze(critic_value) 111 | 112 | new_probs = dist.log_prob(actions) 113 | prob_ratio = new_probs.exp() / old_probs.exp() 114 | #prob_ratio = (new_probs - old_probs).exp() 115 | weighted_probs = advantage[batch] * prob_ratio 116 | weighted_clipped_probs = T.clamp(prob_ratio, 1 - self.policy_clip, 117 | 1 + self.policy_clip ) * advantage[batch] 118 | actor_loss = -T.min(weighted_probs, weighted_clipped_probs).mean() 119 | 120 | returns = advantage[batch] + values[batch] 121 | critic_loss = (returns-critic_value) ** 2 122 | critic_loss = critic_loss.mean() 123 | 124 | total_loss = actor_loss + 0.5 * critic_loss 125 | self.actor.optimizer.zero_grad() 126 | self.critic.optimiser.zero_grad() 127 | # print("total loss", total_loss.item()) 128 | total_loss.backward() 129 | self.actor.optimizer.step() 130 | self.critic.optimiser.step() 131 | 132 | self.memory_handler.clear_memory() 133 | -------------------------------------------------------------------------------- /RL_algorithms/Torch/PPO/Discrete/PPO/critic.py: -------------------------------------------------------------------------------- 1 | from multiprocessing.context import BaseContext 2 | import os 3 | import numpy as np 4 | import torch as T 5 | import torch.nn as nn 6 | import torch.optim as optim 7 | from torch.distributions.categorical import Categorical 8 | PATH = os.getcwd() 9 | class CriticNetwork(nn.Module): 10 | def __init__(self, input_dims, alpha, model_name : str , fc1_dims = 256,\ 11 | fc2_dims = 256, check_point_base_dir = 'Learning_algorithm/Torch/PPO/models/') -> None: 12 | super(CriticNetwork, self).__init__() 13 | 14 | check_point_base_dir = os.path.join( PATH , check_point_base_dir ) 15 | self.check_point_file = os.path.join(check_point_base_dir, model_name) 16 | self.critic = nn.Sequential( 17 | nn.Linear(*input_dims , fc1_dims), 18 | nn.ReLU(), 19 | nn.Linear(fc1_dims , fc2_dims), 20 | nn.ReLU(), 21 | nn.Linear(fc2_dims , 1), 22 | 23 | ) 24 | self.optimiser = optim.Adam(self.parameters(), lr = alpha) 25 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu') 26 | self.to(self.device) 27 | 28 | def forward(self, state): 29 | value = self.critic(state) 30 | return value 31 | 32 | def save_checkpoint(self): 33 | T.save(self.state_dict(), self.check_point_file) 34 | 35 | def load_checkpoint(self): 36 | self.load_state_dict(T.load(self.check_point_file)) -------------------------------------------------------------------------------- /RL_algorithms/Torch/PPO/Discrete/PPO/main.py: -------------------------------------------------------------------------------- 1 | from turtle import shape 2 | import gym 3 | import numpy as np 4 | from agent import Agent 5 | 6 | from utils import plot_learning_curve 7 | import gym 8 | 9 | import SpaceRobotEnv 10 | import numpy as np 11 | 12 | 13 | 14 | if __name__ == '__main__': 15 | env = gym.make("SpaceRobotState-v0") 16 | N = 20 17 | batch_size = 5 18 | n_epochs = 4 19 | alpha = 0.0003 20 | action_space = env.action_space.shape[0] 21 | obs_shape = env.observation_space["observation"].shape 22 | 23 | 24 | 25 | agent = Agent( n_actions = action_space, 26 | batch_size=batch_size, 27 | alpha = alpha, 28 | n_epoch = n_epochs, 29 | input_dims = obs_shape, 30 | model_name_actor = "space_robot_actor.pt", 31 | model_name_critic = "space_robot_critic.pt") 32 | n_iter = 300 33 | figure_file = 'Learning_algorithm/Torch/PPO/plots/space_robot_performance.png' 34 | best_score = env.reward_range[0] 35 | score_history = [] 36 | n_steps = 0 37 | learn_iters = 0 38 | avg_score = 0 39 | 40 | for i in range(n_iter): 41 | obs = env.reset() 42 | observation = obs["observation"] 43 | done = False 44 | score = 0 45 | while not done: 46 | action, prob, val = agent.choose_action(observation) 47 | act = action 48 | pr = prob 49 | observation_, reward, done, info = env.step(action) 50 | n_steps+=1 51 | score += reward 52 | 53 | agent.remember(observation, action, prob, val, reward, done) 54 | #steps before we begin learning 20 55 | if n_steps % N ==0: 56 | agent.learn() 57 | learn_iters += 1 58 | observation = observation_["observation"] 59 | score_history.append(score) 60 | avg_score = np.mean(score_history[-100:]) 61 | 62 | if avg_score>best_score: 63 | best_score= avg_score 64 | agent.save_models() 65 | print('episode', i , 'score %.1f', 'avg_score %.1f' %avg_score, 66 | 'time_steps',n_steps, 'learning_steps', learn_iters) 67 | 68 | x = [i+1 for i in range(len(score_history))] 69 | plot_learning_curve(x, score_history,figure_file) 70 | env.close() 71 | -------------------------------------------------------------------------------- /RL_algorithms/Torch/PPO/Discrete/PPO/memory.py: -------------------------------------------------------------------------------- 1 | from multiprocessing.context import BaseContext 2 | import os 3 | import numpy as np 4 | import torch as T 5 | import torch.nn as nn 6 | import torch.optim as optim 7 | 8 | class Mem: 9 | def __init__(self, state , prob, val ,action, reward, done) -> None: 10 | self.state = state 11 | self.prob = prob 12 | self.val = val 13 | self.action = action 14 | self.reward = reward 15 | self.done = done 16 | 17 | class PPOMemory: 18 | def __init__(self, batch_size): 19 | self.states = [] 20 | self.probs = [] 21 | self.vals = [] 22 | self.actions = [] 23 | self.rewards = [] 24 | self.dones = [] 25 | 26 | self.batch_size = batch_size 27 | 28 | def generate_batches(self): 29 | n_states = len(self.states) 30 | batch_start = np.arange(0, n_states, self.batch_size) 31 | indices = np.arange(n_states, dtype=np.int64) 32 | np.random.shuffle(indices) 33 | batches = [indices[i:i+self.batch_size] for i in batch_start] 34 | 35 | return np.array(self.states),\ 36 | np.array(self.actions),\ 37 | np.array(self.probs),\ 38 | np.array(self.vals),\ 39 | np.array(self.rewards),\ 40 | np.array(self.dones),\ 41 | batches 42 | 43 | def store_memory(self, state, action, probs, vals, reward, done): 44 | self.states.append(state) 45 | self.actions.append(action) 46 | self.probs.append(probs) 47 | self.vals.append(vals) 48 | self.rewards.append(reward) 49 | self.dones.append(done) 50 | 51 | def clear_memory(self): 52 | self.states = [] 53 | self.probs = [] 54 | self.actions = [] 55 | self.rewards = [] 56 | self.dones = [] 57 | self.vals = [] 58 | -------------------------------------------------------------------------------- /RL_algorithms/Torch/PPO/Discrete/PPO/models/space_robot_actor.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Discrete/PPO/models/space_robot_actor.pt -------------------------------------------------------------------------------- /RL_algorithms/Torch/PPO/Discrete/PPO/models/space_robot_critic.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Discrete/PPO/models/space_robot_critic.pt -------------------------------------------------------------------------------- /RL_algorithms/Torch/PPO/Discrete/PPO/plots/space_robot_performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Discrete/PPO/plots/space_robot_performance.png -------------------------------------------------------------------------------- /RL_algorithms/Torch/PPO/Discrete/PPO/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | def plot_learning_curve(x, scores, figure_file): 5 | running_avg = np.zeros(len(scores)) 6 | for i in range(len(running_avg)): 7 | running_avg[i] = np.mean(scores[max(0, i-100):(i+1)]) 8 | plt.plot(x, running_avg) 9 | plt.title('Running average of previous 100 scores') 10 | plt.savefig(figure_file) -------------------------------------------------------------------------------- /RL_algorithms/Torch/PPO/Discrete/PPOImage/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Discrete/PPOImage/__init__.py -------------------------------------------------------------------------------- /RL_algorithms/Torch/PPO/Discrete/PPOImage/actor.py: -------------------------------------------------------------------------------- 1 | from multiprocessing.context import BaseContext 2 | import os 3 | import numpy as np 4 | import torch as T 5 | import torch.nn as nn 6 | import torch.optim as optim 7 | from torch.distributions.categorical import Categorical 8 | PATH = os.getcwd() 9 | 10 | class ActorNetwork(nn.Module): 11 | 12 | def __init__(self, n_actions, alpha, model_name : str, 13 | check_point_base_dir = 'RL_algorithms/Torch/PPOImage/models'): 14 | super(ActorNetwork, self).__init__() 15 | 16 | check_point_base_dir = os.path.join( PATH , check_point_base_dir ) 17 | self.checkpoint_file = os.path.join(check_point_base_dir, model_name) 18 | 19 | self.actor = nn.Sequential( 20 | nn.Conv2d(in_channels = 3, out_channels = 32, kernel_size=5, stride=1), 21 | nn.ReLU(), 22 | nn.BatchNorm2d(32), 23 | nn.ReLU(), 24 | nn.MaxPool2d(2,2), 25 | 26 | nn.Conv2d(in_channels = 32, out_channels = 64, kernel_size=5, stride=1), 27 | nn.ReLU(), 28 | nn.BatchNorm2d(64), 29 | nn.ReLU(), 30 | nn.MaxPool2d(2,2), 31 | 32 | nn.Conv2d(in_channels = 64, out_channels = 64, kernel_size=5, stride=1), 33 | nn.ReLU(), 34 | nn.BatchNorm2d(64), 35 | nn.ReLU(), 36 | nn.MaxPool2d(2,2), 37 | 38 | nn.Flatten(), 39 | nn.Linear(1024, 4096), 40 | nn.ReLU(), 41 | nn.Linear(4096, 256), 42 | nn.ReLU(), 43 | nn.Linear(256, 64), 44 | nn.ReLU(), 45 | nn.Linear(64, n_actions), 46 | 47 | nn.Softmax(dim=-1) 48 | ) 49 | 50 | self.optimizer = optim.Adam(self.parameters(), lr=alpha) 51 | 52 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu') 53 | 54 | self.to(self.device) 55 | 56 | def forward(self, state): 57 | dist = self.actor(state) 58 | dist = Categorical(dist) 59 | return dist 60 | 61 | def save_checkpoint(self): 62 | T.save(self.state_dict(), self.checkpoint_file) 63 | 64 | def load_checkpoint(self): 65 | self.load_state_dict(T.load(self.checkpoint_file)) 66 | -------------------------------------------------------------------------------- /RL_algorithms/Torch/PPO/Discrete/PPOImage/agent.py: -------------------------------------------------------------------------------- 1 | import imp 2 | from multiprocessing.context import BaseContext 3 | import os 4 | import copy 5 | from tqdm import tqdm 6 | import numpy as np 7 | import torch as T 8 | import torch.nn as nn 9 | import torch.optim as optim 10 | from torch.distributions.categorical import Categorical 11 | from actor import ActorNetwork 12 | from critic import CriticNetwork 13 | from memory import PPOMemory 14 | 15 | 16 | PATH = os.getcwd() 17 | # MODEL_XML_PATH = os.path.join( 18 | # PATH, "SpaceRobotEnv", "assets", "spacerobot", "spacerobot_image.xml" 19 | # ) 20 | 21 | class Agent: 22 | def __init__(self, n_actions, model_name_actor : str, model_name_critic : str, \ 23 | gamma = 0.99, alpha = 0.0003, gae_lambda = 0.95, \ 24 | policy_clip = 0.1, n_epoch = 10, batch_size = 64): 25 | ''' 26 | parameter 27 | arguments: 28 | - model_name_actor : model name for actor to be used in model savind directory 29 | - model_name_critic :model name for critic to be used in model savind directory 30 | ''' 31 | #self, n_actions, gae_lamda = 0.95, gamma = 0.99, alpha = 0 .0003, policy_clip = 0.2, batch_size = 64, N = 2048 , n_epoch = 10 32 | self.gamma = gamma 33 | self.gae_lambda = gae_lambda 34 | self.policy_clip = policy_clip 35 | self.n_epoch = n_epoch 36 | 37 | self.actor = ActorNetwork(n_actions, alpha, model_name = model_name_actor) 38 | self.critic = CriticNetwork(alpha, model_name = model_name_critic) 39 | self.memory_handler = PPOMemory( batch_size ) 40 | 41 | def remember(self, state, action, probs, vals, reward, done): 42 | self.memory_handler.store_memory(state, action, probs, vals, reward, done) 43 | 44 | def save_models(self): 45 | print("Saving models now") 46 | self.actor.save_checkpoint() 47 | self.critic.save_checkpoint() 48 | 49 | def load_model(self): 50 | print("Load model") 51 | self.actor.load_checkpoint() 52 | self.critic.load_checkpoint() 53 | 54 | def play_optimal(self, observation): 55 | with T.no_grad(): 56 | state = T.tensor([observation], dtype=T.float).to(self.actor.device) 57 | dist = self.actor(state) 58 | # action shoulnt be sampe it should be arg max 59 | action = dist.sample() 60 | action =T.squeeze(action).item() 61 | return action 62 | 63 | def choose_action(self, observation): 64 | observation = np.array(observation) 65 | state = T.tensor([observation], dtype=T.float).to(self.actor.device) 66 | dist = self.actor(state) 67 | value = self.critic(state) 68 | 69 | action = dist.sample() 70 | 71 | # this is equivalent to the reinforce algorithm of probablity distribition 72 | probs = T.squeeze(dist.log_prob(action)).item() 73 | 74 | action =T.squeeze(action).item() 75 | value =T.squeeze(value).item() 76 | 77 | return action, probs , value 78 | 79 | def learn(self): 80 | for _ in range(self.n_epoch): 81 | 82 | state_arr, action_arr, old_prob_arr, vals_arr,\ 83 | reward_arr, dones_arr, batches = \ 84 | self.memory_handler.generate_batches() 85 | 86 | values = vals_arr.copy() 87 | advantage = np.zeros(len(reward_arr), dtype=np.float32) 88 | 89 | for t in range(len(reward_arr)-1): 90 | discount = 0.95 91 | a_t = 0 92 | for k in range(t, len(reward_arr)-1): 93 | a_t += discount*(reward_arr[k] + self.gamma*values[k+1]*\ 94 | (1-int(dones_arr[k])) - values[k]) 95 | discount *= self.gamma*self.gae_lambda 96 | advantage[t] = a_t 97 | advantage = T.tensor(advantage).to(self.actor.device) 98 | 99 | values = T.tensor(values).to(self.actor.device) 100 | for batch in batches: 101 | states = T.tensor(state_arr[batch], dtype=T.float).to(self.actor.device) 102 | old_probs = T.tensor(old_prob_arr[batch]).to(self.actor.device) 103 | actions = T.tensor(action_arr[batch]).to(self.actor.device) 104 | 105 | dist = self.actor(states) 106 | critic_value = self.critic(states) 107 | 108 | critic_value = T.squeeze(critic_value) 109 | 110 | new_probs = dist.log_prob(actions) 111 | prob_ratio = new_probs.exp() / old_probs.exp() 112 | #prob_ratio = (new_probs - old_probs).exp() 113 | weighted_probs = advantage[batch] * prob_ratio 114 | weighted_clipped_probs = T.clamp(prob_ratio, 1-self.policy_clip, 115 | 1+self.policy_clip)*advantage[batch] 116 | actor_loss = -T.min(weighted_probs, weighted_clipped_probs).mean() 117 | 118 | returns = advantage[batch] + values[batch] 119 | critic_loss = (returns-critic_value)**2 120 | critic_loss = critic_loss.mean() 121 | 122 | total_loss = actor_loss + 0.5*critic_loss 123 | self.actor.optimizer.zero_grad() 124 | self.critic.optimiser.zero_grad() 125 | # print("total loss", total_loss.item()) 126 | total_loss.backward() 127 | self.actor.optimizer.step() 128 | self.critic.optimiser.step() 129 | 130 | self.memory_handler.clear_memory() 131 | -------------------------------------------------------------------------------- /RL_algorithms/Torch/PPO/Discrete/PPOImage/critic.py: -------------------------------------------------------------------------------- 1 | from multiprocessing.context import BaseContext 2 | import os 3 | import numpy as np 4 | import torch as T 5 | import torch.nn as nn 6 | import torch.optim as optim 7 | from torch.distributions.categorical import Categorical 8 | PATH = os.getcwd() 9 | class CriticNetwork(nn.Module): 10 | def __init__(self, alpha, model_name : str ,\ 11 | check_point_base_dir = 'RL_algorithms/Torch/PPOImage/models/') -> None: 12 | super(CriticNetwork, self).__init__() 13 | 14 | check_point_base_dir = os.path.join( PATH , check_point_base_dir ) 15 | self.check_point_file = os.path.join(check_point_base_dir, model_name) 16 | self.critic = nn.Sequential( 17 | nn.Conv2d(in_channels = 3, out_channels = 32, kernel_size=5, stride=1), 18 | nn.ReLU(), 19 | nn.BatchNorm2d(32), 20 | nn.ReLU(), 21 | nn.MaxPool2d(2,2), 22 | 23 | # nn.Conv2d(in_channels = 32, out_channels = 64, kernel_size=5, stride=1), 24 | # nn.ReLU(), 25 | # nn.BatchNorm2d(64), 26 | # nn.ReLU(), 27 | # nn.MaxPool2d(2,2), 28 | 29 | # nn.Conv2d(in_channels = 64, out_channels = 64, kernel_size=5, stride=1), 30 | # nn.ReLU(), 31 | # nn.BatchNorm2d(64), 32 | # nn.ReLU(), 33 | # nn.MaxPool2d(2,2), 34 | 35 | nn.Flatten(), 36 | nn.Linear(28800, 512), 37 | nn.ReLU(), 38 | nn.Linear(512, 64), 39 | nn.ReLU(), 40 | nn.Linear(64, 1), 41 | 42 | ) 43 | self.optimiser = optim.Adam(self.parameters(), lr = alpha) 44 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu') 45 | self.to(self.device) 46 | 47 | def forward(self, state): 48 | value = self.critic(state) 49 | return value 50 | 51 | def save_checkpoint(self): 52 | T.save(self.state_dict(), self.check_point_file) 53 | 54 | def load_checkpoint(self): 55 | self.load_state_dict(T.load(self.check_point_file)) -------------------------------------------------------------------------------- /RL_algorithms/Torch/PPO/Discrete/PPOImage/main.py: -------------------------------------------------------------------------------- 1 | from turtle import shape 2 | import gym 3 | import numpy as np 4 | from agent import Agent 5 | 6 | from utils import plot_learning_curve 7 | 8 | from SpaceRobotEnv.envs import SpaceRobotImage 9 | 10 | import numpy as np 11 | 12 | 13 | 14 | if __name__ == '__main__': 15 | # env = SpaceRobotImage() 16 | env = SpaceRobotImage() 17 | #N = 20 18 | N = 20 19 | batch_size = 5 20 | n_epochs = 4 21 | alpha = 0.0003 22 | action_space = env.action_space.shape[0] 23 | agent = Agent( n_actions = action_space, 24 | batch_size=batch_size, 25 | alpha = alpha, 26 | n_epoch = n_epochs, 27 | model_name_actor = "space_robot_actor.pt", 28 | model_name_critic = "space_robot_critic.pt") 29 | n_iter = 300 30 | figure_file = 'RL_algorithms/Torch/PPOImage/plots/space_robot_performance.png' 31 | best_score = env.reward_range[0] 32 | score_history = [] 33 | n_steps = 0 34 | learn_iters = 0 35 | avg_score = 0 36 | 37 | for i in range(n_iter): 38 | obs = env.reset() 39 | observation = obs["rawimage"].reshape(3, 64, 64) 40 | done = False 41 | score = 0 42 | while not done: 43 | action, prob, val = agent.choose_action(observation) 44 | observation_, reward, done, info = env.step(action) 45 | n_steps += 1 46 | score += reward 47 | 48 | agent.remember(observation, action, prob, val, reward, done) 49 | #steps before we begin learning 20 50 | if n_steps % N ==0: 51 | agent.learn() 52 | learn_iters += 1 53 | observation = observation_["rawimage"].reshape(3, 64, 64) 54 | 55 | print("done") 56 | score_history.append(score) 57 | avg_score = np.mean(score_history[-100:]) 58 | 59 | if avg_score>best_score: 60 | best_score= avg_score 61 | agent.save_models() 62 | print('episode', i , 'score %.1f',score, 'avg_score %.1f' %avg_score, 63 | 'time_steps',n_steps, 'learning_steps', learn_iters) 64 | 65 | x = [i+1 for i in range(len(score_history))] 66 | plot_learning_curve(x, score_history,figure_file) 67 | env.close() 68 | -------------------------------------------------------------------------------- /RL_algorithms/Torch/PPO/Discrete/PPOImage/memory.py: -------------------------------------------------------------------------------- 1 | from multiprocessing.context import BaseContext 2 | import os 3 | import numpy as np 4 | import torch as T 5 | import torch.nn as nn 6 | import torch.optim as optim 7 | 8 | class Mem: 9 | def __init__(self, state , prob, val ,action, reward, done) -> None: 10 | self.state = state 11 | self.prob = prob 12 | self.val = val 13 | self.action = action 14 | self.reward = reward 15 | self.done = done 16 | 17 | class PPOMemory: 18 | def __init__(self, batch_size): 19 | self.states = [] 20 | self.probs = [] 21 | self.vals = [] 22 | self.actions = [] 23 | self.rewards = [] 24 | self.dones = [] 25 | 26 | self.batch_size = batch_size 27 | 28 | def generate_batches(self): 29 | n_states = len(self.states) 30 | batch_start = np.arange(0, n_states, self.batch_size) 31 | indices = np.arange(n_states, dtype=np.int64) 32 | np.random.shuffle(indices) 33 | batches = [indices[i:i+self.batch_size] for i in batch_start] 34 | 35 | return np.array(self.states),\ 36 | np.array(self.actions),\ 37 | np.array(self.probs),\ 38 | np.array(self.vals),\ 39 | np.array(self.rewards),\ 40 | np.array(self.dones),\ 41 | batches 42 | 43 | def store_memory(self, state, action, probs, vals, reward, done): 44 | self.states.append(state) 45 | self.actions.append(action) 46 | self.probs.append(probs) 47 | self.vals.append(vals) 48 | self.rewards.append(reward) 49 | self.dones.append(done) 50 | 51 | def clear_memory(self): 52 | self.states = [] 53 | self.probs = [] 54 | self.actions = [] 55 | self.rewards = [] 56 | self.dones = [] 57 | self.vals = [] 58 | -------------------------------------------------------------------------------- /RL_algorithms/Torch/PPO/Discrete/PPOImage/models/.space_robot_actor.pt.icloud: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Discrete/PPOImage/models/.space_robot_actor.pt.icloud -------------------------------------------------------------------------------- /RL_algorithms/Torch/PPO/Discrete/PPOImage/models/.space_robot_critic.pt.icloud: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Discrete/PPOImage/models/.space_robot_critic.pt.icloud -------------------------------------------------------------------------------- /RL_algorithms/Torch/PPO/Discrete/PPOImage/plots/space_robot_performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Discrete/PPOImage/plots/space_robot_performance.png -------------------------------------------------------------------------------- /RL_algorithms/Torch/PPO/Discrete/PPOImage/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | def plot_learning_curve(x, scores, figure_file): 5 | running_avg = np.zeros(len(scores)) 6 | for i in range(len(running_avg)): 7 | running_avg[i] = np.mean(scores[max(0, i-100):(i+1)]) 8 | plt.plot(x, running_avg) 9 | plt.title('Running average of previous 100 scores') 10 | plt.savefig(figure_file) -------------------------------------------------------------------------------- /RL_algorithms/Torch/PPO/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/__init__.py -------------------------------------------------------------------------------- /RL_algorithms/Torch/SAC/SAC_ENV/core.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.signal 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.distributions.normal import Normal 8 | 9 | 10 | def combined_shape(length, shape=None): 11 | if shape is None: 12 | return (length,) 13 | return (length, shape) if np.isscalar(shape) else (length, *shape) 14 | 15 | def mlp(sizes, activation, output_activation=nn.Identity): 16 | # converts array of layer shape to neural net 17 | layers = [] 18 | for j in range(len(sizes)-1): 19 | act = activation if j < len(sizes) -2 else output_activation 20 | layers += [nn.Linear(sizes[j], sizes[j+1]), act()] 21 | return nn.Sequential(*layers) 22 | 23 | def count_vars(module): 24 | return sum([np.prod(p.shape) for p in module.parameters()]) 25 | 26 | 27 | LOG_STD_MAX = 2 28 | LOG_STD_MIN = -20 29 | 30 | class SquashedGaussianMLPActor(nn.Module): 31 | 32 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation, act_limit): 33 | super().__init__() 34 | self.net = mlp([obs_dim] + list(hidden_sizes), activation, activation) 35 | self.mu_layer = nn.Linear(hidden_sizes[-1], act_dim) 36 | self.log_std_layer = nn.Linear(hidden_sizes[-1], act_dim) 37 | self.act_limit = act_limit 38 | 39 | def forward(self, obs, deterministic=False, with_logprob=True): 40 | net_out = self.net(obs) 41 | mu = self.mu_layer(net_out) 42 | log_std = self.log_std_layer(net_out) 43 | log_std = torch.clamp(log_std, LOG_STD_MIN, LOG_STD_MAX) 44 | std = torch.exp(log_std) 45 | 46 | # Pre-squash distribution and sample 47 | pi_distribution = Normal(mu, std) 48 | if deterministic: 49 | # Only used for evaluating policy at test time. 50 | pi_action = mu 51 | else: 52 | pi_action = pi_distribution.rsample() 53 | 54 | if with_logprob: 55 | # Compute logprob from Gaussian, and then apply correction for Tanh squashing. 56 | # NOTE: The correction formula is a little bit magic. To get an understanding 57 | # of where it comes from, check out the original SAC paper (arXiv 1801.01290) 58 | # and look in appendix C. This is a more numerically-stable equivalent to Eq 21. 59 | # Try deriving it yourself as a (very difficult) exercise. :) 60 | logp_pi = pi_distribution.log_prob(pi_action).sum(axis=-1) 61 | logp_pi -= (2*(np.log(2) - pi_action - F.softplus(-2*pi_action))).sum(axis=1) 62 | else: 63 | logp_pi = None 64 | 65 | pi_action = torch.tanh(pi_action) 66 | pi_action = self.act_limit * pi_action 67 | 68 | return pi_action, logp_pi 69 | 70 | 71 | class MLPQFunction(nn.Module): 72 | 73 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation): 74 | super().__init__() 75 | self.q = mlp([obs_dim + act_dim] + list(hidden_sizes) + [1], activation) 76 | 77 | def forward(self, obs, act): 78 | q = self.q(torch.cat([obs, act], dim=-1)) 79 | return torch.squeeze(q, -1) # Critical to ensure q has right shape. 80 | 81 | class MLPActorCritic(nn.Module): 82 | 83 | def __init__(self, observation_space, action_space, hidden_sizes=(256,256), 84 | activation=nn.ReLU): 85 | super().__init__() 86 | 87 | obs_dim = observation_space.shape[0] 88 | act_dim = action_space.shape[0] 89 | act_limit = action_space.high[0] 90 | 91 | # build policy and value functions 92 | self.pi = SquashedGaussianMLPActor(obs_dim, act_dim, hidden_sizes, activation, act_limit) 93 | self.q1 = MLPQFunction(obs_dim, act_dim, hidden_sizes, activation) 94 | self.q2 = MLPQFunction(obs_dim, act_dim, hidden_sizes, activation) 95 | 96 | def act(self, obs, deterministic=False): 97 | with torch.no_grad(): 98 | a, _ = self.pi(obs, deterministic, False) 99 | return a.numpy() 100 | -------------------------------------------------------------------------------- /RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658847118.Tosins-Air.19214.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658847118.Tosins-Air.19214.0 -------------------------------------------------------------------------------- /RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658847140.Tosins-Air.19431.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658847140.Tosins-Air.19431.0 -------------------------------------------------------------------------------- /RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658847454.Tosins-Air.19535.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658847454.Tosins-Air.19535.0 -------------------------------------------------------------------------------- /RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658847513.Tosins-Air.19931.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658847513.Tosins-Air.19931.0 -------------------------------------------------------------------------------- /RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658847612.Tosins-Air.19979.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658847612.Tosins-Air.19979.0 -------------------------------------------------------------------------------- /RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658847918.Tosins-Air.20089.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658847918.Tosins-Air.20089.0 -------------------------------------------------------------------------------- /RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658848049.Tosins-Air.20232.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658848049.Tosins-Air.20232.0 -------------------------------------------------------------------------------- /RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658848339.Tosins-Air.20384.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658848339.Tosins-Air.20384.0 -------------------------------------------------------------------------------- /RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658848364.Tosins-Air.20423.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658848364.Tosins-Air.20423.0 -------------------------------------------------------------------------------- /RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658848673.Tosins-Air.20649.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658848673.Tosins-Air.20649.0 -------------------------------------------------------------------------------- /RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658848831.Tosins-Air.20793.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658848831.Tosins-Air.20793.0 -------------------------------------------------------------------------------- /RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658849191.Tosins-Air.20924.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658849191.Tosins-Air.20924.0 -------------------------------------------------------------------------------- /RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658849218.Tosins-Air.20984.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658849218.Tosins-Air.20984.0 -------------------------------------------------------------------------------- /RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658849777.Tosins-Air.21229.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658849777.Tosins-Air.21229.0 -------------------------------------------------------------------------------- /RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658849785.Tosins-Air.21269.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658849785.Tosins-Air.21269.0 -------------------------------------------------------------------------------- /RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658849885.Tosins-Air.21429.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658849885.Tosins-Air.21429.0 -------------------------------------------------------------------------------- /RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658849941.Tosins-Air.21521.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658849941.Tosins-Air.21521.0 -------------------------------------------------------------------------------- /RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658850278.Tosins-Air.21678.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658850278.Tosins-Air.21678.0 -------------------------------------------------------------------------------- /RL_algorithms/Torch/SAC/SAC_ENV/memory.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import core 3 | import torch 4 | 5 | class ReplayBuffer: 6 | """ 7 | A simple FIFO experience replay buffer for SAC agents. 8 | """ 9 | 10 | def __init__(self, obs_dim, act_dim, size): 11 | self.obs_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32) 12 | self.obs2_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32) 13 | self.act_buf = np.zeros(core.combined_shape(size, act_dim), dtype=np.float32) 14 | self.rew_buf = np.zeros(size, dtype=np.float32) 15 | self.done_buf = np.zeros(size, dtype=np.float32) 16 | self.ptr, self.size, self.max_size = 0, 0, size 17 | 18 | def store(self, obs, act, rew, next_obs, done): 19 | self.obs_buf[self.ptr] = obs 20 | self.obs2_buf[self.ptr] = next_obs 21 | self.act_buf[self.ptr] = act 22 | self.rew_buf[self.ptr] = rew 23 | self.done_buf[self.ptr] = done 24 | self.ptr = (self.ptr+1) % self.max_size 25 | self.size = min(self.size+1, self.max_size) 26 | 27 | def sample_batch(self, batch_size=32): 28 | idxs = np.random.randint(0, self.size, size=batch_size) 29 | batch = dict(obs=self.obs_buf[idxs], 30 | obs2=self.obs2_buf[idxs], 31 | act=self.act_buf[idxs], 32 | rew=self.rew_buf[idxs], 33 | done=self.done_buf[idxs]) 34 | return {k: torch.as_tensor(v, dtype=torch.float32) for k,v in batch.items()} -------------------------------------------------------------------------------- /RL_algorithms/Torch/SAC/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/__init__.py -------------------------------------------------------------------------------- /RL_algorithms/Torch/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/__init__.py -------------------------------------------------------------------------------- /RL_algorithms/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/__init__.py -------------------------------------------------------------------------------- /RL_algorithms/utils/mpi_tools.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | def allreduce(*args, **kwargs): 3 | return MPI.COMM_WORLD.Allreduce(*args, **kwargs) 4 | 5 | def mpi_op(x, op): 6 | x, scalar = ([x], True) if np.isscalar(x) else (x, False) 7 | x = np.asarray(x, dtype=np.float32) 8 | buff = np.zeros_like(x, dtype=np.float32) 9 | allreduce(x, buff, op=op) 10 | return buff[0] if scalar else 11 | 12 | def mpi_statistics_scalar(x, with_min_and_max=False): 13 | """ 14 | Get mean/std and optional min/max of scalar x across MPI processes. 15 | Args: 16 | x: An array containing samples of the scalar to produce statistics 17 | for. 18 | with_min_and_max (bool): If true, return min and max of x in 19 | addition to mean and std. 20 | """ 21 | x = np.array(x, dtype=np.float32) 22 | global_sum, global_n = mpi_sum([np.sum(x), len(x)]) 23 | mean = global_sum / global_n 24 | 25 | global_sum_sq = mpi_sum(np.sum((x - mean)**2)) 26 | std = np.sqrt(global_sum_sq / global_n) # compute global std 27 | 28 | if with_min_and_max: 29 | global_min = mpi_op(np.min(x) if len(x) > 0 else np.inf, op=MPI.MIN) 30 | global_max = mpi_op(np.max(x) if len(x) > 0 else -np.inf, op=MPI.MAX) 31 | return mean, std, global_min, global_max 32 | return mean, std -------------------------------------------------------------------------------- /Simulation.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/Simulation.jpg -------------------------------------------------------------------------------- /SpaceRobotEnv/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/.DS_Store -------------------------------------------------------------------------------- /SpaceRobotEnv/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from gym.envs.registration import register 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | 7 | register( 8 | id="SpaceRobotState-v0", 9 | entry_point="SpaceRobotEnv.envs:SpaceRobotState", 10 | max_episode_steps=512, 11 | ) 12 | 13 | register( 14 | id="SpaceRobotImage-v0", 15 | entry_point="SpaceRobotEnv.envs:SpaceRobotImage", 16 | max_episode_steps=512, 17 | ) 18 | 19 | register( 20 | id="SpaceRobotDualArm-v0", 21 | entry_point="SpaceRobotEnv.envs:SpaceRobotDualArm", 22 | max_episode_steps=512, 23 | ) 24 | 25 | register( 26 | id="SpaceRobotPointCloud-v0", 27 | entry_point="SpaceRobotEnv.envs:SpaceRobotPointCloud", 28 | max_episode_steps=512, 29 | ) 30 | 31 | register( 32 | id="SpaceRobotCost-v0", 33 | entry_point="SpaceRobotEnv.envs:SpaceRobotCost", 34 | max_episode_steps=512, 35 | ) 36 | 37 | register( 38 | id="SpaceRobotReorientation-v0", 39 | entry_point="SpaceRobotEnv.envs:SpaceRobotReorientation", 40 | max_episode_steps=512, 41 | ) -------------------------------------------------------------------------------- /SpaceRobotEnv/assets/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/assets/.DS_Store -------------------------------------------------------------------------------- /SpaceRobotEnv/assets/common/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The dm_control Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | """Functions to manage the common assets for domains.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import os 23 | from dm_control.utils import resources 24 | 25 | _SUITE_DIR = os.path.dirname(os.path.dirname(__file__)) 26 | _FILENAMES = [ 27 | "common/materials.xml", 28 | "common/skybox.xml", 29 | "common/visual.xml", 30 | ] 31 | 32 | ASSETS = {filename: resources.GetResource(os.path.join(_SUITE_DIR, filename)) 33 | for filename in _FILENAMES} 34 | 35 | 36 | def read_model(model_filename): 37 | """Reads a model XML file and returns its contents as a string.""" 38 | return resources.GetResource(os.path.join(_SUITE_DIR, model_filename)) 39 | -------------------------------------------------------------------------------- /SpaceRobotEnv/assets/common/materials.xml: -------------------------------------------------------------------------------- 1 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /SpaceRobotEnv/assets/common/skybox.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /SpaceRobotEnv/assets/common/visual.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /SpaceRobotEnv/assets/spacerobot/arm_v3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /SpaceRobotEnv/assets/spacerobot/arm_v31.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /SpaceRobotEnv/assets/spacerobot/asset.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /SpaceRobotEnv/assets/spacerobot/sensor.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /SpaceRobotEnv/assets/spacerobot/spacerobot_cost.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /SpaceRobotEnv/assets/spacerobot/spacerobot_dualarm.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /SpaceRobotEnv/assets/spacerobot/spacerobot_image.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /SpaceRobotEnv/assets/spacerobot/spacerobot_state.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /SpaceRobotEnv/assets/spacerobot/stls/R10.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/assets/spacerobot/stls/R10.stl -------------------------------------------------------------------------------- /SpaceRobotEnv/assets/spacerobot/stls/cube.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/assets/spacerobot/stls/cube.stl -------------------------------------------------------------------------------- /SpaceRobotEnv/assets/spacerobot/stls/v_base.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/assets/spacerobot/stls/v_base.stl -------------------------------------------------------------------------------- /SpaceRobotEnv/assets/spacerobot/stls/v_forearm.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/assets/spacerobot/stls/v_forearm.stl -------------------------------------------------------------------------------- /SpaceRobotEnv/assets/spacerobot/stls/v_shoulder.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/assets/spacerobot/stls/v_shoulder.stl -------------------------------------------------------------------------------- /SpaceRobotEnv/assets/spacerobot/stls/v_upperarm.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/assets/spacerobot/stls/v_upperarm.stl -------------------------------------------------------------------------------- /SpaceRobotEnv/assets/spacerobot/stls/v_wrist1.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/assets/spacerobot/stls/v_wrist1.stl -------------------------------------------------------------------------------- /SpaceRobotEnv/assets/spacerobot/stls/v_wrist2.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/assets/spacerobot/stls/v_wrist2.stl -------------------------------------------------------------------------------- /SpaceRobotEnv/assets/spacerobot/stls/v_wrist3.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/assets/spacerobot/stls/v_wrist3.stl -------------------------------------------------------------------------------- /SpaceRobotEnv/assets/spacerobot/subgoal.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 121 | 122 | 123 | 124 | 125 | 126 | 129 | 130 | 131 | 134 | 135 | 136 | 139 | 140 | 141 | 144 | 145 | 146 | 149 | 150 | 151 | 154 | 155 | 156 | 159 | 160 | 161 | -------------------------------------------------------------------------------- /SpaceRobotEnv/envs/SpaceRobotReorientation.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import copy 4 | import numpy as np 5 | 6 | import gym 7 | from gym import spaces 8 | from gym.utils import seeding 9 | 10 | from gym.envs.robotics import utils 11 | from gym.envs.robotics import rotations 12 | 13 | import mujoco_py 14 | 15 | 16 | PATH = os.getcwd() 17 | MODEL_XML_PATH = os.path.join(PATH,'SpaceRobotEnv','assets', 'spacerobot', 'spacerobot_dualarm.xml') 18 | DEFAULT_SIZE = 500 19 | 20 | 21 | class RobotEnv(gym.GoalEnv): 22 | def __init__(self, model_path, initial_qpos, n_substeps): 23 | 24 | # load model and simulator 25 | self.model = mujoco_py.load_model_from_path(model_path) 26 | self.sim = mujoco_py.MjSim(self.model, nsubsteps=n_substeps) 27 | 28 | # render setting 29 | self.viewer = None 30 | self._viewers = {} 31 | self.metadata = { 32 | "render.modes": ["human", "rgb_array"], 33 | "video.frames_per_second": int(np.round(1.0 / self.dt)), 34 | } 35 | 36 | # seed 37 | self.seed() 38 | 39 | # initalization 40 | self._env_setup(initial_qpos=initial_qpos) 41 | self.initial_state = copy.deepcopy(self.sim.get_state()) 42 | self.goal = self._sample_goal() 43 | 44 | # set action_space and observation_space 45 | obs = self._get_obs() 46 | self._set_action_space() 47 | self.observation_space = spaces.Dict( 48 | dict( 49 | desired_goal=spaces.Box( 50 | -np.inf, np.inf, shape=obs["desired_goal"].shape, dtype="float32" 51 | ), 52 | achieved_goal=spaces.Box( 53 | -np.inf, np.inf, shape=obs["achieved_goal"].shape, dtype="float32" 54 | ), 55 | observation=spaces.Box( 56 | -np.inf, np.inf, shape=obs["observation"].shape, dtype="float32" 57 | ), 58 | ) 59 | ) 60 | 61 | def _set_action_space(self): 62 | bounds = self.model.actuator_ctrlrange.copy() 63 | low, high = bounds.T 64 | self.action_space = spaces.Box(low=low, high=high, dtype=np.float32) 65 | return self.action_space 66 | 67 | @property 68 | def dt(self): 69 | return self.sim.model.opt.timestep * self.sim.nsubsteps 70 | 71 | def _detecte_collision(self): 72 | self.collision = self.sim.data.ncon 73 | return self.collision 74 | 75 | def _sensor_torque(self): 76 | self.sensor_data = self.sim.data.sensordata 77 | return self.sensor_data 78 | 79 | def seed(self, seed=None): 80 | self.np_random, seed = seeding.np_random(seed) 81 | return [seed] 82 | 83 | def step(self, action): 84 | action = np.clip(action, self.action_space.low, self.action_space.high) 85 | self._set_action(action) # do one step simulation here 86 | self._step_callback() 87 | obs = self._get_obs() 88 | done = False 89 | info = { 90 | "is_success": self._is_success(obs["achieved_goal"], self.goal) 91 | } 92 | reward = self.compute_reward(obs["achieved_goal"], self.goal, info) 93 | # reward = self.compute_reward(obs['achieved_goal'], self.goal, info) + self.compute_reward(obs['achieved_goal1'], self.goal1, info) 94 | return obs, reward, done, info 95 | 96 | def reset(self): 97 | """Attempt to reset the simulator. Since we randomize initial conditions, it 98 | is possible to get into a state with numerical issues (e.g. due to penetration or 99 | Gimbel lock) or we may not achieve an initial condition (e.g. an object is within the hand). 100 | In this case, we just keep randomizing until we eventually achieve a valid initial 101 | configuration. 102 | """ 103 | super(RobotEnv, self).reset() 104 | did_reset_sim = False 105 | while not did_reset_sim: 106 | did_reset_sim = self._reset_sim() 107 | 108 | self.goal = self._sample_goal() 109 | obs = self._get_obs() 110 | 111 | # TODO: set the position of cube 112 | # body_id = self.sim.model.geom_name2id("cube") 113 | # self.sim.model.geom_pos[body_id] = np.array([0, 0, 6]) 114 | return obs 115 | 116 | def close(self): 117 | if self.viewer is not None: 118 | # self.viewer.finish() 119 | self.viewer = None 120 | self._viewers = {} 121 | 122 | def render(self, mode="human", width=DEFAULT_SIZE, height=DEFAULT_SIZE): 123 | # self._render_callback() 124 | if mode == "rgb_array": 125 | self._get_viewer(mode).render(width, height) 126 | # window size used for old mujoco-py: 127 | datargb, datadepth = self._get_viewer(mode).read_pixels( 128 | width, height, depth=True 129 | ) 130 | # original image is upside-down, so flip it 131 | return datargb[::-1, :, :], datadepth[::-1] 132 | elif mode == "human": 133 | self._get_viewer(mode).render() 134 | 135 | def _get_viewer(self, mode): 136 | self.viewer = self._viewers.get(mode) 137 | 138 | if self.viewer is None: 139 | if mode == "human": 140 | self.viewer = mujoco_py.MjViewer(self.sim) 141 | self._viewer_setup() 142 | 143 | elif mode == "rgb_array": 144 | self.viewer = mujoco_py.MjRenderContextOffscreen(self.sim, device_id=-1) 145 | self._viewer_setup() 146 | # self.viewer.cam.trackbodyid = 0 147 | # latest modification 148 | cam_pos = np.array([0.5, 0, 5, 0.3, -30, 0]) 149 | for i in range(3): 150 | self.viewer.cam.lookat[i] = cam_pos[i] 151 | self.viewer.cam.distance = cam_pos[3] 152 | self.viewer.cam.elevation = cam_pos[4] 153 | self.viewer.cam.azimuth = cam_pos[5] 154 | # self.viewer.cam.trackbodyid = -1 155 | 156 | self._viewers[mode] = self.viewer 157 | return self.viewer 158 | 159 | def _reset_sim(self): 160 | """Resets a simulation and indicates whether or not it is successful. 161 | If a reset is unsuccessful (e.g. if a randomized state caused an error in the 162 | simulation), this method should indicate such a failure by returning False. 163 | In such a case, this method will be called again to attempt a the reset again. 164 | """ 165 | self.sim.set_state(self.initial_state) 166 | self.sim.forward() 167 | return True 168 | 169 | def _get_obs(self): 170 | """Returns the observation.""" 171 | raise NotImplementedError() 172 | 173 | def _set_action(self, action): 174 | """Applies the given action to the simulation.""" 175 | raise NotImplementedError() 176 | 177 | def _is_success(self, achieved_goal, desired_goal): 178 | """Indicates whether or not the achieved goal successfully achieved the desired goal.""" 179 | raise NotImplementedError() 180 | 181 | def _sample_goal(self): 182 | """Samples a new goal and returns it.""" 183 | raise NotImplementedError() 184 | 185 | def _env_setup(self, initial_qpos): 186 | """Initial configuration of the environment. Can be used to configure initial state 187 | and extract information from the simulation. 188 | """ 189 | pass 190 | 191 | def _viewer_setup(self): 192 | """Initial configuration of the viewer. Can be used to set the camera position, 193 | for example. 194 | """ 195 | pass 196 | 197 | def _render_callback(self): 198 | """A custom callback【自定义回调】 that is called before rendering. Can be used 199 | to implement custom visualizations.【可实现自定义可视化】 200 | """ 201 | pass 202 | 203 | def _step_callback(self): 204 | """A custom callback that is called after stepping the simulation. Can be used 205 | to enforce additional constraints on the simulation state.【对模拟状态强制附加约束】 206 | """ 207 | pass 208 | 209 | 210 | def goal_distance(goal_a, goal_b): 211 | assert goal_a.shape == goal_b.shape 212 | return np.linalg.norm(goal_a - goal_b, axis=-1) 213 | 214 | 215 | class SpacerobotEnv(RobotEnv): 216 | """Superclass for all SpaceRobot environments.""" 217 | 218 | def __init__( 219 | self, 220 | model_path, 221 | n_substeps, 222 | distance_threshold, 223 | initial_qpos, 224 | reward_type, 225 | pro_type, 226 | c_coeff, 227 | ): 228 | """Initializes a new Fetch environment. 229 | Args: 230 | model_path (string): path to the environments XML file 231 | n_substeps (int): number of substeps the simulation runs on every call to step 232 | distance_threshold (float): the threshold after which a goal is considered achieved 233 | initial_qpos (dict): a dictionary of joint names and values that define the initial configuration 234 | reward_type ('sparse' or 'dense'): the reward type, i.e. sparse or dense 235 | pro_type ('MDP' or 'CMDP'): the problem setting whether contains cost or not 236 | c_coeff: cost coefficient 237 | """ 238 | self.n_substeps = n_substeps 239 | # self.target_range = target_range 240 | self.distance_threshold = distance_threshold 241 | self.reward_type = reward_type 242 | self.c_coeff = c_coeff 243 | self.pro_type = pro_type 244 | 245 | super(SpacerobotEnv, self).__init__( 246 | model_path=model_path, 247 | n_substeps=n_substeps, 248 | initial_qpos=initial_qpos, 249 | ) 250 | 251 | def compute_reward(self, achieved_goal, desired_goal, info): 252 | # Compute distance between goal and the achieved goal. 253 | d = goal_distance(achieved_goal, desired_goal) 254 | 255 | reward = { 256 | "sparse": -(d > self.distance_threshold).astype(np.float32), 257 | "dense": -(0.001 * d ** 2 + np.log10(d ** 2 + 1e-6)), 258 | } 259 | 260 | return reward 261 | 262 | 263 | def _set_action(self, action): 264 | """ 265 | output action (velocity) 266 | :param action: angle velocity of joints 267 | :return: angle velocity of joints 268 | """ 269 | assert action.shape == (12,) 270 | self.sim.data.ctrl[:] = action * 0.5 271 | for _ in range(self.n_substeps): 272 | self.sim.step() 273 | 274 | def _get_obs(self): 275 | # positions 276 | # grip_pos = self.sim.data.get_body_xpos("tip_frame") 277 | # grip_pos1 = self.sim.data.get_body_xpos("tip_frame1") 278 | """ 279 | # get the rotation angle of the target 280 | grip_rot = self.sim.data.get_body_xquat('tip_frame') 281 | grip_rot = rotations.quat2euler(grip_rot) 282 | grip_rot1 = self.sim.data.get_body_xquat('tip_frame1') 283 | grip_rot1 = rotations.quat2euler(grip_rot1) 284 | """ 285 | # dt = self.sim.nsubsteps * self.sim.model.opt.timestep 286 | # grip_velp = self.sim.data.get_body_xvelp("tip_frame") * dt 287 | # grip_velp1 = self.sim.data.get_body_xvelp("tip_frame1") * dt 288 | """ 289 | achieved_goal = np.concatenate([grip_pos.copy(),grip_rot.copy()]) 290 | achieved_goal1 = np.concatenate([grip_pos1.copy(),grip_rot1.copy()]) 291 | """ 292 | post_base_att = self.sim.data.get_body_xquat('chasersat') 293 | 294 | obs = np.concatenate( 295 | [ 296 | self.sim.data.qpos[:].copy(), 297 | self.sim.data.qvel[:].copy(), 298 | self.goal.copy(), 299 | ] 300 | ) 301 | 302 | return { 303 | "observation": obs.copy(), 304 | "achieved_goal": post_base_att.copy(), 305 | "desired_goal": self.goal.copy(), 306 | } 307 | 308 | def _viewer_setup(self): 309 | # body_id = self.sim.model.body_name2id('forearm_link') 310 | body_id = self.sim.model.body_name2id("wrist_3_link") 311 | lookat = self.sim.data.body_xpos[body_id] 312 | for idx, value in enumerate(lookat): 313 | self.viewer.cam.lookat[idx] = value 314 | self.viewer.cam.distance = 2.5 315 | self.viewer.cam.azimuth = 132.0 316 | self.viewer.cam.elevation = -14.0 317 | 318 | def _reset_sim(self): 319 | self.sim.set_state(self.initial_state) 320 | self.sim.forward() 321 | return True 322 | 323 | def _sample_goal(self): 324 | goal = self.initial_base_att 325 | 326 | return goal.copy() 327 | 328 | def _is_success(self, achieved_goal, desired_goal): 329 | d = goal_distance(achieved_goal, desired_goal) 330 | return (d < self.distance_threshold).astype(np.float32) 331 | # return d 332 | 333 | def _env_setup(self, initial_qpos): 334 | 335 | # set qpos of chasersat 336 | chasersat_pos = [0.,0.,4.] # init pos of base 337 | chasersat_ori = np.random.rand(3) * 0.5 # initial base att range[0,1) 338 | chasersat_quat = rotations.euler2quat(chasersat_ori) 339 | initial_qpos['chasersat:joint'] = list(chasersat_pos) + list(chasersat_quat) 340 | # print('initial qpos of base is {}'.format(initial_qpos['chasersat:joint'])) 341 | 342 | for name, value in initial_qpos.items(): 343 | self.sim.data.set_joint_qpos(name, value) 344 | utils.reset_mocap_welds(self.sim) 345 | 346 | # Extract information for sampling goals. 347 | self.initial_gripper_xpos = self.sim.data.get_body_xpos("tip_frame").copy() 348 | self.initial_gripper_xpos1 = self.sim.data.get_body_xpos("tip_frame1").copy() 349 | 350 | # get the initial base attitude 351 | self.initial_base_att = self.sim.data.get_body_xquat("chasersat").copy() 352 | 353 | # get the initial base position 354 | self.initial_base_pos = self.sim.data.get_body_xpos("chasersat").copy() 355 | # print('initial base att is {}'.format(self.initial_base_att)) 356 | # print('initial base pos is {}'.format(self.initial_base_pos)) 357 | # print('initial pos is {}'.format(self.sim.data.qpos[:])) 358 | 359 | def render(self, mode="human", width=500, height=500): 360 | return super(SpacerobotEnv, self).render(mode, width, height) 361 | 362 | 363 | class SpaceRobotReorientation(SpacerobotEnv, gym.utils.EzPickle): 364 | def __init__(self, reward_type="sparse", pro_type="MDP"): 365 | initial_qpos = { 366 | "arm:shoulder_pan_joint": 0.0, 367 | "arm:shoulder_lift_joint": 0.0, 368 | "arm:elbow_joint": 0.0, 369 | "arm:wrist_1_joint": 0.0, 370 | "arm:wrist_2_joint": 0.0, 371 | "arm:wrist_3_joint": 0.0, 372 | "arm:shoulder_pan_joint1": 0.0, 373 | "arm:shoulder_lift_joint1": 0.0, 374 | "arm:elbow_joint1": 0.0, 375 | "arm:wrist_1_joint1": 0.0, 376 | "arm:wrist_2_joint1": 0.0, 377 | "arm:wrist_3_joint1": 0.0, 378 | } 379 | SpacerobotEnv.__init__( 380 | self, 381 | MODEL_XML_PATH, 382 | n_substeps=20, 383 | distance_threshold=0.05, 384 | initial_qpos=initial_qpos, 385 | reward_type=reward_type, 386 | pro_type=pro_type, 387 | c_coeff=0.1, 388 | ) 389 | gym.utils.EzPickle.__init__(self) 390 | -------------------------------------------------------------------------------- /SpaceRobotEnv/envs/SpaceRobotState.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import copy 4 | import numpy as np 5 | 6 | import gym 7 | from gym import spaces 8 | from gym.utils import seeding 9 | 10 | from gym.envs.robotics import utils 11 | from gym.envs.robotics import rotations 12 | 13 | import mujoco_py 14 | 15 | PATH = os.getcwd() 16 | 17 | MODEL_XML_PATH = os.path.join( 18 | PATH, "SpaceRobotEnv", "assets", "spacerobot", "spacerobot_state.xml" 19 | ) 20 | DEFAULT_SIZE = 500 21 | 22 | 23 | class RobotEnv(gym.GoalEnv): 24 | def __init__(self, model_path, initial_qpos, n_substeps): 25 | 26 | # load model and simulator 27 | self.model = mujoco_py.load_model_from_path(model_path) 28 | self.sim = mujoco_py.MjSim(self.model, nsubsteps=n_substeps) 29 | 30 | # render setting 31 | self.viewer = None 32 | self._viewers = {} 33 | self.metadata = { 34 | "render.modes": ["human", "rgb_array"], 35 | "video.frames_per_second": int(np.round(1.0 / self.dt)), 36 | } 37 | 38 | # seed 39 | self.seed() 40 | 41 | # initalization 42 | self._env_setup(initial_qpos=initial_qpos) 43 | self.initial_state = copy.deepcopy(self.sim.get_state()) 44 | self.goal = self._sample_goal() 45 | 46 | # set action_space and observation_space 47 | obs = self._get_obs() 48 | self._set_action_space() 49 | self.observation_space = spaces.Dict( 50 | dict( 51 | desired_goal=spaces.Box( 52 | -np.inf, np.inf, shape=obs["desired_goal"].shape, dtype="float32" 53 | ), 54 | achieved_goal=spaces.Box( 55 | -np.inf, np.inf, shape=obs["achieved_goal"].shape, dtype="float32" 56 | ), 57 | observation=spaces.Box( 58 | -np.inf, np.inf, shape=obs["observation"].shape, dtype="float32" 59 | ), 60 | ) 61 | ) 62 | 63 | def _set_action_space(self): 64 | bounds = self.model.actuator_ctrlrange.copy() 65 | low, high = bounds.T 66 | self.action_space = spaces.Box( low = low, high = high, dtype = np.float32) 67 | return self.action_space 68 | 69 | @property 70 | def dt(self): 71 | return self.sim.model.opt.timestep * self.sim.nsubsteps 72 | 73 | def _detecte_collision(self): 74 | self.collision = self.sim.data.ncon 75 | return self.collision 76 | 77 | def _sensor_torque(self): 78 | self.sensor_data = self.sim.data.sensordata 79 | return self.sensor_data 80 | 81 | def seed(self, seed=None): 82 | self.np_random, seed = seeding.np_random(seed) 83 | return [seed] 84 | 85 | def step(self, action): 86 | old_action = self.sim.data.ctrl.copy() * (1 / 0.5) 87 | action = np.clip(action, self.action_space.low, self.action_space.high) 88 | self._set_action(action) 89 | self._step_callback() 90 | obs = self._get_obs() 91 | done = False 92 | info = { 93 | "is_success": self._is_success(obs["achieved_goal"], self.goal), 94 | "act": action, 95 | "old_act": old_action, 96 | } 97 | reward = self.compute_reward( 98 | obs["achieved_goal"], self.goal, action, old_action, info 99 | ) 100 | return obs, reward, done, info 101 | 102 | def reset(self): 103 | """Attempt to reset the simulator. Since we randomize initial conditions, it 104 | is possible to get into a state with numerical issues (e.g. due to penetration or 105 | Gimbel lock) or we may not achieve an initial condition (e.g. an object is within the hand). 106 | In this case, we just keep randomizing until we eventually achieve a valid initial 107 | configuration. 108 | """ 109 | super(RobotEnv, self).reset() 110 | did_reset_sim = False 111 | while not did_reset_sim: 112 | did_reset_sim = self._reset_sim() 113 | 114 | self.goal = self._sample_goal() 115 | obs = self._get_obs() 116 | 117 | return obs 118 | 119 | def close(self): 120 | if self.viewer is not None: 121 | # self.viewer.finish() 122 | self.viewer = None 123 | self._viewers = {} 124 | 125 | def render(self, mode="human", width=DEFAULT_SIZE, height=DEFAULT_SIZE): 126 | # self._render_callback() 127 | if mode == "rgb_array": 128 | self._get_viewer(mode).render(width, height) 129 | # window size used for old mujoco-py: 130 | data = self._get_viewer(mode).read_pixels(width, height, depth=False) 131 | # original image is upside-down, so flip it 132 | return data[::-1, :, :] 133 | elif mode == "human": 134 | self._get_viewer(mode).render() 135 | 136 | def _get_viewer(self, mode): 137 | self.viewer = self._viewers.get(mode) 138 | if self.viewer is None: 139 | if mode == "human": 140 | self.viewer = mujoco_py.MjViewer(self.sim) 141 | elif mode == "rgb_array": 142 | self.viewer = mujoco_py.MjRenderContextOffscreen(self.sim, device_id=-1) 143 | self._viewer_setup() 144 | self._viewers[mode] = self.viewer 145 | return self.viewer 146 | 147 | def _reset_sim(self): 148 | """Resets a simulation and indicates whether or not it is successful. 149 | If a reset is unsuccessful (e.g. if a randomized state caused an error in the 150 | simulation), this method should indicate such a failure by returning False. 151 | In such a case, this method will be called again to attempt a the reset again. 152 | """ 153 | self.sim.set_state(self.initial_state) 154 | self.sim.forward() 155 | return True 156 | 157 | def _get_obs(self): 158 | """Returns the observation.""" 159 | raise NotImplementedError() 160 | 161 | def _set_action(self, action): 162 | """Applies the given action to the simulation.""" 163 | raise NotImplementedError() 164 | 165 | def _is_success(self, achieved_goal, desired_goal): 166 | """Indicates whether or not the achieved goal successfully achieved the desired goal.""" 167 | raise NotImplementedError() 168 | 169 | def _sample_goal(self): 170 | """Samples a new goal and returns it.""" 171 | raise NotImplementedError() 172 | 173 | def _env_setup(self, initial_qpos): 174 | """Initial configuration of the environment. Can be used to configure initial state 175 | and extract information from the simulation. 176 | """ 177 | pass 178 | 179 | def _viewer_setup(self): 180 | """Initial configuration of the viewer. Can be used to set the camera position, 181 | for example. 182 | """ 183 | pass 184 | 185 | def _render_callback(self): 186 | """A custom callback【自定义回调】 that is called before rendering. Can be used 187 | to implement custom visualizations.【可实现自定义可视化】 188 | """ 189 | pass 190 | 191 | def _step_callback(self): 192 | """A custom callback that is called after stepping the simulation. Can be used 193 | to enforce additional constraints on the simulation state.【对模拟状态强制附加约束】 194 | """ 195 | pass 196 | 197 | 198 | def goal_distance(goal_a, goal_b): 199 | assert goal_a.shape == goal_b.shape 200 | return np.linalg.norm(goal_a - goal_b, axis=-1) 201 | 202 | 203 | class SpacerobotEnv(RobotEnv): 204 | """Superclass for all SpaceRobot environments.""" 205 | 206 | def __init__( 207 | self, 208 | model_path, 209 | n_substeps, 210 | distance_threshold, 211 | initial_qpos, 212 | reward_type, 213 | ): 214 | """Initializes a new Fetch environment. 215 | Args: 216 | model_path (string): path to the environments XML file 217 | n_substeps (int): number of substeps the simulation runs on every call to step 218 | distance_threshold (float): the threshold after which a goal is considered achieved 219 | initial_qpos (dict): a dictionary of joint names and values that define the initial configuration 220 | reward_type ('sparse' or 'dense'): the reward type, i.e. sparse or dense 221 | """ 222 | self.n_substeps = n_substeps 223 | self.distance_threshold = distance_threshold 224 | self.reward_type = reward_type 225 | 226 | super(SpacerobotEnv, self).__init__( 227 | model_path=model_path, 228 | n_substeps=n_substeps, 229 | initial_qpos=initial_qpos, 230 | ) 231 | 232 | def compute_reward(self, achieved_goal, desired_goal, action, old_action, info): 233 | 234 | # Compute distance between goal and the achieved goal. 235 | d = goal_distance(achieved_goal, desired_goal) 236 | if self.reward_type == "sparse": 237 | return -(d > self.distance_threshold).astype(np.float32) 238 | elif self.reward_type == "distance": 239 | return d 240 | else: 241 | # dense reward 242 | return -( 243 | 0.001 * d ** 2 244 | + np.log10(d ** 2 + 1e-6) 245 | + 0.01 * np.linalg.norm(action - old_action) ** 2 246 | ) 247 | 248 | def _set_action(self, action): 249 | """ 250 | :param action: 3*None->6*None 251 | :return: 252 | """ 253 | assert action.shape == (6,) 254 | self.sim.data.ctrl[:] = action * 0.5 255 | for _ in range(self.n_substeps): 256 | self.sim.step() 257 | 258 | def _get_obs(self): 259 | # positions 260 | grip_pos = self.sim.data.get_body_xpos("tip_frame") 261 | grip_velp = self.sim.data.get_body_xvelp("tip_frame") * self.dt 262 | robot_qpos, robot_qvel = utils.robot_get_obs(self.sim) 263 | 264 | gripper_state = robot_qpos[-1:] 265 | gripper_vel = ( 266 | robot_qvel[-1:] * self.dt 267 | ) # change to a scalar if the gripper is made symmetric 268 | 269 | achieved_goal = grip_pos.copy() 270 | 271 | obs = np.concatenate( 272 | [ 273 | self.sim.data.qpos[7:13].copy(), 274 | self.sim.data.qvel[6:12].copy(), 275 | grip_pos, 276 | grip_velp, 277 | self.goal.copy(), 278 | ] 279 | ) 280 | 281 | return { 282 | "observation": obs.copy(), 283 | "achieved_goal": achieved_goal.copy(), 284 | "desired_goal": self.goal.copy(), 285 | } 286 | 287 | def _viewer_setup(self): 288 | body_id = self.sim.model.body_name2id("tip_frame") 289 | lookat = self.sim.data.body_xpos[body_id] 290 | for idx, value in enumerate(lookat): 291 | self.viewer.cam.lookat[idx] = value 292 | self.viewer.cam.distance = 2.5 293 | self.viewer.cam.azimuth = 132.0 294 | self.viewer.cam.elevation = -14.0 295 | 296 | def _reset_sim(self): 297 | self.sim.set_state(self.initial_state) 298 | self.sim.forward() 299 | return True 300 | 301 | def _sample_goal(self): 302 | 303 | goal = self.initial_gripper_xpos[:3].copy() 304 | d = goal_distance(self.sim.data.get_body_xpos("tip_frame").copy(), goal) 305 | 306 | goal[0] = self.initial_gripper_xpos[0] + np.random.uniform(-0.4, 0) 307 | goal[1] = self.initial_gripper_xpos[1] + np.random.uniform(-0.3, 0.3) 308 | goal[2] = self.initial_gripper_xpos[2] + np.random.uniform(0, 0.3) 309 | 310 | d = goal_distance(self.sim.data.get_body_xpos("tip_frame").copy(), goal) 311 | 312 | site_id = self.sim.model.site_name2id("target0") 313 | self.sim.model.site_pos[site_id] = goal 314 | self.sim.forward() 315 | 316 | return goal.copy() 317 | 318 | def _is_success(self, achieved_goal, desired_goal): 319 | d = goal_distance(achieved_goal, desired_goal) 320 | return (d < self.distance_threshold).astype(np.float32) 321 | # return d 322 | 323 | def _env_setup(self, initial_qpos): 324 | for name, value in initial_qpos.items(): 325 | self.sim.data.set_joint_qpos(name, value) 326 | utils.reset_mocap_welds(self.sim) 327 | 328 | # Extract information for sampling goals. 329 | self.initial_gripper_xpos = self.sim.data.get_body_xpos("tip_frame").copy() 330 | 331 | def render(self, mode="human", width=500, height=500): 332 | return super(SpacerobotEnv, self).render(mode, width, height) 333 | 334 | 335 | class SpaceRobotState(SpacerobotEnv, gym.utils.EzPickle): 336 | def __init__(self, reward_type="nosparse"): 337 | initial_qpos = { 338 | "arm:shoulder_pan_joint": 0.0, 339 | "arm:shoulder_lift_joint": 0.0, 340 | "arm:elbow_joint": 0.0, 341 | "arm:wrist_1_joint": 0.0, 342 | "arm:wrist_2_joint": 0.0, 343 | "arm:wrist_3_joint": 0.0, 344 | } 345 | SpacerobotEnv.__init__( 346 | self, 347 | MODEL_XML_PATH, 348 | n_substeps=20, 349 | distance_threshold=0.05, 350 | initial_qpos=initial_qpos, 351 | reward_type=reward_type, 352 | ) 353 | gym.utils.EzPickle.__init__(self) 354 | -------------------------------------------------------------------------------- /SpaceRobotEnv/envs/__init__.py: -------------------------------------------------------------------------------- 1 | import imp 2 | from .SpaceRobotDualArm import SpaceRobotDualArm 3 | from .SpaceRobotImage import SpaceRobotImage 4 | from .SpaceRobotState import SpaceRobotState 5 | from .SpaceRobotCost import SpaceRobotCost 6 | from .SpaceRobotReorientation import SpaceRobotReorientation 7 | -------------------------------------------------------------------------------- /SpaceRobotEnv/images/Simulation.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/images/Simulation.jpg -------------------------------------------------------------------------------- /SpaceRobotEnv/images/ccc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/images/ccc.png -------------------------------------------------------------------------------- /SpaceRobotEnv/images/iros.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/images/iros.gif -------------------------------------------------------------------------------- /SpaceRobotEnv/images/ral.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/images/ral.gif -------------------------------------------------------------------------------- /SpaceRobotEnv/images/robot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/images/robot.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | gym>=0.15.4 2 | mujoco-py>=1.15.1.0 3 | torch>=1.12.0 4 | torchvision>=0.13.0 5 | torchaudio>=0.12.0 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from os.path import dirname, realpath 2 | from setuptools import find_packages, setup 3 | 4 | def read_requirements_file(filename): 5 | req_file_path = '%s/%s' % (dirname(realpath(__file__)), filename) 6 | with open(req_file_path) as f: 7 | return [line.strip() for line in f] 8 | 9 | setup( 10 | name="SpaceRobotEnv", 11 | version="0.0.1", 12 | install_requires=read_requirements_file('requirements.txt'), 13 | packages=find_packages(exclude=("image",)), 14 | ) -------------------------------------------------------------------------------- /test_env.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | import SpaceRobotEnv 4 | import numpy as np 5 | 6 | env = gym.make("SpaceRobotReorientation-v0") 7 | 8 | dim_u = env.action_space.shape[0] 9 | print(dim_u) 10 | dim_o = env.observation_space["observation"].shape[0] 11 | print(dim_o) 12 | 13 | 14 | observation = env.reset() 15 | max_action = env.action_space.high 16 | print("max_action:", max_action) 17 | print("min_action", env.action_space.low) 18 | for e_step in range(20): 19 | observation = env.reset() 20 | for i_step in range(50): 21 | env.render() 22 | action = np.random.uniform(low=-1.0, high=1.0, size=(dim_u,)) 23 | observation, reward, done, info = env.step(max_action * action) 24 | 25 | env.close() 26 | --------------------------------------------------------------------------------