├── .gitignore
├── LICENSE
├── README.md
├── RL_algorithms
    ├── Torch
    │   ├── .DS_Store
    │   ├── DDPG
    │   │   └── DDPG_ENV
    │   │   │   ├── core.py
    │   │   │   ├── ddpg.py
    │   │   │   ├── logger
    │   │   │       ├── events.out.tfevents.1658931832.Tosins-MacBook-Air.local.42757.0
    │   │   │       ├── events.out.tfevents.1658931911.Tosins-MacBook-Air.local.42851.0
    │   │   │       ├── events.out.tfevents.1658931934.Tosins-MacBook-Air.local.42904.0
    │   │   │       ├── events.out.tfevents.1658932019.Tosins-MacBook-Air.local.42965.0
    │   │   │       ├── events.out.tfevents.1658932066.Tosins-MacBook-Air.local.43009.0
    │   │   │       ├── events.out.tfevents.1658932179.Tosins-MacBook-Air.local.43178.0
    │   │   │       ├── events.out.tfevents.1658932723.Tosins-MacBook-Air.local.43565.0
    │   │   │       └── events.out.tfevents.1658932834.Tosins-MacBook-Air.local.43628.0
    │   │   │   ├── memory.py
    │   │   │   └── training_log_csv
    │   │   │       ├── Avg Reward (1).svg
    │   │   │       └── run-.-tag-Avg Reward (1).csv
    │   ├── PPO
    │   │   ├── Continious
    │   │   │   ├── PPO
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── actor.py
    │   │   │   │   ├── agent.py
    │   │   │   │   ├── critic.py
    │   │   │   │   ├── main.py
    │   │   │   │   ├── memory.py
    │   │   │   │   ├── models
    │   │   │   │   │   ├── space_robot_actor.pt
    │   │   │   │   │   └── space_robot_critic.pt
    │   │   │   │   ├── plots
    │   │   │   │   │   └── space_robot_performance.png
    │   │   │   │   └── utils.py
    │   │   │   ├── PPO_Two_heads
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── actor.py
    │   │   │   │   ├── agent.py
    │   │   │   │   ├── critic.py
    │   │   │   │   ├── main.py
    │   │   │   │   ├── memory.py
    │   │   │   │   ├── models
    │   │   │   │   │   ├── space_robot_actor.pt
    │   │   │   │   │   └── space_robot_critic.pt
    │   │   │   │   ├── plots
    │   │   │   │   │   └── space_robot_performance.png
    │   │   │   │   └── utils.py
    │   │   │   └── __init__.py
    │   │   ├── Discrete
    │   │   │   ├── PPO
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── actor.py
    │   │   │   │   ├── agent.py
    │   │   │   │   ├── critic.py
    │   │   │   │   ├── main.py
    │   │   │   │   ├── memory.py
    │   │   │   │   ├── models
    │   │   │   │   │   ├── space_robot_actor.pt
    │   │   │   │   │   └── space_robot_critic.pt
    │   │   │   │   ├── plots
    │   │   │   │   │   └── space_robot_performance.png
    │   │   │   │   ├── training_log
    │   │   │   │   └── utils.py
    │   │   │   └── PPOImage
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── actor.py
    │   │   │   │   ├── agent.py
    │   │   │   │   ├── critic.py
    │   │   │   │   ├── main.py
    │   │   │   │   ├── memory.py
    │   │   │   │   ├── models
    │   │   │   │       ├── .space_robot_actor.pt.icloud
    │   │   │   │       └── .space_robot_critic.pt.icloud
    │   │   │   │   ├── plots
    │   │   │   │       └── space_robot_performance.png
    │   │   │   │   └── utils.py
    │   │   └── __init__.py
    │   ├── SAC
    │   │   ├── SAC_ENV
    │   │   │   ├── core.py
    │   │   │   ├── logger
    │   │   │   │   ├── events.out.tfevents.1658847118.Tosins-Air.19214.0
    │   │   │   │   ├── events.out.tfevents.1658847140.Tosins-Air.19431.0
    │   │   │   │   ├── events.out.tfevents.1658847454.Tosins-Air.19535.0
    │   │   │   │   ├── events.out.tfevents.1658847513.Tosins-Air.19931.0
    │   │   │   │   ├── events.out.tfevents.1658847612.Tosins-Air.19979.0
    │   │   │   │   ├── events.out.tfevents.1658847918.Tosins-Air.20089.0
    │   │   │   │   ├── events.out.tfevents.1658848049.Tosins-Air.20232.0
    │   │   │   │   ├── events.out.tfevents.1658848339.Tosins-Air.20384.0
    │   │   │   │   ├── events.out.tfevents.1658848364.Tosins-Air.20423.0
    │   │   │   │   ├── events.out.tfevents.1658848673.Tosins-Air.20649.0
    │   │   │   │   ├── events.out.tfevents.1658848831.Tosins-Air.20793.0
    │   │   │   │   ├── events.out.tfevents.1658849191.Tosins-Air.20924.0
    │   │   │   │   ├── events.out.tfevents.1658849218.Tosins-Air.20984.0
    │   │   │   │   ├── events.out.tfevents.1658849777.Tosins-Air.21229.0
    │   │   │   │   ├── events.out.tfevents.1658849785.Tosins-Air.21269.0
    │   │   │   │   ├── events.out.tfevents.1658849885.Tosins-Air.21429.0
    │   │   │   │   ├── events.out.tfevents.1658849941.Tosins-Air.21521.0
    │   │   │   │   └── events.out.tfevents.1658850278.Tosins-Air.21678.0
    │   │   │   ├── memory.py
    │   │   │   ├── sac.py
    │   │   │   └── training_log_csv
    │   │   │   │   ├── run-.-tag-Avg Reward.csv
    │   │   │   │   └── run-.-tag-Loss_Pi.csv
    │   │   └── __init__.py
    │   └── __init__.py
    ├── __init__.py
    └── utils
    │   └── mpi_tools.py
├── Simulation.jpg
├── SpaceRobotEnv
    ├── .DS_Store
    ├── __init__.py
    ├── assets
    │   ├── .DS_Store
    │   ├── common
    │   │   ├── __init__.py
    │   │   ├── materials.xml
    │   │   ├── skybox.xml
    │   │   └── visual.xml
    │   └── spacerobot
    │   │   ├── arm_v3.xml
    │   │   ├── arm_v31.xml
    │   │   ├── asset.xml
    │   │   ├── sensor.xml
    │   │   ├── spacerobot_cost.xml
    │   │   ├── spacerobot_dualarm.xml
    │   │   ├── spacerobot_image.xml
    │   │   ├── spacerobot_state.xml
    │   │   ├── stls
    │   │       ├── R10.stl
    │   │       ├── cube.stl
    │   │       ├── v_base.stl
    │   │       ├── v_forearm.stl
    │   │       ├── v_shoulder.stl
    │   │       ├── v_upperarm.stl
    │   │       ├── v_wrist1.stl
    │   │       ├── v_wrist2.stl
    │   │       └── v_wrist3.stl
    │   │   └── subgoal.xml
    ├── envs
    │   ├── SpaceRobotCost.py
    │   ├── SpaceRobotDualArm.py
    │   ├── SpaceRobotImage.py
    │   ├── SpaceRobotPointCloud.py
    │   ├── SpaceRobotReorientation.py
    │   ├── SpaceRobotState.py
    │   └── __init__.py
    └── images
    │   ├── Simulation.jpg
    │   ├── ccc.png
    │   ├── iros.gif
    │   ├── ral.gif
    │   └── robot.png
├── requirements.txt
├── setup.py
└── test_env.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | 1/
  2 | 2/
  3 | 3/
  4 | SpaceRobotEnv.egg-info/
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.pyc
  8 | *.py[cod]
  9 | *$py.class
 10 | 
 11 | # C extensions
 12 | *.so
 13 | 
 14 | # Distribution / packaging
 15 | .Python
 16 | build/
 17 | develop-eggs/
 18 | dist/
 19 | downloads/
 20 | eggs/
 21 | .eggs/
 22 | lib/
 23 | lib64/
 24 | parts/
 25 | sdist/
 26 | var/
 27 | wheels/
 28 | share/python-wheels/
 29 | *.egg-info/
 30 | .installed.cfg
 31 | *.egg
 32 | MANIFEST
 33 | 
 34 | # PyInstaller
 35 | #  Usually these files are written by a python script from a template
 36 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 37 | *.manifest
 38 | *.spec
 39 | 
 40 | # Installer logs
 41 | pip-log.txt
 42 | pip-delete-this-directory.txt
 43 | 
 44 | # Unit test / coverage reports
 45 | htmlcov/
 46 | .tox/
 47 | .nox/
 48 | .coverage
 49 | .coverage.*
 50 | .cache
 51 | nosetests.xml
 52 | coverage.xml
 53 | *.cover
 54 | *.py,cover
 55 | .hypothesis/
 56 | .pytest_cache/
 57 | cover/
 58 | 
 59 | # Translations
 60 | *.mo
 61 | *.pot
 62 | 
 63 | # Django stuff:
 64 | *.log
 65 | local_settings.py
 66 | db.sqlite3
 67 | db.sqlite3-journal
 68 | 
 69 | # Flask stuff:
 70 | instance/
 71 | .webassets-cache
 72 | 
 73 | # Scrapy stuff:
 74 | .scrapy
 75 | 
 76 | # Sphinx documentation
 77 | docs/_build/
 78 | 
 79 | # PyBuilder
 80 | .pybuilder/
 81 | target/
 82 | 
 83 | # Jupyter Notebook
 84 | .ipynb_checkpoints
 85 | 
 86 | # IPython
 87 | profile_default/
 88 | ipython_config.py
 89 | 
 90 | # pyenv
 91 | #   For a library or package, you might want to ignore these files since the code is
 92 | #   intended to run in multiple environments; otherwise, check them in:
 93 | # .python-version
 94 | 
 95 | # pipenv
 96 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 97 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 98 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 99 | #   install all needed dependencies.
100 | #Pipfile.lock
101 | 
102 | # poetry
103 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
104 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
105 | #   commonly ignored for libraries.
106 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
107 | #poetry.lock
108 | 
109 | # pdm
110 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
111 | #pdm.lock
112 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
113 | #   in version control.
114 | #   https://pdm.fming.dev/#use-with-ide
115 | .pdm.toml
116 | 
117 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
118 | __pypackages__/
119 | 
120 | # Celery stuff
121 | celerybeat-schedule
122 | celerybeat.pid
123 | 
124 | # SageMath parsed files
125 | *.sage.py
126 | 
127 | # Environments
128 | .env
129 | .venv
130 | env/
131 | venv/
132 | ENV/
133 | env.bak/
134 | venv.bak/
135 | 
136 | # Spyder project settings
137 | .spyderproject
138 | .spyproject
139 | 
140 | # Rope project settings
141 | .ropeproject
142 | 
143 | # mkdocs documentation
144 | /site
145 | 
146 | # mypy
147 | .mypy_cache/
148 | .dmypy.json
149 | dmypy.json
150 | 
151 | # Pyre type checker
152 | .pyre/
153 | 
154 | # pytype static type analyzer
155 | .pytype/
156 | 
157 | # Cython debug symbols
158 | cython_debug/
159 | 
160 | # PyCharm
161 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
162 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
163 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
164 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
165 | .idea/
166 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                 Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # SpaceRobotEnv
  2 | 
  3 | > Note: our repo can be found in the OpenAI Gym Documentation now. Please see [SpaceRobotEnv](https://www.gymlibrary.dev/environments/third_party_environments/#spacerobotenv).    
  4 | 
  5 | **SpaceRobotEnv** is an open-sourced environments for trajectory planning of free-floating space robots.
  6 | Different from the traditional robot, the free-floating space robot is a dynamic coupling system because of the non-actuated base, as shown in the figure below. 
  7 | Therefore, model-based trajectory planning methods encounter many dif- ficulties in modeling and computing. 
  8 | 
  9 | 
 10 | Accordingly, the researches focus on how to utilize the model-free methods, like reinforcement learning algorithms, to obtain the trajectory directly. 
 11 | However, reaching high-level planning accuracy, bimanual coordination and end-to-end control remains an open challenge for space robotics researchers.
 12 | To better help the community study this problem, SpaceRobotEnv are developed with the following key features:
 13 | * **Real Space Environment**: we construct environments similar to the space. The free-floating space robot is located in a low-gravity condition.
 14 | * **Dynamic coupling control**: Compared with robots on the ground, the torques of joints have a significant impact on the posture of the base. The movement of the base makes a disturbance on the positions of end-effectors, thus leading to a more complex trajectory planning task. 
 15 | * **Image input**: We provide the ability to use images as observations. And we also demonstrates our environment is effective, please see [our paper](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=9550509).   
 16 | 
 17 | - **Quick Demos**
 18 | 
 19 | [Paper link](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=9636681)
 20 | <div align=center>
 21 | <img src="SpaceRobotEnv/images/ral.gif" align="center" width="600"/>
 22 | </div> 
 23 | 
 24 | [Paper link](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=9636681)
 25 | <div align=center>
 26 | <img src="SpaceRobotEnv/images/iros.gif" align="center" width="600"/>
 27 | </div>
 28 | 
 29 | [Paper link](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=9550509)
 30 | <div align=center>
 31 | <img src="SpaceRobotEnv/images/ccc.png" align="center" width="600"/>
 32 | </div>  
 33 | 
 34 | Environments of this repo are as follows:
 35 | * **SpaceRobotState-v0**
 36 |     * State vector contains the angular positions and velocities of joints, the positions and velocities of end-effectors and the positions of goals. The core goal is to make the end-effector reach the goal randomly selected within a large space. 
 37 | * **SpaceRobotCost-v0**
 38 |     * The task is to make the end-effector reach a random goal while avoiding obvious movement of the base, especially for its orientation. Because the rotation of the base will cause the interruption of communication with the earth. 
 39 | * **SpaceRobotImage-v0**
 40 |     * State vector only contains images information. The core goal is the same as that of the `SpaceRobotState-v0` environment.
 41 | * **SpaceRobotDualArm-v0**
 42 |     * The free floating space robot owns two robotic arms which are attached with the base. That means two end-effectors are corresponding to two goal positions. 
 43 |     When two end-effectors reach the goals together, the task is finished. 
 44 | * **SpaceRobotReorientation-v0**
 45 |     * The free floating space robot owns two robotic arms which are attached with the base. The inital orientation of the base is sampled randomly in each episode. 
 46 |     When two arms help the base to reach the target orientation, the task is finished.     
 47 | * **SpaceRobotPointCloud-v0**
 48 |     * State vector contains the point colouds information. The core goal is the same as that of the `SpaceRobotState-v0` environment.
 49 | 
 50 | ## Installation
 51 | 
 52 | Our environment is built on the [Mujoco Simulation](https://github.com/deepmind/mujoco). So before using our repo, please make sure you install the [Mujoco](https://github.com/deepmind/mujoco) platform.
 53 | Additionally, our framework is based on the [Gym](https://github.com/openai/gym).
 54 | Details regarding installation of Gym can be found [here](https://github.com/openai/gym).
 55 | 
 56 | After you finish the installation of the Mujoco and Gym and test some toy examples using them, you can install this repo from the source code:
 57 | 
 58 | ```bash
 59 | pip install -e .
 60 | ```
 61 | 
 62 | ## Quick Start
 63 | 
 64 | We provide a Gym-Like API that allows us to get interacting information. `test_env.py` shows a toy example to verify the environments.
 65 | As you can see, A Gym-Like API makes some popular RL-based algorithm repos, like [Stable Baselines3](https://github.com/DLR-RM/stable-baselines3), easily implemented in our environments.
 66 | ```python
 67 | import gym
 68 | 
 69 | import SpaceRobotEnv
 70 | import numpy as np
 71 | 
 72 | env = gym.make("SpaceRobotState-v0")
 73 | 
 74 | dim_u = env.action_space.shape[0]
 75 | print(dim_u)
 76 | dim_o = env.observation_space["observation"].shape[0]
 77 | print(dim_o)
 78 | 
 79 | 
 80 | observation = env.reset()
 81 | max_action = env.action_space.high
 82 | print("max_action:", max_action)
 83 | print("mmin_action", env.action_space.low)
 84 | for e_step in range(20):
 85 |     observation = env.reset()
 86 |     for i_step in range(50):
 87 |         env.render()
 88 |         action = np.random.uniform(low=-1.0, high=1.0, size=(dim_u,))
 89 |         observation, reward, done, info = env.step(max_action * action)
 90 | 
 91 | env.close()
 92 | ```
 93 | 
 94 | ## Introduction of free-floating space robot
 95 | 
 96 | The free-floating space robot contains two parts, a robotic arm and a base satellite. The robot arm is rigidly connected with the base, and the whole space robot remains in a low-gravity condition.
 97 | The 6-DoF UR5 model is chosen as the robot arm, and to simplify, we considered the base as a cubic structure. The specific structure is shown as follows.
 98 | 
 99 | <div align=center>
100 | <img src="SpaceRobotEnv/images/robot.png" align="center" width="600"/>
101 | </div> 
102 | 
103 | 
104 | ## Future plan
105 | 
106 | 
107 | ### Tasks under development:  
108 |   - [x] Point cloud inputs
109 |   - [ ] Add new torque controllers, like impedance controller.
110 |   - [ ] Bulid new environments
111 | 
112 | ### Algorithms:
113 |   - [x] PPO
114 |   - [ ] TRPO
115 |   - [x] DDPG
116 |   - [ ] TD3
117 |   - [x] SAC
118 |   - [ ] HER
119 |   - [ ] [HDO](https://ieeexplore.ieee.org/abstract/document/9718193)
120 | 
121 | ## Citing SpaceRobotEnv
122 | 
123 | If you find SpaceRobotEnv useful, please cite our recent work in your publications. 
124 | 
125 | ```
126 | @article{wang2022collision,
127 |   title={Collision-Free Trajectory Planning for a 6-DoF Free-Floating Space Robot via Hierarchical Decoupling Optimization},
128 |   author={Wang, Shengjie and Cao, Yuxue and Zheng, Xiang and Zhang, Tao},
129 |   journal={IEEE Robotics and Automation Letters},
130 |   volume={7},
131 |   number={2},
132 |   pages={4953--4960},
133 |   year={2022},
134 |   publisher={IEEE}
135 | }
136 | 
137 | @inproceedings{wang2021multi,
138 |   title={A Multi-Target Trajectory Planning of a 6-DoF Free-Floating Space Robot via Reinforcement Learning},
139 |   author={Wang, Shengjie and Zheng, Xiang and Cao, Yuxue and Zhang, Tao},
140 |   booktitle={2021 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)},
141 |   pages={3724--3730},
142 |   organization={IEEE}
143 | }
144 | 
145 | @inproceedings{wang2021end,
146 |   title={An End-to-End Trajectory Planning Strategy for Free-floating Space Robots},
147 |   author={Wang, Shengjie and Cao, Yuxue and Zheng, Xiang and Zhang, Tao},
148 |   booktitle={2021 40th Chinese Control Conference (CCC)},
149 |   pages={4236--4241},
150 |   year={2021},
151 |   organization={IEEE}
152 | }
153 | 
154 | @article{cao2022reinforcement,
155 |   title={Reinforcement Learning with Prior Policy Guidance for Motion Planning of Dual-Arm Free-Floating Space Robot},
156 |   author={Cao, Yuxue and Wang, Shengjie and Zheng, Xiang and Ma, Wenke and Xie, Xinru and Liu, Lei},
157 |   journal={arXiv preprint arXiv:2209.01434},
158 |   year={2022}
159 | }
160 | 
161 | ```  
162 |   
163 | ## The Team
164 | 
165 | SpaceRobotEnv is a project maintained by 
166 | [Shengjie Wang](https://github.com/Shengjie-bob), [Xiang Zheng](https://github.com/x-zheng16), [Yuxue Cao](https://github.com/ShenGe123000) , [Fengbo Lan](https://github.com/lanrobot) at Tsinghua University. Also thanks a lot for the great contribution from [Tosin](https://github.com/tohsin)  .
167 | 
168 | 
169 | ## License
170 | 
171 | SpaceRobotEnv has an Apache license, as found in the [LICENSE](LICENSE) file.
172 | 


--------------------------------------------------------------------------------
/RL_algorithms/Torch/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/.DS_Store


--------------------------------------------------------------------------------
/RL_algorithms/Torch/DDPG/DDPG_ENV/core.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy.signal
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | 
 7 | 
 8 | def combined_shape(length, shape=None):
 9 |     if shape is None:
10 |         return (length,)
11 |     return (length, shape) if np.isscalar(shape) else (length, *shape)
12 | 
13 | def mlp(sizes, activation, output_activation=nn.Identity):
14 |     layers = []
15 |     for j in range(len(sizes)-1):
16 |         act = activation if j < len(sizes)-2 else output_activation
17 |         layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
18 |     return nn.Sequential(*layers)
19 | 
20 | def count_vars(module):
21 |     return sum([np.prod(p.shape) for p in module.parameters()])
22 | 
23 | class MLPActor(nn.Module):
24 | 
25 |     def __init__(self, obs_dim, act_dim, hidden_sizes, activation, act_limit):
26 |         super().__init__()
27 |         pi_sizes = [obs_dim] + list(hidden_sizes) + [act_dim]
28 |         self.pi = mlp(pi_sizes, activation, nn.Tanh)
29 |         self.act_limit = act_limit
30 | 
31 |     def forward(self, obs):
32 |         # Return output from network scaled to action space limits.
33 |         return self.act_limit * self.pi(obs)
34 | 
35 | class MLPQFunction(nn.Module):
36 | 
37 |     def __init__(self, obs_dim, act_dim, hidden_sizes, activation):
38 |         super().__init__()
39 |         self.q = mlp([obs_dim + act_dim] + list(hidden_sizes) + [1], activation)
40 | 
41 |     def forward(self, obs, act):
42 |         q = self.q(torch.cat([obs, act], dim=-1))
43 |         return torch.squeeze(q, -1) # Critical to ensure q has right shape.
44 | 
45 | class MLPActorCritic(nn.Module):
46 | 
47 |     def __init__(self, observation_space, action_space, hidden_sizes=(256,256),
48 |                  activation=nn.ReLU):
49 |         super().__init__()
50 | 
51 |         obs_dim = observation_space.shape[0]
52 |         act_dim = action_space.shape[0]
53 |         act_limit = action_space.high[0]
54 | 
55 |         # build policy and value functions
56 |         self.pi = MLPActor(obs_dim, act_dim, hidden_sizes, activation, act_limit)
57 |         self.q = MLPQFunction(obs_dim, act_dim, hidden_sizes, activation)
58 | 
59 |     def act(self, obs):
60 |         with torch.no_grad():
61 |             return self.pi(obs).numpy()
62 | 


--------------------------------------------------------------------------------
/RL_algorithms/Torch/DDPG/DDPG_ENV/ddpg.py:
--------------------------------------------------------------------------------
  1 | from copy import deepcopy
  2 | import numpy as np
  3 | import torch
  4 | from torch.optim import Adam
  5 | import gym
  6 | import time
  7 | import SpaceRobotEnv
  8 | import core
  9 | from torch.utils.tensorboard import SummaryWriter
 10 | # run tensor board tensorboard --logdir = /Users/emma/dev/SpaceRobotEnv/RL_algorithms/Torch/DDPG/DDPG_ENV/logger
 11 | #tensorboard --logdir=/Users/emma/dev/SpaceRobotEnv/RL_algorithms/Torch/DDPG/DDPG_ENV/logger
 12 | class ReplayBuffer:
 13 |     """
 14 |     A simple FIFO experience replay buffer for DDPG agents.
 15 |     """
 16 | 
 17 |     def __init__(self, obs_dim, act_dim, size):
 18 |         self.obs_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32)
 19 |         self.obs2_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32)
 20 |         self.act_buf = np.zeros(core.combined_shape(size, act_dim), dtype=np.float32)
 21 |         self.rew_buf = np.zeros(size, dtype=np.float32)
 22 |         self.done_buf = np.zeros(size, dtype=np.float32)
 23 |         self.ptr, self.size, self.max_size = 0, 0, size
 24 | 
 25 |     def store(self, obs, act, rew, next_obs, done):
 26 |         self.obs_buf[self.ptr] = obs
 27 |         self.obs2_buf[self.ptr] = next_obs
 28 |         self.act_buf[self.ptr] = act
 29 |         self.rew_buf[self.ptr] = rew
 30 |         self.done_buf[self.ptr] = done
 31 |         self.ptr = (self.ptr+1) % self.max_size
 32 |         self.size = min(self.size+1, self.max_size)
 33 | 
 34 |     def sample_batch(self, batch_size=32):
 35 |         idxs = np.random.randint(0, self.size, size=batch_size)
 36 |         batch = dict(obs=self.obs_buf[idxs],
 37 |                      obs2=self.obs2_buf[idxs],
 38 |                      act=self.act_buf[idxs],
 39 |                      rew=self.rew_buf[idxs],
 40 |                      done=self.done_buf[idxs])
 41 |         return {k: torch.as_tensor(v, dtype=torch.float32) for k,v in batch.items()}
 42 | 
 43 | 
 44 | 
 45 | def ddpg(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, 
 46 |          steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, 
 47 |          polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, 
 48 |          update_after=1000, update_every=50, act_noise=0.1, num_test_episodes=10, 
 49 |          max_ep_len=1000, save_freq=1):
 50 |     """
 51 |     Deep Deterministic Policy Gradient (DDPG)
 52 | 
 53 | 
 54 |     Args:
 55 |         env_fn : A function which creates a copy of the environment.
 56 |             The environment must satisfy the OpenAI Gym API.
 57 | 
 58 |         actor_critic: The constructor method for a PyTorch Module with an ``act`` 
 59 |             method, a ``pi`` module, and a ``q`` module. The ``act`` method and
 60 |             ``pi`` module should accept batches of observations as inputs,
 61 |             and ``q`` should accept a batch of observations and a batch of 
 62 |             actions as inputs. When called, these should return:
 63 | 
 64 |             ===========  ================  ======================================
 65 |             Call         Output Shape      Description
 66 |             ===========  ================  ======================================
 67 |             ``act``      (batch, act_dim)  | Numpy array of actions for each 
 68 |                                            | observation.
 69 |             ``pi``       (batch, act_dim)  | Tensor containing actions from policy
 70 |                                            | given observations.
 71 |             ``q``        (batch,)          | Tensor containing the current estimate
 72 |                                            | of Q* for the provided observations
 73 |                                            | and actions. (Critical: make sure to
 74 |                                            | flatten this!)
 75 |             ===========  ================  ======================================
 76 | 
 77 |         ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
 78 |             you provided to DDPG.
 79 | 
 80 |         seed (int): Seed for random number generators.
 81 | 
 82 |         steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
 83 |             for the agent and the environment in each epoch.
 84 | 
 85 |         epochs (int): Number of epochs to run and train agent.
 86 | 
 87 |         replay_size (int): Maximum length of replay buffer.
 88 | 
 89 |         gamma (float): Discount factor. (Always between 0 and 1.)
 90 | 
 91 |         polyak (float): Interpolation factor in polyak averaging for target 
 92 |             networks. Target networks are updated towards main networks 
 93 |             according to:
 94 | 
 95 |             .. math:: \\theta_{\\text{targ}} \\leftarrow 
 96 |                 \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta
 97 | 
 98 |             where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
 99 |             close to 1.)
100 | 
101 |         pi_lr (float): Learning rate for policy.
102 | 
103 |         q_lr (float): Learning rate for Q-networks.
104 | 
105 |         batch_size (int): Minibatch size for SGD.
106 | 
107 |         start_steps (int): Number of steps for uniform-random action selection,
108 |             before running real policy. Helps exploration.
109 | 
110 |         update_after (int): Number of env interactions to collect before
111 |             starting to do gradient descent updates. Ensures replay buffer
112 |             is full enough for useful updates.
113 | 
114 |         update_every (int): Number of env interactions that should elapse
115 |             between gradient descent updates. Note: Regardless of how long 
116 |             you wait between updates, the ratio of env steps to gradient steps 
117 |             is locked to 1.
118 | 
119 |         act_noise (float): Stddev for Gaussian exploration noise added to 
120 |             policy at training time. (At test time, no noise is added.)
121 | 
122 |         num_test_episodes (int): Number of episodes to test the deterministic
123 |             policy at the end of each epoch.
124 | 
125 |         max_ep_len (int): Maximum length of trajectory / episode / rollout.
126 | 
127 |         logger_kwargs (dict): Keyword args for EpochLogger.
128 | 
129 |         save_freq (int): How often (in terms of gap between epochs) to save
130 |             the current policy and value function.
131 | 
132 |     """
133 | 
134 |     # logger = EpochLogger(**logger_kwargs)
135 |     # logger.save_config(locals())
136 |     n_update_step = 0
137 |     n_test_step = 0
138 |     n_played_games = 0
139 |     score_history = []
140 |     torch.manual_seed(seed)
141 |     np.random.seed(seed)
142 | 
143 |     env, test_env = env_fn(), env_fn()
144 |     obs_dim = env.observation_space['observation'].shape[0]
145 |     act_dim = env.action_space.shape[0]
146 | 
147 |     # Action limit for clamping: critically, assumes all dimensions share the same bound!
148 |     act_limit = env.action_space.high[0]
149 | 
150 |     # Create actor-critic module and target networks
151 |     ac = actor_critic(env.observation_space['observation'], env.action_space, **ac_kwargs)
152 |     ac_targ = deepcopy(ac)
153 | 
154 |     # Freeze target networks with respect to optimizers (only update via polyak averaging)
155 |     for p in ac_targ.parameters():
156 |         p.requires_grad = False
157 | 
158 |     # Experience buffer
159 |     replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)
160 | 
161 |     # Count variables (protip: try to get a feel for how different size networks behave!)
162 |     var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.q])
163 |     # logger.log('\nNumber of parameters: \t pi: %d, \t q: %d\n'%var_counts)
164 | 
165 |     # Set up function for computing DDPG Q-loss
166 |     def compute_loss_q(data):
167 |         o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done']
168 | 
169 |         q = ac.q(o,a)
170 | 
171 |         # Bellman backup for Q function
172 |         with torch.no_grad():
173 |             q_pi_targ = ac_targ.q(o2, ac_targ.pi(o2))
174 |             backup = r + gamma * (1 - d) * q_pi_targ
175 | 
176 |         # MSE loss against Bellman backup
177 |         loss_q = ((q - backup)**2).mean()
178 | 
179 |         # Useful info for logging
180 |         loss_info = dict(QVals=q.detach().numpy())
181 | 
182 |         return loss_q, loss_info
183 | 
184 |     # Set up function for computing DDPG pi loss
185 |     def compute_loss_pi(data):
186 |         o = data['obs']
187 |         q_pi = ac.q(o, ac.pi(o))
188 |         return -q_pi.mean()
189 | 
190 |     # Set up optimizers for policy and q-function
191 |     pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr)
192 |     q_optimizer = Adam(ac.q.parameters(), lr=q_lr)
193 | 
194 |     # Set up model saving
195 |     # logger.setup_pytorch_saver(ac)
196 | 
197 |     def update(data):
198 |         # First run one gradient descent step for Q.
199 |         q_optimizer.zero_grad()
200 |         loss_q, loss_info = compute_loss_q(data)
201 |         loss_q.backward()
202 |         q_optimizer.step()
203 |         writer.add_scalar("Loss_Q", loss_q.item(), n_update_step )
204 | 
205 |         # Freeze Q-network so you don't waste computational effort 
206 |         # computing gradients for it during the policy learning step.
207 |         for p in ac.q.parameters():
208 |             p.requires_grad = False
209 | 
210 |         # Next run one gradient descent step for pi.
211 |         pi_optimizer.zero_grad()
212 |         loss_pi = compute_loss_pi(data)
213 |         loss_pi.backward()
214 |         pi_optimizer.step()
215 |         writer.add_scalar("Loss_Pi", loss_pi.item(), n_update_step)
216 | 
217 |         # Unfreeze Q-network so you can optimize it at next DDPG step.
218 |         for p in ac.q.parameters():
219 |             p.requires_grad = True
220 |        
221 | 
222 |         # Record things
223 |         # logger.store(LossQ=loss_q.item(), LossPi=loss_pi.item(), **loss_info)
224 | 
225 |         # Finally, update target networks by polyak averaging.
226 |         with torch.no_grad():
227 |             for p, p_targ in zip(ac.parameters(), ac_targ.parameters()):
228 |                 # NB: We use an in-place operations "mul_", "add_" to update target
229 |                 # params, as opposed to "mul" and "add", which would make new tensors.
230 |                 p_targ.data.mul_(polyak)
231 |                 p_targ.data.add_((1 - polyak) * p.data)
232 | 
233 |     def get_action(o, noise_scale):
234 |         a = ac.act(torch.as_tensor(o, dtype=torch.float32))
235 |         a += noise_scale * np.random.randn(act_dim)
236 |         return np.clip(a, -act_limit, act_limit)
237 | 
238 |     def test_agent():
239 |         avg_score_test = []
240 |         for j in range(num_test_episodes):
241 |             o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
242 |             o = o['observation']
243 |             while not(d or (ep_len == max_ep_len)):
244 |                 # Take deterministic actions at test time (noise_2scale=0)
245 |                 o, r, d, _ = test_env.step(get_action(o, 0))
246 |                 o = o['observation']
247 |                 ep_ret += r
248 |                 ep_len += 1
249 |             avg_score_test.append(ep_ret)
250 |         writer.add_scalar("Test_score avg", np.mean(avg_score_test), n_test_step)
251 |             # logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)
252 | 
253 |     # Prepare for interaction with environment
254 |     total_steps = steps_per_epoch * epochs
255 |     start_time = time.time()
256 |     o, ep_ret, ep_len = env.reset(), 0, 0
257 |     o = o["observation"]
258 | 
259 |     # Main loop: collect experience in env and update/log each epoch
260 |     for t in range(total_steps):
261 |         
262 |         # Until start_steps have elapsed, randomly sample actions
263 |         # from a uniform distribution for better exploration. Afterwards, 
264 |         # use the learned policy (with some noise, via act_noise). 
265 |         if t > start_steps:
266 |             a = get_action(o, act_noise)
267 |         else:
268 |             a = env.action_space.sample()
269 | 
270 |         # Step the env
271 |         o2, r, d, _ = env.step(a)
272 |         o2 = o2["observation"]
273 |         ep_ret += r
274 |         ep_len += 1
275 | 
276 |         # Ignore the "done" signal if it comes from hitting the time
277 |         # horizon (that is, when it's an artificial terminal signal
278 |         # that isn't based on the agent's state)
279 |         d = False if ep_len==max_ep_len else d
280 | 
281 |         # Store experience to replay buffer
282 |         replay_buffer.store(o, a, r, o2, d)
283 | 
284 |         # Super critical, easy to overlook step: make sure to update 
285 |         # most recent observation!
286 |         o = o2
287 | 
288 |         # End of trajectory handling
289 |         if d or (ep_len == max_ep_len):
290 |             # logger.store(EpRet=ep_ret, EpLen=ep_len)
291 |             n_played_games += 1
292 |             score_history.append(ep_ret)
293 |             avg_score = np.mean(score_history[-100:])
294 |             writer.add_scalar("Avg Reward", avg_score, n_played_games )
295 |             print( 'score %.1f' %ep_ret, 'avg_score %.1f' %avg_score,'num_games', n_played_games, )
296 |             
297 |             o, ep_ret, ep_len = env.reset(), 0, 0
298 |             o= o["observation"]
299 | 
300 |         # Update handling
301 |         if t >= update_after and t % update_every == 0:
302 |             for _ in range(update_every):
303 |                 n_update_step += 1
304 |                 batch = replay_buffer.sample_batch(batch_size)
305 |                 update(data=batch)
306 | 
307 |         # End of epoch handling
308 |         if (t+1) % steps_per_epoch == 0:
309 |             epoch = (t+1) // steps_per_epoch
310 | 
311 |             # Test the performance of the deterministic version of the agent.
312 |             n_test_step +=1
313 |             test_agent()
314 | 
315 |             # Log info about epoch
316 |            
317 | 
318 | if __name__ == '__main__':
319 |     import argparse
320 |     parser = argparse.ArgumentParser()
321 |     parser.add_argument('--env', type=str, default='SpaceRobotState-v0')
322 |     parser.add_argument('--hid', type=int, default=256)
323 |     parser.add_argument('--l', type=int, default=2)
324 |     parser.add_argument('--gamma', type=float, default=0.99)
325 |     parser.add_argument('--seed', '-s', type=int, default=0)
326 |     parser.add_argument('--epochs', type=int, default=50)
327 |     parser.add_argument('--exp_name', type=str, default='ddpg')
328 |     args = parser.parse_args()
329 | 
330 |     writer = SummaryWriter("RL_algorithms/Torch/DDPG/DDPG_ENV/logger")
331 |     writer.add_text(
332 |         "hyperparameters",
333 |         "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
334 |     )
335 |     ddpg(lambda : gym.make(args.env), actor_critic=core.MLPActorCritic,
336 |          ac_kwargs=dict(hidden_sizes=[args.hid]*args.l), 
337 |          gamma=args.gamma, seed=args.seed, epochs=args.epochs)
338 | 


--------------------------------------------------------------------------------
/RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658931832.Tosins-MacBook-Air.local.42757.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658931832.Tosins-MacBook-Air.local.42757.0


--------------------------------------------------------------------------------
/RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658931911.Tosins-MacBook-Air.local.42851.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658931911.Tosins-MacBook-Air.local.42851.0


--------------------------------------------------------------------------------
/RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658931934.Tosins-MacBook-Air.local.42904.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658931934.Tosins-MacBook-Air.local.42904.0


--------------------------------------------------------------------------------
/RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658932019.Tosins-MacBook-Air.local.42965.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658932019.Tosins-MacBook-Air.local.42965.0


--------------------------------------------------------------------------------
/RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658932066.Tosins-MacBook-Air.local.43009.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658932066.Tosins-MacBook-Air.local.43009.0


--------------------------------------------------------------------------------
/RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658932179.Tosins-MacBook-Air.local.43178.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658932179.Tosins-MacBook-Air.local.43178.0


--------------------------------------------------------------------------------
/RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658932723.Tosins-MacBook-Air.local.43565.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658932723.Tosins-MacBook-Air.local.43565.0


--------------------------------------------------------------------------------
/RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658932834.Tosins-MacBook-Air.local.43628.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658932834.Tosins-MacBook-Air.local.43628.0


--------------------------------------------------------------------------------
/RL_algorithms/Torch/DDPG/DDPG_ENV/memory.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/DDPG/DDPG_ENV/memory.py


--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Continious/PPO/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Continious/PPO/__init__.py


--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Continious/PPO/actor.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing.context import BaseContext
 2 | import os
 3 | import numpy as np
 4 | import torch as T
 5 | 
 6 | import torch.nn as nn
 7 | import torch.optim as optim
 8 | from torch.distributions.normal import Normal
 9 | PATH = os.getcwd()
10 | 
11 | class ActorNetwork(nn.Module):
12 |     
13 |     def __init__(self, n_actions, input_dims, alpha,  model_name : str, 
14 |             fc1_dims=256, fc2_dims=256, check_point_base_dir = 'RL_algorithms/Torch/PPO/Continious/PPO/models/'):
15 |         super(ActorNetwork, self).__init__()
16 |         self.n_actions = n_actions
17 | 
18 |         log_std = -0.5 * np.ones(n_actions, dtype=np.float32)
19 |         self.log_std = T.nn.Parameter(T.as_tensor(log_std))
20 |         
21 |         check_point_base_dir = os.path.join( PATH , check_point_base_dir )
22 |         self.checkpoint_file = os.path.join(check_point_base_dir, model_name)
23 | 
24 |         self.actor = nn.Sequential(
25 |                 nn.Linear(*input_dims, fc1_dims),
26 |                 nn.ReLU(),
27 |                 nn.Linear(fc1_dims, fc2_dims),
28 |                 nn.ReLU(),
29 |                 nn.Linear(fc2_dims, n_actions),
30 |                 nn.Tanh()
31 |         )
32 | 
33 |         self.optimizer = optim.Adam(self.parameters(), lr=alpha)
34 | 
35 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
36 | 
37 |         self.to(self.device)
38 | 
39 |     def forward(self, obs, act = None):
40 |         pi = self._distribution(obs)
41 |         logp_a = None
42 |         if act is not None:
43 |             logp_a = self._log_prob_from_distribution(pi, act)
44 |         return pi, logp_a
45 |         
46 |     def _distribution(self, state):
47 |         mu = self.actor(state)
48 |         std = T.exp(self.log_std)
49 |         return Normal(mu, std)
50 | 
51 |     def _log_prob_from_distribution(self, pi, act):
52 |         return pi.log_prob(act).sum(axis=-1)    
53 |        
54 |     def save_checkpoint(self):
55 |         T.save(self.state_dict(), self.checkpoint_file)
56 | 
57 |     def load_checkpoint(self):
58 |         self.load_state_dict(T.load(self.checkpoint_file))
59 | 


--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Continious/PPO/agent.py:
--------------------------------------------------------------------------------
  1 | import imp
  2 | from multiprocessing.context import BaseContext
  3 | import os
  4 | import copy
  5 | from tqdm import tqdm
  6 | import numpy as np
  7 | import torch as T
  8 | import torch.nn as nn
  9 | import torch.optim as optim
 10 | 
 11 | from actor import ActorNetwork
 12 | from critic import CriticNetwork 
 13 | from memory import PPOBuffer
 14 | 
 15 | 
 16 | PATH = os.getcwd()
 17 | # MODEL_XML_PATH = os.path.join(
 18 | #     PATH, "SpaceRobotEnv", "assets", "spacerobot", "spacerobot_image.xml"
 19 | # )
 20 | 
 21 | class Agent:
 22 |     def __init__(self, n_actions, input_dims,  model_name_actor : str, model_name_critic : str, \
 23 |                 gamma = 0.99, alpha = 0.0003, gae_lambda = 0.95,  \
 24 |                 policy_clip = 0.2, n_epoch = 10,  batch_size = 64):
 25 |         '''
 26 |         parameter 
 27 |             arguments:
 28 |                 - model_name_actor : model name for actor to be used in model savind directory
 29 |                 - model_name_critic :model name for critic to be used in model savind directory
 30 |         '''
 31 |         #self, n_actions, gae_lamda = 0.95, gamma = 0.99, alpha = 0 .0003, policy_clip = 0.2, batch_size = 64, N = 2048 , n_epoch = 10
 32 |         self.gamma = gamma
 33 |         self.gae_lambda = gae_lambda
 34 |         self.policy_clip = policy_clip
 35 |         self.n_epoch = n_epoch
 36 | 
 37 |         self.actor = ActorNetwork(n_actions, input_dims, alpha, model_name = model_name_actor)
 38 |         self.critic = CriticNetwork(input_dims, alpha, model_name  = model_name_critic)
 39 |         self.memory_handler = PPOBuffer( batch_size )
 40 | 
 41 |     def remember(self, state, action, probs, vals, reward, done):
 42 |         self.memory_handler.store_memory(state, action, probs, vals, reward, done)
 43 | 
 44 |     def save_models(self):
 45 |         print("Saving models now")
 46 |         self.actor.save_checkpoint()
 47 |         self.critic.save_checkpoint()
 48 | 
 49 |     def load_model(self):
 50 |         print("Load model")
 51 |         self.actor.load_checkpoint()
 52 |         self.critic.load_checkpoint()
 53 | 
 54 |     def play_optimal(self, observation):
 55 |         with T.no_grad():
 56 |             state = T.tensor([observation], dtype=T.float).to(self.actor.device)
 57 |             dist = self.actor(state)
 58 |             # action shoulnt be sampe it should be arg max
 59 |             action = dist.sample()
 60 |             action =T.squeeze(action).item()
 61 |             return action
 62 | 
 63 |     def choose_action(self, observation):
 64 |         with T.no_grad():
 65 |             observation = T.tensor([observation], dtype=T.float).to(self.actor.device)
 66 |             policy = self.actor._distribution(observation)
 67 |             action = policy.sample()
 68 |             logp_a = self.actor._log_prob_from_distribution(policy, action)
 69 |             value = self.critic(observation)
 70 | 
 71 |             return action.numpy(),  logp_a.numpy(), value.numpy()
 72 |             
 73 |     def learn(self):
 74 |         for _ in range(self.n_epoch):
 75 | 
 76 |             state_arr, action_arr, old_prob_arr, vals_arr,\
 77 |             reward_arr, dones_arr, batches = \
 78 |                     self.memory_handler.generate_batches()
 79 | 
 80 |             values = vals_arr.copy()
 81 |             advantage = np.zeros(len(reward_arr), dtype=np.float32)
 82 |             # calculate advantage = sigma_t + (gamma * lamda) * sigma_t+1 + (gamma * lamda) ^ 2 * sigma_t+2.....
 83 |             # sigma_t = reward_t + gamma * Value(s_ t+1 ) - Value(s_t)
 84 |             for t in range(len(reward_arr)-1):
 85 |                 discount = 1
 86 |                 a_t = 0
 87 |                 for k in range(t, len(reward_arr)-1):
 88 |                     
 89 |                     a_t += discount * (reward_arr[k] + self.gamma*values[k+1]*\
 90 |                             (1-int(dones_arr[k])) - values[k])
 91 | 
 92 |                     #   discount term gamma * gae_lamda (y*lamda)
 93 |                     discount *= self.gamma * self.gae_lambda
 94 |                 advantage[t] = a_t
 95 |             advantage = T.tensor(advantage).to(self.actor.device)
 96 | 
 97 |             values = T.tensor(values).to(self.actor.device)
 98 | 
 99 |             for batch in batches:
100 |                 states = T.tensor(state_arr[batch], dtype=T.float).to(self.actor.device)
101 |                 old_probs = T.tensor(old_prob_arr[batch]).to(self.actor.device)
102 | 
103 |                 actions = T.tensor(action_arr[batch]).to(self.actor.device)
104 | 
105 |                 pi, new_probs = self.actor(states, actions)
106 | 
107 |                 critic_value = self.critic(states)
108 | 
109 |                 critic_value = T.squeeze(critic_value)
110 | 
111 |                 # new_probs = dist.log_prob(actions)
112 | 
113 | 
114 |                 # prob_ratio = new_probs.exp() / old_probs.exp()
115 |                 prob_ratio = T.exp(new_probs - old_probs)
116 |                 weighted_probs = advantage[batch] * prob_ratio
117 | 
118 |                 weighted_clipped_probs = T.clamp(prob_ratio, 1-self.policy_clip,
119 |                         1 + self.policy_clip) * advantage[batch]
120 | 
121 |                 actor_loss = -T.min(weighted_probs, weighted_clipped_probs).mean()
122 | 
123 |                 returns = advantage[batch] + values[batch]
124 |                 critic_loss = (returns-critic_value)**2
125 |                 critic_loss = critic_loss.mean()
126 | 
127 |                 total_loss = actor_loss + 0.5* critic_loss
128 |                 self.actor.optimizer.zero_grad()
129 |                 self.critic.optimiser.zero_grad()
130 |                 # print("total loss", total_loss.item())
131 |                 total_loss.backward()
132 |                 self.actor.optimizer.step()
133 |                 self.critic.optimiser.step()
134 | 
135 |         self.memory_handler.clear_memory()               
136 | 


--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Continious/PPO/critic.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing.context import BaseContext
 2 | import os
 3 | import numpy as np
 4 | import torch as T
 5 | import torch.nn as nn
 6 | import torch.optim as optim
 7 | from torch.distributions.categorical import Categorical
 8 | PATH = os.getcwd()
 9 | class CriticNetwork(nn.Module):
10 |     def __init__(self, input_dims, alpha, model_name : str , fc1_dims = 256,\
11 |      fc2_dims = 256, check_point_base_dir = 'RL_algorithms/Torch/PPO/Continious/PPO/models') -> None:
12 |         super(CriticNetwork, self).__init__()
13 | 
14 |         check_point_base_dir = os.path.join( PATH , check_point_base_dir )
15 |         self.check_point_file = os.path.join(check_point_base_dir, model_name)
16 |         self.critic  = nn.Sequential(
17 |             nn.Linear(*input_dims , fc1_dims),
18 |             nn.ReLU(),
19 |             nn.Linear(fc1_dims , fc2_dims),
20 |             nn.ReLU(),
21 |             nn.Linear(fc2_dims , 1),
22 | 
23 |         )
24 |         self.optimiser = optim.Adam(self.parameters(), lr = alpha)
25 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
26 |         self.to(self.device)
27 | 
28 |     def forward(self, state):
29 |         value = self.critic(state)
30 |         return value
31 | 
32 |     def save_checkpoint(self):
33 |         T.save(self.state_dict(), self.check_point_file)
34 | 
35 |     def load_checkpoint(self):
36 |         self.load_state_dict(T.load(self.check_point_file))


--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Continious/PPO/main.py:
--------------------------------------------------------------------------------
 1 | from turtle import shape
 2 | import gym
 3 | import numpy as np
 4 | from agent import Agent
 5 | 
 6 | from utils import plot_learning_curve
 7 | import gym
 8 | 
 9 | import SpaceRobotEnv
10 | import numpy as np
11 | 
12 | 
13 | 
14 | if __name__ == '__main__':
15 |     env = gym.make("SpaceRobotState-v0")
16 |     N = 30
17 |     batch_size = 16
18 |     n_epochs = 3
19 |     alpha = 0.0003
20 |     action_space = env.action_space.shape[0]
21 |     obs_shape = env.observation_space["observation"].shape
22 | 
23 |    
24 |     
25 |     agent = Agent(  n_actions = action_space, 
26 |                     batch_size=batch_size, 
27 |                     alpha = alpha,
28 |                     n_epoch = n_epochs, 
29 |                     input_dims = obs_shape,
30 |                     model_name_actor = "space_robot_actor.pt",
31 |                     model_name_critic = "space_robot_critic.pt")
32 |     n_iter = 3000
33 |     figure_file = 'RL_algorithms/Torch/PPO/Continious/PPO/plots/space_robot_performance.png'
34 |     best_score = env.reward_range[0]
35 |     score_history = []
36 |     n_steps = 0
37 |     learn_iters = 0
38 |     avg_score = 0
39 | 
40 |     for i in range(n_iter):
41 |         obs = env.reset()
42 |         observation = obs["observation"]
43 |         done = False
44 |         score = 0
45 |         while not done:
46 |             action, prob, val = agent.choose_action(observation)
47 |             v = prob
48 |             # a = action
49 |             a = action.reshape(6,)
50 |             observation_, reward, done, info = env.step(a)
51 |             n_steps+=1
52 |             score += reward
53 | 
54 |             agent.remember(observation,  action, prob, val, reward, done)
55 |             #steps before we begin learning 20
56 |             if n_steps % N ==0:
57 |                 agent.learn()
58 |                 learn_iters += 1
59 |             observation = observation_["observation"]
60 |         score_history.append(score)
61 |         avg_score = np.mean(score_history[-100:])
62 | 
63 |         if avg_score>best_score:
64 |             best_score= avg_score
65 |             agent.save_models()
66 |         print('episode', i , 'score %.1f', 'avg_score %.1f' %avg_score,
67 |         'time_steps',n_steps, 'learning_steps', learn_iters)
68 | 
69 |     x = [i+1 for i in range(len(score_history))]
70 |     plot_learning_curve(x, score_history,figure_file)
71 |     env.close()
72 | 


--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Continious/PPO/memory.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing.context import BaseContext
 2 | import os
 3 | import numpy as np
 4 | import torch as T
 5 | import torch.nn as nn
 6 | import torch.optim as optim
 7 | 
 8 | 
 9 | 
10 | class PPOBuffer:
11 |     def __init__(self, batch_size):
12 |         self.states = []
13 |         self.probs = []
14 |         self.vals = []
15 |         self.actions = []
16 |         self.rewards = []
17 |         self.dones = []
18 | 
19 |         self.batch_size = batch_size
20 | 
21 |     def generate_batches(self):
22 |         n_states = len(self.states)
23 |         batch_start = np.arange(0, n_states, self.batch_size)
24 |         indices = np.arange(n_states, dtype=np.int64)
25 |         np.random.shuffle(indices)
26 |         batches = [indices[i:i+self.batch_size] for i in batch_start]
27 | 
28 |         return np.array(self.states),\
29 |                 np.array(self.actions),\
30 |                 np.array(self.probs),\
31 |                 np.array(self.vals),\
32 |                 np.array(self.rewards),\
33 |                 np.array(self.dones),\
34 |                 batches
35 | 
36 |     def store_memory(self, state, action, probs, vals, reward, done):
37 |         self.states.append(state)
38 |         self.actions.append(action)
39 |         self.probs.append(probs)
40 |         self.vals.append(vals)
41 |         self.rewards.append(reward)
42 |         self.dones.append(done)
43 | 
44 |     def clear_memory(self):
45 |         self.states = []
46 |         self.probs = []
47 |         self.actions = []
48 |         self.rewards = []
49 |         self.dones = []
50 |         self.vals = []
51 | 


--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Continious/PPO/models/space_robot_actor.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Continious/PPO/models/space_robot_actor.pt


--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Continious/PPO/models/space_robot_critic.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Continious/PPO/models/space_robot_critic.pt


--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Continious/PPO/plots/space_robot_performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Continious/PPO/plots/space_robot_performance.png


--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Continious/PPO/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | def plot_learning_curve(x, scores, figure_file):
 5 |     running_avg = np.zeros(len(scores))
 6 |     for i in range(len(running_avg)):
 7 |         running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
 8 |     plt.plot(x, running_avg)
 9 |     plt.title('Running average of previous 100 scores')
10 |     plt.savefig(figure_file)


--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/__init__.py


--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/actor.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing.context import BaseContext
 2 | import os
 3 | import numpy as np
 4 | import torch as T
 5 | import torch.nn.functional as F
 6 | import torch.nn as nn
 7 | import torch.optim as optim
 8 | from torch.distributions.normal import Normal
 9 | PATH = os.getcwd()
10 | 
11 | class ActorNetwork(nn.Module):
12 |     
13 |     def __init__(self, max_actions, n_actions, input_dims, alpha,  model_name : str, 
14 |             fc1_dims=256, fc2_dims=256, check_point_base_dir = 'RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/models'):
15 |         super(ActorNetwork, self).__init__()
16 |         self.n_actions = n_actions
17 |         self.max_actions = max_actions
18 |        
19 |         
20 |         check_point_base_dir = os.path.join( PATH , check_point_base_dir )
21 |         self.checkpoint_file = os.path.join(check_point_base_dir, model_name)
22 |         self.base_model = nn.Sequential(
23 |                 nn.Linear(*input_dims, fc1_dims),
24 |                 nn.ReLU(),
25 |                 nn.Linear(fc1_dims, fc2_dims),
26 |                 nn.ReLU(),
27 |         )
28 |         fc = [nn.Linear(fc2_dims, 2*n_actions)]
29 |         self.fc = nn.Sequential(*fc)       
30 |         self.optimizer = optim.Adam(self.parameters(), lr=alpha)
31 | 
32 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
33 | 
34 |         self.to(self.device)
35 | 
36 |     def forward(self, state):
37 |         x = self.base_model(state)
38 |         x = self.fc(x)
39 |         mean, std = T.chunk(x, chunks=2, dim=-1)
40 |         mean, std = self.max_actions * T.tanh(mean), F.softplus(std)
41 |         return mean, std
42 | 
43 |     def get_logprob(self, state, action):
44 |         mean, std = self.forward(state)
45 |         dist = Normal(mean, std)
46 |         log_prob = dist.log_prob(action).sum(axis=-1) 
47 |         return log_prob
48 | 
49 |  
50 |        
51 |     def save_checkpoint(self):
52 |         T.save(self.state_dict(), self.checkpoint_file)
53 | 
54 |     def load_checkpoint(self):
55 |         self.load_state_dict(T.load(self.checkpoint_file))
56 | 


--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/agent.py:
--------------------------------------------------------------------------------
  1 | import imp
  2 | from multiprocessing.context import BaseContext
  3 | import os
  4 | import copy
  5 | from tqdm import tqdm
  6 | import numpy as np
  7 | import torch as T
  8 | import torch.nn as nn
  9 | import torch.optim as optim
 10 | from torch.distributions import Normal
 11 | 
 12 | from actor import ActorNetwork
 13 | from critic import CriticNetwork 
 14 | from memory import PPOBuffer
 15 | 
 16 | 
 17 | PATH = os.getcwd()
 18 | # MODEL_XML_PATH = os.path.join(
 19 | #     PATH, "SpaceRobotEnv", "assets", "spacerobot", "spacerobot_image.xml"
 20 | # )
 21 | 
 22 | class Agent:
 23 |     def __init__(self, env_max_action, n_actions, input_dims,  model_name_actor : str, model_name_critic : str, \
 24 |                 gamma = 0.99, alpha = 0.0003, gae_lambda = 0.95,  \
 25 |                 policy_clip = 0.2, n_epoch = 3,  batch_size = 64):
 26 |         '''
 27 |         parameter 
 28 |             arguments:
 29 |                 - model_name_actor : model name for actor to be used in model savind directory
 30 |                 - model_name_critic :model name for critic to be used in model savind directory
 31 |         '''
 32 |         #self, n_actions, gae_lamda = 0.95, gamma = 0.99, alpha = 0 .0003, policy_clip = 0.2, batch_size = 64, N = 2048 , n_epoch = 10
 33 |         self.gamma = gamma
 34 |         self.gae_lambda = gae_lambda
 35 |         self.policy_clip = policy_clip
 36 |         self.n_epoch = n_epoch
 37 | 
 38 |         self.actor = ActorNetwork( env_max_action , n_actions, input_dims, alpha, model_name = model_name_actor)
 39 |         self.critic = CriticNetwork(input_dims, alpha, model_name  = model_name_critic)
 40 |         self.memory_handler = PPOBuffer( batch_size )
 41 | 
 42 |     def remember(self, state, action, probs, vals, reward, done):
 43 |         self.memory_handler.store_memory(state, action, probs, vals, reward, done)
 44 | 
 45 |     def save_models(self):
 46 |         print("Saving models now")
 47 |         self.actor.save_checkpoint()
 48 |         self.critic.save_checkpoint()
 49 | 
 50 |     def load_model(self):
 51 |         print("Load model")
 52 |         self.actor.load_checkpoint()
 53 |         self.critic.load_checkpoint()
 54 | 
 55 |    
 56 |     def choose_action(self, state):
 57 |         # state = T.as_tensor(state, dtype=T.float, device=device)
 58 |         state = T.tensor([state], dtype=T.float).to(self.actor.device)
 59 |         
 60 |         mean, std = self.actor.forward(state)
 61 |        
 62 |         dist = Normal(mean, std)
 63 |       
 64 | 
 65 |         action = dist.sample()
 66 |         action_logprob = dist.log_prob(action).sum(axis=-1) 
 67 |         value = self.critic(state)
 68 | 
 69 |         return action, action_logprob,  value
 70 | 
 71 |     # def choose_action(self, observation):
 72 |     #     with T.no_grad():
 73 |     #         observation = T.tensor([observation], dtype=T.float).to(self.actor.device)
 74 |     #         action , logp_a  = self.actor.sample_normal(observation)
 75 |     #         value = self.critic(observation)
 76 |     #         return action.numpy(),  logp_a.numpy(), value.numpy()
 77 |     def learn(self):
 78 |         for _ in range(self.n_epoch):
 79 | 
 80 |             state_arr, action_arr, old_prob_arr, vals_arr,\
 81 |             reward_arr, dones_arr, batches = \
 82 |                     self.memory_handler.generate_batches()
 83 | 
 84 |             values = vals_arr.copy()
 85 |             advantage = np.zeros(len(reward_arr), dtype=np.float32)
 86 |             # calculate advantage = sigma_t + (gamma * lamda) * sigma_t+1 + (gamma * lamda) ^ 2 * sigma_t+2.....
 87 |             # sigma_t = reward_t + gamma * Value(s_ t+1 ) - Value(s_t)
 88 |             for t in range(len(reward_arr)-1):
 89 |                 discount = 1
 90 |                 a_t = 0
 91 |                 for k in range(t, len(reward_arr)-1):
 92 |                     
 93 |                     a_t += discount * (reward_arr[k] + self.gamma*values[k+1]*\
 94 |                             (1-int(dones_arr[k])) - values[k])
 95 | 
 96 |                     #   discount term gamma * gae_lamda (y*lamda)
 97 |                     discount *= self.gamma * self.gae_lambda
 98 |                 advantage[t] = a_t
 99 |             advantage = T.tensor(advantage).to(self.actor.device)
100 | 
101 |             values = T.tensor(values).to(self.actor.device)
102 | 
103 |             for batch in batches:
104 |                 states = T.tensor(state_arr[batch], dtype=T.float).to(self.actor.device)
105 |                 old_probs = T.tensor(old_prob_arr[batch]).to(self.actor.device)
106 | 
107 |                 actions = T.tensor(action_arr[batch]).to(self.actor.device)
108 | 
109 |                 new_probs = self.actor.get_logprob(states, actions)
110 | 
111 |                 critic_value = self.critic(states)
112 | 
113 |                 critic_value = T.squeeze(critic_value)
114 |                
115 |                 prob_ratio = T.exp(new_probs - old_probs)
116 |                 
117 |                 weighted_probs = advantage[batch] * prob_ratio
118 | 
119 | 
120 |                 weighted_clipped_probs = T.clamp(prob_ratio, 1-self.policy_clip,
121 |                         1 + self.policy_clip)*advantage[batch]
122 |                 actor_loss = -T.min(weighted_probs, weighted_clipped_probs).mean()
123 | 
124 |                 returns = advantage[batch] + values[batch]
125 |                 critic_loss = (returns-critic_value)**2
126 |                 critic_loss = critic_loss.mean()
127 | 
128 |                 total_loss = actor_loss + 0.5* critic_loss
129 |                 self.actor.optimizer.zero_grad()
130 |                 self.critic.optimiser.zero_grad()
131 |                 # print("total loss", total_loss.item())
132 |                 total_loss.backward()
133 |                 self.actor.optimizer.step()
134 |                 self.critic.optimiser.step()
135 | 
136 |         self.memory_handler.clear_memory()               
137 | 


--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/critic.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing.context import BaseContext
 2 | import os
 3 | import numpy as np
 4 | import torch as T
 5 | import torch.nn as nn
 6 | import torch.optim as optim
 7 | from torch.distributions.categorical import Categorical
 8 | PATH = os.getcwd()
 9 | class CriticNetwork(nn.Module):
10 |     def __init__(self, input_dims, alpha, model_name : str , fc1_dims = 256,\
11 |      fc2_dims = 256, check_point_base_dir = 'RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/models') -> None:
12 |         super(CriticNetwork, self).__init__()
13 | 
14 |         check_point_base_dir = os.path.join( PATH , check_point_base_dir )
15 |         self.check_point_file = os.path.join(check_point_base_dir, model_name)
16 |         self.critic  = nn.Sequential(
17 |             nn.Linear(*input_dims , fc1_dims),
18 |             nn.ReLU(),
19 |             nn.Linear(fc1_dims , fc2_dims),
20 |             nn.ReLU(),
21 |             nn.Linear(fc2_dims , 1),
22 | 
23 |         )
24 |         self.optimiser = optim.Adam(self.parameters(), lr = alpha)
25 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
26 |         self.to(self.device)
27 | 
28 |     def forward(self, state):
29 |         value = self.critic(state)
30 |         return value
31 | 
32 |     def save_checkpoint(self):
33 |         T.save(self.state_dict(), self.check_point_file)
34 | 
35 |     def load_checkpoint(self):
36 |         self.load_state_dict(T.load(self.check_point_file))


--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/main.py:
--------------------------------------------------------------------------------
 1 | from turtle import shape
 2 | import gym
 3 | import numpy as np
 4 | from agent import Agent
 5 | 
 6 | from utils import plot_learning_curve
 7 | import gym
 8 | 
 9 | import SpaceRobotEnv
10 | import numpy as np
11 | 
12 | 
13 | 
14 | if __name__ == '__main__':
15 |     env = gym.make("SpaceRobotState-v0")
16 |     N = 30
17 |     batch_size = 16
18 |     n_epochs = 3
19 |     alpha = 0.0003
20 |     action_space = env.action_space.shape[0]
21 |     obs_shape = env.observation_space["observation"].shape
22 |     env_max_action =  float(env.action_space.high[0])
23 |     
24 |     agent = Agent(  env_max_action = env_max_action,
25 |                     n_actions = action_space, 
26 |                     batch_size = batch_size, 
27 |                     alpha = alpha,
28 |                     n_epoch = n_epochs, 
29 |                     input_dims = obs_shape,
30 |                     model_name_actor = "space_robot_actor.pt",
31 |                     model_name_critic = "space_robot_critic.pt")
32 |     n_iter = 300
33 |     figure_file = 'RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/plots/space_robot_performance.png'
34 |     best_score = env.reward_range[0]
35 |     score_history = []
36 |     n_steps = 0
37 |     learn_iters = 0
38 |     avg_score = 0
39 | 
40 |     for i in range(n_iter):
41 |         obs = env.reset()
42 |         observation = obs["observation"]
43 |       
44 |         done = False
45 |         score = 0
46 |         while not done:
47 |             action, prob, val = agent.choose_action(observation)
48 |             
49 |             action = action.detach().cpu().numpy().flatten()
50 |             action = action.clip(env.action_space.low, env.action_space.high)
51 | 
52 |             action_logprob = prob.detach().cpu().numpy().flatten()
53 |             val = val.detach().cpu().numpy().flatten()
54 |             
55 |             observation_, reward, done, info = env.step(action)
56 |             n_steps+=1
57 |             score += reward
58 | 
59 |             agent.remember(observation, action, action_logprob, val, reward, done)
60 |             #steps before we begin learning 20
61 |             if n_steps % N ==0:
62 |                 agent.learn()
63 |                 learn_iters += 1
64 |             observation = observation_["observation"]
65 |         score_history.append(score)
66 |         avg_score = np.mean(score_history[-100:])
67 | 
68 |         if avg_score>best_score:
69 |             best_score= avg_score
70 |             agent.save_models()
71 |         print('episode', i , 'score %.1f', 'avg_score %.1f' %avg_score,
72 |         'time_steps',n_steps, 'learning_steps', learn_iters)
73 | 
74 |     x = [i+1 for i in range(len(score_history))]
75 |     plot_learning_curve(x, score_history,figure_file)
76 |     env.close()
77 | 


--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/memory.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing.context import BaseContext
 2 | import os
 3 | import numpy as np
 4 | import torch as T
 5 | import torch.nn as nn
 6 | import torch.optim as optim
 7 | 
 8 | 
 9 | 
10 | class PPOBuffer:
11 |     def __init__(self, batch_size):
12 |         self.states = []
13 |         self.probs = []
14 |         self.vals = []
15 |         self.actions = []
16 |         self.rewards = []
17 |         self.dones = []
18 | 
19 |         self.batch_size = batch_size
20 | 
21 |     def generate_batches(self):
22 |         n_states = len(self.states)
23 |         batch_start = np.arange(0, n_states, self.batch_size)
24 |         indices = np.arange(n_states, dtype=np.int64)
25 |         np.random.shuffle(indices)
26 |         batches = [indices[i:i+self.batch_size] for i in batch_start]
27 | 
28 |         return np.array(self.states),\
29 |                 np.array(self.actions),\
30 |                 np.array(self.probs),\
31 |                 np.array(self.vals),\
32 |                 np.array(self.rewards),\
33 |                 np.array(self.dones),\
34 |                 batches
35 | 
36 |     def store_memory(self, state, action, probs, vals, reward, done):
37 |         self.states.append(state)
38 |         self.actions.append(action)
39 |         self.probs.append(probs)
40 |         self.vals.append(vals)
41 |         self.rewards.append(reward)
42 |         self.dones.append(done)
43 | 
44 |     def clear_memory(self):
45 |         self.states = []
46 |         self.probs = []
47 |         self.actions = []
48 |         self.rewards = []
49 |         self.dones = []
50 |         self.vals = []
51 | 


--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/models/space_robot_actor.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/models/space_robot_actor.pt


--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/models/space_robot_critic.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/models/space_robot_critic.pt


--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/plots/space_robot_performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/plots/space_robot_performance.png


--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | def plot_learning_curve(x, scores, figure_file):
 5 |     running_avg = np.zeros(len(scores))
 6 |     for i in range(len(running_avg)):
 7 |         running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
 8 |     plt.plot(x, running_avg)
 9 |     plt.title('Running average of previous 100 scores')
10 |     plt.savefig(figure_file)


--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Continious/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Continious/__init__.py


--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Discrete/PPO/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Discrete/PPO/__init__.py


--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Discrete/PPO/actor.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing.context import BaseContext
 2 | import os
 3 | import numpy as np
 4 | import torch as T
 5 | import torch.nn as nn
 6 | import torch.optim as optim
 7 | from torch.distributions.categorical import Categorical
 8 | PATH = os.getcwd()
 9 | 
10 | class ActorNetwork(nn.Module):
11 |     
12 |     def __init__(self, n_actions, input_dims, alpha,  model_name : str, 
13 |             fc1_dims=256, fc2_dims=256, check_point_base_dir = 'Learning_algorithm/Torch/PPO/models/'):
14 |         super(ActorNetwork, self).__init__()
15 |         
16 |         check_point_base_dir = os.path.join( PATH , check_point_base_dir )
17 |         self.checkpoint_file = os.path.join(check_point_base_dir, model_name)
18 |         self.actor = nn.Sequential(
19 |                 nn.Linear(*input_dims, fc1_dims),
20 |                 nn.ReLU(),
21 |                 nn.Linear(fc1_dims, fc2_dims),
22 |                 nn.ReLU(),
23 |                 nn.Linear(fc2_dims, n_actions),
24 |                 nn.Softmax(dim=-1)
25 |         )
26 | 
27 |         self.optimizer = optim.Adam(self.parameters(), lr=alpha)
28 | 
29 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
30 | 
31 |         self.to(self.device)
32 | 
33 |     def forward(self, state):
34 |         dist = self.actor(state)
35 |         dist = Categorical(dist)
36 |         return dist
37 | 
38 |     def save_checkpoint(self):
39 |         T.save(self.state_dict(), self.checkpoint_file)
40 | 
41 |     def load_checkpoint(self):
42 |         self.load_state_dict(T.load(self.checkpoint_file))
43 | 


--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Discrete/PPO/agent.py:
--------------------------------------------------------------------------------
  1 | import imp
  2 | from multiprocessing.context import BaseContext
  3 | import os
  4 | import copy
  5 | from tqdm import tqdm
  6 | import numpy as np
  7 | import torch as T
  8 | import torch.nn as nn
  9 | import torch.optim as optim
 10 | from torch.distributions.categorical import Categorical
 11 | from actor import ActorNetwork
 12 | from critic import CriticNetwork 
 13 | from memory import PPOMemory
 14 | 
 15 | 
 16 | PATH = os.getcwd()
 17 | # MODEL_XML_PATH = os.path.join(
 18 | #     PATH, "SpaceRobotEnv", "assets", "spacerobot", "spacerobot_image.xml"
 19 | # )
 20 | 
 21 | class Agent:
 22 |     def __init__(self, n_actions, input_dims,  model_name_actor : str, model_name_critic : str, \
 23 |                 gamma = 0.99, alpha = 0.0003, gae_lambda = 0.95,  \
 24 |                 policy_clip = 0.1, n_epoch = 10,  batch_size = 64):
 25 |         '''
 26 |         parameter 
 27 |             arguments:
 28 |                 - model_name_actor : model name for actor to be used in model savind directory
 29 |                 - model_name_critic :model name for critic to be used in model savind directory
 30 |         '''
 31 |         seed = 10000 
 32 |         T.manual_seed(seed)
 33 |         np.random.seed(seed)
 34 |         #self, n_actions, gae_lamda = 0.95, gamma = 0.99, alpha = 0 .0003, policy_clip = 0.2, batch_size = 64, N = 2048 , n_epoch = 10
 35 |         self.gamma = gamma
 36 |         self.gae_lambda = gae_lambda
 37 |         self.policy_clip = policy_clip
 38 |         self.n_epoch = n_epoch
 39 | 
 40 |         self.actor = ActorNetwork(n_actions, input_dims, alpha, model_name = model_name_actor)
 41 |         self.critic = CriticNetwork(input_dims, alpha, model_name  = model_name_critic)
 42 |         self.memory_handler = PPOMemory( batch_size )
 43 | 
 44 |     def remember(self, state, action, probs, vals, reward, done):
 45 |         self.memory_handler.store_memory(state, action, probs, vals, reward, done)
 46 | 
 47 |     def save_models(self):
 48 |         print("Saving models now")
 49 |         self.actor.save_checkpoint()
 50 |         self.critic.save_checkpoint()
 51 | 
 52 |     def load_model(self):
 53 |         print("Load model")
 54 |         self.actor.load_checkpoint()
 55 |         self.critic.load_checkpoint()
 56 | 
 57 |     def play_optimal(self, observation):
 58 |         with T.no_grad():
 59 |             state = T.tensor([observation], dtype=T.float).to(self.actor.device)
 60 |             dist = self.actor(state)
 61 |             # action shoulnt be sampe it should be arg max
 62 |             action = dist.sample()
 63 |             action =T.squeeze(action).item()
 64 |             return action
 65 | 
 66 |     def choose_action(self, observation):
 67 |         state = T.tensor([observation], dtype=T.float).to(self.actor.device)
 68 |         dist = self.actor(state)
 69 |         value  = self.critic(state)
 70 | 
 71 |         action = dist.sample()
 72 | 
 73 |         # this is equivalent to the reinforce algorithm of probablity distribition
 74 |         probs = T.squeeze(dist.log_prob(action)).item() 
 75 | 
 76 |         action =T.squeeze(action).item()
 77 |         value =T.squeeze(value).item()
 78 | 
 79 |         return action, probs , value
 80 | 
 81 |     def learn(self):
 82 |         for _ in range(self.n_epoch):
 83 | 
 84 |             state_arr, action_arr, old_prob_arr, vals_arr,\
 85 |             reward_arr, dones_arr, batches = \
 86 |                     self.memory_handler.generate_batches()
 87 | 
 88 |             values = vals_arr.copy()
 89 |             advantage = np.zeros(len(reward_arr), dtype=np.float32)
 90 | 
 91 |             for t in range(len(reward_arr)-1):
 92 |                 discount = 1
 93 |                 a_t = 0
 94 |                 for k in range(t, len(reward_arr)-1):
 95 |                     a_t += discount*(reward_arr[k] + self.gamma*values[k+1]*\
 96 |                             ( 1 - int(dones_arr[k]) ) - values[k])
 97 |                     discount *= self.gamma*self.gae_lambda
 98 |                 advantage[t] = a_t
 99 |             advantage = T.tensor(advantage).to(self.actor.device)
100 | 
101 |             values = T.tensor(values).to(self.actor.device)
102 |             for batch in batches:
103 |                 states = T.tensor(state_arr[batch], dtype=T.float).to(self.actor.device)
104 |                 old_probs = T.tensor(old_prob_arr[batch]).to(self.actor.device)
105 |                 actions = T.tensor(action_arr[batch]).to(self.actor.device)
106 | 
107 |                 dist = self.actor(states)
108 |                 critic_value = self.critic(states)
109 | 
110 |                 critic_value = T.squeeze(critic_value)
111 | 
112 |                 new_probs = dist.log_prob(actions)
113 |                 prob_ratio = new_probs.exp() / old_probs.exp()
114 |                 #prob_ratio = (new_probs - old_probs).exp()
115 |                 weighted_probs = advantage[batch] * prob_ratio
116 |                 weighted_clipped_probs = T.clamp(prob_ratio, 1 - self.policy_clip,
117 |                         1 + self.policy_clip ) * advantage[batch]
118 |                 actor_loss = -T.min(weighted_probs, weighted_clipped_probs).mean()
119 | 
120 |                 returns = advantage[batch] + values[batch]
121 |                 critic_loss = (returns-critic_value) ** 2
122 |                 critic_loss = critic_loss.mean()
123 | 
124 |                 total_loss = actor_loss + 0.5 * critic_loss
125 |                 self.actor.optimizer.zero_grad()
126 |                 self.critic.optimiser.zero_grad()
127 |                 # print("total loss", total_loss.item())
128 |                 total_loss.backward()
129 |                 self.actor.optimizer.step()
130 |                 self.critic.optimiser.step()
131 | 
132 |         self.memory_handler.clear_memory()               
133 | 


--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Discrete/PPO/critic.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing.context import BaseContext
 2 | import os
 3 | import numpy as np
 4 | import torch as T
 5 | import torch.nn as nn
 6 | import torch.optim as optim
 7 | from torch.distributions.categorical import Categorical
 8 | PATH = os.getcwd()
 9 | class CriticNetwork(nn.Module):
10 |     def __init__(self, input_dims, alpha, model_name : str , fc1_dims = 256,\
11 |      fc2_dims = 256, check_point_base_dir = 'Learning_algorithm/Torch/PPO/models/') -> None:
12 |         super(CriticNetwork, self).__init__()
13 | 
14 |         check_point_base_dir = os.path.join( PATH , check_point_base_dir )
15 |         self.check_point_file = os.path.join(check_point_base_dir, model_name)
16 |         self.critic  = nn.Sequential(
17 |             nn.Linear(*input_dims , fc1_dims),
18 |             nn.ReLU(),
19 |             nn.Linear(fc1_dims , fc2_dims),
20 |             nn.ReLU(),
21 |             nn.Linear(fc2_dims , 1),
22 | 
23 |         )
24 |         self.optimiser = optim.Adam(self.parameters(), lr = alpha)
25 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
26 |         self.to(self.device)
27 | 
28 |     def forward(self, state):
29 |         value = self.critic(state)
30 |         return value
31 | 
32 |     def save_checkpoint(self):
33 |         T.save(self.state_dict(), self.check_point_file)
34 | 
35 |     def load_checkpoint(self):
36 |         self.load_state_dict(T.load(self.check_point_file))


--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Discrete/PPO/main.py:
--------------------------------------------------------------------------------
 1 | from turtle import shape
 2 | import gym
 3 | import numpy as np
 4 | from agent import Agent
 5 | 
 6 | from utils import plot_learning_curve
 7 | import gym
 8 | 
 9 | import SpaceRobotEnv
10 | import numpy as np
11 | 
12 | 
13 | 
14 | if __name__ == '__main__':
15 |     env = gym.make("SpaceRobotState-v0")
16 |     N = 20
17 |     batch_size = 5 
18 |     n_epochs = 4 
19 |     alpha = 0.0003
20 |     action_space = env.action_space.shape[0]
21 |     obs_shape = env.observation_space["observation"].shape
22 | 
23 |    
24 |     
25 |     agent = Agent(  n_actions = action_space, 
26 |                     batch_size=batch_size, 
27 |                     alpha = alpha,
28 |                     n_epoch = n_epochs, 
29 |                     input_dims = obs_shape,
30 |                     model_name_actor = "space_robot_actor.pt",
31 |                     model_name_critic = "space_robot_critic.pt")
32 |     n_iter = 300
33 |     figure_file = 'Learning_algorithm/Torch/PPO/plots/space_robot_performance.png'
34 |     best_score = env.reward_range[0]
35 |     score_history = []
36 |     n_steps = 0
37 |     learn_iters = 0
38 |     avg_score = 0
39 | 
40 |     for i in range(n_iter):
41 |         obs = env.reset()
42 |         observation = obs["observation"]
43 |         done = False
44 |         score = 0
45 |         while not done:
46 |             action, prob, val = agent.choose_action(observation)
47 |             act = action
48 |             pr = prob
49 |             observation_, reward, done, info = env.step(action)
50 |             n_steps+=1
51 |             score += reward
52 | 
53 |             agent.remember(observation,  action, prob, val, reward, done)
54 |             #steps before we begin learning 20
55 |             if n_steps % N ==0:
56 |                 agent.learn()
57 |                 learn_iters += 1
58 |             observation = observation_["observation"]
59 |         score_history.append(score)
60 |         avg_score = np.mean(score_history[-100:])
61 | 
62 |         if avg_score>best_score:
63 |             best_score= avg_score
64 |             agent.save_models()
65 |         print('episode', i , 'score %.1f', 'avg_score %.1f' %avg_score,
66 |         'time_steps',n_steps, 'learning_steps', learn_iters)
67 | 
68 |     x = [i+1 for i in range(len(score_history))]
69 |     plot_learning_curve(x, score_history,figure_file)
70 |     env.close()
71 | 


--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Discrete/PPO/memory.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing.context import BaseContext
 2 | import os
 3 | import numpy as np
 4 | import torch as T
 5 | import torch.nn as nn
 6 | import torch.optim as optim
 7 | 
 8 | class Mem:
 9 |     def __init__(self, state , prob, val ,action, reward, done) -> None:
10 |         self.state = state
11 |         self.prob = prob
12 |         self.val = val
13 |         self.action = action
14 |         self.reward = reward
15 |         self.done = done
16 | 
17 | class PPOMemory:
18 |     def __init__(self, batch_size):
19 |         self.states = []
20 |         self.probs = []
21 |         self.vals = []
22 |         self.actions = []
23 |         self.rewards = []
24 |         self.dones = []
25 | 
26 |         self.batch_size = batch_size
27 | 
28 |     def generate_batches(self):
29 |         n_states = len(self.states)
30 |         batch_start = np.arange(0, n_states, self.batch_size)
31 |         indices = np.arange(n_states, dtype=np.int64)
32 |         np.random.shuffle(indices)
33 |         batches = [indices[i:i+self.batch_size] for i in batch_start]
34 | 
35 |         return np.array(self.states),\
36 |                 np.array(self.actions),\
37 |                 np.array(self.probs),\
38 |                 np.array(self.vals),\
39 |                 np.array(self.rewards),\
40 |                 np.array(self.dones),\
41 |                 batches
42 | 
43 |     def store_memory(self, state, action, probs, vals, reward, done):
44 |         self.states.append(state)
45 |         self.actions.append(action)
46 |         self.probs.append(probs)
47 |         self.vals.append(vals)
48 |         self.rewards.append(reward)
49 |         self.dones.append(done)
50 | 
51 |     def clear_memory(self):
52 |         self.states = []
53 |         self.probs = []
54 |         self.actions = []
55 |         self.rewards = []
56 |         self.dones = []
57 |         self.vals = []
58 | 


--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Discrete/PPO/models/space_robot_actor.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Discrete/PPO/models/space_robot_actor.pt


--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Discrete/PPO/models/space_robot_critic.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Discrete/PPO/models/space_robot_critic.pt


--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Discrete/PPO/plots/space_robot_performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Discrete/PPO/plots/space_robot_performance.png


--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Discrete/PPO/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | def plot_learning_curve(x, scores, figure_file):
 5 |     running_avg = np.zeros(len(scores))
 6 |     for i in range(len(running_avg)):
 7 |         running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
 8 |     plt.plot(x, running_avg)
 9 |     plt.title('Running average of previous 100 scores')
10 |     plt.savefig(figure_file)


--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Discrete/PPOImage/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Discrete/PPOImage/__init__.py


--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Discrete/PPOImage/actor.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing.context import BaseContext
 2 | import os
 3 | import numpy as np
 4 | import torch as T
 5 | import torch.nn as nn
 6 | import torch.optim as optim
 7 | from torch.distributions.categorical import Categorical
 8 | PATH = os.getcwd()
 9 | 
10 | class ActorNetwork(nn.Module):
11 |     
12 |     def __init__(self, n_actions, alpha,  model_name : str, 
13 |             check_point_base_dir = 'RL_algorithms/Torch/PPOImage/models'):
14 |         super(ActorNetwork, self).__init__()
15 |         
16 |         check_point_base_dir = os.path.join( PATH , check_point_base_dir )
17 |         self.checkpoint_file = os.path.join(check_point_base_dir, model_name)
18 |         
19 |         self.actor = nn.Sequential(
20 |                 nn.Conv2d(in_channels = 3, out_channels = 32, kernel_size=5,  stride=1),
21 |                 nn.ReLU(),
22 |                 nn.BatchNorm2d(32),
23 |                 nn.ReLU(),
24 |                 nn.MaxPool2d(2,2),
25 | 
26 |                 nn.Conv2d(in_channels = 32, out_channels = 64, kernel_size=5,  stride=1),
27 |                 nn.ReLU(),
28 |                 nn.BatchNorm2d(64),
29 |                 nn.ReLU(),
30 |                 nn.MaxPool2d(2,2),
31 | 
32 |                 nn.Conv2d(in_channels = 64, out_channels = 64, kernel_size=5,  stride=1),
33 |                 nn.ReLU(),
34 |                 nn.BatchNorm2d(64),
35 |                 nn.ReLU(),
36 |                 nn.MaxPool2d(2,2),
37 | 
38 |                 nn.Flatten(),
39 |                 nn.Linear(1024, 4096),
40 |                 nn.ReLU(),
41 |                 nn.Linear(4096, 256),
42 |                 nn.ReLU(),
43 |                 nn.Linear(256, 64),
44 |                 nn.ReLU(),
45 |                 nn.Linear(64, n_actions),
46 | 
47 |                 nn.Softmax(dim=-1)
48 |         )
49 | 
50 |         self.optimizer = optim.Adam(self.parameters(), lr=alpha)
51 | 
52 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
53 | 
54 |         self.to(self.device)
55 | 
56 |     def forward(self, state):
57 |         dist = self.actor(state)
58 |         dist = Categorical(dist)
59 |         return dist
60 | 
61 |     def save_checkpoint(self):
62 |         T.save(self.state_dict(), self.checkpoint_file)
63 | 
64 |     def load_checkpoint(self):
65 |         self.load_state_dict(T.load(self.checkpoint_file))
66 | 


--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Discrete/PPOImage/agent.py:
--------------------------------------------------------------------------------
  1 | import imp
  2 | from multiprocessing.context import BaseContext
  3 | import os
  4 | import copy
  5 | from tqdm import tqdm
  6 | import numpy as np
  7 | import torch as T
  8 | import torch.nn as nn
  9 | import torch.optim as optim
 10 | from torch.distributions.categorical import Categorical
 11 | from actor import ActorNetwork
 12 | from critic import CriticNetwork 
 13 | from memory import PPOMemory
 14 | 
 15 | 
 16 | PATH = os.getcwd()
 17 | # MODEL_XML_PATH = os.path.join(
 18 | #     PATH, "SpaceRobotEnv", "assets", "spacerobot", "spacerobot_image.xml"
 19 | # )
 20 | 
 21 | class Agent:
 22 |     def __init__(self, n_actions,   model_name_actor : str, model_name_critic : str, \
 23 |                 gamma = 0.99, alpha = 0.0003, gae_lambda = 0.95,  \
 24 |                 policy_clip = 0.1, n_epoch = 10,  batch_size = 64):
 25 |         '''
 26 |         parameter 
 27 |             arguments:
 28 |                 - model_name_actor : model name for actor to be used in model savind directory
 29 |                 - model_name_critic :model name for critic to be used in model savind directory
 30 |         '''
 31 |         #self, n_actions, gae_lamda = 0.95, gamma = 0.99, alpha = 0 .0003, policy_clip = 0.2, batch_size = 64, N = 2048 , n_epoch = 10
 32 |         self.gamma = gamma
 33 |         self.gae_lambda = gae_lambda
 34 |         self.policy_clip = policy_clip
 35 |         self.n_epoch = n_epoch
 36 | 
 37 |         self.actor = ActorNetwork(n_actions, alpha, model_name = model_name_actor)
 38 |         self.critic = CriticNetwork(alpha, model_name  = model_name_critic)
 39 |         self.memory_handler = PPOMemory( batch_size )
 40 | 
 41 |     def remember(self, state, action, probs, vals, reward, done):
 42 |         self.memory_handler.store_memory(state, action, probs, vals, reward, done)
 43 | 
 44 |     def save_models(self):
 45 |         print("Saving models now")
 46 |         self.actor.save_checkpoint()
 47 |         self.critic.save_checkpoint()
 48 | 
 49 |     def load_model(self):
 50 |         print("Load model")
 51 |         self.actor.load_checkpoint()
 52 |         self.critic.load_checkpoint()
 53 | 
 54 |     def play_optimal(self, observation):
 55 |         with T.no_grad():
 56 |             state = T.tensor([observation], dtype=T.float).to(self.actor.device)
 57 |             dist = self.actor(state)
 58 |             # action shoulnt be sampe it should be arg max
 59 |             action = dist.sample()
 60 |             action =T.squeeze(action).item()
 61 |             return action
 62 | 
 63 |     def choose_action(self, observation):
 64 |         observation = np.array(observation)
 65 |         state = T.tensor([observation], dtype=T.float).to(self.actor.device)
 66 |         dist = self.actor(state)
 67 |         value  = self.critic(state)
 68 | 
 69 |         action = dist.sample()
 70 | 
 71 |         # this is equivalent to the reinforce algorithm of probablity distribition
 72 |         probs = T.squeeze(dist.log_prob(action)).item() 
 73 | 
 74 |         action =T.squeeze(action).item()
 75 |         value =T.squeeze(value).item()
 76 | 
 77 |         return action, probs , value
 78 | 
 79 |     def learn(self):
 80 |         for _ in range(self.n_epoch):
 81 | 
 82 |             state_arr, action_arr, old_prob_arr, vals_arr,\
 83 |             reward_arr, dones_arr, batches = \
 84 |                     self.memory_handler.generate_batches()
 85 | 
 86 |             values = vals_arr.copy()
 87 |             advantage = np.zeros(len(reward_arr), dtype=np.float32)
 88 | 
 89 |             for t in range(len(reward_arr)-1):
 90 |                 discount = 0.95
 91 |                 a_t = 0
 92 |                 for k in range(t, len(reward_arr)-1):
 93 |                     a_t += discount*(reward_arr[k] + self.gamma*values[k+1]*\
 94 |                             (1-int(dones_arr[k])) - values[k])
 95 |                     discount *= self.gamma*self.gae_lambda
 96 |                 advantage[t] = a_t
 97 |             advantage = T.tensor(advantage).to(self.actor.device)
 98 | 
 99 |             values = T.tensor(values).to(self.actor.device)
100 |             for batch in batches:
101 |                 states = T.tensor(state_arr[batch], dtype=T.float).to(self.actor.device)
102 |                 old_probs = T.tensor(old_prob_arr[batch]).to(self.actor.device)
103 |                 actions = T.tensor(action_arr[batch]).to(self.actor.device)
104 | 
105 |                 dist = self.actor(states)
106 |                 critic_value = self.critic(states)
107 | 
108 |                 critic_value = T.squeeze(critic_value)
109 | 
110 |                 new_probs = dist.log_prob(actions)
111 |                 prob_ratio = new_probs.exp() / old_probs.exp()
112 |                 #prob_ratio = (new_probs - old_probs).exp()
113 |                 weighted_probs = advantage[batch] * prob_ratio
114 |                 weighted_clipped_probs = T.clamp(prob_ratio, 1-self.policy_clip,
115 |                         1+self.policy_clip)*advantage[batch]
116 |                 actor_loss = -T.min(weighted_probs, weighted_clipped_probs).mean()
117 | 
118 |                 returns = advantage[batch] + values[batch]
119 |                 critic_loss = (returns-critic_value)**2
120 |                 critic_loss = critic_loss.mean()
121 | 
122 |                 total_loss = actor_loss + 0.5*critic_loss
123 |                 self.actor.optimizer.zero_grad()
124 |                 self.critic.optimiser.zero_grad()
125 |                 # print("total loss", total_loss.item())
126 |                 total_loss.backward()
127 |                 self.actor.optimizer.step()
128 |                 self.critic.optimiser.step()
129 | 
130 |         self.memory_handler.clear_memory()               
131 | 


--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Discrete/PPOImage/critic.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing.context import BaseContext
 2 | import os
 3 | import numpy as np
 4 | import torch as T
 5 | import torch.nn as nn
 6 | import torch.optim as optim
 7 | from torch.distributions.categorical import Categorical
 8 | PATH = os.getcwd()
 9 | class CriticNetwork(nn.Module):
10 |     def __init__(self, alpha, model_name : str ,\
11 |       check_point_base_dir = 'RL_algorithms/Torch/PPOImage/models/') -> None:
12 |         super(CriticNetwork, self).__init__()
13 | 
14 |         check_point_base_dir = os.path.join( PATH , check_point_base_dir )
15 |         self.check_point_file = os.path.join(check_point_base_dir, model_name)
16 |         self.critic  = nn.Sequential(
17 |            nn.Conv2d(in_channels = 3, out_channels = 32, kernel_size=5,  stride=1),
18 |                 nn.ReLU(),
19 |                 nn.BatchNorm2d(32),
20 |                 nn.ReLU(),
21 |                 nn.MaxPool2d(2,2),
22 | 
23 |                 # nn.Conv2d(in_channels = 32, out_channels = 64, kernel_size=5,  stride=1),
24 |                 # nn.ReLU(),
25 |                 # nn.BatchNorm2d(64),
26 |                 # nn.ReLU(),
27 |                 # nn.MaxPool2d(2,2),
28 | 
29 |                 # nn.Conv2d(in_channels = 64, out_channels = 64, kernel_size=5,  stride=1),
30 |                 # nn.ReLU(),
31 |                 # nn.BatchNorm2d(64),
32 |                 # nn.ReLU(),
33 |                 # nn.MaxPool2d(2,2),
34 | 
35 |                 nn.Flatten(),
36 |                 nn.Linear(28800, 512),
37 |                 nn.ReLU(),
38 |                 nn.Linear(512, 64),
39 |                 nn.ReLU(),
40 |                 nn.Linear(64, 1),
41 | 
42 |         )
43 |         self.optimiser = optim.Adam(self.parameters(), lr = alpha)
44 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
45 |         self.to(self.device)
46 | 
47 |     def forward(self, state):
48 |         value = self.critic(state)
49 |         return value
50 | 
51 |     def save_checkpoint(self):
52 |         T.save(self.state_dict(), self.check_point_file)
53 | 
54 |     def load_checkpoint(self):
55 |         self.load_state_dict(T.load(self.check_point_file))


--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Discrete/PPOImage/main.py:
--------------------------------------------------------------------------------
 1 | from turtle import shape
 2 | import gym
 3 | import numpy as np
 4 | from agent import Agent
 5 | 
 6 | from utils import plot_learning_curve
 7 | 
 8 | from SpaceRobotEnv.envs import SpaceRobotImage
 9 | 
10 | import numpy as np
11 | 
12 | 
13 | 
14 | if __name__ == '__main__':
15 |     # env = SpaceRobotImage()
16 |     env = SpaceRobotImage()
17 |     #N = 20
18 |     N = 20
19 |     batch_size = 5 
20 |     n_epochs = 4 
21 |     alpha = 0.0003
22 |     action_space = env.action_space.shape[0]    
23 |     agent = Agent(  n_actions = action_space, 
24 |                     batch_size=batch_size, 
25 |                     alpha = alpha,
26 |                     n_epoch = n_epochs, 
27 |                     model_name_actor = "space_robot_actor.pt",
28 |                     model_name_critic = "space_robot_critic.pt")
29 |     n_iter = 300
30 |     figure_file = 'RL_algorithms/Torch/PPOImage/plots/space_robot_performance.png'
31 |     best_score = env.reward_range[0]
32 |     score_history = []
33 |     n_steps = 0
34 |     learn_iters = 0
35 |     avg_score = 0
36 | 
37 |     for i in range(n_iter):
38 |         obs = env.reset()
39 |         observation = obs["rawimage"].reshape(3, 64, 64)
40 |         done = False
41 |         score = 0
42 |         while not done:
43 |             action, prob, val = agent.choose_action(observation)
44 |             observation_, reward, done, info = env.step(action)
45 |             n_steps += 1
46 |             score += reward
47 | 
48 |             agent.remember(observation,  action, prob, val, reward, done)
49 |             #steps before we begin learning 20
50 |             if n_steps % N ==0:
51 |                 agent.learn()
52 |                 learn_iters += 1
53 |             observation = observation_["rawimage"].reshape(3, 64, 64)
54 |            
55 |         print("done")
56 |         score_history.append(score)
57 |         avg_score = np.mean(score_history[-100:])
58 | 
59 |         if avg_score>best_score:
60 |             best_score= avg_score
61 |             agent.save_models()
62 |         print('episode', i , 'score %.1f',score,  'avg_score %.1f' %avg_score,
63 |         'time_steps',n_steps, 'learning_steps', learn_iters)
64 | 
65 |     x = [i+1 for i in range(len(score_history))]
66 |     plot_learning_curve(x, score_history,figure_file)
67 |     env.close()
68 | 


--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Discrete/PPOImage/memory.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing.context import BaseContext
 2 | import os
 3 | import numpy as np
 4 | import torch as T
 5 | import torch.nn as nn
 6 | import torch.optim as optim
 7 | 
 8 | class Mem:
 9 |     def __init__(self, state , prob, val ,action, reward, done) -> None:
10 |         self.state = state
11 |         self.prob = prob
12 |         self.val = val
13 |         self.action = action
14 |         self.reward = reward
15 |         self.done = done
16 | 
17 | class PPOMemory:
18 |     def __init__(self, batch_size):
19 |         self.states = []
20 |         self.probs = []
21 |         self.vals = []
22 |         self.actions = []
23 |         self.rewards = []
24 |         self.dones = []
25 | 
26 |         self.batch_size = batch_size
27 | 
28 |     def generate_batches(self):
29 |         n_states = len(self.states)
30 |         batch_start = np.arange(0, n_states, self.batch_size)
31 |         indices = np.arange(n_states, dtype=np.int64)
32 |         np.random.shuffle(indices)
33 |         batches = [indices[i:i+self.batch_size] for i in batch_start]
34 | 
35 |         return np.array(self.states),\
36 |                 np.array(self.actions),\
37 |                 np.array(self.probs),\
38 |                 np.array(self.vals),\
39 |                 np.array(self.rewards),\
40 |                 np.array(self.dones),\
41 |                 batches
42 | 
43 |     def store_memory(self, state, action, probs, vals, reward, done):
44 |         self.states.append(state)
45 |         self.actions.append(action)
46 |         self.probs.append(probs)
47 |         self.vals.append(vals)
48 |         self.rewards.append(reward)
49 |         self.dones.append(done)
50 | 
51 |     def clear_memory(self):
52 |         self.states = []
53 |         self.probs = []
54 |         self.actions = []
55 |         self.rewards = []
56 |         self.dones = []
57 |         self.vals = []
58 | 


--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Discrete/PPOImage/models/.space_robot_actor.pt.icloud:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Discrete/PPOImage/models/.space_robot_actor.pt.icloud


--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Discrete/PPOImage/models/.space_robot_critic.pt.icloud:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Discrete/PPOImage/models/.space_robot_critic.pt.icloud


--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Discrete/PPOImage/plots/space_robot_performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Discrete/PPOImage/plots/space_robot_performance.png


--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Discrete/PPOImage/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | def plot_learning_curve(x, scores, figure_file):
 5 |     running_avg = np.zeros(len(scores))
 6 |     for i in range(len(running_avg)):
 7 |         running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
 8 |     plt.plot(x, running_avg)
 9 |     plt.title('Running average of previous 100 scores')
10 |     plt.savefig(figure_file)


--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/__init__.py


--------------------------------------------------------------------------------
/RL_algorithms/Torch/SAC/SAC_ENV/core.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy.signal
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | from torch.distributions.normal import Normal
  8 | 
  9 | 
 10 | def combined_shape(length, shape=None):
 11 |     if shape is None:
 12 |         return (length,)
 13 |     return (length, shape) if np.isscalar(shape) else (length, *shape)
 14 | 
 15 | def mlp(sizes, activation, output_activation=nn.Identity):
 16 |     # converts array of layer shape to neural net
 17 |     layers = []
 18 |     for j in range(len(sizes)-1):
 19 |         act = activation if j < len(sizes) -2 else output_activation
 20 |         layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
 21 |     return nn.Sequential(*layers)
 22 | 
 23 | def count_vars(module):
 24 |     return sum([np.prod(p.shape) for p in module.parameters()])
 25 | 
 26 | 
 27 | LOG_STD_MAX = 2
 28 | LOG_STD_MIN = -20
 29 | 
 30 | class SquashedGaussianMLPActor(nn.Module):
 31 | 
 32 |     def __init__(self, obs_dim, act_dim, hidden_sizes, activation, act_limit):
 33 |         super().__init__()
 34 |         self.net = mlp([obs_dim] + list(hidden_sizes), activation, activation)
 35 |         self.mu_layer = nn.Linear(hidden_sizes[-1], act_dim)
 36 |         self.log_std_layer = nn.Linear(hidden_sizes[-1], act_dim)
 37 |         self.act_limit = act_limit
 38 | 
 39 |     def forward(self, obs, deterministic=False, with_logprob=True):
 40 |         net_out = self.net(obs)
 41 |         mu = self.mu_layer(net_out)
 42 |         log_std = self.log_std_layer(net_out)
 43 |         log_std = torch.clamp(log_std, LOG_STD_MIN, LOG_STD_MAX)
 44 |         std = torch.exp(log_std)
 45 | 
 46 |         # Pre-squash distribution and sample
 47 |         pi_distribution = Normal(mu, std)
 48 |         if deterministic:
 49 |             # Only used for evaluating policy at test time.
 50 |             pi_action = mu
 51 |         else:
 52 |             pi_action = pi_distribution.rsample()
 53 | 
 54 |         if with_logprob:
 55 |             # Compute logprob from Gaussian, and then apply correction for Tanh squashing.
 56 |             # NOTE: The correction formula is a little bit magic. To get an understanding 
 57 |             # of where it comes from, check out the original SAC paper (arXiv 1801.01290) 
 58 |             # and look in appendix C. This is a more numerically-stable equivalent to Eq 21.
 59 |             # Try deriving it yourself as a (very difficult) exercise. :)
 60 |             logp_pi = pi_distribution.log_prob(pi_action).sum(axis=-1)
 61 |             logp_pi -= (2*(np.log(2) - pi_action - F.softplus(-2*pi_action))).sum(axis=1)
 62 |         else:
 63 |             logp_pi = None
 64 | 
 65 |         pi_action = torch.tanh(pi_action)
 66 |         pi_action = self.act_limit * pi_action
 67 | 
 68 |         return pi_action, logp_pi
 69 | 
 70 | 
 71 | class MLPQFunction(nn.Module):
 72 | 
 73 |     def __init__(self, obs_dim, act_dim, hidden_sizes, activation):
 74 |         super().__init__()
 75 |         self.q = mlp([obs_dim + act_dim] + list(hidden_sizes) + [1], activation)
 76 | 
 77 |     def forward(self, obs, act):
 78 |         q = self.q(torch.cat([obs, act], dim=-1))
 79 |         return torch.squeeze(q, -1) # Critical to ensure q has right shape.
 80 | 
 81 | class MLPActorCritic(nn.Module):
 82 | 
 83 |     def __init__(self, observation_space, action_space, hidden_sizes=(256,256),
 84 |                  activation=nn.ReLU):
 85 |         super().__init__()
 86 | 
 87 |         obs_dim = observation_space.shape[0]
 88 |         act_dim = action_space.shape[0]
 89 |         act_limit = action_space.high[0]
 90 | 
 91 |         # build policy and value functions
 92 |         self.pi = SquashedGaussianMLPActor(obs_dim, act_dim, hidden_sizes, activation, act_limit)
 93 |         self.q1 = MLPQFunction(obs_dim, act_dim, hidden_sizes, activation)
 94 |         self.q2 = MLPQFunction(obs_dim, act_dim, hidden_sizes, activation)
 95 | 
 96 |     def act(self, obs, deterministic=False):
 97 |         with torch.no_grad():
 98 |             a, _ = self.pi(obs, deterministic, False)
 99 |             return a.numpy()
100 | 


--------------------------------------------------------------------------------
/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658847118.Tosins-Air.19214.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658847118.Tosins-Air.19214.0


--------------------------------------------------------------------------------
/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658847140.Tosins-Air.19431.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658847140.Tosins-Air.19431.0


--------------------------------------------------------------------------------
/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658847454.Tosins-Air.19535.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658847454.Tosins-Air.19535.0


--------------------------------------------------------------------------------
/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658847513.Tosins-Air.19931.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658847513.Tosins-Air.19931.0


--------------------------------------------------------------------------------
/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658847612.Tosins-Air.19979.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658847612.Tosins-Air.19979.0


--------------------------------------------------------------------------------
/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658847918.Tosins-Air.20089.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658847918.Tosins-Air.20089.0


--------------------------------------------------------------------------------
/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658848049.Tosins-Air.20232.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658848049.Tosins-Air.20232.0


--------------------------------------------------------------------------------
/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658848339.Tosins-Air.20384.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658848339.Tosins-Air.20384.0


--------------------------------------------------------------------------------
/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658848364.Tosins-Air.20423.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658848364.Tosins-Air.20423.0


--------------------------------------------------------------------------------
/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658848673.Tosins-Air.20649.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658848673.Tosins-Air.20649.0


--------------------------------------------------------------------------------
/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658848831.Tosins-Air.20793.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658848831.Tosins-Air.20793.0


--------------------------------------------------------------------------------
/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658849191.Tosins-Air.20924.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658849191.Tosins-Air.20924.0


--------------------------------------------------------------------------------
/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658849218.Tosins-Air.20984.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658849218.Tosins-Air.20984.0


--------------------------------------------------------------------------------
/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658849777.Tosins-Air.21229.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658849777.Tosins-Air.21229.0


--------------------------------------------------------------------------------
/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658849785.Tosins-Air.21269.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658849785.Tosins-Air.21269.0


--------------------------------------------------------------------------------
/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658849885.Tosins-Air.21429.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658849885.Tosins-Air.21429.0


--------------------------------------------------------------------------------
/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658849941.Tosins-Air.21521.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658849941.Tosins-Air.21521.0


--------------------------------------------------------------------------------
/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658850278.Tosins-Air.21678.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658850278.Tosins-Air.21678.0


--------------------------------------------------------------------------------
/RL_algorithms/Torch/SAC/SAC_ENV/memory.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import core
 3 | import torch
 4 | 
 5 | class ReplayBuffer:
 6 |     """
 7 |     A simple FIFO experience replay buffer for SAC agents.
 8 |     """
 9 | 
10 |     def __init__(self, obs_dim, act_dim, size):
11 |         self.obs_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32)
12 |         self.obs2_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32)
13 |         self.act_buf = np.zeros(core.combined_shape(size, act_dim), dtype=np.float32)
14 |         self.rew_buf = np.zeros(size, dtype=np.float32)
15 |         self.done_buf = np.zeros(size, dtype=np.float32)
16 |         self.ptr, self.size, self.max_size = 0, 0, size
17 | 
18 |     def store(self, obs, act, rew, next_obs, done):
19 |         self.obs_buf[self.ptr] = obs
20 |         self.obs2_buf[self.ptr] = next_obs
21 |         self.act_buf[self.ptr] = act
22 |         self.rew_buf[self.ptr] = rew
23 |         self.done_buf[self.ptr] = done
24 |         self.ptr = (self.ptr+1) % self.max_size
25 |         self.size = min(self.size+1, self.max_size)
26 | 
27 |     def sample_batch(self, batch_size=32):
28 |         idxs = np.random.randint(0, self.size, size=batch_size)
29 |         batch = dict(obs=self.obs_buf[idxs],
30 |                      obs2=self.obs2_buf[idxs],
31 |                      act=self.act_buf[idxs],
32 |                      rew=self.rew_buf[idxs],
33 |                      done=self.done_buf[idxs])
34 |         return {k: torch.as_tensor(v, dtype=torch.float32) for k,v in batch.items()}


--------------------------------------------------------------------------------
/RL_algorithms/Torch/SAC/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/__init__.py


--------------------------------------------------------------------------------
/RL_algorithms/Torch/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/__init__.py


--------------------------------------------------------------------------------
/RL_algorithms/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/__init__.py


--------------------------------------------------------------------------------
/RL_algorithms/utils/mpi_tools.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | def allreduce(*args, **kwargs):
 3 |     return MPI.COMM_WORLD.Allreduce(*args, **kwargs)
 4 | 
 5 | def mpi_op(x, op):
 6 |     x, scalar = ([x], True) if np.isscalar(x) else (x, False)
 7 |     x = np.asarray(x, dtype=np.float32)
 8 |     buff = np.zeros_like(x, dtype=np.float32)
 9 |     allreduce(x, buff, op=op)
10 |     return buff[0] if scalar else 
11 | 
12 | def mpi_statistics_scalar(x, with_min_and_max=False):
13 |     """
14 |     Get mean/std and optional min/max of scalar x across MPI processes.
15 |     Args:
16 |         x: An array containing samples of the scalar to produce statistics
17 |             for.
18 |         with_min_and_max (bool): If true, return min and max of x in 
19 |             addition to mean and std.
20 |     """
21 |     x = np.array(x, dtype=np.float32)
22 |     global_sum, global_n = mpi_sum([np.sum(x), len(x)])
23 |     mean = global_sum / global_n
24 | 
25 |     global_sum_sq = mpi_sum(np.sum((x - mean)**2))
26 |     std = np.sqrt(global_sum_sq / global_n)  # compute global std
27 | 
28 |     if with_min_and_max:
29 |         global_min = mpi_op(np.min(x) if len(x) > 0 else np.inf, op=MPI.MIN)
30 |         global_max = mpi_op(np.max(x) if len(x) > 0 else -np.inf, op=MPI.MAX)
31 |         return mean, std, global_min, global_max
32 |     return mean, std


--------------------------------------------------------------------------------
/Simulation.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/Simulation.jpg


--------------------------------------------------------------------------------
/SpaceRobotEnv/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/.DS_Store


--------------------------------------------------------------------------------
/SpaceRobotEnv/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from gym.envs.registration import register
 3 | 
 4 | logger = logging.getLogger(__name__)
 5 | 
 6 | 
 7 | register(
 8 |     id="SpaceRobotState-v0",
 9 |     entry_point="SpaceRobotEnv.envs:SpaceRobotState",
10 |     max_episode_steps=512,
11 | )
12 | 
13 | register(
14 |     id="SpaceRobotImage-v0",
15 |     entry_point="SpaceRobotEnv.envs:SpaceRobotImage",
16 |     max_episode_steps=512,
17 | )
18 | 
19 | register(
20 |     id="SpaceRobotDualArm-v0",
21 |     entry_point="SpaceRobotEnv.envs:SpaceRobotDualArm",
22 |     max_episode_steps=512,
23 | )
24 | 
25 | register(
26 |     id="SpaceRobotPointCloud-v0",
27 |     entry_point="SpaceRobotEnv.envs:SpaceRobotPointCloud",
28 |     max_episode_steps=512,
29 | )
30 | 
31 | register(
32 |     id="SpaceRobotCost-v0",
33 |     entry_point="SpaceRobotEnv.envs:SpaceRobotCost",
34 |     max_episode_steps=512,
35 | )
36 | 
37 | register(
38 |     id="SpaceRobotReorientation-v0",
39 |     entry_point="SpaceRobotEnv.envs:SpaceRobotReorientation",
40 |     max_episode_steps=512,
41 | )


--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/assets/.DS_Store


--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/common/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The dm_control Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | 
16 | """Functions to manage the common assets for domains."""
17 | 
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 | 
22 | import os
23 | from dm_control.utils import resources
24 | 
25 | _SUITE_DIR = os.path.dirname(os.path.dirname(__file__))
26 | _FILENAMES = [
27 |     "common/materials.xml",
28 |     "common/skybox.xml",
29 |     "common/visual.xml",
30 | ]
31 | 
32 | ASSETS = {filename: resources.GetResource(os.path.join(_SUITE_DIR, filename))
33 |           for filename in _FILENAMES}
34 | 
35 | 
36 | def read_model(model_filename):
37 |   """Reads a model XML file and returns its contents as a string."""
38 |   return resources.GetResource(os.path.join(_SUITE_DIR, model_filename))
39 | 


--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/common/materials.xml:
--------------------------------------------------------------------------------
 1 | <!--
 2 | Common textures, colors and materials to be used throughout this suite. Some
 3 | materials such as xxx_highlight are activated on occurence of certain events,
 4 | for example receiving a positive reward.
 5 | -->
 6 | <mujoco>
 7 |     <asset>
 8 |         <texture name="grid" type="2d" builtin="checker" rgb1=".1 .2 .3" rgb2=".2 .3 .4" width="300" height="300" mark="edge" markrgb=".2 .3 .4" />
 9 |         <material name="grid" texture="grid" texrepeat="1 1" texuniform="true" reflectance=".2" />
10 |         <material name="self" rgba=".7 .5 .3 1" />
11 |         <material name="self_default" rgba=".7 .5 .3 1" />
12 |         <material name="self_highlight" rgba="0 .5 .3 1" />
13 |         <material name="effector" rgba=".7 .4 .2 1" />
14 |         <material name="effector_default" rgba=".7 .4 .2 1" />
15 |         <material name="effector_highlight" rgba="0 .5 .3 1" />
16 |         <material name="decoration" rgba=".3 .5 .7 1" />
17 |         <material name="eye" rgba="0 .2 1 1" />
18 |         <material name="target" rgba=".6 .3 .3 1" />
19 |         <material name="target_default" rgba=".6 .3 .3 1" />
20 |         <material name="target_highlight" rgba=".6 .3 .3 .4" />
21 |     </asset>
22 | </mujoco>


--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/common/skybox.xml:
--------------------------------------------------------------------------------
1 | <mujoco>
2 |     <asset>
3 |         <texture name="skybox" type="skybox" builtin="gradient" rgb1=".4 .6 .8" rgb2="0 0 0" width="800" height="800" mark="random" markrgb="1 1 1" />
4 |     </asset>
5 | </mujoco>


--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/common/visual.xml:
--------------------------------------------------------------------------------
1 | <mujoco>
2 |     <visual>
3 |         <headlight ambient=".4 .4 .4" diffuse=".8 .8 .8" specular="0.1 0.1 0.1" />
4 |         <map znear=".01" />
5 |         <quality shadowsize="2048" />
6 |     </visual>
7 | </mujoco>


--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/spacerobot/arm_v3.xml:
--------------------------------------------------------------------------------
 1 | <!-- unit quat in mujoco xml is "1 0 0 0"  [ w, x, y, z]-->
 2 | <!-- "0.707107 0 0.707107 0" = [ x: 0, y: 1.5707963(90), z: 0 ]-->
 3 | <!-- "0.5 0.5 -0.5 0.5" = [ x: 1.5707963(90), y: 0, z: 1.5707963(90) ]-->
 4 | <mujoco>
 5 |     <body name="base_link" pos="0 0 0" euler="0 0 3.1415927">
 6 |         <!--base_link is fixed in the base so that the motion of the arm will affect the base-->
 7 |         <inertial pos="-0.00083 0 0.00758" mass="4" diaginertia="0.00443333 0.00443333 0.0072" />
 8 |         <geom name="v_base_link" type="mesh" group="1" mesh="v_base" euler="0 0 -1.5707963" />
 9 | 
10 |         <!-- joint 1 : a joint creates motion degrees of freedom between the body where it is defined and the body's parent
11 |             joint 1 defines the relation of the motions between link1 and link0-->
12 |         <body name="shoulder_link" pos="0 0 0.089159" euler="1.5707963 0 0">
13 |             <inertial pos="0 -0.02561 0.00193" mass="3.7" diaginertia="0.0067 0.0064 0.0067" />
14 |             <joint class="arm1" name="arm:shoulder_pan_joint" axis="0 1 0" pos="0 -0.02561 0" />
15 |             <geom name="v_shoulder_link" type="mesh" group="1" mesh="v_shoulder" euler="1.5707963 3.1415927 0" />
16 |             <site name="shoulder_site" pos="0 0 .03" type="box" size=".02 .02 .02" />
17 | 
18 |             <body name="upper_arm_link" pos="-0.425 0 0">
19 |                 <inertial pos="0.2125 0 0.11336" mass="8.393" diaginertia="0.0149 0.3564 0.3553" />
20 |                 <joint class="arm1" name="arm:shoulder_lift_joint" axis="0 0 1" pos="0.425 0 0.13585" />
21 |                 <geom name="v_upper_arm_link" type="mesh" group="1" mesh="v_upperarm" pos="0.425 0 0.13585" euler="1.5707963 -1.5707963 0" />
22 |                 <site name="upperarm_site" pos="0 0 .03" type="box" size=".02 .02 .02" />
23 | 
24 |                 <body name="forearm_link" pos="-0.39225 0 0">
25 |                     <inertial pos="0.11993 0 0.0265" mass="2.275" fullinertia="0.0025 0.0551 0.0546 0 0.0034 0" />
26 |                     <joint class="arm1" name="arm:elbow_joint" pos="0.39225 0 0.13585" axis="0 0 1" />
27 |                     <geom name="v_forearm_link" type="mesh" group="1" mesh="v_forearm" pos="0.39225 0 0.0161" euler="1.5707963 -1.5707963 0" />
28 |                     <site name="forearm_site" pos="0 0 .03" type="box" size=".02 .02 .02" />
29 | 
30 |                     <body name="wrist_1_link" pos="0 0 0.093" euler="1.5707963 0 0">
31 |                         <inertial pos="0 -0.0018 0.01634" mass="1.219" diaginertia="0.0012 0.0012 0.0009" />
32 |                         <joint class="arm2" name="arm:wrist_1_joint" pos="0 -0.093 0" axis="0 1 0" />
33 |                         <geom name="v_wrist_1_link" type="mesh" group="1" mesh="v_wrist1" pos="0 -0.093 0" />
34 |                         <site name="wrist1_site" pos="0 0 .03" type="box" size=".01 .01 .01" />
35 | 
36 |                         <body name="wrist_2_link" pos="0 0 0.09465" euler="-1.5707963 0 0">
37 |                             <inertial pos="0 0.0018 0.01634" mass="1.219" diaginertia="0.0012 0.0012 0.0009" />
38 |                             <joint class="arm2" name="arm:wrist_2_joint" pos="0 0.09465 0" axis="0 -1 0" />
39 |                             <geom name="v_wrist_2_link" type="mesh" group="1" mesh="v_wrist2" pos="0 0.09465 0" euler="1.5707963 0 0" />
40 |                             <site name="wrist2_site" pos="0 0 .03" type="box" size=".01 .01 .01" />
41 | 
42 |                             <body name="wrist_3_link" pos="0 0 0.0823">
43 |                                 <inertial pos="0 0 -0.01159" mass="0.1879" diaginertia="0.0000884489143 0.0000884489143 0.000138534912" />
44 |                                 <joint class="arm2" name="arm:wrist_3_joint" pos="0 0 -0.0823" axis="0 0 1" />
45 |                                 <geom name="v_wrist_3_link" type="mesh" group="1" mesh="v_wrist3" pos="0 0 -0.0823" euler="1.5707963 0 0" />
46 |                                 <site name="wrist3_site" pos="0 0 .03" type="box" size=".01 .01 .01" />
47 | 
48 |                                 <body name="tip_frame" pos="0 0 0.06">
49 |                                     <site name="tip_frame:x" pos="0.0618 0 0" size="0.01 0.0618" type="cylinder" rgba="1 0 0 1" euler="0 1.5707963 0" />
50 |                                     <site name="tip_frame:y" pos="0 0.0618 0" size="0.01 0.0618" type="cylinder" rgba="0 1 0 1" euler="-1.5707963 0 0" />
51 |                                     <site name="tip_frame:z" pos="0 0 0.0618" size="0.01 0.0618" type="cylinder" rgba="0 0 1 1" />
52 |                                 </body>
53 |                             </body>
54 |                         </body>
55 |                     </body>
56 |                 </body>
57 |             </body>
58 |         </body>
59 |     </body>
60 | 
61 | </mujoco>


--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/spacerobot/arm_v31.xml:
--------------------------------------------------------------------------------
 1 | <!-- unit quat in mujoco xml is "1 0 0 0"  [ w, x, y, z]-->
 2 | <!-- "0.707107 0 0.707107 0" = [ x: 0, y: 1.5707963(90), z: 0 ]-->
 3 | <!-- "0.5 0.5 -0.5 0.5" = [ x: 1.5707963(90), y: 0, z: 1.5707963(90) ]-->
 4 | <mujoco>
 5 |     <body name="base_link1" pos="0 0 0" euler="0 0 3.1415927">
 6 |         <inertial pos="-0.00083 0 0.00758" mass="4" diaginertia="0.00443333 0.00443333 0.0072" />
 7 |         <geom name="v_base_link1" type="mesh" group="1" mesh="v_base" euler="0 0 -1.5707963" />
 8 | 
 9 |         <body name="shoulder_link1" pos="0 0 0.089159" euler="1.5707963 0 0">
10 |             <inertial pos="0 -0.02561 0.00193" mass="3.7" diaginertia="0.0067 0.0064 0.0067" />
11 |             <joint class="arm1" name="arm:shoulder_pan_joint1" axis="0 1 0" pos="0 -0.02561 0" />
12 |             <geom name="v_shoulder_link1" type="mesh" group="1" mesh="v_shoulder" euler="1.5707963 3.1415927 0" />
13 |             <site name="shoulder_site1" pos="0 0 .03" type="box" size=".02 .02 .02" />
14 | 
15 |             <body name="upper_arm_link1" pos="-0.425 0 0">
16 |                 <inertial pos="0.2125 0 0.11336" mass="8.393" diaginertia="0.0149 0.3564 0.3553" />
17 |                 <joint class="arm1" name="arm:shoulder_lift_joint1" axis="0 0 1" pos="0.425 0 0.13585" />
18 |                 <geom name="v_upper_arm_link1" type="mesh" group="1" mesh="v_upperarm" pos="0.425 0 0.13585" euler="1.5707963 -1.5707963 0" />
19 |                 <site name="upperarm_site1" pos="0 0 .03" type="box" size=".02 .02 .02" />
20 | 
21 |                 <body name="forearm_link1" pos="-0.39225 0 0">
22 |                     <inertial pos="0.11993 0 0.0265" mass="2.275" fullinertia="0.0025 0.0551 0.0546 0 0.0034 0" />
23 |                     <joint class="arm1" name="arm:elbow_joint1" pos="0.39225 0 0.13585" axis="0 0 1" />
24 |                     <geom name="v_forearm_link1" type="mesh" group="1" mesh="v_forearm" pos="0.39225 0 0.0161" euler="1.5707963 -1.5707963 0" />
25 |                     <site name="forearm_site1" pos="0 0 .03" type="box" size=".02 .02 .02" />
26 | 
27 |                     <body name="wrist_1_link1" pos="0 0 0.093" euler="1.5707963 0 0">
28 |                         <inertial pos="0 -0.0018 0.01634" mass="1.219" diaginertia="0.0012 0.0012 0.0009" />
29 |                         <joint class="arm2" name="arm:wrist_1_joint1" pos="0 -0.093 0" axis="0 1 0" />
30 |                         <geom name="v_wrist_1_link1" type="mesh" group="1" mesh="v_wrist1" pos="0 -0.093 0" />
31 |                         <site name="wrist1_site1" pos="0 0 .03" type="box" size=".01 .01 .01" />
32 | 
33 |                         <body name="wrist_2_link1" pos="0 0 0.09465" euler="-1.5707963 0 0">
34 |                             <inertial pos="0 0.0018 0.01634" mass="1.219" diaginertia="0.0012 0.0012 0.0009" />
35 |                             <joint class="arm2" name="arm:wrist_2_joint1" pos="0 0.09465 0" axis="0 -1 0" />
36 |                             <geom name="v_wrist_2_link1" type="mesh" group="1" mesh="v_wrist2" pos="0 0.09465 0" euler="1.5707963 0 0" />
37 |                             <site name="wrist2_site1" pos="0 0 .03" type="box" size=".01 .01 .01" />
38 | 
39 |                             <body name="wrist_3_link1" pos="0 0 0.0823">
40 |                                 <inertial pos="0 0 -0.01159" mass="0.1879" diaginertia="0.0000884489143 0.0000884489143 0.000138534912" />
41 |                                 <joint class="arm2" name="arm:wrist_3_joint1" pos="0 0 -0.0823" axis="0 0 1" />
42 |                                 <geom name="v_wrist_3_link1" type="mesh" group="1" mesh="v_wrist3" pos="0 0 -0.0823" euler="1.5707963 0 0" />
43 |                                 <site name="wrist3_site1" pos="0 0 .03" type="box" size=".01 .01 .01" />
44 | 
45 |                                 <body name="tip_frame1" pos="0 0 0.06">
46 |                                     <site name="tip_frame1:x" pos="0.0618 0 0" size="0.01 0.0618" type="cylinder" rgba="1 0 0 1" euler="0 1.5707963 0" />
47 |                                     <site name="tip_frame1:y" pos="0 0.0618 0" size="0.01 0.0618" type="cylinder" rgba="0 1 0 1" euler="-1.5707963 0 0" />
48 |                                     <site name="tip_frame1:z" pos="0 0 0.0618" size="0.01 0.0618" type="cylinder" rgba="0 0 1 1" />
49 |                                 </body>
50 |                             </body>
51 |                         </body>
52 |                     </body>
53 |                 </body>
54 |             </body>
55 |         </body>
56 |     </body>
57 | 
58 | </mujoco>


--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/spacerobot/asset.xml:
--------------------------------------------------------------------------------
 1 | <mujoco>
 2 |     <asset>
 3 |         <mesh name="v_base" file="stls/v_base.stl" />
 4 |         <mesh name="v_shoulder" file="stls/v_shoulder.stl" />
 5 |         <mesh name="v_upperarm" file="stls/v_upperarm.stl" />
 6 |         <mesh name="v_forearm" file="stls/v_forearm.stl" />
 7 |         <mesh name="v_wrist1" file="stls/v_wrist1.stl" />
 8 |         <mesh name="v_wrist2" file="stls/v_wrist2.stl" />
 9 |         <mesh name="v_wrist3" file="stls/v_wrist3.stl" />
10 |         <mesh name="cube" file="stls/cube.stl" />
11 |         <mesh name="R10" file="stls/R10.stl" scale="0.001 0.001 0.001" />
12 |     </asset>
13 | </mujoco>


--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/spacerobot/sensor.xml:
--------------------------------------------------------------------------------
 1 | <sensor>
 2 |     <touch name="gripperpalm_frc" site="gripperpalm" />
 3 |     <touch name="finger_1_link_1_frc" site="gf_1_link_1" />
 4 |     <touch name="finger_1_link_2_frc" site="gf_1_link_2" />
 5 |     <touch name="finger_1_link_3_frc" site="gf_1_link_3" />
 6 |     <touch name="finger_2_link_1_frc" site="gf_2_link_1" />
 7 |     <touch name="finger_2_link_2_frc" site="gf_2_link_2" />
 8 |     <touch name="finger_2_link_3_frc" site="gf_2_link_3" />
 9 |     <touch name="finger_m_link_1_frc" site="gf_m_link_1" />
10 |     <touch name="finger_m_link_2_frc" site="gf_m_link_2" />
11 |     <touch name="finger_m_link_3_frc" site="gf_m_link_3" />
12 | 
13 |     <framepos name="gripper_pos" objtype="site" objname="gripperpalm_middle" />
14 |     <framequat name="gripper_quat" objtype="site" objname="gripperpalm_middle" />
15 |     <framelinvel name="gripper_linvel" objtype="site" objname="gripperpalm_middle" />
16 |     <frameangvel name="gripper_angvel" objtype="site" objname="gripperpalm_middle" />
17 |     <framelinacc name="gripper_linacc" objtype="site" objname="gripperpalm_middle" />
18 |     <frameangacc name="gripper_angacc" objtype="site" objname="gripperpalm_middle" />
19 | 
20 |     <framepos name="target_pos" objtype="site" objname="target" />
21 |     <framequat name="target_quat" objtype="site" objname="target" />
22 |     <framelinvel name="target_linvel" objtype="site" objname="target" />
23 |     <frameangvel name="target_angvel" objtype="site" objname="target" />
24 |     <framelinacc name="target_linacc" objtype="site" objname="target" />
25 |     <frameangacc name="target_angacc" objtype="site" objname="target" />
26 | 
27 |     <framepos name="chasersat_pos" objtype="site" objname="chasersat" />
28 |     <framequat name="chasersat_quat" objtype="site" objname="chasersat" />
29 |     <framelinvel name="chasersat_linvel" objtype="site" objname="chasersat" />
30 |     <frameangvel name="chasersat_angvel" objtype="site" objname="chasersat" />
31 |     <framelinacc name="chasersat_linacc" objtype="site" objname="chasersat" />
32 |     <frameangacc name="chasersat_angacc" objtype="site" objname="chasersat" />
33 | 
34 |     <actuatorpos name="shoulder_pan_P_pos" actuator="shoulder_pan_P" />
35 |     <actuatorvel name="shoulder_pan_P_vel" actuator="shoulder_pan_P" />
36 |     <actuatorfrc name="shoulder_pan_P_frc" actuator="shoulder_pan_P" />
37 |     <actuatorfrc name="shoulder_lift_P_frc" actuator="shoulder_lift_P" />
38 |     <actuatorfrc name="forearm_P_frc" actuator="forearm_P" />
39 |     <actuatorfrc name="wrist_1_P_frc" actuator="wrist_1_P" />
40 |     <actuatorfrc name="wrist_2_P_frc" actuator="wrist_2_P" />
41 |     <actuatorfrc name="wrist_3_P_frc" actuator="wrist_3_P" />
42 | </sensor>


--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/spacerobot/spacerobot_cost.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <mujoco model="ur5">
 3 |     <compiler angle="radian" coordinate="local" eulerseq="xyz" />
 4 |     <option cone="elliptic" gravity="0 0 0" impratio="5" timestep="0.001" tolerance="1e-10" />
 5 |     <default class="main">
 6 |         <joint damping="0" armature="1" />
 7 |         <geom condim="4" solimp="0.99 0.99 0.01" solref="0.01 1" />
 8 |         <default class="arm1">
 9 |             <joint damping="0" />
10 |         </default>
11 |         <default class="arm2">
12 |             <joint damping="0" />
13 |         </default>
14 |     </default>
15 | 
16 |     <include file="../common/visual.xml" />
17 |     <include file="../common/skybox.xml" />
18 |     <include file="../common/materials.xml" />
19 |     <include file="asset.xml" />
20 | 
21 |     <worldbody>
22 |         <geom name="floor" size="5 5 .2" type="plane" material="grid" />
23 |         <camera mode="targetbody" name="cam_to_dockingring" pos="4 2 7" target="chasersat" />
24 |         <light dir="0 0 -1" directional="true" name="light1" pos="0 -1 6" />
25 |         <light dir="0 0 -1" directional="true" name="light2" pos="0 1 6" />
26 | 
27 |         <site name="target0" pos="0 0 4.5" size="0.05 0.05 0.05" rgba="1 0 0 0.3" type="sphere"></site>
28 | 
29 |         <body name="chasersat" pos="0 0 4">
30 |             <freejoint name="chasersat:joint" />
31 |             <geom name="chasersat" mass="80" pos="0 0 0" size="0.8726 0.8726 0.8726" type="box" />
32 |             <site name="chasersat" pos="0 0 0" euler="0 0 0" size="0.01 0.01 0.01" type="box" />
33 |             <body name="chasersat_base" pos="0.67 0.67 0.9526">
34 |                 <geom name="chasersat_base" mass="10" pos="0 0 -0.04" size="0.1 0.1 0.04" type="box" />
35 |                 <include file="arm_v3.xml" />
36 |             </body>
37 |             <!-- show the target frame -->
38 |             <body name="chasersat_frame" pos="0.47 0.47 0.9526">
39 |                 <site name="chasersat_frame:x" pos="0.0618 0 0" size="0.01 0.0618" type="cylinder" rgba="1 0 0 1" euler="0 1.5707963 0" />
40 |                 <site name="chasersat_frame:y" pos="0 0.0618 0" size="0.01 0.0618" type="cylinder" rgba="0 1 0 1" euler="-1.5707963 0 0" />
41 |                 <site name="chasersat_frame:z" pos="0 0 0.0618" size="0.01 0.0618" type="cylinder" rgba="0 0 1 1" />
42 |             </body>
43 |         </body>
44 | 
45 |         <!-- <body name="targetsat" pos="2 0 4" euler="0 0 3.1415927">
46 |             <freejoint name="targetsat:joint"/>
47 |             <geom name="targetsat" mass="79" pos="0 0 0" size="0.5 0.5 0.5" type="box"/>
48 |             <site name="targetsat" pos="0 0 0" euler="0 0 0" size="0.01 0.01 0.01" type="box"/>
49 |             <body name="targetsat_base" pos="0.54 0 0" euler="0 1.5707963 1.5707963">
50 |                 <geom name="targetsat_base" mass="2" pos="0 0 -0.04" size="0.3090 0.04" type="cylinder"/>
51 | 
52 |                 <body name="target_frame" pos="0 0 0.1" euler="0 3.1415927 0">
53 |                     <site name="target_frame:x" pos="0.0618 0 0" size="0.01 0.0618" type="cylinder" rgba="1 0 0 1" euler="0 1.5707963 0" />
54 |                     <site name="target_frame:y" pos="0 0.0618 0" size="0.01 0.0618" type="cylinder" rgba="0 1 0 1" euler="-1.5707963 0 0" />
55 |                     <site name="target_frame:z" pos="0 0 0.0618" size="0.01 0.0618" type="cylinder" rgba="0 0 1 1"/>
56 |                 </body>
57 |             </body>
58 |         </body> -->
59 | 
60 |         <include file="subgoal.xml" />
61 |     </worldbody>
62 | 
63 |     <!-- scalar_force = gain_term * (act or ctrl) + bias_term -->
64 |     <!-- default: fixed	gain_term = gainprm[0] -->
65 |     <!-- default: none	bias_term = 0 -->
66 |     <!--         affine	bias_term = biasprm[0] + biasprm[1]*length + biasprm[2]*velocity -->
67 |     <actuator>
68 |         <general biasprm="0 0 -100" biastype="affine" ctrllimited="true" ctrlrange="-2.0942 2.0942" forcelimited="true" forcerange="-150 150" gainprm="100 0 0" joint="arm:shoulder_pan_joint" name="arm:shoulder_pan_T" />
69 |         <general biasprm="0 0 -100" biastype="affine" ctrllimited="true" ctrlrange="-2.0942 2.0942" forcelimited="true" forcerange="-150 150" gainprm="100 0 0" joint="arm:shoulder_lift_joint" name="arm:shoulder_lift_T" />
70 |         <general biasprm="0 0 -100" biastype="affine" ctrllimited="true" ctrlrange="-2.0942 2.0942" forcelimited="true" forcerange="-150 150" gainprm="100 0 0" joint="arm:elbow_joint" name="arm:elbow_T" />
71 |         <general biasprm="0 0 -50" biastype="affine" ctrllimited="true" ctrlrange="-3.14 3.14" forcelimited="true" forcerange="-28 28" gainprm="50 0 0" joint="arm:wrist_1_joint" name="arm:wrist_1_T" />
72 |         <general biasprm="0 0 -50" biastype="affine" ctrllimited="true" ctrlrange="-3.14 3.14" forcelimited="true" forcerange="-28 28" gainprm="50 0 0" joint="arm:wrist_2_joint" name="arm:wrist_2_T" />
73 |         <general biasprm="0 0 -50" biastype="affine" ctrllimited="true" ctrlrange="-3.14 3.14" forcelimited="true" forcerange="-28 28" gainprm="50 0 0" joint="arm:wrist_3_joint" name="arm:wrist_3_T" />
74 |         <!-- <general ctrllimited="true" ctrlrange="-150 150" forcelimited="true" forcerange="-150 150" gainprm="1" joint="arm:shoulder_pan_joint" name="arm:shoulder_pan_T"/>
75 |         <general ctrllimited="true" ctrlrange="-150 150" forcelimited="true" forcerange="-150 150" gainprm="1" joint="arm:shoulder_lift_joint" name="arm:shoulder_lift_T"/>
76 |         <general ctrllimited="true" ctrlrange="-150 150" forcelimited="true" forcerange="-150 150" gainprm="1" joint="arm:elbow_joint" name="arm:elbow_T"/> -->
77 |         <!-- <general ctrllimited="true" ctrlrange="-28 28" forcelimited="true" forcerange="-28 28" gainprm="1" joint="arm:wrist_1_joint" name="arm:wrist_1_T"/>
78 |         <general ctrllimited="true" ctrlrange="-28 28" forcelimited="true" forcerange="-28 28" gainprm="1" joint="arm:wrist_2_joint" name="arm:wrist_2_T"/>
79 |         <general ctrllimited="true" ctrlrange="-28 28" forcelimited="true" forcerange="-28 28" gainprm="1" joint="arm:wrist_3_joint" name="arm:wrist_3_T"/> -->
80 |     </actuator>
81 | 
82 |     <sensor>
83 |         <!-- <torque name="shoulder_sensor" site="shoulder_site"/>
84 |         <torque name="upper_arm_sensor" site="upperarm_site"/>
85 |         <torque name="forearm_sensor" site="forearm_site"/>
86 |         <torque name="wrist_1_sensor" site="wrist_1_site"/>
87 |         <torque name="wrist_2_sensor" site="wrist_2_site"/>
88 |         <torque name="wrist_3_sensor" site="wrist_3_site"/> -->
89 |         <actuatorfrc name="shoulder_pan_P_frc" actuator="arm:shoulder_pan_T" />
90 |         <actuatorfrc name="shoulder_lift_P_frc" actuator="arm:shoulder_lift_T" />
91 |         <actuatorfrc name="forearm_P_frc" actuator="arm:elbow_T" />
92 |         <actuatorfrc name="wrist_1_P_frc" actuator="arm:wrist_1_T" />
93 |         <actuatorfrc name="wrist_2_P_frc" actuator="arm:wrist_2_T" />
94 |         <actuatorfrc name="wrist_3_P_frc" actuator="arm:wrist_3_T" />
95 |     </sensor>
96 | 
97 | 
98 | </mujoco>


--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/spacerobot/spacerobot_dualarm.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0"?>
  2 | <mujoco model="ur5">
  3 |     <compiler angle="radian" coordinate="local" eulerseq="xyz" />
  4 |     <option cone="elliptic" gravity="0 0 0" impratio="5" timestep="0.001" tolerance="1e-10" />
  5 |     <default class="main">
  6 |         <joint damping="0" armature="1" />
  7 |         <geom condim="4" solimp="0.99 0.99 0.01" solref="0.01 1" />
  8 |         <default class="arm1">
  9 |             <joint damping="0" />
 10 |         </default>
 11 |         <default class="arm2">
 12 |             <joint damping="0" />
 13 |         </default>
 14 |     </default>
 15 | 
 16 |     <include file="../common/visual.xml" />
 17 |     <include file="../common/skybox.xml" />
 18 |     <include file="../common/materials.xml" />
 19 |     <include file="asset.xml" />
 20 | 
 21 |     <worldbody>
 22 |         <geom name="floor" size="5 5 .2" type="plane" material="grid" />
 23 |         <!--        <geom name="circle" type="mesh"  group="1" mesh="R10" rgba="0 1 0 0.3" contype="0" conaffinity="0" euler="0 1.5707963 0"/>-->
 24 | 
 25 |         <camera mode="targetbody" name="cam_to_dockingring" pos="4 2 7" target="chasersat" />
 26 |         <light dir="0 0 -1" directional="true" name="light1" pos="0 -1 6" />
 27 |         <light dir="0 0 -1" directional="true" name="light2" pos="0 1 6" />
 28 | 
 29 |         <site name="target0" pos="0 0 4.5" size="0.04 0.04 0.04" rgba="1 0 0 0.4" type="box"></site>
 30 |         <site name="target1" pos="0 0 4.5" size="0.04 0.04 0.04" rgba="0 0 1 0.4" type="box"></site>
 31 | 
 32 | 
 33 |         <site name="body" pos="0 0 4" size="0.4726 0.4726 0.4726" rgba="1 0 0 0.4" type="box"></site>
 34 |         <body name="chasersat" pos="0 0 4">
 35 |             <freejoint name="chasersat:joint" />
 36 |             <geom name="chasersat" mass="400" pos="0 0 0" size="0.4726 0.4726 0.4726" type="box" />
 37 |             <site name="chasersat" pos="0 0 0" euler="0 0 0" size="0.01 0.01 0.01" type="box" />
 38 | 
 39 |             <body name="chasersat_base1" pos="0.3 0.5526 0.08" euler="-1.5707963 0 0">
 40 |                 <geom name="chasersat_base1" mass="10" pos="0 0 -0.04" size="0.1 0.1 0.04" type="box" />
 41 |                 <include file="arm_v3.xml" />
 42 |             </body>
 43 |             <body name="chasersat_base2" pos="0.3 -0.5526 0.08" euler="1.5707963 0 0">
 44 |                 <geom name="chasersat_base2" mass="10" pos="0 0 -0.04" size="0.1 0.1 0.04" type="box" />
 45 |                 <include file="arm_v31.xml" />
 46 |             </body>
 47 | 
 48 |             <body name="chasersat_frame" pos="0.07 0.07 0.5526">
 49 |                 <site name="chasersat_frame:x" pos="0.0618 0 0" size="0.01 0.0618" type="cylinder" rgba="1 0 0 1" euler="0 1.5707963 0" />
 50 |                 <site name="chasersat_frame:y" pos="0 0.0618 0" size="0.01 0.0618" type="cylinder" rgba="0 1 0 1" euler="-1.5707963 0 0" />
 51 |                 <site name="chasersat_frame:z" pos="0 0 0.0618" size="0.01 0.0618" type="cylinder" rgba="0 0 1 1" />
 52 |             </body>
 53 |         </body>
 54 | 
 55 |         <!--        <body name="targetsat" pos="1.5 0 4.3" euler="-1.5707963 -1.5707963 3.1415927">-->
 56 |         <!--&lt;!&ndash;            <freejoint name="targetsat:joint"/>&ndash;&gt;-->
 57 |         <!--            <geom name="targetsat" mass="79" pos="0 0 0" size="0.3 0.3 0.3" type="box" />-->
 58 |         <!--            <site name="targetsat" pos="0 0 0" euler="0 0 0" size="0.01 0.01 0.01" type="box"/>-->
 59 |         <!--            <body name="targetsat_base" pos="0.54 0 0" euler="0 1.5707963 1.5707963">-->
 60 |         <!--                &lt;!&ndash;geom name="targetsat_base" mass="2" pos="0 0 -0.04" size="0.3090 0.04" type="cylinder"/ contype="0" conaffinity="0"&ndash;&gt;-->
 61 |         <!--                <geom name="targethold_v1" mass="2" pos="0.1 0 -0.1975" size="0.025 0.025 0.043" type="box" />-->
 62 |         <!--                <geom name="targethold_h" mass="2" pos="0.0 0 -0.1375" size="0.025 0.125 0.025" euler="0 0 1.5707963" type="box" />-->
 63 |         <!--                <geom name="targethold_v2" mass="2" pos="-0.1 0 -0.1975" size="0.025 0.025 0.043" type="box" />-->
 64 | 
 65 |         <!--            <body name="target_frame" pos="0 0 -0.07" euler="3.1415927 0 -1.5707963">-->
 66 |         <!--                <site name="target_frame:x" pos="0.0618 0 0" size="0.01 0.0618" type="cylinder" rgba="1 0 0 0" euler="0 1.5707963 0" />-->
 67 |         <!--                <site name="target_frame:y" pos="0 0.0618 0" size="0.01 0.0618" type="cylinder" rgba="0 1 0 0" euler="-1.5707963 0 0" />-->
 68 |         <!--                <site name="target_frame:z" pos="0 0 0.0618" size="0.01 0.0618" type="cylinder" rgba="0 0 1 0"/>-->
 69 |         <!--            </body>-->
 70 |         <!--            </body>-->
 71 |         <!--        </body>-->
 72 | 
 73 |         <geom name="cube" type="mesh" group="1" mesh="cube" rgba="0 1 0 0.3" contype="0" conaffinity="0" euler="0 1.5707963 0" />
 74 | 
 75 |         <include file="subgoal.xml" />
 76 |     </worldbody>
 77 | 
 78 |     <!-- scalar_force = gain_term * (act or ctrl) + bias_term -->
 79 |     <!-- default: fixed	gain_term = gainprm[0] -->
 80 |     <!-- default: none	bias_term = 0 -->
 81 |     <!--         affine	bias_term = biasprm[0] + biasprm[1]*length + biasprm[2]*velocity -->
 82 |     <actuator>
 83 |         <general biasprm="0 0 -100" biastype="affine" ctrllimited="true" ctrlrange="-2.0942 2.0942" forcelimited="true" forcerange="-150 150" gainprm="100 0 0" joint="arm:shoulder_pan_joint" name="arm:shoulder_pan_T" />
 84 |         <general biasprm="0 0 -100" biastype="affine" ctrllimited="true" ctrlrange="-2.0942 2.0942" forcelimited="true" forcerange="-150 150" gainprm="100 0 0" joint="arm:shoulder_lift_joint" name="arm:shoulder_lift_T" />
 85 |         <general biasprm="0 0 -100" biastype="affine" ctrllimited="true" ctrlrange="-2.0942 2.0942" forcelimited="true" forcerange="-150 150" gainprm="100 0 0" joint="arm:elbow_joint" name="arm:elbow_T" />
 86 |         <general biasprm="0 0 -50" biastype="affine" ctrllimited="true" ctrlrange="-3.14 3.14" forcelimited="true" forcerange="-28 28" gainprm="50 0 0" joint="arm:wrist_1_joint" name="arm:wrist_1_T" />
 87 |         <general biasprm="0 0 -50" biastype="affine" ctrllimited="true" ctrlrange="-3.14 3.14" forcelimited="true" forcerange="-28 28" gainprm="50 0 0" joint="arm:wrist_2_joint" name="arm:wrist_2_T" />
 88 |         <general biasprm="0 0 -50" biastype="affine" ctrllimited="true" ctrlrange="-3.14 3.14" forcelimited="true" forcerange="-28 28" gainprm="50 0 0" joint="arm:wrist_3_joint" name="arm:wrist_3_T" />
 89 | 
 90 |         <general biasprm="0 0 -100" biastype="affine" ctrllimited="true" ctrlrange="-2.0942 2.0942" forcelimited="true" forcerange="-150 150" gainprm="100 0 0" joint="arm:shoulder_pan_joint1" name="arm:shoulder_pan_T1" />
 91 |         <general biasprm="0 0 -100" biastype="affine" ctrllimited="true" ctrlrange="-2.0942 2.0942" forcelimited="true" forcerange="-150 150" gainprm="100 0 0" joint="arm:shoulder_lift_joint1" name="arm:shoulder_lift_T1" />
 92 |         <general biasprm="0 0 -100" biastype="affine" ctrllimited="true" ctrlrange="-2.0942 2.0942" forcelimited="true" forcerange="-150 150" gainprm="100 0 0" joint="arm:elbow_joint1" name="arm:elbow_T1" />
 93 |         <general biasprm="0 0 -50" biastype="affine" ctrllimited="true" ctrlrange="-3.14 3.14" forcelimited="true" forcerange="-28 28" gainprm="50 0 0" joint="arm:wrist_1_joint1" name="arm:wrist_1_T1" />
 94 |         <general biasprm="0 0 -50" biastype="affine" ctrllimited="true" ctrlrange="-3.14 3.14" forcelimited="true" forcerange="-28 28" gainprm="50 0 0" joint="arm:wrist_2_joint1" name="arm:wrist_2_T1" />
 95 |         <general biasprm="0 0 -50" biastype="affine" ctrllimited="true" ctrlrange="-3.14 3.14" forcelimited="true" forcerange="-28 28" gainprm="50 0 0" joint="arm:wrist_3_joint1" name="arm:wrist_3_T1" />
 96 |         <!-- <general ctrllimited="true" ctrlrange="-150 150" forcelimited="true" forcerange="-150 150" gainprm="1" joint="arm:shoulder_pan_joint" name="arm:shoulder_pan_T"/>
 97 |         <general ctrllimited="true" ctrlrange="-150 150" forcelimited="true" forcerange="-150 150" gainprm="1" joint="arm:shoulder_lift_joint" name="arm:shoulder_lift_T"/>
 98 |         <general ctrllimited="true" ctrlrange="-150 150" forcelimited="true" forcerange="-150 150" gainprm="1" joint="arm:elbow_joint" name="arm:elbow_T"/> -->
 99 |         <!-- <general ctrllimited="true" ctrlrange="-28 28" forcelimited="true" forcerange="-28 28" gainprm="1" joint="arm:wrist_1_joint" name="arm:wrist_1_T"/>
100 |         <general ctrllimited="true" ctrlrange="-28 28" forcelimited="true" forcerange="-28 28" gainprm="1" joint="arm:wrist_2_joint" name="arm:wrist_2_T"/>
101 |         <general ctrllimited="true" ctrlrange="-28 28" forcelimited="true" forcerange="-28 28" gainprm="1" joint="arm:wrist_3_joint" name="arm:wrist_3_T"/> -->
102 |     </actuator>
103 | 
104 |     <sensor>
105 |         <torque name="shoulder_sensor" site="shoulder_site" />
106 |         <torque name="upper_arm_sensor" site="upperarm_site" />
107 |         <torque name="forearm_sensor" site="forearm_site" />
108 |         <torque name="wrist1_sensor" site="wrist1_site" />
109 |         <torque name="wrist2_sensor" site="wrist2_site" />
110 |         <torque name="wrist3_sensor" site="wrist3_site" />
111 | 
112 |         <torque name="shoulder_sensor1" site="shoulder_site1" />
113 |         <torque name="upper_arm_sensor1" site="upperarm_site1" />
114 |         <torque name="forearm_sensor1" site="forearm_site1" />
115 |         <torque name="wrist1_sensor1" site="wrist1_site1" />
116 |         <torque name="wrist2_sensor1" site="wrist2_site1" />
117 |         <torque name="wrist3_sensor1" site="wrist3_site1" />
118 |     </sensor>
119 | </mujoco>


--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/spacerobot/spacerobot_image.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <mujoco model="ur5">
 3 |     <compiler angle="radian" coordinate="local" eulerseq="xyz" />
 4 |     <option cone="elliptic" gravity="0 0 0" impratio="5" timestep="0.001" tolerance="1e-10" />
 5 |     <default class="main">
 6 |         <joint damping="0" armature="1" />
 7 |         <geom condim="4" solimp="0.99 0.99 0.01" solref="0.01 1" />
 8 |         <default class="arm1">
 9 |             <joint damping="0" />
10 |         </default>
11 |         <default class="arm2">
12 |             <joint damping="0" />
13 |         </default>
14 |     </default>
15 | 
16 |     <include file="../common/visual.xml" />
17 |     <include file="../common/skybox.xml" />
18 |     <include file="../common/materials.xml" />
19 |     <include file="asset.xml" />
20 | 
21 |     <worldbody>
22 |         <geom name="floor" size="5 5 .2" type="plane" material="grid" />
23 |         <camera mode="targetbody" name="cam_to_dockingring" pos="4 2 7" target="chasersat" />
24 |         <light dir="0 0 -1" directional="true" name="light1" pos="0 -1 6" />
25 |         <light dir="0 0 -1" directional="true" name="light2" pos="0 1 6" />
26 | 
27 |         <site name="target0" pos="0 0 4.5" size="0.05 0.05 0.05" rgba="1 0 0 1" type="sphere"></site>
28 | 
29 |         <body name="chasersat" pos="0 0 4">
30 |             <freejoint name="chasersat:joint" />
31 |             <geom name="chasersat" mass="419.8441" pos="0 0 0" size="0.8726 0.8726 0.8726" type="box" />
32 |             <site name="chasersat" pos="0 0 0" euler="0 0 0" size="0.01 0.01 0.01" type="box" />
33 |             <body name="chasersat_base" pos="0.67 -0.67 0.9526">
34 |                 <geom name="chasersat_base" mass="10" pos="0 0 -0.04" size="0.1 0.1 0.04" type="box" />
35 |                 <include file="arm_v3.xml" />
36 |             </body>
37 |             <!--            <body name="chasersat_frame" pos="0.47 0.47 0.9526">-->
38 |             <!--                <site name="chasersat_frame:x" pos="0.0618 0 0" size="0.01 0.0618" type="cylinder" rgba="1 0 0 1" euler="0 1.5707963 0" />-->
39 |             <!--                <site name="chasersat_frame:y" pos="0 0.0618 0" size="0.01 0.0618" type="cylinder" rgba="0 1 0 1" euler="-1.5707963 0 0" />-->
40 |             <!--                <site name="chasersat_frame:z" pos="0 0 0.0618" size="0.01 0.0618" type="cylinder" rgba="0 0 1 1"/>-->
41 |             <!--            </body>-->
42 |         </body>
43 | 
44 |         <!-- <body name="targetsat" pos="2 0 4" euler="0 0 3.1415927">
45 |             <freejoint name="targetsat:joint"/>
46 |             <geom name="targetsat" mass="79" pos="0 0 0" size="0.5 0.5 0.5" type="box"/>
47 |             <site name="targetsat" pos="0 0 0" euler="0 0 0" size="0.01 0.01 0.01" type="box"/>
48 |             <body name="targetsat_base" pos="0.54 0 0" euler="0 1.5707963 1.5707963">
49 |                 <geom name="targetsat_base" mass="2" pos="0 0 -0.04" size="0.3090 0.04" type="cylinder"/>
50 | 
51 |                 <body name="target_frame" pos="0 0 0.1" euler="0 3.1415927 0">
52 |                     <site name="target_frame:x" pos="0.0618 0 0" size="0.01 0.0618" type="cylinder" rgba="1 0 0 1" euler="0 1.5707963 0" />
53 |                     <site name="target_frame:y" pos="0 0.0618 0" size="0.01 0.0618" type="cylinder" rgba="0 1 0 1" euler="-1.5707963 0 0" />
54 |                     <site name="target_frame:z" pos="0 0 0.0618" size="0.01 0.0618" type="cylinder" rgba="0 0 1 1"/>
55 |                 </body>
56 |             </body>
57 |         </body> -->
58 | 
59 |         <include file="subgoal.xml" />
60 |     </worldbody>
61 | 
62 |     <!-- scalar_force = gain_term * (act or ctrl) + bias_term -->
63 |     <!-- default: fixed	gain_term = gainprm[0] -->
64 |     <!-- default: none	bias_term = 0 -->
65 |     <!--         affine	bias_term = biasprm[0] + biasprm[1]*length + biasprm[2]*velocity -->
66 |     <actuator>
67 |         <general biasprm="0 0 -100" biastype="affine" ctrllimited="true" ctrlrange="-2.0942 2.0942" forcelimited="true" forcerange="-150 150" gainprm="100 0 0" joint="arm:shoulder_pan_joint" name="arm:shoulder_pan_T" />
68 |         <general biasprm="0 0 -100" biastype="affine" ctrllimited="true" ctrlrange="-2.0942 2.0942" forcelimited="true" forcerange="-150 150" gainprm="100 0 0" joint="arm:shoulder_lift_joint" name="arm:shoulder_lift_T" />
69 |         <general biasprm="0 0 -100" biastype="affine" ctrllimited="true" ctrlrange="-2.0942 2.0942" forcelimited="true" forcerange="-150 150" gainprm="100 0 0" joint="arm:elbow_joint" name="arm:elbow_T" />
70 |         <general biasprm="0 0 -50" biastype="affine" ctrllimited="true" ctrlrange="-3.14 3.14" forcelimited="true" forcerange="-28 28" gainprm="50 0 0" joint="arm:wrist_1_joint" name="arm:wrist_1_T" />
71 |         <general biasprm="0 0 -50" biastype="affine" ctrllimited="true" ctrlrange="-3.14 3.14" forcelimited="true" forcerange="-28 28" gainprm="50 0 0" joint="arm:wrist_2_joint" name="arm:wrist_2_T" />
72 |         <general biasprm="0 0 -50" biastype="affine" ctrllimited="true" ctrlrange="-3.14 3.14" forcelimited="true" forcerange="-28 28" gainprm="50 0 0" joint="arm:wrist_3_joint" name="arm:wrist_3_T" />
73 |         <!-- <general ctrllimited="true" ctrlrange="-150 150" forcelimited="true" forcerange="-150 150" gainprm="1" joint="arm:shoulder_pan_joint" name="arm:shoulder_pan_T"/>
74 |         <general ctrllimited="true" ctrlrange="-150 150" forcelimited="true" forcerange="-150 150" gainprm="1" joint="arm:shoulder_lift_joint" name="arm:shoulder_lift_T"/>
75 |         <general ctrllimited="true" ctrlrange="-150 150" forcelimited="true" forcerange="-150 150" gainprm="1" joint="arm:elbow_joint" name="arm:elbow_T"/> -->
76 |         <!-- <general ctrllimited="true" ctrlrange="-28 28" forcelimited="true" forcerange="-28 28" gainprm="1" joint="arm:wrist_1_joint" name="arm:wrist_1_T"/>
77 |         <general ctrllimited="true" ctrlrange="-28 28" forcelimited="true" forcerange="-28 28" gainprm="1" joint="arm:wrist_2_joint" name="arm:wrist_2_T"/>
78 |         <general ctrllimited="true" ctrlrange="-28 28" forcelimited="true" forcerange="-28 28" gainprm="1" joint="arm:wrist_3_joint" name="arm:wrist_3_T"/> -->
79 |     </actuator>
80 | 
81 |     <sensor>
82 |         <!-- <torque name="shoulder_sensor" site="shoulder_site"/>
83 |         <torque name="upper_arm_sensor" site="upperarm_site"/>
84 |         <torque name="forearm_sensor" site="forearm_site"/>
85 |         <torque name="wrist_1_sensor" site="wrist_1_site"/>
86 |         <torque name="wrist_2_sensor" site="wrist_2_site"/>
87 |         <torque name="wrist_3_sensor" site="wrist_3_site"/> -->
88 |         <actuatorfrc name="shoulder_pan_P_frc" actuator="arm:shoulder_pan_T" />
89 |         <actuatorfrc name="shoulder_lift_P_frc" actuator="arm:shoulder_lift_T" />
90 |         <actuatorfrc name="forearm_P_frc" actuator="arm:elbow_T" />
91 |         <actuatorfrc name="wrist_1_P_frc" actuator="arm:wrist_1_T" />
92 |         <actuatorfrc name="wrist_2_P_frc" actuator="arm:wrist_2_T" />
93 |         <actuatorfrc name="wrist_3_P_frc" actuator="arm:wrist_3_T" />
94 |     </sensor>
95 | 
96 | </mujoco>


--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/spacerobot/spacerobot_state.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <mujoco model="ur5">
 3 |     <compiler angle="radian" coordinate="local" eulerseq="xyz" />
 4 |     <option cone="elliptic" gravity="0 0 0" impratio="5" timestep="0.001" tolerance="1e-10" />
 5 |     <default class="main">
 6 |         <joint damping="0" armature="1" />
 7 |         <geom condim="4" solimp="0.99 0.99 0.01" solref="0.01 1" />
 8 |         <default class="arm1">
 9 |             <joint damping="0" />
10 |         </default>
11 |         <default class="arm2">
12 |             <joint damping="0" />
13 |         </default>
14 |     </default>
15 | 
16 |     <include file="../common/visual.xml" />
17 |     <include file="../common/skybox.xml" />
18 |     <include file="../common/materials.xml" />
19 |     <include file="asset.xml" />
20 | 
21 |     <worldbody>
22 |         <geom name="floor" size="5 5 .2" type="plane" material="grid" />
23 |         <camera mode="targetbody" name="cam_to_dockingring" pos="4 2 7" target="chasersat" />
24 |         <light dir="0 0 -1" directional="true" name="light1" pos="0 -1 6" />
25 |         <light dir="0 0 -1" directional="true" name="light2" pos="0 1 6" />
26 | 
27 |         <site name="target0" pos="0 0 4.5" size="0.05 0.05 0.05" rgba="1 0 0 0.3" type="sphere"></site>
28 | 
29 |         <body name="chasersat" pos="0 0 4">
30 |             <freejoint name="chasersat:joint" />
31 |             <geom name="chasersat" mass="200" pos="0 0 0" size="0.8726 0.8726 0.8726" type="box" />
32 |             <site name="chasersat" pos="0 0 0" euler="0 0 0" size="0.01 0.01 0.01" type="box" />
33 |             <body name="chasersat_base" pos="0.67 0.67 0.9526">
34 |                 <geom name="chasersat_base" mass="10" pos="0 0 -0.04" size="0.1 0.1 0.04" type="box" />
35 |                 <include file="arm_v3.xml" />
36 |             </body>
37 |             <!-- show the target frame -->
38 |             <body name="chasersat_frame" pos="0.47 0.47 0.9526">
39 |                 <site name="chasersat_frame:x" pos="0.0618 0 0" size="0.01 0.0618" type="cylinder" rgba="1 0 0 1" euler="0 1.5707963 0" />
40 |                 <site name="chasersat_frame:y" pos="0 0.0618 0" size="0.01 0.0618" type="cylinder" rgba="0 1 0 1" euler="-1.5707963 0 0" />
41 |                 <site name="chasersat_frame:z" pos="0 0 0.0618" size="0.01 0.0618" type="cylinder" rgba="0 0 1 1" />
42 |             </body>
43 |         </body>
44 | 
45 |         <!-- <body name="targetsat" pos="2 0 4" euler="0 0 3.1415927">
46 |             <freejoint name="targetsat:joint"/>
47 |             <geom name="targetsat" mass="79" pos="0 0 0" size="0.5 0.5 0.5" type="box"/>
48 |             <site name="targetsat" pos="0 0 0" euler="0 0 0" size="0.01 0.01 0.01" type="box"/>
49 |             <body name="targetsat_base" pos="0.54 0 0" euler="0 1.5707963 1.5707963">
50 |                 <geom name="targetsat_base" mass="2" pos="0 0 -0.04" size="0.3090 0.04" type="cylinder"/>
51 | 
52 |                 <body name="target_frame" pos="0 0 0.1" euler="0 3.1415927 0">
53 |                     <site name="target_frame:x" pos="0.0618 0 0" size="0.01 0.0618" type="cylinder" rgba="1 0 0 1" euler="0 1.5707963 0" />
54 |                     <site name="target_frame:y" pos="0 0.0618 0" size="0.01 0.0618" type="cylinder" rgba="0 1 0 1" euler="-1.5707963 0 0" />
55 |                     <site name="target_frame:z" pos="0 0 0.0618" size="0.01 0.0618" type="cylinder" rgba="0 0 1 1"/>
56 |                 </body>
57 |             </body>
58 |         </body> -->
59 | 
60 |         <include file="subgoal.xml" />
61 |     </worldbody>
62 | 
63 |     <!-- scalar_force = gain_term * (act or ctrl) + bias_term -->
64 |     <!-- default: fixed	gain_term = gainprm[0] -->
65 |     <!-- default: none	bias_term = 0 -->
66 |     <!--         affine	bias_term = biasprm[0] + biasprm[1]*length + biasprm[2]*velocity -->
67 |     <actuator>
68 |         <general biasprm="0 0 -100" biastype="affine" ctrllimited="true" ctrlrange="-2.0942 2.0942" forcelimited="true" forcerange="-150 150" gainprm="100 0 0" joint="arm:shoulder_pan_joint" name="arm:shoulder_pan_T" />
69 |         <general biasprm="0 0 -100" biastype="affine" ctrllimited="true" ctrlrange="-2.0942 2.0942" forcelimited="true" forcerange="-150 150" gainprm="100 0 0" joint="arm:shoulder_lift_joint" name="arm:shoulder_lift_T" />
70 |         <general biasprm="0 0 -100" biastype="affine" ctrllimited="true" ctrlrange="-2.0942 2.0942" forcelimited="true" forcerange="-150 150" gainprm="100 0 0" joint="arm:elbow_joint" name="arm:elbow_T" />
71 |         <general biasprm="0 0 -50" biastype="affine" ctrllimited="true" ctrlrange="-3.14 3.14" forcelimited="true" forcerange="-28 28" gainprm="50 0 0" joint="arm:wrist_1_joint" name="arm:wrist_1_T" />
72 |         <general biasprm="0 0 -50" biastype="affine" ctrllimited="true" ctrlrange="-3.14 3.14" forcelimited="true" forcerange="-28 28" gainprm="50 0 0" joint="arm:wrist_2_joint" name="arm:wrist_2_T" />
73 |         <general biasprm="0 0 -50" biastype="affine" ctrllimited="true" ctrlrange="-3.14 3.14" forcelimited="true" forcerange="-28 28" gainprm="50 0 0" joint="arm:wrist_3_joint" name="arm:wrist_3_T" />
74 |         <!-- <general ctrllimited="true" ctrlrange="-150 150" forcelimited="true" forcerange="-150 150" gainprm="1" joint="arm:shoulder_pan_joint" name="arm:shoulder_pan_T"/>
75 |         <general ctrllimited="true" ctrlrange="-150 150" forcelimited="true" forcerange="-150 150" gainprm="1" joint="arm:shoulder_lift_joint" name="arm:shoulder_lift_T"/>
76 |         <general ctrllimited="true" ctrlrange="-150 150" forcelimited="true" forcerange="-150 150" gainprm="1" joint="arm:elbow_joint" name="arm:elbow_T"/> -->
77 |         <!-- <general ctrllimited="true" ctrlrange="-28 28" forcelimited="true" forcerange="-28 28" gainprm="1" joint="arm:wrist_1_joint" name="arm:wrist_1_T"/>
78 |         <general ctrllimited="true" ctrlrange="-28 28" forcelimited="true" forcerange="-28 28" gainprm="1" joint="arm:wrist_2_joint" name="arm:wrist_2_T"/>
79 |         <general ctrllimited="true" ctrlrange="-28 28" forcelimited="true" forcerange="-28 28" gainprm="1" joint="arm:wrist_3_joint" name="arm:wrist_3_T"/> -->
80 |     </actuator>
81 | 
82 |     <sensor>
83 |         <!-- <torque name="shoulder_sensor" site="shoulder_site"/>
84 |         <torque name="upper_arm_sensor" site="upperarm_site"/>
85 |         <torque name="forearm_sensor" site="forearm_site"/>
86 |         <torque name="wrist_1_sensor" site="wrist_1_site"/>
87 |         <torque name="wrist_2_sensor" site="wrist_2_site"/>
88 |         <torque name="wrist_3_sensor" site="wrist_3_site"/> -->
89 |         <actuatorfrc name="shoulder_pan_P_frc" actuator="arm:shoulder_pan_T" />
90 |         <actuatorfrc name="shoulder_lift_P_frc" actuator="arm:shoulder_lift_T" />
91 |         <actuatorfrc name="forearm_P_frc" actuator="arm:elbow_T" />
92 |         <actuatorfrc name="wrist_1_P_frc" actuator="arm:wrist_1_T" />
93 |         <actuatorfrc name="wrist_2_P_frc" actuator="arm:wrist_2_T" />
94 |         <actuatorfrc name="wrist_3_P_frc" actuator="arm:wrist_3_T" />
95 |     </sensor>
96 | 
97 | 
98 | </mujoco>


--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/spacerobot/stls/R10.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/assets/spacerobot/stls/R10.stl


--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/spacerobot/stls/cube.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/assets/spacerobot/stls/cube.stl


--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/spacerobot/stls/v_base.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/assets/spacerobot/stls/v_base.stl


--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/spacerobot/stls/v_forearm.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/assets/spacerobot/stls/v_forearm.stl


--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/spacerobot/stls/v_shoulder.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/assets/spacerobot/stls/v_shoulder.stl


--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/spacerobot/stls/v_upperarm.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/assets/spacerobot/stls/v_upperarm.stl


--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/spacerobot/stls/v_wrist1.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/assets/spacerobot/stls/v_wrist1.stl


--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/spacerobot/stls/v_wrist2.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/assets/spacerobot/stls/v_wrist2.stl


--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/spacerobot/stls/v_wrist3.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/assets/spacerobot/stls/v_wrist3.stl


--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/spacerobot/subgoal.xml:
--------------------------------------------------------------------------------
  1 | <mujoco>
  2 |     <!-- Subgoal 0: Yellow -->
  3 |     <!-- <body name="end_goal_upper_arm" pos="0 0 0" mocap="true">
  4 |         <site name="end_goal_upper_arm" type="box" size="0.1 0.1 0.1" rgba="1 1 0 0.3" />
  5 |     </body>
  6 | 
  7 |     <body name="end_goal_forearm" pos="0 0 0" mocap="true">
  8 |         <site name="end_goal_forearm" type="box" size="0.1 0.1 0.1" rgba="1 1 0 0.3" />
  9 |     </body>
 10 | 
 11 |     <body name="end_goal_wrist_1" pos="0 0 0" mocap="true">
 12 |         <site name="end_goal_wrist_1" type="box" size="0.1 0.1 0.1" rgba="1 1 0 0.3" />
 13 |     </body> -->
 14 |     <body name="subgoal_0_shoulder_arm" pos="0 0 0" mocap="true">
 15 |         <site name="subgoal_0_shoulder_arm" type="capsule" size="0.01 0.0446" rgba="1 1 0 0.3" pos="0 -0.0446 0" euler="-1.5707963 0 0" />
 16 |     </body>
 17 | 
 18 |     <body name="subgoal_0_upper_arm" pos="0 0 0" mocap="true">
 19 |         <site name="subgoal_0_upper_arm" type="capsule" size="0.02 0.2125" rgba="1 1 0 0.3" pos="0.2125 0 0" euler="0 1.5707963 0" />
 20 |     </body>
 21 | 
 22 |     <body name="subgoal_0_forearm" pos="0 0 0" mocap="true">
 23 |         <site name="subgoal_0_forearm" type="capsule" size="0.02 0.1961" rgba="1 1 0 0.3" pos="0.1961 0 0" euler="0 1.5707963 0" />
 24 |     </body>
 25 | 
 26 |     <body name="subgoal_0_wrist_1" pos="0 0 0" mocap="true">
 27 |         <site name="subgoal_0_wrist_1" type="capsule" size="0.02 0.0465" rgba="1 1 0 0.3" pos="0 -0.0465 0" euler="-1.5707963 0 0" />
 28 |     </body>
 29 | 
 30 |     <body name="subgoal_0_wrist_2" pos="0 0 0" mocap="true">
 31 |         <site name="subgoal_0_wrist_2" type="capsule" size="0.02 0.0473" rgba="1 1 0 0.3" pos="0 0.0473 0" euler="-1.5707963 0 0" />
 32 |     </body>
 33 | 
 34 |     <body name="subgoal_0_wrist_3" pos="0 0 0" mocap="true">
 35 |         <site name="subgoal_0_wrist_3" type="capsule" size="0.02 0.0411" rgba="1 1 0 0.3" pos="0 0 -0.0411" />
 36 |     </body>
 37 | 
 38 |     <body name="subgoal_0_tip" pos="0 0 0" mocap="true">
 39 |         <site name="subgoal_0_tip:x" pos="0.0618 0 0" size="0.01 0.0618" type="cylinder" rgba="1 0 0 1" euler="0 1.5707963 0" />
 40 |         <site name="subgoal_0_tip:y" pos="0 0.0618 0" size="0.01 0.0618" type="cylinder" rgba="0 1 0 1" euler="-1.5707963 0 0" />
 41 |         <site name="subgoal_0_tip:z" pos="0 0 0.0618" size="0.01 0.0618" type="cylinder" rgba="0 0 1 1" />
 42 |     </body>
 43 | 
 44 |     <!-- Subgoal 1: Purple -->
 45 |     <!-- <body name="subgoal_1_upper_arm" pos="0 0 0" mocap="true">
 46 |         <site name="subgoal_1_upper_arm" type="box" size="0.09 0.09 0.09" rgba="1 0 1 0.5" />
 47 |     </body>
 48 | 
 49 |     <body name="subgoal_1_forearm" pos="0 0 0" mocap="true">
 50 |         <site name="subgoal_1_forearm" type="box" size="0.09 0.09 0.09" rgba="1 0 1 0.5" />
 51 |     </body>
 52 | 
 53 |     <body name="subgoal_1_wrist_1" pos="0 0 0" mocap="true">
 54 |         <site name="subgoal_1_wrist_1" type="box" size="0.09 0.09 0.09" rgba="1 0 1 0.5" />
 55 |     </body> -->
 56 |     <body name="subgoal_1_shoulder_arm" pos="0 0 0" mocap="true">
 57 |         <site name="subgoal_1_shoulder_arm" type="capsule" size="0.01 0.0446" rgba="1 0 1 0.5" pos="0 -0.0446 0" euler="-1.5707963 0 0" />
 58 |     </body>
 59 | 
 60 |     <body name="subgoal_1_upper_arm" pos="0 0 0" mocap="true">
 61 |         <site name="subgoal_1_upper_arm" type="capsule" size="0.02 0.2125" rgba="1 0 1 0.5" pos="0.2125 0 0" euler="0 1.5707963 0" />
 62 |     </body>
 63 | 
 64 |     <body name="subgoal_1_forearm" pos="0 0 0" mocap="true">
 65 |         <site name="subgoal_1_forearm" type="capsule" size="0.02 0.1961" rgba="1 0 1 0.5" pos="0.1961 0 0" euler="0 1.5707963 0" />
 66 |     </body>
 67 | 
 68 |     <body name="subgoal_1_wrist_1" pos="0 0 0" mocap="true">
 69 |         <site name="subgoal_1_wrist_1" type="capsule" size="0.02 0.0465" rgba="1 0 1 0.5" pos="0 -0.0465 0" euler="-1.5707963 0 0" />
 70 |     </body>
 71 | 
 72 |     <body name="subgoal_1_wrist_2" pos="0 0 0" mocap="true">
 73 |         <site name="subgoal_1_wrist_2" type="capsule" size="0.02 0.0473" rgba="1 0 1 0.5" pos="0 0.0473 0" euler="-1.5707963 0 0" />
 74 |     </body>
 75 | 
 76 |     <body name="subgoal_1_wrist_3" pos="0 0 0" mocap="true">
 77 |         <site name="subgoal_1_wrist_3" type="capsule" size="0.02 0.0411" rgba="1 0 1 0.5" pos="0 0 -0.0411" />
 78 |     </body>
 79 | 
 80 |     <body name="subgoal_1_tip" pos="0 0 0" mocap="true">
 81 |         <site name="subgoal_1_tip:x" pos="0.0618 0 0" size="0.01 0.0618" type="cylinder" rgba="1 0 0 1" euler="0 1.5707963 0" />
 82 |         <site name="subgoal_1_tip:y" pos="0 0.0618 0" size="0.01 0.0618" type="cylinder" rgba="0 1 0 1" euler="-1.5707963 0 0" />
 83 |         <site name="subgoal_1_tip:z" pos="0 0 0.0618" size="0.01 0.0618" type="cylinder" rgba="0 0 1 1" />
 84 |     </body>
 85 | 
 86 |     <!-- Subgoal 2: Red -->
 87 |     <body name="subgoal_2_shoulder_arm" pos="0 0 0" mocap="true">
 88 |         <site name="subgoal_2_shoulder_arm" type="capsule" size="0.01 0.0446" rgba="1 0 0 0.7" pos="0 -0.0446 0" euler="-1.5707963 0 0" />
 89 |     </body>
 90 | 
 91 |     <body name="subgoal_2_upper_arm" pos="0 0 0" mocap="true">
 92 |         <site name="subgoal_2_upper_arm" type="capsule" size="0.02 0.2125" rgba="1 0 0 0.7" pos="0.2125 0 0" euler="0 1.5707963 0" />
 93 |     </body>
 94 | 
 95 |     <body name="subgoal_2_forearm" pos="0 0 0" mocap="true">
 96 |         <site name="subgoal_2_forearm" type="capsule" size="0.02 0.1961" rgba="1 0 0 0.7" pos="0.1961 0 0" euler="0 1.5707963 0" />
 97 |     </body>
 98 | 
 99 |     <body name="subgoal_2_wrist_1" pos="0 0 0" mocap="true">
100 |         <site name="subgoal_2_wrist_1" type="capsule" size="0.02 0.0465" rgba="1 0 0 0.7" pos="0 -0.0465 0" euler="-1.5707963 0 0" />
101 |     </body>
102 | 
103 |     <body name="subgoal_2_wrist_2" pos="0 0 0" mocap="true">
104 |         <site name="subgoal_2_wrist_2" type="capsule" size="0.02 0.0473" rgba="1 0 0 0.7" pos="0 0.0473 0" euler="-1.5707963 0 0" />
105 |     </body>
106 | 
107 |     <body name="subgoal_2_wrist_3" pos="0 0 0" mocap="true">
108 |         <site name="subgoal_2_wrist_3" type="capsule" size="0.02 0.0411" rgba="1 0 0 0.7" pos="0 0 -0.0411" />
109 |     </body>
110 | 
111 |     <body name="subgoal_2_tip" pos="0 0 0" mocap="true">
112 |         <site name="subgoal_2_tip:x" pos="0.0618 0 0" size="0.01 0.0618" type="cylinder" rgba="1 0 0 1" euler="0 1.5707963 0" />
113 |         <site name="subgoal_2_tip:y" pos="0 0.0618 0" size="0.01 0.0618" type="cylinder" rgba="0 1 0 1" euler="-1.5707963 0 0" />
114 |         <site name="subgoal_2_tip:z" pos="0 0 0.0618" size="0.01 0.0618" type="cylinder" rgba="0 0 1 1" />
115 |     </body>
116 | 
117 |     <!-- Subgoal 3: Green -->
118 |     <!-- <body name="subgoal_3" pos="0 0 0" mocap="true">
119 |         <site name="subgoal_3"  type="box" size="0.08 0.08 0.08" rgba="0 1 0 0" />
120 |     </body> -->
121 |     <body name="endgoal" pos="0 0 0" mocap="true">
122 |         <site name="endgoal" type="box" size="0.1 0.1 0.1" rgba="1 0 0 0.3" />
123 |     </body>
124 | 
125 |     <!-- Subgoal 4: Blue -->
126 |     <!-- <body name="subgoal_4" pos="0 0 0" mocap="true">
127 |         <site name="subgoal_4"  type="box" size="0.08 0.08 0.08" rgba="0 0 1 0" />
128 |     </body> -->
129 | 
130 |     <!-- Subgoal 5: Cyan -->
131 |     <!-- <body name="subgoal_5" pos="0 0 0" mocap="true">
132 |         <site name="subgoal_5"  type="box" size="0.08 0.08 0.08" rgba="0 1 1 0" />
133 |     </body> -->
134 | 
135 |     <!-- Subgoal 6: Orange -->
136 |     <!-- <body name="subgoal_6" pos="0 0 0" mocap="true">
137 |         <site name="subgoal_6"  type="box" size="0.08 0.08 0.08" rgba="1 0.65 0 0" />
138 |     </body> -->
139 | 
140 |     <!-- Subgoal 7: Maroon -->
141 |     <!-- <body name="subgoal_7" pos="0 0 0" mocap="true">
142 |         <site name="subgoal_7"  type="box" size="0.08 0.08 0.08" rgba="0.5 0 0 0" />
143 |     </body> -->
144 | 
145 |     <!-- Subgoal 8: Gray -->
146 |     <!-- <body name="subgoal_8" pos="0 0 0" mocap="true">
147 |         <site name="subgoal_8"  type="box" size="0.08 0.08 0.08" rgba="0.5 0.5 0.5 0" />
148 |     </body> -->
149 | 
150 |     <!-- Subgoal 9: White -->
151 |     <!-- <body name="subgoal_9" pos="0 0 0" mocap="true">
152 |         <site name="subgoal_9"  type="box" size="0.08 0.08 0.08" rgba="1 1 1 0" />
153 |     </body> -->
154 | 
155 |     <!-- Subgoal 10: Black -->
156 |     <!-- <body name="subgoal_10" pos="0 0 0" mocap="true">
157 |         <site name="subgoal_10"  type="box" size="0.08 0.08 0.08" rgba="0 0 0 0" />
158 |     </body> -->
159 | 
160 | 
161 | </mujoco>


--------------------------------------------------------------------------------
/SpaceRobotEnv/envs/SpaceRobotReorientation.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import copy
  4 | import numpy as np
  5 | 
  6 | import gym
  7 | from gym import spaces
  8 | from gym.utils import seeding
  9 | 
 10 | from gym.envs.robotics import utils
 11 | from gym.envs.robotics import rotations
 12 | 
 13 | import mujoco_py
 14 | 
 15 | 
 16 | PATH = os.getcwd()
 17 | MODEL_XML_PATH = os.path.join(PATH,'SpaceRobotEnv','assets', 'spacerobot', 'spacerobot_dualarm.xml')
 18 | DEFAULT_SIZE = 500
 19 | 
 20 | 
 21 | class RobotEnv(gym.GoalEnv):
 22 |     def __init__(self, model_path, initial_qpos, n_substeps):
 23 | 
 24 |         # load model and simulator
 25 |         self.model = mujoco_py.load_model_from_path(model_path)
 26 |         self.sim = mujoco_py.MjSim(self.model, nsubsteps=n_substeps)
 27 | 
 28 |         # render setting
 29 |         self.viewer = None
 30 |         self._viewers = {}
 31 |         self.metadata = {
 32 |             "render.modes": ["human", "rgb_array"],
 33 |             "video.frames_per_second": int(np.round(1.0 / self.dt)),
 34 |         }
 35 | 
 36 |         # seed
 37 |         self.seed()
 38 | 
 39 |         # initalization
 40 |         self._env_setup(initial_qpos=initial_qpos)
 41 |         self.initial_state = copy.deepcopy(self.sim.get_state())
 42 |         self.goal = self._sample_goal()
 43 | 
 44 |         # set action_space and observation_space
 45 |         obs = self._get_obs()
 46 |         self._set_action_space()
 47 |         self.observation_space = spaces.Dict(
 48 |             dict(
 49 |                 desired_goal=spaces.Box(
 50 |                     -np.inf, np.inf, shape=obs["desired_goal"].shape, dtype="float32"
 51 |                 ),
 52 |                 achieved_goal=spaces.Box(
 53 |                     -np.inf, np.inf, shape=obs["achieved_goal"].shape, dtype="float32"
 54 |                 ),
 55 |                 observation=spaces.Box(
 56 |                     -np.inf, np.inf, shape=obs["observation"].shape, dtype="float32"
 57 |                 ),
 58 |             )
 59 |         )
 60 | 
 61 |     def _set_action_space(self):
 62 |         bounds = self.model.actuator_ctrlrange.copy()
 63 |         low, high = bounds.T
 64 |         self.action_space = spaces.Box(low=low, high=high, dtype=np.float32)
 65 |         return self.action_space
 66 | 
 67 |     @property
 68 |     def dt(self):
 69 |         return self.sim.model.opt.timestep * self.sim.nsubsteps
 70 | 
 71 |     def _detecte_collision(self):
 72 |         self.collision = self.sim.data.ncon
 73 |         return self.collision
 74 | 
 75 |     def _sensor_torque(self):
 76 |         self.sensor_data = self.sim.data.sensordata
 77 |         return self.sensor_data
 78 | 
 79 |     def seed(self, seed=None):
 80 |         self.np_random, seed = seeding.np_random(seed)
 81 |         return [seed]
 82 | 
 83 |     def step(self, action):
 84 |         action = np.clip(action, self.action_space.low, self.action_space.high)
 85 |         self._set_action(action)  # do one step simulation here
 86 |         self._step_callback()
 87 |         obs = self._get_obs()
 88 |         done = False
 89 |         info = {
 90 |             "is_success": self._is_success(obs["achieved_goal"], self.goal)
 91 |         }
 92 |         reward = self.compute_reward(obs["achieved_goal"], self.goal, info)
 93 |         # reward = self.compute_reward(obs['achieved_goal'], self.goal, info) + self.compute_reward(obs['achieved_goal1'], self.goal1, info)
 94 |         return obs, reward, done, info
 95 | 
 96 |     def reset(self):
 97 |         """Attempt to reset the simulator. Since we randomize initial conditions, it
 98 |         is possible to get into a state with numerical issues (e.g. due to penetration or
 99 |         Gimbel lock) or we may not achieve an initial condition (e.g. an object is within the hand).
100 |         In this case, we just keep randomizing until we eventually achieve a valid initial
101 |         configuration.
102 |         """
103 |         super(RobotEnv, self).reset()
104 |         did_reset_sim = False
105 |         while not did_reset_sim:
106 |             did_reset_sim = self._reset_sim()
107 | 
108 |         self.goal = self._sample_goal()
109 |         obs = self._get_obs()
110 | 
111 |         # TODO: set the position of cube
112 |         # body_id = self.sim.model.geom_name2id("cube")
113 |         # self.sim.model.geom_pos[body_id] = np.array([0, 0, 6])
114 |         return obs
115 | 
116 |     def close(self):
117 |         if self.viewer is not None:
118 |             # self.viewer.finish()
119 |             self.viewer = None
120 |             self._viewers = {}
121 | 
122 |     def render(self, mode="human", width=DEFAULT_SIZE, height=DEFAULT_SIZE):
123 |         # self._render_callback()
124 |         if mode == "rgb_array":
125 |             self._get_viewer(mode).render(width, height)
126 |             # window size used for old mujoco-py:
127 |             datargb, datadepth = self._get_viewer(mode).read_pixels(
128 |                 width, height, depth=True
129 |             )
130 |             # original image is upside-down, so flip it
131 |             return datargb[::-1, :, :], datadepth[::-1]
132 |         elif mode == "human":
133 |             self._get_viewer(mode).render()
134 | 
135 |     def _get_viewer(self, mode):
136 |         self.viewer = self._viewers.get(mode)
137 | 
138 |         if self.viewer is None:
139 |             if mode == "human":
140 |                 self.viewer = mujoco_py.MjViewer(self.sim)
141 |                 self._viewer_setup()
142 | 
143 |             elif mode == "rgb_array":
144 |                 self.viewer = mujoco_py.MjRenderContextOffscreen(self.sim, device_id=-1)
145 |                 self._viewer_setup()
146 |                 # self.viewer.cam.trackbodyid = 0
147 |                 # latest modification
148 |                 cam_pos = np.array([0.5, 0, 5, 0.3, -30, 0])
149 |                 for i in range(3):
150 |                     self.viewer.cam.lookat[i] = cam_pos[i]
151 |                 self.viewer.cam.distance = cam_pos[3]
152 |                 self.viewer.cam.elevation = cam_pos[4]
153 |                 self.viewer.cam.azimuth = cam_pos[5]
154 |                 # self.viewer.cam.trackbodyid = -1
155 | 
156 |             self._viewers[mode] = self.viewer
157 |         return self.viewer
158 | 
159 |     def _reset_sim(self):
160 |         """Resets a simulation and indicates whether or not it is successful.
161 |         If a reset is unsuccessful (e.g. if a randomized state caused an error in the
162 |         simulation), this method should indicate such a failure by returning False.
163 |         In such a case, this method will be called again to attempt a the reset again.
164 |         """
165 |         self.sim.set_state(self.initial_state)
166 |         self.sim.forward()
167 |         return True
168 | 
169 |     def _get_obs(self):
170 |         """Returns the observation."""
171 |         raise NotImplementedError()
172 | 
173 |     def _set_action(self, action):
174 |         """Applies the given action to the simulation."""
175 |         raise NotImplementedError()
176 | 
177 |     def _is_success(self, achieved_goal, desired_goal):
178 |         """Indicates whether or not the achieved goal successfully achieved the desired goal."""
179 |         raise NotImplementedError()
180 | 
181 |     def _sample_goal(self):
182 |         """Samples a new goal and returns it."""
183 |         raise NotImplementedError()
184 | 
185 |     def _env_setup(self, initial_qpos):
186 |         """Initial configuration of the environment. Can be used to configure initial state
187 |         and extract information from the simulation.
188 |         """
189 |         pass
190 | 
191 |     def _viewer_setup(self):
192 |         """Initial configuration of the viewer. Can be used to set the camera position,
193 |         for example.
194 |         """
195 |         pass
196 | 
197 |     def _render_callback(self):
198 |         """A custom callback【自定义回调】 that is called before rendering. Can be used
199 |         to implement custom visualizations.【可实现自定义可视化】
200 |         """
201 |         pass
202 | 
203 |     def _step_callback(self):
204 |         """A custom callback that is called after stepping the simulation. Can be used
205 |         to enforce additional constraints on the simulation state.【对模拟状态强制附加约束】
206 |         """
207 |         pass
208 | 
209 | 
210 | def goal_distance(goal_a, goal_b):
211 |     assert goal_a.shape == goal_b.shape
212 |     return np.linalg.norm(goal_a - goal_b, axis=-1)
213 | 
214 | 
215 | class SpacerobotEnv(RobotEnv):
216 |     """Superclass for all SpaceRobot environments."""
217 | 
218 |     def __init__(
219 |         self,
220 |         model_path,
221 |         n_substeps,
222 |         distance_threshold,
223 |         initial_qpos,
224 |         reward_type,
225 |         pro_type,
226 |         c_coeff,
227 |     ):
228 |         """Initializes a new Fetch environment.
229 |         Args:
230 |             model_path (string): path to the environments XML file
231 |             n_substeps (int): number of substeps the simulation runs on every call to step
232 |             distance_threshold (float): the threshold after which a goal is considered achieved
233 |             initial_qpos (dict): a dictionary of joint names and values that define the initial configuration
234 |             reward_type ('sparse' or 'dense'): the reward type, i.e. sparse or dense
235 |             pro_type ('MDP' or 'CMDP'):  the problem setting whether contains cost or not
236 |             c_coeff: cost coefficient
237 |         """
238 |         self.n_substeps = n_substeps
239 |         #        self.target_range = target_range
240 |         self.distance_threshold = distance_threshold
241 |         self.reward_type = reward_type
242 |         self.c_coeff = c_coeff
243 |         self.pro_type = pro_type
244 | 
245 |         super(SpacerobotEnv, self).__init__(
246 |             model_path=model_path,
247 |             n_substeps=n_substeps,
248 |             initial_qpos=initial_qpos,
249 |         )
250 | 
251 |     def compute_reward(self, achieved_goal, desired_goal, info):
252 |         # Compute distance between goal and the achieved goal.
253 |         d = goal_distance(achieved_goal, desired_goal)
254 | 
255 |         reward = {
256 |             "sparse": -(d > self.distance_threshold).astype(np.float32),
257 |             "dense": -(0.001 * d ** 2 + np.log10(d ** 2 + 1e-6)),
258 |         }
259 | 
260 |         return reward
261 | 
262 | 
263 |     def _set_action(self, action):
264 |         """
265 |         output action (velocity)
266 |         :param action: angle velocity of joints
267 |         :return: angle velocity of joints
268 |         """
269 |         assert action.shape == (12,)
270 |         self.sim.data.ctrl[:] = action * 0.5
271 |         for _ in range(self.n_substeps):
272 |             self.sim.step()
273 | 
274 |     def _get_obs(self):
275 |         # positions
276 |         # grip_pos = self.sim.data.get_body_xpos("tip_frame")
277 |         # grip_pos1 = self.sim.data.get_body_xpos("tip_frame1")
278 |         """
279 |         # get the rotation angle of the target
280 |         grip_rot = self.sim.data.get_body_xquat('tip_frame')
281 |         grip_rot = rotations.quat2euler(grip_rot)
282 |         grip_rot1 = self.sim.data.get_body_xquat('tip_frame1')
283 |         grip_rot1 = rotations.quat2euler(grip_rot1)     
284 |         """
285 |         # dt = self.sim.nsubsteps * self.sim.model.opt.timestep
286 |         # grip_velp = self.sim.data.get_body_xvelp("tip_frame") * dt
287 |         # grip_velp1 = self.sim.data.get_body_xvelp("tip_frame1") * dt
288 |         """
289 |         achieved_goal = np.concatenate([grip_pos.copy(),grip_rot.copy()])
290 |         achieved_goal1 = np.concatenate([grip_pos1.copy(),grip_rot1.copy()]) 
291 |         """
292 |         post_base_att = self.sim.data.get_body_xquat('chasersat')
293 | 
294 |         obs = np.concatenate(
295 |             [
296 |                 self.sim.data.qpos[:].copy(),
297 |                 self.sim.data.qvel[:].copy(),
298 |                 self.goal.copy(),
299 |             ]
300 |         )
301 | 
302 |         return {
303 |             "observation": obs.copy(),
304 |             "achieved_goal": post_base_att.copy(),
305 |             "desired_goal": self.goal.copy(),
306 |         }
307 | 
308 |     def _viewer_setup(self):
309 |         # body_id = self.sim.model.body_name2id('forearm_link')
310 |         body_id = self.sim.model.body_name2id("wrist_3_link")
311 |         lookat = self.sim.data.body_xpos[body_id]
312 |         for idx, value in enumerate(lookat):
313 |             self.viewer.cam.lookat[idx] = value
314 |         self.viewer.cam.distance = 2.5
315 |         self.viewer.cam.azimuth = 132.0
316 |         self.viewer.cam.elevation = -14.0
317 | 
318 |     def _reset_sim(self):
319 |         self.sim.set_state(self.initial_state)
320 |         self.sim.forward()
321 |         return True
322 | 
323 |     def _sample_goal(self):
324 |         goal = self.initial_base_att 
325 | 
326 |         return goal.copy()
327 | 
328 |     def _is_success(self, achieved_goal, desired_goal):
329 |         d = goal_distance(achieved_goal, desired_goal)
330 |         return (d < self.distance_threshold).astype(np.float32)
331 |         # return d
332 | 
333 |     def _env_setup(self, initial_qpos):
334 |         
335 |         # set qpos of chasersat 
336 |         chasersat_pos = [0.,0.,4.] # init pos of base
337 |         chasersat_ori = np.random.rand(3) * 0.5 # initial base att range[0,1)
338 |         chasersat_quat = rotations.euler2quat(chasersat_ori)
339 |         initial_qpos['chasersat:joint'] = list(chasersat_pos) + list(chasersat_quat) 
340 |         # print('initial qpos of base is {}'.format(initial_qpos['chasersat:joint']))
341 | 
342 |         for name, value in initial_qpos.items():
343 |             self.sim.data.set_joint_qpos(name, value)
344 |         utils.reset_mocap_welds(self.sim)
345 | 
346 |         # Extract information for sampling goals.
347 |         self.initial_gripper_xpos = self.sim.data.get_body_xpos("tip_frame").copy()
348 |         self.initial_gripper_xpos1 = self.sim.data.get_body_xpos("tip_frame1").copy()
349 | 
350 |         # get the initial base attitude
351 |         self.initial_base_att = self.sim.data.get_body_xquat("chasersat").copy()
352 | 
353 |         # get the initial base position
354 |         self.initial_base_pos = self.sim.data.get_body_xpos("chasersat").copy()
355 |         # print('initial base att is {}'.format(self.initial_base_att))
356 |         # print('initial base pos is {}'.format(self.initial_base_pos))
357 |         # print('initial pos is {}'.format(self.sim.data.qpos[:]))
358 | 
359 |     def render(self, mode="human", width=500, height=500):
360 |         return super(SpacerobotEnv, self).render(mode, width, height)
361 | 
362 | 
363 | class SpaceRobotReorientation(SpacerobotEnv, gym.utils.EzPickle):
364 |     def __init__(self, reward_type="sparse", pro_type="MDP"):
365 |         initial_qpos = {
366 |             "arm:shoulder_pan_joint": 0.0,
367 |             "arm:shoulder_lift_joint": 0.0,
368 |             "arm:elbow_joint": 0.0,
369 |             "arm:wrist_1_joint": 0.0,
370 |             "arm:wrist_2_joint": 0.0,
371 |             "arm:wrist_3_joint": 0.0,
372 |             "arm:shoulder_pan_joint1": 0.0,
373 |             "arm:shoulder_lift_joint1": 0.0,
374 |             "arm:elbow_joint1": 0.0,
375 |             "arm:wrist_1_joint1": 0.0,
376 |             "arm:wrist_2_joint1": 0.0,
377 |             "arm:wrist_3_joint1": 0.0,
378 |         }
379 |         SpacerobotEnv.__init__(
380 |             self,
381 |             MODEL_XML_PATH,
382 |             n_substeps=20,
383 |             distance_threshold=0.05,
384 |             initial_qpos=initial_qpos,
385 |             reward_type=reward_type,
386 |             pro_type=pro_type,
387 |             c_coeff=0.1,
388 |         )
389 |         gym.utils.EzPickle.__init__(self)
390 | 


--------------------------------------------------------------------------------
/SpaceRobotEnv/envs/SpaceRobotState.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import copy
  4 | import numpy as np
  5 | 
  6 | import gym
  7 | from gym import spaces
  8 | from gym.utils import seeding
  9 | 
 10 | from gym.envs.robotics import utils
 11 | from gym.envs.robotics import rotations
 12 | 
 13 | import mujoco_py
 14 | 
 15 | PATH = os.getcwd()
 16 | 
 17 | MODEL_XML_PATH = os.path.join(
 18 |     PATH, "SpaceRobotEnv", "assets", "spacerobot", "spacerobot_state.xml"
 19 | )
 20 | DEFAULT_SIZE = 500
 21 | 
 22 | 
 23 | class RobotEnv(gym.GoalEnv):
 24 |     def __init__(self, model_path, initial_qpos, n_substeps):
 25 | 
 26 |         # load model and simulator
 27 |         self.model = mujoco_py.load_model_from_path(model_path)
 28 |         self.sim = mujoco_py.MjSim(self.model, nsubsteps=n_substeps)
 29 | 
 30 |         # render setting
 31 |         self.viewer = None
 32 |         self._viewers = {}
 33 |         self.metadata = {
 34 |             "render.modes": ["human", "rgb_array"],
 35 |             "video.frames_per_second": int(np.round(1.0 / self.dt)),
 36 |         }
 37 | 
 38 |         # seed
 39 |         self.seed()
 40 | 
 41 |         # initalization
 42 |         self._env_setup(initial_qpos=initial_qpos)
 43 |         self.initial_state = copy.deepcopy(self.sim.get_state())
 44 |         self.goal = self._sample_goal()
 45 | 
 46 |         # set action_space and observation_space
 47 |         obs = self._get_obs()
 48 |         self._set_action_space()
 49 |         self.observation_space = spaces.Dict(
 50 |             dict(
 51 |                 desired_goal=spaces.Box(
 52 |                     -np.inf, np.inf, shape=obs["desired_goal"].shape, dtype="float32"
 53 |                 ),
 54 |                 achieved_goal=spaces.Box(
 55 |                     -np.inf, np.inf, shape=obs["achieved_goal"].shape, dtype="float32"
 56 |                 ),
 57 |                 observation=spaces.Box(
 58 |                     -np.inf, np.inf, shape=obs["observation"].shape, dtype="float32"
 59 |                 ),
 60 |             )
 61 |         )
 62 | 
 63 |     def _set_action_space(self):
 64 |         bounds = self.model.actuator_ctrlrange.copy()
 65 |         low, high = bounds.T
 66 |         self.action_space = spaces.Box( low = low, high = high, dtype = np.float32)
 67 |         return self.action_space
 68 | 
 69 |     @property
 70 |     def dt(self):
 71 |         return self.sim.model.opt.timestep * self.sim.nsubsteps
 72 | 
 73 |     def _detecte_collision(self):
 74 |         self.collision = self.sim.data.ncon
 75 |         return self.collision
 76 | 
 77 |     def _sensor_torque(self):
 78 |         self.sensor_data = self.sim.data.sensordata
 79 |         return self.sensor_data
 80 | 
 81 |     def seed(self, seed=None):
 82 |         self.np_random, seed = seeding.np_random(seed)
 83 |         return [seed]
 84 | 
 85 |     def step(self, action):
 86 |         old_action = self.sim.data.ctrl.copy() * (1 / 0.5)
 87 |         action = np.clip(action, self.action_space.low, self.action_space.high)
 88 |         self._set_action(action)
 89 |         self._step_callback()
 90 |         obs = self._get_obs()
 91 |         done = False
 92 |         info = {
 93 |             "is_success": self._is_success(obs["achieved_goal"], self.goal),
 94 |             "act": action,
 95 |             "old_act": old_action,
 96 |         }
 97 |         reward = self.compute_reward(
 98 |             obs["achieved_goal"], self.goal, action, old_action, info
 99 |         )
100 |         return obs, reward, done, info
101 | 
102 |     def reset(self):
103 |         """Attempt to reset the simulator. Since we randomize initial conditions, it
104 |         is possible to get into a state with numerical issues (e.g. due to penetration or
105 |         Gimbel lock) or we may not achieve an initial condition (e.g. an object is within the hand).
106 |         In this case, we just keep randomizing until we eventually achieve a valid initial
107 |         configuration.
108 |         """
109 |         super(RobotEnv, self).reset()
110 |         did_reset_sim = False
111 |         while not did_reset_sim:
112 |             did_reset_sim = self._reset_sim()
113 | 
114 |         self.goal = self._sample_goal()
115 |         obs = self._get_obs()
116 | 
117 |         return obs
118 | 
119 |     def close(self):
120 |         if self.viewer is not None:
121 |             # self.viewer.finish()
122 |             self.viewer = None
123 |             self._viewers = {}
124 | 
125 |     def render(self, mode="human", width=DEFAULT_SIZE, height=DEFAULT_SIZE):
126 |         # self._render_callback()
127 |         if mode == "rgb_array":
128 |             self._get_viewer(mode).render(width, height)
129 |             # window size used for old mujoco-py:
130 |             data = self._get_viewer(mode).read_pixels(width, height, depth=False)
131 |             # original image is upside-down, so flip it
132 |             return data[::-1, :, :]
133 |         elif mode == "human":
134 |             self._get_viewer(mode).render()
135 | 
136 |     def _get_viewer(self, mode):
137 |         self.viewer = self._viewers.get(mode)
138 |         if self.viewer is None:
139 |             if mode == "human":
140 |                 self.viewer = mujoco_py.MjViewer(self.sim)
141 |             elif mode == "rgb_array":
142 |                 self.viewer = mujoco_py.MjRenderContextOffscreen(self.sim, device_id=-1)
143 |             self._viewer_setup()
144 |             self._viewers[mode] = self.viewer
145 |         return self.viewer
146 | 
147 |     def _reset_sim(self):
148 |         """Resets a simulation and indicates whether or not it is successful.
149 |         If a reset is unsuccessful (e.g. if a randomized state caused an error in the
150 |         simulation), this method should indicate such a failure by returning False.
151 |         In such a case, this method will be called again to attempt a the reset again.
152 |         """
153 |         self.sim.set_state(self.initial_state)
154 |         self.sim.forward()
155 |         return True
156 | 
157 |     def _get_obs(self):
158 |         """Returns the observation."""
159 |         raise NotImplementedError()
160 | 
161 |     def _set_action(self, action):
162 |         """Applies the given action to the simulation."""
163 |         raise NotImplementedError()
164 | 
165 |     def _is_success(self, achieved_goal, desired_goal):
166 |         """Indicates whether or not the achieved goal successfully achieved the desired goal."""
167 |         raise NotImplementedError()
168 | 
169 |     def _sample_goal(self):
170 |         """Samples a new goal and returns it."""
171 |         raise NotImplementedError()
172 | 
173 |     def _env_setup(self, initial_qpos):
174 |         """Initial configuration of the environment. Can be used to configure initial state
175 |         and extract information from the simulation.
176 |         """
177 |         pass
178 | 
179 |     def _viewer_setup(self):
180 |         """Initial configuration of the viewer. Can be used to set the camera position,
181 |         for example.
182 |         """
183 |         pass
184 | 
185 |     def _render_callback(self):
186 |         """A custom callback【自定义回调】 that is called before rendering. Can be used
187 |         to implement custom visualizations.【可实现自定义可视化】
188 |         """
189 |         pass
190 | 
191 |     def _step_callback(self):
192 |         """A custom callback that is called after stepping the simulation. Can be used
193 |         to enforce additional constraints on the simulation state.【对模拟状态强制附加约束】
194 |         """
195 |         pass
196 | 
197 | 
198 | def goal_distance(goal_a, goal_b):
199 |     assert goal_a.shape == goal_b.shape
200 |     return np.linalg.norm(goal_a - goal_b, axis=-1)
201 | 
202 | 
203 | class SpacerobotEnv(RobotEnv):
204 |     """Superclass for all SpaceRobot environments."""
205 | 
206 |     def __init__(
207 |         self,
208 |         model_path,
209 |         n_substeps,
210 |         distance_threshold,
211 |         initial_qpos,
212 |         reward_type,
213 |     ):
214 |         """Initializes a new Fetch environment.
215 |         Args:
216 |             model_path (string): path to the environments XML file
217 |             n_substeps (int): number of substeps the simulation runs on every call to step
218 |             distance_threshold (float): the threshold after which a goal is considered achieved
219 |             initial_qpos (dict): a dictionary of joint names and values that define the initial configuration
220 |             reward_type ('sparse' or 'dense'): the reward type, i.e. sparse or dense
221 |         """
222 |         self.n_substeps = n_substeps
223 |         self.distance_threshold = distance_threshold
224 |         self.reward_type = reward_type
225 | 
226 |         super(SpacerobotEnv, self).__init__(
227 |             model_path=model_path,
228 |             n_substeps=n_substeps,
229 |             initial_qpos=initial_qpos,
230 |         )
231 | 
232 |     def compute_reward(self, achieved_goal, desired_goal, action, old_action, info):
233 | 
234 |         # Compute distance between goal and the achieved goal.
235 |         d = goal_distance(achieved_goal, desired_goal)
236 |         if self.reward_type == "sparse":
237 |             return -(d > self.distance_threshold).astype(np.float32)
238 |         elif self.reward_type == "distance":
239 |             return d
240 |         else:
241 |             # dense reward
242 |             return -(
243 |                 0.001 * d ** 2
244 |                 + np.log10(d ** 2 + 1e-6)
245 |                 + 0.01 * np.linalg.norm(action - old_action) ** 2
246 |             )
247 | 
248 |     def _set_action(self, action):
249 |         """
250 |         :param action: 3*None->6*None
251 |         :return:
252 |         """
253 |         assert action.shape == (6,)
254 |         self.sim.data.ctrl[:] = action * 0.5
255 |         for _ in range(self.n_substeps):
256 |             self.sim.step()
257 | 
258 |     def _get_obs(self):
259 |         # positions
260 |         grip_pos = self.sim.data.get_body_xpos("tip_frame")
261 |         grip_velp = self.sim.data.get_body_xvelp("tip_frame") * self.dt
262 |         robot_qpos, robot_qvel = utils.robot_get_obs(self.sim)
263 | 
264 |         gripper_state = robot_qpos[-1:]
265 |         gripper_vel = (
266 |             robot_qvel[-1:] * self.dt
267 |         )  # change to a scalar if the gripper is made symmetric
268 | 
269 |         achieved_goal = grip_pos.copy()
270 | 
271 |         obs = np.concatenate(
272 |             [
273 |                 self.sim.data.qpos[7:13].copy(),
274 |                 self.sim.data.qvel[6:12].copy(),
275 |                 grip_pos,
276 |                 grip_velp,
277 |                 self.goal.copy(),
278 |             ]
279 |         )
280 | 
281 |         return {
282 |             "observation": obs.copy(),
283 |             "achieved_goal": achieved_goal.copy(),
284 |             "desired_goal": self.goal.copy(),
285 |         }
286 | 
287 |     def _viewer_setup(self):
288 |         body_id = self.sim.model.body_name2id("tip_frame")
289 |         lookat = self.sim.data.body_xpos[body_id]
290 |         for idx, value in enumerate(lookat):
291 |             self.viewer.cam.lookat[idx] = value
292 |         self.viewer.cam.distance = 2.5
293 |         self.viewer.cam.azimuth = 132.0
294 |         self.viewer.cam.elevation = -14.0
295 | 
296 |     def _reset_sim(self):
297 |         self.sim.set_state(self.initial_state)
298 |         self.sim.forward()
299 |         return True
300 | 
301 |     def _sample_goal(self):
302 | 
303 |         goal = self.initial_gripper_xpos[:3].copy()
304 |         d = goal_distance(self.sim.data.get_body_xpos("tip_frame").copy(), goal)
305 | 
306 |         goal[0] = self.initial_gripper_xpos[0] + np.random.uniform(-0.4, 0)
307 |         goal[1] = self.initial_gripper_xpos[1] + np.random.uniform(-0.3, 0.3)
308 |         goal[2] = self.initial_gripper_xpos[2] + np.random.uniform(0, 0.3)
309 | 
310 |         d = goal_distance(self.sim.data.get_body_xpos("tip_frame").copy(), goal)
311 | 
312 |         site_id = self.sim.model.site_name2id("target0")
313 |         self.sim.model.site_pos[site_id] = goal
314 |         self.sim.forward()
315 | 
316 |         return goal.copy()
317 | 
318 |     def _is_success(self, achieved_goal, desired_goal):
319 |         d = goal_distance(achieved_goal, desired_goal)
320 |         return (d < self.distance_threshold).astype(np.float32)
321 |         # return d
322 | 
323 |     def _env_setup(self, initial_qpos):
324 |         for name, value in initial_qpos.items():
325 |             self.sim.data.set_joint_qpos(name, value)
326 |         utils.reset_mocap_welds(self.sim)
327 | 
328 |         # Extract information for sampling goals.
329 |         self.initial_gripper_xpos = self.sim.data.get_body_xpos("tip_frame").copy()
330 | 
331 |     def render(self, mode="human", width=500, height=500):
332 |         return super(SpacerobotEnv, self).render(mode, width, height)
333 | 
334 | 
335 | class SpaceRobotState(SpacerobotEnv, gym.utils.EzPickle):
336 |     def __init__(self, reward_type="nosparse"):
337 |         initial_qpos = {
338 |             "arm:shoulder_pan_joint": 0.0,
339 |             "arm:shoulder_lift_joint": 0.0,
340 |             "arm:elbow_joint": 0.0,
341 |             "arm:wrist_1_joint": 0.0,
342 |             "arm:wrist_2_joint": 0.0,
343 |             "arm:wrist_3_joint": 0.0,
344 |         }
345 |         SpacerobotEnv.__init__(
346 |             self,
347 |             MODEL_XML_PATH,
348 |             n_substeps=20,
349 |             distance_threshold=0.05,
350 |             initial_qpos=initial_qpos,
351 |             reward_type=reward_type,
352 |         )
353 |         gym.utils.EzPickle.__init__(self)
354 | 


--------------------------------------------------------------------------------
/SpaceRobotEnv/envs/__init__.py:
--------------------------------------------------------------------------------
1 | import imp
2 | from .SpaceRobotDualArm import SpaceRobotDualArm
3 | from .SpaceRobotImage import SpaceRobotImage
4 | from .SpaceRobotState import SpaceRobotState
5 | from .SpaceRobotCost import SpaceRobotCost
6 | from .SpaceRobotReorientation import SpaceRobotReorientation
7 | 


--------------------------------------------------------------------------------
/SpaceRobotEnv/images/Simulation.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/images/Simulation.jpg


--------------------------------------------------------------------------------
/SpaceRobotEnv/images/ccc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/images/ccc.png


--------------------------------------------------------------------------------
/SpaceRobotEnv/images/iros.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/images/iros.gif


--------------------------------------------------------------------------------
/SpaceRobotEnv/images/ral.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/images/ral.gif


--------------------------------------------------------------------------------
/SpaceRobotEnv/images/robot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/images/robot.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | gym>=0.15.4
2 | mujoco-py>=1.15.1.0
3 | torch>=1.12.0
4 | torchvision>=0.13.0
5 | torchaudio>=0.12.0


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from os.path import dirname, realpath
 2 | from setuptools import find_packages, setup
 3 | 
 4 | def read_requirements_file(filename):
 5 |     req_file_path = '%s/%s' % (dirname(realpath(__file__)), filename)
 6 |     with open(req_file_path) as f:
 7 |         return [line.strip() for line in f]
 8 | 
 9 | setup(
10 |     name="SpaceRobotEnv",
11 |     version="0.0.1",
12 |     install_requires=read_requirements_file('requirements.txt'),
13 |     packages=find_packages(exclude=("image",)),
14 | )


--------------------------------------------------------------------------------
/test_env.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | 
 3 | import SpaceRobotEnv
 4 | import numpy as np
 5 | 
 6 | env = gym.make("SpaceRobotReorientation-v0")
 7 | 
 8 | dim_u = env.action_space.shape[0]
 9 | print(dim_u)
10 | dim_o = env.observation_space["observation"].shape[0]
11 | print(dim_o)
12 | 
13 | 
14 | observation = env.reset()
15 | max_action = env.action_space.high
16 | print("max_action:", max_action)
17 | print("min_action", env.action_space.low)
18 | for e_step in range(20):
19 |     observation = env.reset()
20 |     for i_step in range(50):
21 |         env.render()
22 |         action = np.random.uniform(low=-1.0, high=1.0, size=(dim_u,))
23 |         observation, reward, done, info = env.step(max_action * action)
24 | 
25 | env.close()
26 | 


--------------------------------------------------------------------------------