├── .gitignore
├── LICENSE
├── README.md
├── RL_algorithms
├── Torch
│ ├── .DS_Store
│ ├── DDPG
│ │ └── DDPG_ENV
│ │ │ ├── core.py
│ │ │ ├── ddpg.py
│ │ │ ├── logger
│ │ │ ├── events.out.tfevents.1658931832.Tosins-MacBook-Air.local.42757.0
│ │ │ ├── events.out.tfevents.1658931911.Tosins-MacBook-Air.local.42851.0
│ │ │ ├── events.out.tfevents.1658931934.Tosins-MacBook-Air.local.42904.0
│ │ │ ├── events.out.tfevents.1658932019.Tosins-MacBook-Air.local.42965.0
│ │ │ ├── events.out.tfevents.1658932066.Tosins-MacBook-Air.local.43009.0
│ │ │ ├── events.out.tfevents.1658932179.Tosins-MacBook-Air.local.43178.0
│ │ │ ├── events.out.tfevents.1658932723.Tosins-MacBook-Air.local.43565.0
│ │ │ └── events.out.tfevents.1658932834.Tosins-MacBook-Air.local.43628.0
│ │ │ ├── memory.py
│ │ │ └── training_log_csv
│ │ │ ├── Avg Reward (1).svg
│ │ │ └── run-.-tag-Avg Reward (1).csv
│ ├── PPO
│ │ ├── Continious
│ │ │ ├── PPO
│ │ │ │ ├── __init__.py
│ │ │ │ ├── actor.py
│ │ │ │ ├── agent.py
│ │ │ │ ├── critic.py
│ │ │ │ ├── main.py
│ │ │ │ ├── memory.py
│ │ │ │ ├── models
│ │ │ │ │ ├── space_robot_actor.pt
│ │ │ │ │ └── space_robot_critic.pt
│ │ │ │ ├── plots
│ │ │ │ │ └── space_robot_performance.png
│ │ │ │ └── utils.py
│ │ │ ├── PPO_Two_heads
│ │ │ │ ├── __init__.py
│ │ │ │ ├── actor.py
│ │ │ │ ├── agent.py
│ │ │ │ ├── critic.py
│ │ │ │ ├── main.py
│ │ │ │ ├── memory.py
│ │ │ │ ├── models
│ │ │ │ │ ├── space_robot_actor.pt
│ │ │ │ │ └── space_robot_critic.pt
│ │ │ │ ├── plots
│ │ │ │ │ └── space_robot_performance.png
│ │ │ │ └── utils.py
│ │ │ └── __init__.py
│ │ ├── Discrete
│ │ │ ├── PPO
│ │ │ │ ├── __init__.py
│ │ │ │ ├── actor.py
│ │ │ │ ├── agent.py
│ │ │ │ ├── critic.py
│ │ │ │ ├── main.py
│ │ │ │ ├── memory.py
│ │ │ │ ├── models
│ │ │ │ │ ├── space_robot_actor.pt
│ │ │ │ │ └── space_robot_critic.pt
│ │ │ │ ├── plots
│ │ │ │ │ └── space_robot_performance.png
│ │ │ │ ├── training_log
│ │ │ │ └── utils.py
│ │ │ └── PPOImage
│ │ │ │ ├── __init__.py
│ │ │ │ ├── actor.py
│ │ │ │ ├── agent.py
│ │ │ │ ├── critic.py
│ │ │ │ ├── main.py
│ │ │ │ ├── memory.py
│ │ │ │ ├── models
│ │ │ │ ├── .space_robot_actor.pt.icloud
│ │ │ │ └── .space_robot_critic.pt.icloud
│ │ │ │ ├── plots
│ │ │ │ └── space_robot_performance.png
│ │ │ │ └── utils.py
│ │ └── __init__.py
│ ├── SAC
│ │ ├── SAC_ENV
│ │ │ ├── core.py
│ │ │ ├── logger
│ │ │ │ ├── events.out.tfevents.1658847118.Tosins-Air.19214.0
│ │ │ │ ├── events.out.tfevents.1658847140.Tosins-Air.19431.0
│ │ │ │ ├── events.out.tfevents.1658847454.Tosins-Air.19535.0
│ │ │ │ ├── events.out.tfevents.1658847513.Tosins-Air.19931.0
│ │ │ │ ├── events.out.tfevents.1658847612.Tosins-Air.19979.0
│ │ │ │ ├── events.out.tfevents.1658847918.Tosins-Air.20089.0
│ │ │ │ ├── events.out.tfevents.1658848049.Tosins-Air.20232.0
│ │ │ │ ├── events.out.tfevents.1658848339.Tosins-Air.20384.0
│ │ │ │ ├── events.out.tfevents.1658848364.Tosins-Air.20423.0
│ │ │ │ ├── events.out.tfevents.1658848673.Tosins-Air.20649.0
│ │ │ │ ├── events.out.tfevents.1658848831.Tosins-Air.20793.0
│ │ │ │ ├── events.out.tfevents.1658849191.Tosins-Air.20924.0
│ │ │ │ ├── events.out.tfevents.1658849218.Tosins-Air.20984.0
│ │ │ │ ├── events.out.tfevents.1658849777.Tosins-Air.21229.0
│ │ │ │ ├── events.out.tfevents.1658849785.Tosins-Air.21269.0
│ │ │ │ ├── events.out.tfevents.1658849885.Tosins-Air.21429.0
│ │ │ │ ├── events.out.tfevents.1658849941.Tosins-Air.21521.0
│ │ │ │ └── events.out.tfevents.1658850278.Tosins-Air.21678.0
│ │ │ ├── memory.py
│ │ │ ├── sac.py
│ │ │ └── training_log_csv
│ │ │ │ ├── run-.-tag-Avg Reward.csv
│ │ │ │ └── run-.-tag-Loss_Pi.csv
│ │ └── __init__.py
│ └── __init__.py
├── __init__.py
└── utils
│ └── mpi_tools.py
├── Simulation.jpg
├── SpaceRobotEnv
├── .DS_Store
├── __init__.py
├── assets
│ ├── .DS_Store
│ ├── common
│ │ ├── __init__.py
│ │ ├── materials.xml
│ │ ├── skybox.xml
│ │ └── visual.xml
│ └── spacerobot
│ │ ├── arm_v3.xml
│ │ ├── arm_v31.xml
│ │ ├── asset.xml
│ │ ├── sensor.xml
│ │ ├── spacerobot_cost.xml
│ │ ├── spacerobot_dualarm.xml
│ │ ├── spacerobot_image.xml
│ │ ├── spacerobot_state.xml
│ │ ├── stls
│ │ ├── R10.stl
│ │ ├── cube.stl
│ │ ├── v_base.stl
│ │ ├── v_forearm.stl
│ │ ├── v_shoulder.stl
│ │ ├── v_upperarm.stl
│ │ ├── v_wrist1.stl
│ │ ├── v_wrist2.stl
│ │ └── v_wrist3.stl
│ │ └── subgoal.xml
├── envs
│ ├── SpaceRobotCost.py
│ ├── SpaceRobotDualArm.py
│ ├── SpaceRobotImage.py
│ ├── SpaceRobotPointCloud.py
│ ├── SpaceRobotReorientation.py
│ ├── SpaceRobotState.py
│ └── __init__.py
└── images
│ ├── Simulation.jpg
│ ├── ccc.png
│ ├── iros.gif
│ ├── ral.gif
│ └── robot.png
├── requirements.txt
├── setup.py
└── test_env.py
/.gitignore:
--------------------------------------------------------------------------------
1 | 1/
2 | 2/
3 | 3/
4 | SpaceRobotEnv.egg-info/
5 | # Byte-compiled / optimized / DLL files
6 | __pycache__/
7 | *.pyc
8 | *.py[cod]
9 | *$py.class
10 |
11 | # C extensions
12 | *.so
13 |
14 | # Distribution / packaging
15 | .Python
16 | build/
17 | develop-eggs/
18 | dist/
19 | downloads/
20 | eggs/
21 | .eggs/
22 | lib/
23 | lib64/
24 | parts/
25 | sdist/
26 | var/
27 | wheels/
28 | share/python-wheels/
29 | *.egg-info/
30 | .installed.cfg
31 | *.egg
32 | MANIFEST
33 |
34 | # PyInstaller
35 | # Usually these files are written by a python script from a template
36 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
37 | *.manifest
38 | *.spec
39 |
40 | # Installer logs
41 | pip-log.txt
42 | pip-delete-this-directory.txt
43 |
44 | # Unit test / coverage reports
45 | htmlcov/
46 | .tox/
47 | .nox/
48 | .coverage
49 | .coverage.*
50 | .cache
51 | nosetests.xml
52 | coverage.xml
53 | *.cover
54 | *.py,cover
55 | .hypothesis/
56 | .pytest_cache/
57 | cover/
58 |
59 | # Translations
60 | *.mo
61 | *.pot
62 |
63 | # Django stuff:
64 | *.log
65 | local_settings.py
66 | db.sqlite3
67 | db.sqlite3-journal
68 |
69 | # Flask stuff:
70 | instance/
71 | .webassets-cache
72 |
73 | # Scrapy stuff:
74 | .scrapy
75 |
76 | # Sphinx documentation
77 | docs/_build/
78 |
79 | # PyBuilder
80 | .pybuilder/
81 | target/
82 |
83 | # Jupyter Notebook
84 | .ipynb_checkpoints
85 |
86 | # IPython
87 | profile_default/
88 | ipython_config.py
89 |
90 | # pyenv
91 | # For a library or package, you might want to ignore these files since the code is
92 | # intended to run in multiple environments; otherwise, check them in:
93 | # .python-version
94 |
95 | # pipenv
96 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
97 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
98 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
99 | # install all needed dependencies.
100 | #Pipfile.lock
101 |
102 | # poetry
103 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
104 | # This is especially recommended for binary packages to ensure reproducibility, and is more
105 | # commonly ignored for libraries.
106 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
107 | #poetry.lock
108 |
109 | # pdm
110 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
111 | #pdm.lock
112 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
113 | # in version control.
114 | # https://pdm.fming.dev/#use-with-ide
115 | .pdm.toml
116 |
117 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
118 | __pypackages__/
119 |
120 | # Celery stuff
121 | celerybeat-schedule
122 | celerybeat.pid
123 |
124 | # SageMath parsed files
125 | *.sage.py
126 |
127 | # Environments
128 | .env
129 | .venv
130 | env/
131 | venv/
132 | ENV/
133 | env.bak/
134 | venv.bak/
135 |
136 | # Spyder project settings
137 | .spyderproject
138 | .spyproject
139 |
140 | # Rope project settings
141 | .ropeproject
142 |
143 | # mkdocs documentation
144 | /site
145 |
146 | # mypy
147 | .mypy_cache/
148 | .dmypy.json
149 | dmypy.json
150 |
151 | # Pyre type checker
152 | .pyre/
153 |
154 | # pytype static type analyzer
155 | .pytype/
156 |
157 | # Cython debug symbols
158 | cython_debug/
159 |
160 | # PyCharm
161 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
162 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
163 | # and can be added to the global gitignore or merged into this file. For a more nuclear
164 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
165 | .idea/
166 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # SpaceRobotEnv
2 |
3 | > Note: our repo can be found in the OpenAI Gym Documentation now. Please see [SpaceRobotEnv](https://www.gymlibrary.dev/environments/third_party_environments/#spacerobotenv).
4 |
5 | **SpaceRobotEnv** is an open-sourced environments for trajectory planning of free-floating space robots.
6 | Different from the traditional robot, the free-floating space robot is a dynamic coupling system because of the non-actuated base, as shown in the figure below.
7 | Therefore, model-based trajectory planning methods encounter many dif- ficulties in modeling and computing.
8 |
9 |
10 | Accordingly, the researches focus on how to utilize the model-free methods, like reinforcement learning algorithms, to obtain the trajectory directly.
11 | However, reaching high-level planning accuracy, bimanual coordination and end-to-end control remains an open challenge for space robotics researchers.
12 | To better help the community study this problem, SpaceRobotEnv are developed with the following key features:
13 | * **Real Space Environment**: we construct environments similar to the space. The free-floating space robot is located in a low-gravity condition.
14 | * **Dynamic coupling control**: Compared with robots on the ground, the torques of joints have a significant impact on the posture of the base. The movement of the base makes a disturbance on the positions of end-effectors, thus leading to a more complex trajectory planning task.
15 | * **Image input**: We provide the ability to use images as observations. And we also demonstrates our environment is effective, please see [our paper](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=9550509).
16 |
17 | - **Quick Demos**
18 |
19 | [Paper link](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=9636681)
20 |
21 |
22 |
23 |
24 | [Paper link](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=9636681)
25 |
26 |
27 |
28 |
29 | [Paper link](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=9550509)
30 |
31 |
32 |
33 |
34 | Environments of this repo are as follows:
35 | * **SpaceRobotState-v0**
36 | * State vector contains the angular positions and velocities of joints, the positions and velocities of end-effectors and the positions of goals. The core goal is to make the end-effector reach the goal randomly selected within a large space.
37 | * **SpaceRobotCost-v0**
38 | * The task is to make the end-effector reach a random goal while avoiding obvious movement of the base, especially for its orientation. Because the rotation of the base will cause the interruption of communication with the earth.
39 | * **SpaceRobotImage-v0**
40 | * State vector only contains images information. The core goal is the same as that of the `SpaceRobotState-v0` environment.
41 | * **SpaceRobotDualArm-v0**
42 | * The free floating space robot owns two robotic arms which are attached with the base. That means two end-effectors are corresponding to two goal positions.
43 | When two end-effectors reach the goals together, the task is finished.
44 | * **SpaceRobotReorientation-v0**
45 | * The free floating space robot owns two robotic arms which are attached with the base. The inital orientation of the base is sampled randomly in each episode.
46 | When two arms help the base to reach the target orientation, the task is finished.
47 | * **SpaceRobotPointCloud-v0**
48 | * State vector contains the point colouds information. The core goal is the same as that of the `SpaceRobotState-v0` environment.
49 |
50 | ## Installation
51 |
52 | Our environment is built on the [Mujoco Simulation](https://github.com/deepmind/mujoco). So before using our repo, please make sure you install the [Mujoco](https://github.com/deepmind/mujoco) platform.
53 | Additionally, our framework is based on the [Gym](https://github.com/openai/gym).
54 | Details regarding installation of Gym can be found [here](https://github.com/openai/gym).
55 |
56 | After you finish the installation of the Mujoco and Gym and test some toy examples using them, you can install this repo from the source code:
57 |
58 | ```bash
59 | pip install -e .
60 | ```
61 |
62 | ## Quick Start
63 |
64 | We provide a Gym-Like API that allows us to get interacting information. `test_env.py` shows a toy example to verify the environments.
65 | As you can see, A Gym-Like API makes some popular RL-based algorithm repos, like [Stable Baselines3](https://github.com/DLR-RM/stable-baselines3), easily implemented in our environments.
66 | ```python
67 | import gym
68 |
69 | import SpaceRobotEnv
70 | import numpy as np
71 |
72 | env = gym.make("SpaceRobotState-v0")
73 |
74 | dim_u = env.action_space.shape[0]
75 | print(dim_u)
76 | dim_o = env.observation_space["observation"].shape[0]
77 | print(dim_o)
78 |
79 |
80 | observation = env.reset()
81 | max_action = env.action_space.high
82 | print("max_action:", max_action)
83 | print("mmin_action", env.action_space.low)
84 | for e_step in range(20):
85 | observation = env.reset()
86 | for i_step in range(50):
87 | env.render()
88 | action = np.random.uniform(low=-1.0, high=1.0, size=(dim_u,))
89 | observation, reward, done, info = env.step(max_action * action)
90 |
91 | env.close()
92 | ```
93 |
94 | ## Introduction of free-floating space robot
95 |
96 | The free-floating space robot contains two parts, a robotic arm and a base satellite. The robot arm is rigidly connected with the base, and the whole space robot remains in a low-gravity condition.
97 | The 6-DoF UR5 model is chosen as the robot arm, and to simplify, we considered the base as a cubic structure. The specific structure is shown as follows.
98 |
99 |
100 |
101 |
102 |
103 |
104 | ## Future plan
105 |
106 |
107 | ### Tasks under development:
108 | - [x] Point cloud inputs
109 | - [ ] Add new torque controllers, like impedance controller.
110 | - [ ] Bulid new environments
111 |
112 | ### Algorithms:
113 | - [x] PPO
114 | - [ ] TRPO
115 | - [x] DDPG
116 | - [ ] TD3
117 | - [x] SAC
118 | - [ ] HER
119 | - [ ] [HDO](https://ieeexplore.ieee.org/abstract/document/9718193)
120 |
121 | ## Citing SpaceRobotEnv
122 |
123 | If you find SpaceRobotEnv useful, please cite our recent work in your publications.
124 |
125 | ```
126 | @article{wang2022collision,
127 | title={Collision-Free Trajectory Planning for a 6-DoF Free-Floating Space Robot via Hierarchical Decoupling Optimization},
128 | author={Wang, Shengjie and Cao, Yuxue and Zheng, Xiang and Zhang, Tao},
129 | journal={IEEE Robotics and Automation Letters},
130 | volume={7},
131 | number={2},
132 | pages={4953--4960},
133 | year={2022},
134 | publisher={IEEE}
135 | }
136 |
137 | @inproceedings{wang2021multi,
138 | title={A Multi-Target Trajectory Planning of a 6-DoF Free-Floating Space Robot via Reinforcement Learning},
139 | author={Wang, Shengjie and Zheng, Xiang and Cao, Yuxue and Zhang, Tao},
140 | booktitle={2021 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)},
141 | pages={3724--3730},
142 | organization={IEEE}
143 | }
144 |
145 | @inproceedings{wang2021end,
146 | title={An End-to-End Trajectory Planning Strategy for Free-floating Space Robots},
147 | author={Wang, Shengjie and Cao, Yuxue and Zheng, Xiang and Zhang, Tao},
148 | booktitle={2021 40th Chinese Control Conference (CCC)},
149 | pages={4236--4241},
150 | year={2021},
151 | organization={IEEE}
152 | }
153 |
154 | @article{cao2022reinforcement,
155 | title={Reinforcement Learning with Prior Policy Guidance for Motion Planning of Dual-Arm Free-Floating Space Robot},
156 | author={Cao, Yuxue and Wang, Shengjie and Zheng, Xiang and Ma, Wenke and Xie, Xinru and Liu, Lei},
157 | journal={arXiv preprint arXiv:2209.01434},
158 | year={2022}
159 | }
160 |
161 | ```
162 |
163 | ## The Team
164 |
165 | SpaceRobotEnv is a project maintained by
166 | [Shengjie Wang](https://github.com/Shengjie-bob), [Xiang Zheng](https://github.com/x-zheng16), [Yuxue Cao](https://github.com/ShenGe123000) , [Fengbo Lan](https://github.com/lanrobot) at Tsinghua University. Also thanks a lot for the great contribution from [Tosin](https://github.com/tohsin) .
167 |
168 |
169 | ## License
170 |
171 | SpaceRobotEnv has an Apache license, as found in the [LICENSE](LICENSE) file.
172 |
--------------------------------------------------------------------------------
/RL_algorithms/Torch/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/.DS_Store
--------------------------------------------------------------------------------
/RL_algorithms/Torch/DDPG/DDPG_ENV/core.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import scipy.signal
3 |
4 | import torch
5 | import torch.nn as nn
6 |
7 |
8 | def combined_shape(length, shape=None):
9 | if shape is None:
10 | return (length,)
11 | return (length, shape) if np.isscalar(shape) else (length, *shape)
12 |
13 | def mlp(sizes, activation, output_activation=nn.Identity):
14 | layers = []
15 | for j in range(len(sizes)-1):
16 | act = activation if j < len(sizes)-2 else output_activation
17 | layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
18 | return nn.Sequential(*layers)
19 |
20 | def count_vars(module):
21 | return sum([np.prod(p.shape) for p in module.parameters()])
22 |
23 | class MLPActor(nn.Module):
24 |
25 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation, act_limit):
26 | super().__init__()
27 | pi_sizes = [obs_dim] + list(hidden_sizes) + [act_dim]
28 | self.pi = mlp(pi_sizes, activation, nn.Tanh)
29 | self.act_limit = act_limit
30 |
31 | def forward(self, obs):
32 | # Return output from network scaled to action space limits.
33 | return self.act_limit * self.pi(obs)
34 |
35 | class MLPQFunction(nn.Module):
36 |
37 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation):
38 | super().__init__()
39 | self.q = mlp([obs_dim + act_dim] + list(hidden_sizes) + [1], activation)
40 |
41 | def forward(self, obs, act):
42 | q = self.q(torch.cat([obs, act], dim=-1))
43 | return torch.squeeze(q, -1) # Critical to ensure q has right shape.
44 |
45 | class MLPActorCritic(nn.Module):
46 |
47 | def __init__(self, observation_space, action_space, hidden_sizes=(256,256),
48 | activation=nn.ReLU):
49 | super().__init__()
50 |
51 | obs_dim = observation_space.shape[0]
52 | act_dim = action_space.shape[0]
53 | act_limit = action_space.high[0]
54 |
55 | # build policy and value functions
56 | self.pi = MLPActor(obs_dim, act_dim, hidden_sizes, activation, act_limit)
57 | self.q = MLPQFunction(obs_dim, act_dim, hidden_sizes, activation)
58 |
59 | def act(self, obs):
60 | with torch.no_grad():
61 | return self.pi(obs).numpy()
62 |
--------------------------------------------------------------------------------
/RL_algorithms/Torch/DDPG/DDPG_ENV/ddpg.py:
--------------------------------------------------------------------------------
1 | from copy import deepcopy
2 | import numpy as np
3 | import torch
4 | from torch.optim import Adam
5 | import gym
6 | import time
7 | import SpaceRobotEnv
8 | import core
9 | from torch.utils.tensorboard import SummaryWriter
10 | # run tensor board tensorboard --logdir = /Users/emma/dev/SpaceRobotEnv/RL_algorithms/Torch/DDPG/DDPG_ENV/logger
11 | #tensorboard --logdir=/Users/emma/dev/SpaceRobotEnv/RL_algorithms/Torch/DDPG/DDPG_ENV/logger
12 | class ReplayBuffer:
13 | """
14 | A simple FIFO experience replay buffer for DDPG agents.
15 | """
16 |
17 | def __init__(self, obs_dim, act_dim, size):
18 | self.obs_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32)
19 | self.obs2_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32)
20 | self.act_buf = np.zeros(core.combined_shape(size, act_dim), dtype=np.float32)
21 | self.rew_buf = np.zeros(size, dtype=np.float32)
22 | self.done_buf = np.zeros(size, dtype=np.float32)
23 | self.ptr, self.size, self.max_size = 0, 0, size
24 |
25 | def store(self, obs, act, rew, next_obs, done):
26 | self.obs_buf[self.ptr] = obs
27 | self.obs2_buf[self.ptr] = next_obs
28 | self.act_buf[self.ptr] = act
29 | self.rew_buf[self.ptr] = rew
30 | self.done_buf[self.ptr] = done
31 | self.ptr = (self.ptr+1) % self.max_size
32 | self.size = min(self.size+1, self.max_size)
33 |
34 | def sample_batch(self, batch_size=32):
35 | idxs = np.random.randint(0, self.size, size=batch_size)
36 | batch = dict(obs=self.obs_buf[idxs],
37 | obs2=self.obs2_buf[idxs],
38 | act=self.act_buf[idxs],
39 | rew=self.rew_buf[idxs],
40 | done=self.done_buf[idxs])
41 | return {k: torch.as_tensor(v, dtype=torch.float32) for k,v in batch.items()}
42 |
43 |
44 |
45 | def ddpg(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0,
46 | steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99,
47 | polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000,
48 | update_after=1000, update_every=50, act_noise=0.1, num_test_episodes=10,
49 | max_ep_len=1000, save_freq=1):
50 | """
51 | Deep Deterministic Policy Gradient (DDPG)
52 |
53 |
54 | Args:
55 | env_fn : A function which creates a copy of the environment.
56 | The environment must satisfy the OpenAI Gym API.
57 |
58 | actor_critic: The constructor method for a PyTorch Module with an ``act``
59 | method, a ``pi`` module, and a ``q`` module. The ``act`` method and
60 | ``pi`` module should accept batches of observations as inputs,
61 | and ``q`` should accept a batch of observations and a batch of
62 | actions as inputs. When called, these should return:
63 |
64 | =========== ================ ======================================
65 | Call Output Shape Description
66 | =========== ================ ======================================
67 | ``act`` (batch, act_dim) | Numpy array of actions for each
68 | | observation.
69 | ``pi`` (batch, act_dim) | Tensor containing actions from policy
70 | | given observations.
71 | ``q`` (batch,) | Tensor containing the current estimate
72 | | of Q* for the provided observations
73 | | and actions. (Critical: make sure to
74 | | flatten this!)
75 | =========== ================ ======================================
76 |
77 | ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object
78 | you provided to DDPG.
79 |
80 | seed (int): Seed for random number generators.
81 |
82 | steps_per_epoch (int): Number of steps of interaction (state-action pairs)
83 | for the agent and the environment in each epoch.
84 |
85 | epochs (int): Number of epochs to run and train agent.
86 |
87 | replay_size (int): Maximum length of replay buffer.
88 |
89 | gamma (float): Discount factor. (Always between 0 and 1.)
90 |
91 | polyak (float): Interpolation factor in polyak averaging for target
92 | networks. Target networks are updated towards main networks
93 | according to:
94 |
95 | .. math:: \\theta_{\\text{targ}} \\leftarrow
96 | \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta
97 |
98 | where :math:`\\rho` is polyak. (Always between 0 and 1, usually
99 | close to 1.)
100 |
101 | pi_lr (float): Learning rate for policy.
102 |
103 | q_lr (float): Learning rate for Q-networks.
104 |
105 | batch_size (int): Minibatch size for SGD.
106 |
107 | start_steps (int): Number of steps for uniform-random action selection,
108 | before running real policy. Helps exploration.
109 |
110 | update_after (int): Number of env interactions to collect before
111 | starting to do gradient descent updates. Ensures replay buffer
112 | is full enough for useful updates.
113 |
114 | update_every (int): Number of env interactions that should elapse
115 | between gradient descent updates. Note: Regardless of how long
116 | you wait between updates, the ratio of env steps to gradient steps
117 | is locked to 1.
118 |
119 | act_noise (float): Stddev for Gaussian exploration noise added to
120 | policy at training time. (At test time, no noise is added.)
121 |
122 | num_test_episodes (int): Number of episodes to test the deterministic
123 | policy at the end of each epoch.
124 |
125 | max_ep_len (int): Maximum length of trajectory / episode / rollout.
126 |
127 | logger_kwargs (dict): Keyword args for EpochLogger.
128 |
129 | save_freq (int): How often (in terms of gap between epochs) to save
130 | the current policy and value function.
131 |
132 | """
133 |
134 | # logger = EpochLogger(**logger_kwargs)
135 | # logger.save_config(locals())
136 | n_update_step = 0
137 | n_test_step = 0
138 | n_played_games = 0
139 | score_history = []
140 | torch.manual_seed(seed)
141 | np.random.seed(seed)
142 |
143 | env, test_env = env_fn(), env_fn()
144 | obs_dim = env.observation_space['observation'].shape[0]
145 | act_dim = env.action_space.shape[0]
146 |
147 | # Action limit for clamping: critically, assumes all dimensions share the same bound!
148 | act_limit = env.action_space.high[0]
149 |
150 | # Create actor-critic module and target networks
151 | ac = actor_critic(env.observation_space['observation'], env.action_space, **ac_kwargs)
152 | ac_targ = deepcopy(ac)
153 |
154 | # Freeze target networks with respect to optimizers (only update via polyak averaging)
155 | for p in ac_targ.parameters():
156 | p.requires_grad = False
157 |
158 | # Experience buffer
159 | replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)
160 |
161 | # Count variables (protip: try to get a feel for how different size networks behave!)
162 | var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.q])
163 | # logger.log('\nNumber of parameters: \t pi: %d, \t q: %d\n'%var_counts)
164 |
165 | # Set up function for computing DDPG Q-loss
166 | def compute_loss_q(data):
167 | o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done']
168 |
169 | q = ac.q(o,a)
170 |
171 | # Bellman backup for Q function
172 | with torch.no_grad():
173 | q_pi_targ = ac_targ.q(o2, ac_targ.pi(o2))
174 | backup = r + gamma * (1 - d) * q_pi_targ
175 |
176 | # MSE loss against Bellman backup
177 | loss_q = ((q - backup)**2).mean()
178 |
179 | # Useful info for logging
180 | loss_info = dict(QVals=q.detach().numpy())
181 |
182 | return loss_q, loss_info
183 |
184 | # Set up function for computing DDPG pi loss
185 | def compute_loss_pi(data):
186 | o = data['obs']
187 | q_pi = ac.q(o, ac.pi(o))
188 | return -q_pi.mean()
189 |
190 | # Set up optimizers for policy and q-function
191 | pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr)
192 | q_optimizer = Adam(ac.q.parameters(), lr=q_lr)
193 |
194 | # Set up model saving
195 | # logger.setup_pytorch_saver(ac)
196 |
197 | def update(data):
198 | # First run one gradient descent step for Q.
199 | q_optimizer.zero_grad()
200 | loss_q, loss_info = compute_loss_q(data)
201 | loss_q.backward()
202 | q_optimizer.step()
203 | writer.add_scalar("Loss_Q", loss_q.item(), n_update_step )
204 |
205 | # Freeze Q-network so you don't waste computational effort
206 | # computing gradients for it during the policy learning step.
207 | for p in ac.q.parameters():
208 | p.requires_grad = False
209 |
210 | # Next run one gradient descent step for pi.
211 | pi_optimizer.zero_grad()
212 | loss_pi = compute_loss_pi(data)
213 | loss_pi.backward()
214 | pi_optimizer.step()
215 | writer.add_scalar("Loss_Pi", loss_pi.item(), n_update_step)
216 |
217 | # Unfreeze Q-network so you can optimize it at next DDPG step.
218 | for p in ac.q.parameters():
219 | p.requires_grad = True
220 |
221 |
222 | # Record things
223 | # logger.store(LossQ=loss_q.item(), LossPi=loss_pi.item(), **loss_info)
224 |
225 | # Finally, update target networks by polyak averaging.
226 | with torch.no_grad():
227 | for p, p_targ in zip(ac.parameters(), ac_targ.parameters()):
228 | # NB: We use an in-place operations "mul_", "add_" to update target
229 | # params, as opposed to "mul" and "add", which would make new tensors.
230 | p_targ.data.mul_(polyak)
231 | p_targ.data.add_((1 - polyak) * p.data)
232 |
233 | def get_action(o, noise_scale):
234 | a = ac.act(torch.as_tensor(o, dtype=torch.float32))
235 | a += noise_scale * np.random.randn(act_dim)
236 | return np.clip(a, -act_limit, act_limit)
237 |
238 | def test_agent():
239 | avg_score_test = []
240 | for j in range(num_test_episodes):
241 | o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
242 | o = o['observation']
243 | while not(d or (ep_len == max_ep_len)):
244 | # Take deterministic actions at test time (noise_2scale=0)
245 | o, r, d, _ = test_env.step(get_action(o, 0))
246 | o = o['observation']
247 | ep_ret += r
248 | ep_len += 1
249 | avg_score_test.append(ep_ret)
250 | writer.add_scalar("Test_score avg", np.mean(avg_score_test), n_test_step)
251 | # logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)
252 |
253 | # Prepare for interaction with environment
254 | total_steps = steps_per_epoch * epochs
255 | start_time = time.time()
256 | o, ep_ret, ep_len = env.reset(), 0, 0
257 | o = o["observation"]
258 |
259 | # Main loop: collect experience in env and update/log each epoch
260 | for t in range(total_steps):
261 |
262 | # Until start_steps have elapsed, randomly sample actions
263 | # from a uniform distribution for better exploration. Afterwards,
264 | # use the learned policy (with some noise, via act_noise).
265 | if t > start_steps:
266 | a = get_action(o, act_noise)
267 | else:
268 | a = env.action_space.sample()
269 |
270 | # Step the env
271 | o2, r, d, _ = env.step(a)
272 | o2 = o2["observation"]
273 | ep_ret += r
274 | ep_len += 1
275 |
276 | # Ignore the "done" signal if it comes from hitting the time
277 | # horizon (that is, when it's an artificial terminal signal
278 | # that isn't based on the agent's state)
279 | d = False if ep_len==max_ep_len else d
280 |
281 | # Store experience to replay buffer
282 | replay_buffer.store(o, a, r, o2, d)
283 |
284 | # Super critical, easy to overlook step: make sure to update
285 | # most recent observation!
286 | o = o2
287 |
288 | # End of trajectory handling
289 | if d or (ep_len == max_ep_len):
290 | # logger.store(EpRet=ep_ret, EpLen=ep_len)
291 | n_played_games += 1
292 | score_history.append(ep_ret)
293 | avg_score = np.mean(score_history[-100:])
294 | writer.add_scalar("Avg Reward", avg_score, n_played_games )
295 | print( 'score %.1f' %ep_ret, 'avg_score %.1f' %avg_score,'num_games', n_played_games, )
296 |
297 | o, ep_ret, ep_len = env.reset(), 0, 0
298 | o= o["observation"]
299 |
300 | # Update handling
301 | if t >= update_after and t % update_every == 0:
302 | for _ in range(update_every):
303 | n_update_step += 1
304 | batch = replay_buffer.sample_batch(batch_size)
305 | update(data=batch)
306 |
307 | # End of epoch handling
308 | if (t+1) % steps_per_epoch == 0:
309 | epoch = (t+1) // steps_per_epoch
310 |
311 | # Test the performance of the deterministic version of the agent.
312 | n_test_step +=1
313 | test_agent()
314 |
315 | # Log info about epoch
316 |
317 |
318 | if __name__ == '__main__':
319 | import argparse
320 | parser = argparse.ArgumentParser()
321 | parser.add_argument('--env', type=str, default='SpaceRobotState-v0')
322 | parser.add_argument('--hid', type=int, default=256)
323 | parser.add_argument('--l', type=int, default=2)
324 | parser.add_argument('--gamma', type=float, default=0.99)
325 | parser.add_argument('--seed', '-s', type=int, default=0)
326 | parser.add_argument('--epochs', type=int, default=50)
327 | parser.add_argument('--exp_name', type=str, default='ddpg')
328 | args = parser.parse_args()
329 |
330 | writer = SummaryWriter("RL_algorithms/Torch/DDPG/DDPG_ENV/logger")
331 | writer.add_text(
332 | "hyperparameters",
333 | "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
334 | )
335 | ddpg(lambda : gym.make(args.env), actor_critic=core.MLPActorCritic,
336 | ac_kwargs=dict(hidden_sizes=[args.hid]*args.l),
337 | gamma=args.gamma, seed=args.seed, epochs=args.epochs)
338 |
--------------------------------------------------------------------------------
/RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658931832.Tosins-MacBook-Air.local.42757.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658931832.Tosins-MacBook-Air.local.42757.0
--------------------------------------------------------------------------------
/RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658931911.Tosins-MacBook-Air.local.42851.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658931911.Tosins-MacBook-Air.local.42851.0
--------------------------------------------------------------------------------
/RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658931934.Tosins-MacBook-Air.local.42904.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658931934.Tosins-MacBook-Air.local.42904.0
--------------------------------------------------------------------------------
/RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658932019.Tosins-MacBook-Air.local.42965.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658932019.Tosins-MacBook-Air.local.42965.0
--------------------------------------------------------------------------------
/RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658932066.Tosins-MacBook-Air.local.43009.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658932066.Tosins-MacBook-Air.local.43009.0
--------------------------------------------------------------------------------
/RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658932179.Tosins-MacBook-Air.local.43178.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658932179.Tosins-MacBook-Air.local.43178.0
--------------------------------------------------------------------------------
/RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658932723.Tosins-MacBook-Air.local.43565.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658932723.Tosins-MacBook-Air.local.43565.0
--------------------------------------------------------------------------------
/RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658932834.Tosins-MacBook-Air.local.43628.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/DDPG/DDPG_ENV/logger/events.out.tfevents.1658932834.Tosins-MacBook-Air.local.43628.0
--------------------------------------------------------------------------------
/RL_algorithms/Torch/DDPG/DDPG_ENV/memory.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/DDPG/DDPG_ENV/memory.py
--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Continious/PPO/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Continious/PPO/__init__.py
--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Continious/PPO/actor.py:
--------------------------------------------------------------------------------
1 | from multiprocessing.context import BaseContext
2 | import os
3 | import numpy as np
4 | import torch as T
5 |
6 | import torch.nn as nn
7 | import torch.optim as optim
8 | from torch.distributions.normal import Normal
9 | PATH = os.getcwd()
10 |
11 | class ActorNetwork(nn.Module):
12 |
13 | def __init__(self, n_actions, input_dims, alpha, model_name : str,
14 | fc1_dims=256, fc2_dims=256, check_point_base_dir = 'RL_algorithms/Torch/PPO/Continious/PPO/models/'):
15 | super(ActorNetwork, self).__init__()
16 | self.n_actions = n_actions
17 |
18 | log_std = -0.5 * np.ones(n_actions, dtype=np.float32)
19 | self.log_std = T.nn.Parameter(T.as_tensor(log_std))
20 |
21 | check_point_base_dir = os.path.join( PATH , check_point_base_dir )
22 | self.checkpoint_file = os.path.join(check_point_base_dir, model_name)
23 |
24 | self.actor = nn.Sequential(
25 | nn.Linear(*input_dims, fc1_dims),
26 | nn.ReLU(),
27 | nn.Linear(fc1_dims, fc2_dims),
28 | nn.ReLU(),
29 | nn.Linear(fc2_dims, n_actions),
30 | nn.Tanh()
31 | )
32 |
33 | self.optimizer = optim.Adam(self.parameters(), lr=alpha)
34 |
35 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
36 |
37 | self.to(self.device)
38 |
39 | def forward(self, obs, act = None):
40 | pi = self._distribution(obs)
41 | logp_a = None
42 | if act is not None:
43 | logp_a = self._log_prob_from_distribution(pi, act)
44 | return pi, logp_a
45 |
46 | def _distribution(self, state):
47 | mu = self.actor(state)
48 | std = T.exp(self.log_std)
49 | return Normal(mu, std)
50 |
51 | def _log_prob_from_distribution(self, pi, act):
52 | return pi.log_prob(act).sum(axis=-1)
53 |
54 | def save_checkpoint(self):
55 | T.save(self.state_dict(), self.checkpoint_file)
56 |
57 | def load_checkpoint(self):
58 | self.load_state_dict(T.load(self.checkpoint_file))
59 |
--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Continious/PPO/agent.py:
--------------------------------------------------------------------------------
1 | import imp
2 | from multiprocessing.context import BaseContext
3 | import os
4 | import copy
5 | from tqdm import tqdm
6 | import numpy as np
7 | import torch as T
8 | import torch.nn as nn
9 | import torch.optim as optim
10 |
11 | from actor import ActorNetwork
12 | from critic import CriticNetwork
13 | from memory import PPOBuffer
14 |
15 |
16 | PATH = os.getcwd()
17 | # MODEL_XML_PATH = os.path.join(
18 | # PATH, "SpaceRobotEnv", "assets", "spacerobot", "spacerobot_image.xml"
19 | # )
20 |
21 | class Agent:
22 | def __init__(self, n_actions, input_dims, model_name_actor : str, model_name_critic : str, \
23 | gamma = 0.99, alpha = 0.0003, gae_lambda = 0.95, \
24 | policy_clip = 0.2, n_epoch = 10, batch_size = 64):
25 | '''
26 | parameter
27 | arguments:
28 | - model_name_actor : model name for actor to be used in model savind directory
29 | - model_name_critic :model name for critic to be used in model savind directory
30 | '''
31 | #self, n_actions, gae_lamda = 0.95, gamma = 0.99, alpha = 0 .0003, policy_clip = 0.2, batch_size = 64, N = 2048 , n_epoch = 10
32 | self.gamma = gamma
33 | self.gae_lambda = gae_lambda
34 | self.policy_clip = policy_clip
35 | self.n_epoch = n_epoch
36 |
37 | self.actor = ActorNetwork(n_actions, input_dims, alpha, model_name = model_name_actor)
38 | self.critic = CriticNetwork(input_dims, alpha, model_name = model_name_critic)
39 | self.memory_handler = PPOBuffer( batch_size )
40 |
41 | def remember(self, state, action, probs, vals, reward, done):
42 | self.memory_handler.store_memory(state, action, probs, vals, reward, done)
43 |
44 | def save_models(self):
45 | print("Saving models now")
46 | self.actor.save_checkpoint()
47 | self.critic.save_checkpoint()
48 |
49 | def load_model(self):
50 | print("Load model")
51 | self.actor.load_checkpoint()
52 | self.critic.load_checkpoint()
53 |
54 | def play_optimal(self, observation):
55 | with T.no_grad():
56 | state = T.tensor([observation], dtype=T.float).to(self.actor.device)
57 | dist = self.actor(state)
58 | # action shoulnt be sampe it should be arg max
59 | action = dist.sample()
60 | action =T.squeeze(action).item()
61 | return action
62 |
63 | def choose_action(self, observation):
64 | with T.no_grad():
65 | observation = T.tensor([observation], dtype=T.float).to(self.actor.device)
66 | policy = self.actor._distribution(observation)
67 | action = policy.sample()
68 | logp_a = self.actor._log_prob_from_distribution(policy, action)
69 | value = self.critic(observation)
70 |
71 | return action.numpy(), logp_a.numpy(), value.numpy()
72 |
73 | def learn(self):
74 | for _ in range(self.n_epoch):
75 |
76 | state_arr, action_arr, old_prob_arr, vals_arr,\
77 | reward_arr, dones_arr, batches = \
78 | self.memory_handler.generate_batches()
79 |
80 | values = vals_arr.copy()
81 | advantage = np.zeros(len(reward_arr), dtype=np.float32)
82 | # calculate advantage = sigma_t + (gamma * lamda) * sigma_t+1 + (gamma * lamda) ^ 2 * sigma_t+2.....
83 | # sigma_t = reward_t + gamma * Value(s_ t+1 ) - Value(s_t)
84 | for t in range(len(reward_arr)-1):
85 | discount = 1
86 | a_t = 0
87 | for k in range(t, len(reward_arr)-1):
88 |
89 | a_t += discount * (reward_arr[k] + self.gamma*values[k+1]*\
90 | (1-int(dones_arr[k])) - values[k])
91 |
92 | # discount term gamma * gae_lamda (y*lamda)
93 | discount *= self.gamma * self.gae_lambda
94 | advantage[t] = a_t
95 | advantage = T.tensor(advantage).to(self.actor.device)
96 |
97 | values = T.tensor(values).to(self.actor.device)
98 |
99 | for batch in batches:
100 | states = T.tensor(state_arr[batch], dtype=T.float).to(self.actor.device)
101 | old_probs = T.tensor(old_prob_arr[batch]).to(self.actor.device)
102 |
103 | actions = T.tensor(action_arr[batch]).to(self.actor.device)
104 |
105 | pi, new_probs = self.actor(states, actions)
106 |
107 | critic_value = self.critic(states)
108 |
109 | critic_value = T.squeeze(critic_value)
110 |
111 | # new_probs = dist.log_prob(actions)
112 |
113 |
114 | # prob_ratio = new_probs.exp() / old_probs.exp()
115 | prob_ratio = T.exp(new_probs - old_probs)
116 | weighted_probs = advantage[batch] * prob_ratio
117 |
118 | weighted_clipped_probs = T.clamp(prob_ratio, 1-self.policy_clip,
119 | 1 + self.policy_clip) * advantage[batch]
120 |
121 | actor_loss = -T.min(weighted_probs, weighted_clipped_probs).mean()
122 |
123 | returns = advantage[batch] + values[batch]
124 | critic_loss = (returns-critic_value)**2
125 | critic_loss = critic_loss.mean()
126 |
127 | total_loss = actor_loss + 0.5* critic_loss
128 | self.actor.optimizer.zero_grad()
129 | self.critic.optimiser.zero_grad()
130 | # print("total loss", total_loss.item())
131 | total_loss.backward()
132 | self.actor.optimizer.step()
133 | self.critic.optimiser.step()
134 |
135 | self.memory_handler.clear_memory()
136 |
--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Continious/PPO/critic.py:
--------------------------------------------------------------------------------
1 | from multiprocessing.context import BaseContext
2 | import os
3 | import numpy as np
4 | import torch as T
5 | import torch.nn as nn
6 | import torch.optim as optim
7 | from torch.distributions.categorical import Categorical
8 | PATH = os.getcwd()
9 | class CriticNetwork(nn.Module):
10 | def __init__(self, input_dims, alpha, model_name : str , fc1_dims = 256,\
11 | fc2_dims = 256, check_point_base_dir = 'RL_algorithms/Torch/PPO/Continious/PPO/models') -> None:
12 | super(CriticNetwork, self).__init__()
13 |
14 | check_point_base_dir = os.path.join( PATH , check_point_base_dir )
15 | self.check_point_file = os.path.join(check_point_base_dir, model_name)
16 | self.critic = nn.Sequential(
17 | nn.Linear(*input_dims , fc1_dims),
18 | nn.ReLU(),
19 | nn.Linear(fc1_dims , fc2_dims),
20 | nn.ReLU(),
21 | nn.Linear(fc2_dims , 1),
22 |
23 | )
24 | self.optimiser = optim.Adam(self.parameters(), lr = alpha)
25 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
26 | self.to(self.device)
27 |
28 | def forward(self, state):
29 | value = self.critic(state)
30 | return value
31 |
32 | def save_checkpoint(self):
33 | T.save(self.state_dict(), self.check_point_file)
34 |
35 | def load_checkpoint(self):
36 | self.load_state_dict(T.load(self.check_point_file))
--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Continious/PPO/main.py:
--------------------------------------------------------------------------------
1 | from turtle import shape
2 | import gym
3 | import numpy as np
4 | from agent import Agent
5 |
6 | from utils import plot_learning_curve
7 | import gym
8 |
9 | import SpaceRobotEnv
10 | import numpy as np
11 |
12 |
13 |
14 | if __name__ == '__main__':
15 | env = gym.make("SpaceRobotState-v0")
16 | N = 30
17 | batch_size = 16
18 | n_epochs = 3
19 | alpha = 0.0003
20 | action_space = env.action_space.shape[0]
21 | obs_shape = env.observation_space["observation"].shape
22 |
23 |
24 |
25 | agent = Agent( n_actions = action_space,
26 | batch_size=batch_size,
27 | alpha = alpha,
28 | n_epoch = n_epochs,
29 | input_dims = obs_shape,
30 | model_name_actor = "space_robot_actor.pt",
31 | model_name_critic = "space_robot_critic.pt")
32 | n_iter = 3000
33 | figure_file = 'RL_algorithms/Torch/PPO/Continious/PPO/plots/space_robot_performance.png'
34 | best_score = env.reward_range[0]
35 | score_history = []
36 | n_steps = 0
37 | learn_iters = 0
38 | avg_score = 0
39 |
40 | for i in range(n_iter):
41 | obs = env.reset()
42 | observation = obs["observation"]
43 | done = False
44 | score = 0
45 | while not done:
46 | action, prob, val = agent.choose_action(observation)
47 | v = prob
48 | # a = action
49 | a = action.reshape(6,)
50 | observation_, reward, done, info = env.step(a)
51 | n_steps+=1
52 | score += reward
53 |
54 | agent.remember(observation, action, prob, val, reward, done)
55 | #steps before we begin learning 20
56 | if n_steps % N ==0:
57 | agent.learn()
58 | learn_iters += 1
59 | observation = observation_["observation"]
60 | score_history.append(score)
61 | avg_score = np.mean(score_history[-100:])
62 |
63 | if avg_score>best_score:
64 | best_score= avg_score
65 | agent.save_models()
66 | print('episode', i , 'score %.1f', 'avg_score %.1f' %avg_score,
67 | 'time_steps',n_steps, 'learning_steps', learn_iters)
68 |
69 | x = [i+1 for i in range(len(score_history))]
70 | plot_learning_curve(x, score_history,figure_file)
71 | env.close()
72 |
--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Continious/PPO/memory.py:
--------------------------------------------------------------------------------
1 | from multiprocessing.context import BaseContext
2 | import os
3 | import numpy as np
4 | import torch as T
5 | import torch.nn as nn
6 | import torch.optim as optim
7 |
8 |
9 |
10 | class PPOBuffer:
11 | def __init__(self, batch_size):
12 | self.states = []
13 | self.probs = []
14 | self.vals = []
15 | self.actions = []
16 | self.rewards = []
17 | self.dones = []
18 |
19 | self.batch_size = batch_size
20 |
21 | def generate_batches(self):
22 | n_states = len(self.states)
23 | batch_start = np.arange(0, n_states, self.batch_size)
24 | indices = np.arange(n_states, dtype=np.int64)
25 | np.random.shuffle(indices)
26 | batches = [indices[i:i+self.batch_size] for i in batch_start]
27 |
28 | return np.array(self.states),\
29 | np.array(self.actions),\
30 | np.array(self.probs),\
31 | np.array(self.vals),\
32 | np.array(self.rewards),\
33 | np.array(self.dones),\
34 | batches
35 |
36 | def store_memory(self, state, action, probs, vals, reward, done):
37 | self.states.append(state)
38 | self.actions.append(action)
39 | self.probs.append(probs)
40 | self.vals.append(vals)
41 | self.rewards.append(reward)
42 | self.dones.append(done)
43 |
44 | def clear_memory(self):
45 | self.states = []
46 | self.probs = []
47 | self.actions = []
48 | self.rewards = []
49 | self.dones = []
50 | self.vals = []
51 |
--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Continious/PPO/models/space_robot_actor.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Continious/PPO/models/space_robot_actor.pt
--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Continious/PPO/models/space_robot_critic.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Continious/PPO/models/space_robot_critic.pt
--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Continious/PPO/plots/space_robot_performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Continious/PPO/plots/space_robot_performance.png
--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Continious/PPO/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 |
4 | def plot_learning_curve(x, scores, figure_file):
5 | running_avg = np.zeros(len(scores))
6 | for i in range(len(running_avg)):
7 | running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
8 | plt.plot(x, running_avg)
9 | plt.title('Running average of previous 100 scores')
10 | plt.savefig(figure_file)
--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/__init__.py
--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/actor.py:
--------------------------------------------------------------------------------
1 | from multiprocessing.context import BaseContext
2 | import os
3 | import numpy as np
4 | import torch as T
5 | import torch.nn.functional as F
6 | import torch.nn as nn
7 | import torch.optim as optim
8 | from torch.distributions.normal import Normal
9 | PATH = os.getcwd()
10 |
11 | class ActorNetwork(nn.Module):
12 |
13 | def __init__(self, max_actions, n_actions, input_dims, alpha, model_name : str,
14 | fc1_dims=256, fc2_dims=256, check_point_base_dir = 'RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/models'):
15 | super(ActorNetwork, self).__init__()
16 | self.n_actions = n_actions
17 | self.max_actions = max_actions
18 |
19 |
20 | check_point_base_dir = os.path.join( PATH , check_point_base_dir )
21 | self.checkpoint_file = os.path.join(check_point_base_dir, model_name)
22 | self.base_model = nn.Sequential(
23 | nn.Linear(*input_dims, fc1_dims),
24 | nn.ReLU(),
25 | nn.Linear(fc1_dims, fc2_dims),
26 | nn.ReLU(),
27 | )
28 | fc = [nn.Linear(fc2_dims, 2*n_actions)]
29 | self.fc = nn.Sequential(*fc)
30 | self.optimizer = optim.Adam(self.parameters(), lr=alpha)
31 |
32 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
33 |
34 | self.to(self.device)
35 |
36 | def forward(self, state):
37 | x = self.base_model(state)
38 | x = self.fc(x)
39 | mean, std = T.chunk(x, chunks=2, dim=-1)
40 | mean, std = self.max_actions * T.tanh(mean), F.softplus(std)
41 | return mean, std
42 |
43 | def get_logprob(self, state, action):
44 | mean, std = self.forward(state)
45 | dist = Normal(mean, std)
46 | log_prob = dist.log_prob(action).sum(axis=-1)
47 | return log_prob
48 |
49 |
50 |
51 | def save_checkpoint(self):
52 | T.save(self.state_dict(), self.checkpoint_file)
53 |
54 | def load_checkpoint(self):
55 | self.load_state_dict(T.load(self.checkpoint_file))
56 |
--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/agent.py:
--------------------------------------------------------------------------------
1 | import imp
2 | from multiprocessing.context import BaseContext
3 | import os
4 | import copy
5 | from tqdm import tqdm
6 | import numpy as np
7 | import torch as T
8 | import torch.nn as nn
9 | import torch.optim as optim
10 | from torch.distributions import Normal
11 |
12 | from actor import ActorNetwork
13 | from critic import CriticNetwork
14 | from memory import PPOBuffer
15 |
16 |
17 | PATH = os.getcwd()
18 | # MODEL_XML_PATH = os.path.join(
19 | # PATH, "SpaceRobotEnv", "assets", "spacerobot", "spacerobot_image.xml"
20 | # )
21 |
22 | class Agent:
23 | def __init__(self, env_max_action, n_actions, input_dims, model_name_actor : str, model_name_critic : str, \
24 | gamma = 0.99, alpha = 0.0003, gae_lambda = 0.95, \
25 | policy_clip = 0.2, n_epoch = 3, batch_size = 64):
26 | '''
27 | parameter
28 | arguments:
29 | - model_name_actor : model name for actor to be used in model savind directory
30 | - model_name_critic :model name for critic to be used in model savind directory
31 | '''
32 | #self, n_actions, gae_lamda = 0.95, gamma = 0.99, alpha = 0 .0003, policy_clip = 0.2, batch_size = 64, N = 2048 , n_epoch = 10
33 | self.gamma = gamma
34 | self.gae_lambda = gae_lambda
35 | self.policy_clip = policy_clip
36 | self.n_epoch = n_epoch
37 |
38 | self.actor = ActorNetwork( env_max_action , n_actions, input_dims, alpha, model_name = model_name_actor)
39 | self.critic = CriticNetwork(input_dims, alpha, model_name = model_name_critic)
40 | self.memory_handler = PPOBuffer( batch_size )
41 |
42 | def remember(self, state, action, probs, vals, reward, done):
43 | self.memory_handler.store_memory(state, action, probs, vals, reward, done)
44 |
45 | def save_models(self):
46 | print("Saving models now")
47 | self.actor.save_checkpoint()
48 | self.critic.save_checkpoint()
49 |
50 | def load_model(self):
51 | print("Load model")
52 | self.actor.load_checkpoint()
53 | self.critic.load_checkpoint()
54 |
55 |
56 | def choose_action(self, state):
57 | # state = T.as_tensor(state, dtype=T.float, device=device)
58 | state = T.tensor([state], dtype=T.float).to(self.actor.device)
59 |
60 | mean, std = self.actor.forward(state)
61 |
62 | dist = Normal(mean, std)
63 |
64 |
65 | action = dist.sample()
66 | action_logprob = dist.log_prob(action).sum(axis=-1)
67 | value = self.critic(state)
68 |
69 | return action, action_logprob, value
70 |
71 | # def choose_action(self, observation):
72 | # with T.no_grad():
73 | # observation = T.tensor([observation], dtype=T.float).to(self.actor.device)
74 | # action , logp_a = self.actor.sample_normal(observation)
75 | # value = self.critic(observation)
76 | # return action.numpy(), logp_a.numpy(), value.numpy()
77 | def learn(self):
78 | for _ in range(self.n_epoch):
79 |
80 | state_arr, action_arr, old_prob_arr, vals_arr,\
81 | reward_arr, dones_arr, batches = \
82 | self.memory_handler.generate_batches()
83 |
84 | values = vals_arr.copy()
85 | advantage = np.zeros(len(reward_arr), dtype=np.float32)
86 | # calculate advantage = sigma_t + (gamma * lamda) * sigma_t+1 + (gamma * lamda) ^ 2 * sigma_t+2.....
87 | # sigma_t = reward_t + gamma * Value(s_ t+1 ) - Value(s_t)
88 | for t in range(len(reward_arr)-1):
89 | discount = 1
90 | a_t = 0
91 | for k in range(t, len(reward_arr)-1):
92 |
93 | a_t += discount * (reward_arr[k] + self.gamma*values[k+1]*\
94 | (1-int(dones_arr[k])) - values[k])
95 |
96 | # discount term gamma * gae_lamda (y*lamda)
97 | discount *= self.gamma * self.gae_lambda
98 | advantage[t] = a_t
99 | advantage = T.tensor(advantage).to(self.actor.device)
100 |
101 | values = T.tensor(values).to(self.actor.device)
102 |
103 | for batch in batches:
104 | states = T.tensor(state_arr[batch], dtype=T.float).to(self.actor.device)
105 | old_probs = T.tensor(old_prob_arr[batch]).to(self.actor.device)
106 |
107 | actions = T.tensor(action_arr[batch]).to(self.actor.device)
108 |
109 | new_probs = self.actor.get_logprob(states, actions)
110 |
111 | critic_value = self.critic(states)
112 |
113 | critic_value = T.squeeze(critic_value)
114 |
115 | prob_ratio = T.exp(new_probs - old_probs)
116 |
117 | weighted_probs = advantage[batch] * prob_ratio
118 |
119 |
120 | weighted_clipped_probs = T.clamp(prob_ratio, 1-self.policy_clip,
121 | 1 + self.policy_clip)*advantage[batch]
122 | actor_loss = -T.min(weighted_probs, weighted_clipped_probs).mean()
123 |
124 | returns = advantage[batch] + values[batch]
125 | critic_loss = (returns-critic_value)**2
126 | critic_loss = critic_loss.mean()
127 |
128 | total_loss = actor_loss + 0.5* critic_loss
129 | self.actor.optimizer.zero_grad()
130 | self.critic.optimiser.zero_grad()
131 | # print("total loss", total_loss.item())
132 | total_loss.backward()
133 | self.actor.optimizer.step()
134 | self.critic.optimiser.step()
135 |
136 | self.memory_handler.clear_memory()
137 |
--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/critic.py:
--------------------------------------------------------------------------------
1 | from multiprocessing.context import BaseContext
2 | import os
3 | import numpy as np
4 | import torch as T
5 | import torch.nn as nn
6 | import torch.optim as optim
7 | from torch.distributions.categorical import Categorical
8 | PATH = os.getcwd()
9 | class CriticNetwork(nn.Module):
10 | def __init__(self, input_dims, alpha, model_name : str , fc1_dims = 256,\
11 | fc2_dims = 256, check_point_base_dir = 'RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/models') -> None:
12 | super(CriticNetwork, self).__init__()
13 |
14 | check_point_base_dir = os.path.join( PATH , check_point_base_dir )
15 | self.check_point_file = os.path.join(check_point_base_dir, model_name)
16 | self.critic = nn.Sequential(
17 | nn.Linear(*input_dims , fc1_dims),
18 | nn.ReLU(),
19 | nn.Linear(fc1_dims , fc2_dims),
20 | nn.ReLU(),
21 | nn.Linear(fc2_dims , 1),
22 |
23 | )
24 | self.optimiser = optim.Adam(self.parameters(), lr = alpha)
25 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
26 | self.to(self.device)
27 |
28 | def forward(self, state):
29 | value = self.critic(state)
30 | return value
31 |
32 | def save_checkpoint(self):
33 | T.save(self.state_dict(), self.check_point_file)
34 |
35 | def load_checkpoint(self):
36 | self.load_state_dict(T.load(self.check_point_file))
--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/main.py:
--------------------------------------------------------------------------------
1 | from turtle import shape
2 | import gym
3 | import numpy as np
4 | from agent import Agent
5 |
6 | from utils import plot_learning_curve
7 | import gym
8 |
9 | import SpaceRobotEnv
10 | import numpy as np
11 |
12 |
13 |
14 | if __name__ == '__main__':
15 | env = gym.make("SpaceRobotState-v0")
16 | N = 30
17 | batch_size = 16
18 | n_epochs = 3
19 | alpha = 0.0003
20 | action_space = env.action_space.shape[0]
21 | obs_shape = env.observation_space["observation"].shape
22 | env_max_action = float(env.action_space.high[0])
23 |
24 | agent = Agent( env_max_action = env_max_action,
25 | n_actions = action_space,
26 | batch_size = batch_size,
27 | alpha = alpha,
28 | n_epoch = n_epochs,
29 | input_dims = obs_shape,
30 | model_name_actor = "space_robot_actor.pt",
31 | model_name_critic = "space_robot_critic.pt")
32 | n_iter = 300
33 | figure_file = 'RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/plots/space_robot_performance.png'
34 | best_score = env.reward_range[0]
35 | score_history = []
36 | n_steps = 0
37 | learn_iters = 0
38 | avg_score = 0
39 |
40 | for i in range(n_iter):
41 | obs = env.reset()
42 | observation = obs["observation"]
43 |
44 | done = False
45 | score = 0
46 | while not done:
47 | action, prob, val = agent.choose_action(observation)
48 |
49 | action = action.detach().cpu().numpy().flatten()
50 | action = action.clip(env.action_space.low, env.action_space.high)
51 |
52 | action_logprob = prob.detach().cpu().numpy().flatten()
53 | val = val.detach().cpu().numpy().flatten()
54 |
55 | observation_, reward, done, info = env.step(action)
56 | n_steps+=1
57 | score += reward
58 |
59 | agent.remember(observation, action, action_logprob, val, reward, done)
60 | #steps before we begin learning 20
61 | if n_steps % N ==0:
62 | agent.learn()
63 | learn_iters += 1
64 | observation = observation_["observation"]
65 | score_history.append(score)
66 | avg_score = np.mean(score_history[-100:])
67 |
68 | if avg_score>best_score:
69 | best_score= avg_score
70 | agent.save_models()
71 | print('episode', i , 'score %.1f', 'avg_score %.1f' %avg_score,
72 | 'time_steps',n_steps, 'learning_steps', learn_iters)
73 |
74 | x = [i+1 for i in range(len(score_history))]
75 | plot_learning_curve(x, score_history,figure_file)
76 | env.close()
77 |
--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/memory.py:
--------------------------------------------------------------------------------
1 | from multiprocessing.context import BaseContext
2 | import os
3 | import numpy as np
4 | import torch as T
5 | import torch.nn as nn
6 | import torch.optim as optim
7 |
8 |
9 |
10 | class PPOBuffer:
11 | def __init__(self, batch_size):
12 | self.states = []
13 | self.probs = []
14 | self.vals = []
15 | self.actions = []
16 | self.rewards = []
17 | self.dones = []
18 |
19 | self.batch_size = batch_size
20 |
21 | def generate_batches(self):
22 | n_states = len(self.states)
23 | batch_start = np.arange(0, n_states, self.batch_size)
24 | indices = np.arange(n_states, dtype=np.int64)
25 | np.random.shuffle(indices)
26 | batches = [indices[i:i+self.batch_size] for i in batch_start]
27 |
28 | return np.array(self.states),\
29 | np.array(self.actions),\
30 | np.array(self.probs),\
31 | np.array(self.vals),\
32 | np.array(self.rewards),\
33 | np.array(self.dones),\
34 | batches
35 |
36 | def store_memory(self, state, action, probs, vals, reward, done):
37 | self.states.append(state)
38 | self.actions.append(action)
39 | self.probs.append(probs)
40 | self.vals.append(vals)
41 | self.rewards.append(reward)
42 | self.dones.append(done)
43 |
44 | def clear_memory(self):
45 | self.states = []
46 | self.probs = []
47 | self.actions = []
48 | self.rewards = []
49 | self.dones = []
50 | self.vals = []
51 |
--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/models/space_robot_actor.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/models/space_robot_actor.pt
--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/models/space_robot_critic.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/models/space_robot_critic.pt
--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/plots/space_robot_performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/plots/space_robot_performance.png
--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Continious/PPO_Two_heads/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 |
4 | def plot_learning_curve(x, scores, figure_file):
5 | running_avg = np.zeros(len(scores))
6 | for i in range(len(running_avg)):
7 | running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
8 | plt.plot(x, running_avg)
9 | plt.title('Running average of previous 100 scores')
10 | plt.savefig(figure_file)
--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Continious/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Continious/__init__.py
--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Discrete/PPO/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Discrete/PPO/__init__.py
--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Discrete/PPO/actor.py:
--------------------------------------------------------------------------------
1 | from multiprocessing.context import BaseContext
2 | import os
3 | import numpy as np
4 | import torch as T
5 | import torch.nn as nn
6 | import torch.optim as optim
7 | from torch.distributions.categorical import Categorical
8 | PATH = os.getcwd()
9 |
10 | class ActorNetwork(nn.Module):
11 |
12 | def __init__(self, n_actions, input_dims, alpha, model_name : str,
13 | fc1_dims=256, fc2_dims=256, check_point_base_dir = 'Learning_algorithm/Torch/PPO/models/'):
14 | super(ActorNetwork, self).__init__()
15 |
16 | check_point_base_dir = os.path.join( PATH , check_point_base_dir )
17 | self.checkpoint_file = os.path.join(check_point_base_dir, model_name)
18 | self.actor = nn.Sequential(
19 | nn.Linear(*input_dims, fc1_dims),
20 | nn.ReLU(),
21 | nn.Linear(fc1_dims, fc2_dims),
22 | nn.ReLU(),
23 | nn.Linear(fc2_dims, n_actions),
24 | nn.Softmax(dim=-1)
25 | )
26 |
27 | self.optimizer = optim.Adam(self.parameters(), lr=alpha)
28 |
29 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
30 |
31 | self.to(self.device)
32 |
33 | def forward(self, state):
34 | dist = self.actor(state)
35 | dist = Categorical(dist)
36 | return dist
37 |
38 | def save_checkpoint(self):
39 | T.save(self.state_dict(), self.checkpoint_file)
40 |
41 | def load_checkpoint(self):
42 | self.load_state_dict(T.load(self.checkpoint_file))
43 |
--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Discrete/PPO/agent.py:
--------------------------------------------------------------------------------
1 | import imp
2 | from multiprocessing.context import BaseContext
3 | import os
4 | import copy
5 | from tqdm import tqdm
6 | import numpy as np
7 | import torch as T
8 | import torch.nn as nn
9 | import torch.optim as optim
10 | from torch.distributions.categorical import Categorical
11 | from actor import ActorNetwork
12 | from critic import CriticNetwork
13 | from memory import PPOMemory
14 |
15 |
16 | PATH = os.getcwd()
17 | # MODEL_XML_PATH = os.path.join(
18 | # PATH, "SpaceRobotEnv", "assets", "spacerobot", "spacerobot_image.xml"
19 | # )
20 |
21 | class Agent:
22 | def __init__(self, n_actions, input_dims, model_name_actor : str, model_name_critic : str, \
23 | gamma = 0.99, alpha = 0.0003, gae_lambda = 0.95, \
24 | policy_clip = 0.1, n_epoch = 10, batch_size = 64):
25 | '''
26 | parameter
27 | arguments:
28 | - model_name_actor : model name for actor to be used in model savind directory
29 | - model_name_critic :model name for critic to be used in model savind directory
30 | '''
31 | seed = 10000
32 | T.manual_seed(seed)
33 | np.random.seed(seed)
34 | #self, n_actions, gae_lamda = 0.95, gamma = 0.99, alpha = 0 .0003, policy_clip = 0.2, batch_size = 64, N = 2048 , n_epoch = 10
35 | self.gamma = gamma
36 | self.gae_lambda = gae_lambda
37 | self.policy_clip = policy_clip
38 | self.n_epoch = n_epoch
39 |
40 | self.actor = ActorNetwork(n_actions, input_dims, alpha, model_name = model_name_actor)
41 | self.critic = CriticNetwork(input_dims, alpha, model_name = model_name_critic)
42 | self.memory_handler = PPOMemory( batch_size )
43 |
44 | def remember(self, state, action, probs, vals, reward, done):
45 | self.memory_handler.store_memory(state, action, probs, vals, reward, done)
46 |
47 | def save_models(self):
48 | print("Saving models now")
49 | self.actor.save_checkpoint()
50 | self.critic.save_checkpoint()
51 |
52 | def load_model(self):
53 | print("Load model")
54 | self.actor.load_checkpoint()
55 | self.critic.load_checkpoint()
56 |
57 | def play_optimal(self, observation):
58 | with T.no_grad():
59 | state = T.tensor([observation], dtype=T.float).to(self.actor.device)
60 | dist = self.actor(state)
61 | # action shoulnt be sampe it should be arg max
62 | action = dist.sample()
63 | action =T.squeeze(action).item()
64 | return action
65 |
66 | def choose_action(self, observation):
67 | state = T.tensor([observation], dtype=T.float).to(self.actor.device)
68 | dist = self.actor(state)
69 | value = self.critic(state)
70 |
71 | action = dist.sample()
72 |
73 | # this is equivalent to the reinforce algorithm of probablity distribition
74 | probs = T.squeeze(dist.log_prob(action)).item()
75 |
76 | action =T.squeeze(action).item()
77 | value =T.squeeze(value).item()
78 |
79 | return action, probs , value
80 |
81 | def learn(self):
82 | for _ in range(self.n_epoch):
83 |
84 | state_arr, action_arr, old_prob_arr, vals_arr,\
85 | reward_arr, dones_arr, batches = \
86 | self.memory_handler.generate_batches()
87 |
88 | values = vals_arr.copy()
89 | advantage = np.zeros(len(reward_arr), dtype=np.float32)
90 |
91 | for t in range(len(reward_arr)-1):
92 | discount = 1
93 | a_t = 0
94 | for k in range(t, len(reward_arr)-1):
95 | a_t += discount*(reward_arr[k] + self.gamma*values[k+1]*\
96 | ( 1 - int(dones_arr[k]) ) - values[k])
97 | discount *= self.gamma*self.gae_lambda
98 | advantage[t] = a_t
99 | advantage = T.tensor(advantage).to(self.actor.device)
100 |
101 | values = T.tensor(values).to(self.actor.device)
102 | for batch in batches:
103 | states = T.tensor(state_arr[batch], dtype=T.float).to(self.actor.device)
104 | old_probs = T.tensor(old_prob_arr[batch]).to(self.actor.device)
105 | actions = T.tensor(action_arr[batch]).to(self.actor.device)
106 |
107 | dist = self.actor(states)
108 | critic_value = self.critic(states)
109 |
110 | critic_value = T.squeeze(critic_value)
111 |
112 | new_probs = dist.log_prob(actions)
113 | prob_ratio = new_probs.exp() / old_probs.exp()
114 | #prob_ratio = (new_probs - old_probs).exp()
115 | weighted_probs = advantage[batch] * prob_ratio
116 | weighted_clipped_probs = T.clamp(prob_ratio, 1 - self.policy_clip,
117 | 1 + self.policy_clip ) * advantage[batch]
118 | actor_loss = -T.min(weighted_probs, weighted_clipped_probs).mean()
119 |
120 | returns = advantage[batch] + values[batch]
121 | critic_loss = (returns-critic_value) ** 2
122 | critic_loss = critic_loss.mean()
123 |
124 | total_loss = actor_loss + 0.5 * critic_loss
125 | self.actor.optimizer.zero_grad()
126 | self.critic.optimiser.zero_grad()
127 | # print("total loss", total_loss.item())
128 | total_loss.backward()
129 | self.actor.optimizer.step()
130 | self.critic.optimiser.step()
131 |
132 | self.memory_handler.clear_memory()
133 |
--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Discrete/PPO/critic.py:
--------------------------------------------------------------------------------
1 | from multiprocessing.context import BaseContext
2 | import os
3 | import numpy as np
4 | import torch as T
5 | import torch.nn as nn
6 | import torch.optim as optim
7 | from torch.distributions.categorical import Categorical
8 | PATH = os.getcwd()
9 | class CriticNetwork(nn.Module):
10 | def __init__(self, input_dims, alpha, model_name : str , fc1_dims = 256,\
11 | fc2_dims = 256, check_point_base_dir = 'Learning_algorithm/Torch/PPO/models/') -> None:
12 | super(CriticNetwork, self).__init__()
13 |
14 | check_point_base_dir = os.path.join( PATH , check_point_base_dir )
15 | self.check_point_file = os.path.join(check_point_base_dir, model_name)
16 | self.critic = nn.Sequential(
17 | nn.Linear(*input_dims , fc1_dims),
18 | nn.ReLU(),
19 | nn.Linear(fc1_dims , fc2_dims),
20 | nn.ReLU(),
21 | nn.Linear(fc2_dims , 1),
22 |
23 | )
24 | self.optimiser = optim.Adam(self.parameters(), lr = alpha)
25 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
26 | self.to(self.device)
27 |
28 | def forward(self, state):
29 | value = self.critic(state)
30 | return value
31 |
32 | def save_checkpoint(self):
33 | T.save(self.state_dict(), self.check_point_file)
34 |
35 | def load_checkpoint(self):
36 | self.load_state_dict(T.load(self.check_point_file))
--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Discrete/PPO/main.py:
--------------------------------------------------------------------------------
1 | from turtle import shape
2 | import gym
3 | import numpy as np
4 | from agent import Agent
5 |
6 | from utils import plot_learning_curve
7 | import gym
8 |
9 | import SpaceRobotEnv
10 | import numpy as np
11 |
12 |
13 |
14 | if __name__ == '__main__':
15 | env = gym.make("SpaceRobotState-v0")
16 | N = 20
17 | batch_size = 5
18 | n_epochs = 4
19 | alpha = 0.0003
20 | action_space = env.action_space.shape[0]
21 | obs_shape = env.observation_space["observation"].shape
22 |
23 |
24 |
25 | agent = Agent( n_actions = action_space,
26 | batch_size=batch_size,
27 | alpha = alpha,
28 | n_epoch = n_epochs,
29 | input_dims = obs_shape,
30 | model_name_actor = "space_robot_actor.pt",
31 | model_name_critic = "space_robot_critic.pt")
32 | n_iter = 300
33 | figure_file = 'Learning_algorithm/Torch/PPO/plots/space_robot_performance.png'
34 | best_score = env.reward_range[0]
35 | score_history = []
36 | n_steps = 0
37 | learn_iters = 0
38 | avg_score = 0
39 |
40 | for i in range(n_iter):
41 | obs = env.reset()
42 | observation = obs["observation"]
43 | done = False
44 | score = 0
45 | while not done:
46 | action, prob, val = agent.choose_action(observation)
47 | act = action
48 | pr = prob
49 | observation_, reward, done, info = env.step(action)
50 | n_steps+=1
51 | score += reward
52 |
53 | agent.remember(observation, action, prob, val, reward, done)
54 | #steps before we begin learning 20
55 | if n_steps % N ==0:
56 | agent.learn()
57 | learn_iters += 1
58 | observation = observation_["observation"]
59 | score_history.append(score)
60 | avg_score = np.mean(score_history[-100:])
61 |
62 | if avg_score>best_score:
63 | best_score= avg_score
64 | agent.save_models()
65 | print('episode', i , 'score %.1f', 'avg_score %.1f' %avg_score,
66 | 'time_steps',n_steps, 'learning_steps', learn_iters)
67 |
68 | x = [i+1 for i in range(len(score_history))]
69 | plot_learning_curve(x, score_history,figure_file)
70 | env.close()
71 |
--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Discrete/PPO/memory.py:
--------------------------------------------------------------------------------
1 | from multiprocessing.context import BaseContext
2 | import os
3 | import numpy as np
4 | import torch as T
5 | import torch.nn as nn
6 | import torch.optim as optim
7 |
8 | class Mem:
9 | def __init__(self, state , prob, val ,action, reward, done) -> None:
10 | self.state = state
11 | self.prob = prob
12 | self.val = val
13 | self.action = action
14 | self.reward = reward
15 | self.done = done
16 |
17 | class PPOMemory:
18 | def __init__(self, batch_size):
19 | self.states = []
20 | self.probs = []
21 | self.vals = []
22 | self.actions = []
23 | self.rewards = []
24 | self.dones = []
25 |
26 | self.batch_size = batch_size
27 |
28 | def generate_batches(self):
29 | n_states = len(self.states)
30 | batch_start = np.arange(0, n_states, self.batch_size)
31 | indices = np.arange(n_states, dtype=np.int64)
32 | np.random.shuffle(indices)
33 | batches = [indices[i:i+self.batch_size] for i in batch_start]
34 |
35 | return np.array(self.states),\
36 | np.array(self.actions),\
37 | np.array(self.probs),\
38 | np.array(self.vals),\
39 | np.array(self.rewards),\
40 | np.array(self.dones),\
41 | batches
42 |
43 | def store_memory(self, state, action, probs, vals, reward, done):
44 | self.states.append(state)
45 | self.actions.append(action)
46 | self.probs.append(probs)
47 | self.vals.append(vals)
48 | self.rewards.append(reward)
49 | self.dones.append(done)
50 |
51 | def clear_memory(self):
52 | self.states = []
53 | self.probs = []
54 | self.actions = []
55 | self.rewards = []
56 | self.dones = []
57 | self.vals = []
58 |
--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Discrete/PPO/models/space_robot_actor.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Discrete/PPO/models/space_robot_actor.pt
--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Discrete/PPO/models/space_robot_critic.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Discrete/PPO/models/space_robot_critic.pt
--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Discrete/PPO/plots/space_robot_performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Discrete/PPO/plots/space_robot_performance.png
--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Discrete/PPO/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 |
4 | def plot_learning_curve(x, scores, figure_file):
5 | running_avg = np.zeros(len(scores))
6 | for i in range(len(running_avg)):
7 | running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
8 | plt.plot(x, running_avg)
9 | plt.title('Running average of previous 100 scores')
10 | plt.savefig(figure_file)
--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Discrete/PPOImage/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Discrete/PPOImage/__init__.py
--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Discrete/PPOImage/actor.py:
--------------------------------------------------------------------------------
1 | from multiprocessing.context import BaseContext
2 | import os
3 | import numpy as np
4 | import torch as T
5 | import torch.nn as nn
6 | import torch.optim as optim
7 | from torch.distributions.categorical import Categorical
8 | PATH = os.getcwd()
9 |
10 | class ActorNetwork(nn.Module):
11 |
12 | def __init__(self, n_actions, alpha, model_name : str,
13 | check_point_base_dir = 'RL_algorithms/Torch/PPOImage/models'):
14 | super(ActorNetwork, self).__init__()
15 |
16 | check_point_base_dir = os.path.join( PATH , check_point_base_dir )
17 | self.checkpoint_file = os.path.join(check_point_base_dir, model_name)
18 |
19 | self.actor = nn.Sequential(
20 | nn.Conv2d(in_channels = 3, out_channels = 32, kernel_size=5, stride=1),
21 | nn.ReLU(),
22 | nn.BatchNorm2d(32),
23 | nn.ReLU(),
24 | nn.MaxPool2d(2,2),
25 |
26 | nn.Conv2d(in_channels = 32, out_channels = 64, kernel_size=5, stride=1),
27 | nn.ReLU(),
28 | nn.BatchNorm2d(64),
29 | nn.ReLU(),
30 | nn.MaxPool2d(2,2),
31 |
32 | nn.Conv2d(in_channels = 64, out_channels = 64, kernel_size=5, stride=1),
33 | nn.ReLU(),
34 | nn.BatchNorm2d(64),
35 | nn.ReLU(),
36 | nn.MaxPool2d(2,2),
37 |
38 | nn.Flatten(),
39 | nn.Linear(1024, 4096),
40 | nn.ReLU(),
41 | nn.Linear(4096, 256),
42 | nn.ReLU(),
43 | nn.Linear(256, 64),
44 | nn.ReLU(),
45 | nn.Linear(64, n_actions),
46 |
47 | nn.Softmax(dim=-1)
48 | )
49 |
50 | self.optimizer = optim.Adam(self.parameters(), lr=alpha)
51 |
52 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
53 |
54 | self.to(self.device)
55 |
56 | def forward(self, state):
57 | dist = self.actor(state)
58 | dist = Categorical(dist)
59 | return dist
60 |
61 | def save_checkpoint(self):
62 | T.save(self.state_dict(), self.checkpoint_file)
63 |
64 | def load_checkpoint(self):
65 | self.load_state_dict(T.load(self.checkpoint_file))
66 |
--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Discrete/PPOImage/agent.py:
--------------------------------------------------------------------------------
1 | import imp
2 | from multiprocessing.context import BaseContext
3 | import os
4 | import copy
5 | from tqdm import tqdm
6 | import numpy as np
7 | import torch as T
8 | import torch.nn as nn
9 | import torch.optim as optim
10 | from torch.distributions.categorical import Categorical
11 | from actor import ActorNetwork
12 | from critic import CriticNetwork
13 | from memory import PPOMemory
14 |
15 |
16 | PATH = os.getcwd()
17 | # MODEL_XML_PATH = os.path.join(
18 | # PATH, "SpaceRobotEnv", "assets", "spacerobot", "spacerobot_image.xml"
19 | # )
20 |
21 | class Agent:
22 | def __init__(self, n_actions, model_name_actor : str, model_name_critic : str, \
23 | gamma = 0.99, alpha = 0.0003, gae_lambda = 0.95, \
24 | policy_clip = 0.1, n_epoch = 10, batch_size = 64):
25 | '''
26 | parameter
27 | arguments:
28 | - model_name_actor : model name for actor to be used in model savind directory
29 | - model_name_critic :model name for critic to be used in model savind directory
30 | '''
31 | #self, n_actions, gae_lamda = 0.95, gamma = 0.99, alpha = 0 .0003, policy_clip = 0.2, batch_size = 64, N = 2048 , n_epoch = 10
32 | self.gamma = gamma
33 | self.gae_lambda = gae_lambda
34 | self.policy_clip = policy_clip
35 | self.n_epoch = n_epoch
36 |
37 | self.actor = ActorNetwork(n_actions, alpha, model_name = model_name_actor)
38 | self.critic = CriticNetwork(alpha, model_name = model_name_critic)
39 | self.memory_handler = PPOMemory( batch_size )
40 |
41 | def remember(self, state, action, probs, vals, reward, done):
42 | self.memory_handler.store_memory(state, action, probs, vals, reward, done)
43 |
44 | def save_models(self):
45 | print("Saving models now")
46 | self.actor.save_checkpoint()
47 | self.critic.save_checkpoint()
48 |
49 | def load_model(self):
50 | print("Load model")
51 | self.actor.load_checkpoint()
52 | self.critic.load_checkpoint()
53 |
54 | def play_optimal(self, observation):
55 | with T.no_grad():
56 | state = T.tensor([observation], dtype=T.float).to(self.actor.device)
57 | dist = self.actor(state)
58 | # action shoulnt be sampe it should be arg max
59 | action = dist.sample()
60 | action =T.squeeze(action).item()
61 | return action
62 |
63 | def choose_action(self, observation):
64 | observation = np.array(observation)
65 | state = T.tensor([observation], dtype=T.float).to(self.actor.device)
66 | dist = self.actor(state)
67 | value = self.critic(state)
68 |
69 | action = dist.sample()
70 |
71 | # this is equivalent to the reinforce algorithm of probablity distribition
72 | probs = T.squeeze(dist.log_prob(action)).item()
73 |
74 | action =T.squeeze(action).item()
75 | value =T.squeeze(value).item()
76 |
77 | return action, probs , value
78 |
79 | def learn(self):
80 | for _ in range(self.n_epoch):
81 |
82 | state_arr, action_arr, old_prob_arr, vals_arr,\
83 | reward_arr, dones_arr, batches = \
84 | self.memory_handler.generate_batches()
85 |
86 | values = vals_arr.copy()
87 | advantage = np.zeros(len(reward_arr), dtype=np.float32)
88 |
89 | for t in range(len(reward_arr)-1):
90 | discount = 0.95
91 | a_t = 0
92 | for k in range(t, len(reward_arr)-1):
93 | a_t += discount*(reward_arr[k] + self.gamma*values[k+1]*\
94 | (1-int(dones_arr[k])) - values[k])
95 | discount *= self.gamma*self.gae_lambda
96 | advantage[t] = a_t
97 | advantage = T.tensor(advantage).to(self.actor.device)
98 |
99 | values = T.tensor(values).to(self.actor.device)
100 | for batch in batches:
101 | states = T.tensor(state_arr[batch], dtype=T.float).to(self.actor.device)
102 | old_probs = T.tensor(old_prob_arr[batch]).to(self.actor.device)
103 | actions = T.tensor(action_arr[batch]).to(self.actor.device)
104 |
105 | dist = self.actor(states)
106 | critic_value = self.critic(states)
107 |
108 | critic_value = T.squeeze(critic_value)
109 |
110 | new_probs = dist.log_prob(actions)
111 | prob_ratio = new_probs.exp() / old_probs.exp()
112 | #prob_ratio = (new_probs - old_probs).exp()
113 | weighted_probs = advantage[batch] * prob_ratio
114 | weighted_clipped_probs = T.clamp(prob_ratio, 1-self.policy_clip,
115 | 1+self.policy_clip)*advantage[batch]
116 | actor_loss = -T.min(weighted_probs, weighted_clipped_probs).mean()
117 |
118 | returns = advantage[batch] + values[batch]
119 | critic_loss = (returns-critic_value)**2
120 | critic_loss = critic_loss.mean()
121 |
122 | total_loss = actor_loss + 0.5*critic_loss
123 | self.actor.optimizer.zero_grad()
124 | self.critic.optimiser.zero_grad()
125 | # print("total loss", total_loss.item())
126 | total_loss.backward()
127 | self.actor.optimizer.step()
128 | self.critic.optimiser.step()
129 |
130 | self.memory_handler.clear_memory()
131 |
--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Discrete/PPOImage/critic.py:
--------------------------------------------------------------------------------
1 | from multiprocessing.context import BaseContext
2 | import os
3 | import numpy as np
4 | import torch as T
5 | import torch.nn as nn
6 | import torch.optim as optim
7 | from torch.distributions.categorical import Categorical
8 | PATH = os.getcwd()
9 | class CriticNetwork(nn.Module):
10 | def __init__(self, alpha, model_name : str ,\
11 | check_point_base_dir = 'RL_algorithms/Torch/PPOImage/models/') -> None:
12 | super(CriticNetwork, self).__init__()
13 |
14 | check_point_base_dir = os.path.join( PATH , check_point_base_dir )
15 | self.check_point_file = os.path.join(check_point_base_dir, model_name)
16 | self.critic = nn.Sequential(
17 | nn.Conv2d(in_channels = 3, out_channels = 32, kernel_size=5, stride=1),
18 | nn.ReLU(),
19 | nn.BatchNorm2d(32),
20 | nn.ReLU(),
21 | nn.MaxPool2d(2,2),
22 |
23 | # nn.Conv2d(in_channels = 32, out_channels = 64, kernel_size=5, stride=1),
24 | # nn.ReLU(),
25 | # nn.BatchNorm2d(64),
26 | # nn.ReLU(),
27 | # nn.MaxPool2d(2,2),
28 |
29 | # nn.Conv2d(in_channels = 64, out_channels = 64, kernel_size=5, stride=1),
30 | # nn.ReLU(),
31 | # nn.BatchNorm2d(64),
32 | # nn.ReLU(),
33 | # nn.MaxPool2d(2,2),
34 |
35 | nn.Flatten(),
36 | nn.Linear(28800, 512),
37 | nn.ReLU(),
38 | nn.Linear(512, 64),
39 | nn.ReLU(),
40 | nn.Linear(64, 1),
41 |
42 | )
43 | self.optimiser = optim.Adam(self.parameters(), lr = alpha)
44 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
45 | self.to(self.device)
46 |
47 | def forward(self, state):
48 | value = self.critic(state)
49 | return value
50 |
51 | def save_checkpoint(self):
52 | T.save(self.state_dict(), self.check_point_file)
53 |
54 | def load_checkpoint(self):
55 | self.load_state_dict(T.load(self.check_point_file))
--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Discrete/PPOImage/main.py:
--------------------------------------------------------------------------------
1 | from turtle import shape
2 | import gym
3 | import numpy as np
4 | from agent import Agent
5 |
6 | from utils import plot_learning_curve
7 |
8 | from SpaceRobotEnv.envs import SpaceRobotImage
9 |
10 | import numpy as np
11 |
12 |
13 |
14 | if __name__ == '__main__':
15 | # env = SpaceRobotImage()
16 | env = SpaceRobotImage()
17 | #N = 20
18 | N = 20
19 | batch_size = 5
20 | n_epochs = 4
21 | alpha = 0.0003
22 | action_space = env.action_space.shape[0]
23 | agent = Agent( n_actions = action_space,
24 | batch_size=batch_size,
25 | alpha = alpha,
26 | n_epoch = n_epochs,
27 | model_name_actor = "space_robot_actor.pt",
28 | model_name_critic = "space_robot_critic.pt")
29 | n_iter = 300
30 | figure_file = 'RL_algorithms/Torch/PPOImage/plots/space_robot_performance.png'
31 | best_score = env.reward_range[0]
32 | score_history = []
33 | n_steps = 0
34 | learn_iters = 0
35 | avg_score = 0
36 |
37 | for i in range(n_iter):
38 | obs = env.reset()
39 | observation = obs["rawimage"].reshape(3, 64, 64)
40 | done = False
41 | score = 0
42 | while not done:
43 | action, prob, val = agent.choose_action(observation)
44 | observation_, reward, done, info = env.step(action)
45 | n_steps += 1
46 | score += reward
47 |
48 | agent.remember(observation, action, prob, val, reward, done)
49 | #steps before we begin learning 20
50 | if n_steps % N ==0:
51 | agent.learn()
52 | learn_iters += 1
53 | observation = observation_["rawimage"].reshape(3, 64, 64)
54 |
55 | print("done")
56 | score_history.append(score)
57 | avg_score = np.mean(score_history[-100:])
58 |
59 | if avg_score>best_score:
60 | best_score= avg_score
61 | agent.save_models()
62 | print('episode', i , 'score %.1f',score, 'avg_score %.1f' %avg_score,
63 | 'time_steps',n_steps, 'learning_steps', learn_iters)
64 |
65 | x = [i+1 for i in range(len(score_history))]
66 | plot_learning_curve(x, score_history,figure_file)
67 | env.close()
68 |
--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Discrete/PPOImage/memory.py:
--------------------------------------------------------------------------------
1 | from multiprocessing.context import BaseContext
2 | import os
3 | import numpy as np
4 | import torch as T
5 | import torch.nn as nn
6 | import torch.optim as optim
7 |
8 | class Mem:
9 | def __init__(self, state , prob, val ,action, reward, done) -> None:
10 | self.state = state
11 | self.prob = prob
12 | self.val = val
13 | self.action = action
14 | self.reward = reward
15 | self.done = done
16 |
17 | class PPOMemory:
18 | def __init__(self, batch_size):
19 | self.states = []
20 | self.probs = []
21 | self.vals = []
22 | self.actions = []
23 | self.rewards = []
24 | self.dones = []
25 |
26 | self.batch_size = batch_size
27 |
28 | def generate_batches(self):
29 | n_states = len(self.states)
30 | batch_start = np.arange(0, n_states, self.batch_size)
31 | indices = np.arange(n_states, dtype=np.int64)
32 | np.random.shuffle(indices)
33 | batches = [indices[i:i+self.batch_size] for i in batch_start]
34 |
35 | return np.array(self.states),\
36 | np.array(self.actions),\
37 | np.array(self.probs),\
38 | np.array(self.vals),\
39 | np.array(self.rewards),\
40 | np.array(self.dones),\
41 | batches
42 |
43 | def store_memory(self, state, action, probs, vals, reward, done):
44 | self.states.append(state)
45 | self.actions.append(action)
46 | self.probs.append(probs)
47 | self.vals.append(vals)
48 | self.rewards.append(reward)
49 | self.dones.append(done)
50 |
51 | def clear_memory(self):
52 | self.states = []
53 | self.probs = []
54 | self.actions = []
55 | self.rewards = []
56 | self.dones = []
57 | self.vals = []
58 |
--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Discrete/PPOImage/models/.space_robot_actor.pt.icloud:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Discrete/PPOImage/models/.space_robot_actor.pt.icloud
--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Discrete/PPOImage/models/.space_robot_critic.pt.icloud:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Discrete/PPOImage/models/.space_robot_critic.pt.icloud
--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Discrete/PPOImage/plots/space_robot_performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/Discrete/PPOImage/plots/space_robot_performance.png
--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/Discrete/PPOImage/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 |
4 | def plot_learning_curve(x, scores, figure_file):
5 | running_avg = np.zeros(len(scores))
6 | for i in range(len(running_avg)):
7 | running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
8 | plt.plot(x, running_avg)
9 | plt.title('Running average of previous 100 scores')
10 | plt.savefig(figure_file)
--------------------------------------------------------------------------------
/RL_algorithms/Torch/PPO/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/PPO/__init__.py
--------------------------------------------------------------------------------
/RL_algorithms/Torch/SAC/SAC_ENV/core.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import scipy.signal
3 |
4 | import torch
5 | import torch.nn as nn
6 | import torch.nn.functional as F
7 | from torch.distributions.normal import Normal
8 |
9 |
10 | def combined_shape(length, shape=None):
11 | if shape is None:
12 | return (length,)
13 | return (length, shape) if np.isscalar(shape) else (length, *shape)
14 |
15 | def mlp(sizes, activation, output_activation=nn.Identity):
16 | # converts array of layer shape to neural net
17 | layers = []
18 | for j in range(len(sizes)-1):
19 | act = activation if j < len(sizes) -2 else output_activation
20 | layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
21 | return nn.Sequential(*layers)
22 |
23 | def count_vars(module):
24 | return sum([np.prod(p.shape) for p in module.parameters()])
25 |
26 |
27 | LOG_STD_MAX = 2
28 | LOG_STD_MIN = -20
29 |
30 | class SquashedGaussianMLPActor(nn.Module):
31 |
32 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation, act_limit):
33 | super().__init__()
34 | self.net = mlp([obs_dim] + list(hidden_sizes), activation, activation)
35 | self.mu_layer = nn.Linear(hidden_sizes[-1], act_dim)
36 | self.log_std_layer = nn.Linear(hidden_sizes[-1], act_dim)
37 | self.act_limit = act_limit
38 |
39 | def forward(self, obs, deterministic=False, with_logprob=True):
40 | net_out = self.net(obs)
41 | mu = self.mu_layer(net_out)
42 | log_std = self.log_std_layer(net_out)
43 | log_std = torch.clamp(log_std, LOG_STD_MIN, LOG_STD_MAX)
44 | std = torch.exp(log_std)
45 |
46 | # Pre-squash distribution and sample
47 | pi_distribution = Normal(mu, std)
48 | if deterministic:
49 | # Only used for evaluating policy at test time.
50 | pi_action = mu
51 | else:
52 | pi_action = pi_distribution.rsample()
53 |
54 | if with_logprob:
55 | # Compute logprob from Gaussian, and then apply correction for Tanh squashing.
56 | # NOTE: The correction formula is a little bit magic. To get an understanding
57 | # of where it comes from, check out the original SAC paper (arXiv 1801.01290)
58 | # and look in appendix C. This is a more numerically-stable equivalent to Eq 21.
59 | # Try deriving it yourself as a (very difficult) exercise. :)
60 | logp_pi = pi_distribution.log_prob(pi_action).sum(axis=-1)
61 | logp_pi -= (2*(np.log(2) - pi_action - F.softplus(-2*pi_action))).sum(axis=1)
62 | else:
63 | logp_pi = None
64 |
65 | pi_action = torch.tanh(pi_action)
66 | pi_action = self.act_limit * pi_action
67 |
68 | return pi_action, logp_pi
69 |
70 |
71 | class MLPQFunction(nn.Module):
72 |
73 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation):
74 | super().__init__()
75 | self.q = mlp([obs_dim + act_dim] + list(hidden_sizes) + [1], activation)
76 |
77 | def forward(self, obs, act):
78 | q = self.q(torch.cat([obs, act], dim=-1))
79 | return torch.squeeze(q, -1) # Critical to ensure q has right shape.
80 |
81 | class MLPActorCritic(nn.Module):
82 |
83 | def __init__(self, observation_space, action_space, hidden_sizes=(256,256),
84 | activation=nn.ReLU):
85 | super().__init__()
86 |
87 | obs_dim = observation_space.shape[0]
88 | act_dim = action_space.shape[0]
89 | act_limit = action_space.high[0]
90 |
91 | # build policy and value functions
92 | self.pi = SquashedGaussianMLPActor(obs_dim, act_dim, hidden_sizes, activation, act_limit)
93 | self.q1 = MLPQFunction(obs_dim, act_dim, hidden_sizes, activation)
94 | self.q2 = MLPQFunction(obs_dim, act_dim, hidden_sizes, activation)
95 |
96 | def act(self, obs, deterministic=False):
97 | with torch.no_grad():
98 | a, _ = self.pi(obs, deterministic, False)
99 | return a.numpy()
100 |
--------------------------------------------------------------------------------
/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658847118.Tosins-Air.19214.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658847118.Tosins-Air.19214.0
--------------------------------------------------------------------------------
/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658847140.Tosins-Air.19431.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658847140.Tosins-Air.19431.0
--------------------------------------------------------------------------------
/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658847454.Tosins-Air.19535.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658847454.Tosins-Air.19535.0
--------------------------------------------------------------------------------
/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658847513.Tosins-Air.19931.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658847513.Tosins-Air.19931.0
--------------------------------------------------------------------------------
/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658847612.Tosins-Air.19979.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658847612.Tosins-Air.19979.0
--------------------------------------------------------------------------------
/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658847918.Tosins-Air.20089.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658847918.Tosins-Air.20089.0
--------------------------------------------------------------------------------
/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658848049.Tosins-Air.20232.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658848049.Tosins-Air.20232.0
--------------------------------------------------------------------------------
/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658848339.Tosins-Air.20384.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658848339.Tosins-Air.20384.0
--------------------------------------------------------------------------------
/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658848364.Tosins-Air.20423.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658848364.Tosins-Air.20423.0
--------------------------------------------------------------------------------
/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658848673.Tosins-Air.20649.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658848673.Tosins-Air.20649.0
--------------------------------------------------------------------------------
/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658848831.Tosins-Air.20793.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658848831.Tosins-Air.20793.0
--------------------------------------------------------------------------------
/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658849191.Tosins-Air.20924.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658849191.Tosins-Air.20924.0
--------------------------------------------------------------------------------
/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658849218.Tosins-Air.20984.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658849218.Tosins-Air.20984.0
--------------------------------------------------------------------------------
/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658849777.Tosins-Air.21229.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658849777.Tosins-Air.21229.0
--------------------------------------------------------------------------------
/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658849785.Tosins-Air.21269.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658849785.Tosins-Air.21269.0
--------------------------------------------------------------------------------
/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658849885.Tosins-Air.21429.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658849885.Tosins-Air.21429.0
--------------------------------------------------------------------------------
/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658849941.Tosins-Air.21521.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658849941.Tosins-Air.21521.0
--------------------------------------------------------------------------------
/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658850278.Tosins-Air.21678.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/SAC_ENV/logger/events.out.tfevents.1658850278.Tosins-Air.21678.0
--------------------------------------------------------------------------------
/RL_algorithms/Torch/SAC/SAC_ENV/memory.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import core
3 | import torch
4 |
5 | class ReplayBuffer:
6 | """
7 | A simple FIFO experience replay buffer for SAC agents.
8 | """
9 |
10 | def __init__(self, obs_dim, act_dim, size):
11 | self.obs_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32)
12 | self.obs2_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32)
13 | self.act_buf = np.zeros(core.combined_shape(size, act_dim), dtype=np.float32)
14 | self.rew_buf = np.zeros(size, dtype=np.float32)
15 | self.done_buf = np.zeros(size, dtype=np.float32)
16 | self.ptr, self.size, self.max_size = 0, 0, size
17 |
18 | def store(self, obs, act, rew, next_obs, done):
19 | self.obs_buf[self.ptr] = obs
20 | self.obs2_buf[self.ptr] = next_obs
21 | self.act_buf[self.ptr] = act
22 | self.rew_buf[self.ptr] = rew
23 | self.done_buf[self.ptr] = done
24 | self.ptr = (self.ptr+1) % self.max_size
25 | self.size = min(self.size+1, self.max_size)
26 |
27 | def sample_batch(self, batch_size=32):
28 | idxs = np.random.randint(0, self.size, size=batch_size)
29 | batch = dict(obs=self.obs_buf[idxs],
30 | obs2=self.obs2_buf[idxs],
31 | act=self.act_buf[idxs],
32 | rew=self.rew_buf[idxs],
33 | done=self.done_buf[idxs])
34 | return {k: torch.as_tensor(v, dtype=torch.float32) for k,v in batch.items()}
--------------------------------------------------------------------------------
/RL_algorithms/Torch/SAC/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/SAC/__init__.py
--------------------------------------------------------------------------------
/RL_algorithms/Torch/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/Torch/__init__.py
--------------------------------------------------------------------------------
/RL_algorithms/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/RL_algorithms/__init__.py
--------------------------------------------------------------------------------
/RL_algorithms/utils/mpi_tools.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | def allreduce(*args, **kwargs):
3 | return MPI.COMM_WORLD.Allreduce(*args, **kwargs)
4 |
5 | def mpi_op(x, op):
6 | x, scalar = ([x], True) if np.isscalar(x) else (x, False)
7 | x = np.asarray(x, dtype=np.float32)
8 | buff = np.zeros_like(x, dtype=np.float32)
9 | allreduce(x, buff, op=op)
10 | return buff[0] if scalar else
11 |
12 | def mpi_statistics_scalar(x, with_min_and_max=False):
13 | """
14 | Get mean/std and optional min/max of scalar x across MPI processes.
15 | Args:
16 | x: An array containing samples of the scalar to produce statistics
17 | for.
18 | with_min_and_max (bool): If true, return min and max of x in
19 | addition to mean and std.
20 | """
21 | x = np.array(x, dtype=np.float32)
22 | global_sum, global_n = mpi_sum([np.sum(x), len(x)])
23 | mean = global_sum / global_n
24 |
25 | global_sum_sq = mpi_sum(np.sum((x - mean)**2))
26 | std = np.sqrt(global_sum_sq / global_n) # compute global std
27 |
28 | if with_min_and_max:
29 | global_min = mpi_op(np.min(x) if len(x) > 0 else np.inf, op=MPI.MIN)
30 | global_max = mpi_op(np.max(x) if len(x) > 0 else -np.inf, op=MPI.MAX)
31 | return mean, std, global_min, global_max
32 | return mean, std
--------------------------------------------------------------------------------
/Simulation.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/Simulation.jpg
--------------------------------------------------------------------------------
/SpaceRobotEnv/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/.DS_Store
--------------------------------------------------------------------------------
/SpaceRobotEnv/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from gym.envs.registration import register
3 |
4 | logger = logging.getLogger(__name__)
5 |
6 |
7 | register(
8 | id="SpaceRobotState-v0",
9 | entry_point="SpaceRobotEnv.envs:SpaceRobotState",
10 | max_episode_steps=512,
11 | )
12 |
13 | register(
14 | id="SpaceRobotImage-v0",
15 | entry_point="SpaceRobotEnv.envs:SpaceRobotImage",
16 | max_episode_steps=512,
17 | )
18 |
19 | register(
20 | id="SpaceRobotDualArm-v0",
21 | entry_point="SpaceRobotEnv.envs:SpaceRobotDualArm",
22 | max_episode_steps=512,
23 | )
24 |
25 | register(
26 | id="SpaceRobotPointCloud-v0",
27 | entry_point="SpaceRobotEnv.envs:SpaceRobotPointCloud",
28 | max_episode_steps=512,
29 | )
30 |
31 | register(
32 | id="SpaceRobotCost-v0",
33 | entry_point="SpaceRobotEnv.envs:SpaceRobotCost",
34 | max_episode_steps=512,
35 | )
36 |
37 | register(
38 | id="SpaceRobotReorientation-v0",
39 | entry_point="SpaceRobotEnv.envs:SpaceRobotReorientation",
40 | max_episode_steps=512,
41 | )
--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/assets/.DS_Store
--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/common/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 The dm_control Authors.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 |
16 | """Functions to manage the common assets for domains."""
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import os
23 | from dm_control.utils import resources
24 |
25 | _SUITE_DIR = os.path.dirname(os.path.dirname(__file__))
26 | _FILENAMES = [
27 | "common/materials.xml",
28 | "common/skybox.xml",
29 | "common/visual.xml",
30 | ]
31 |
32 | ASSETS = {filename: resources.GetResource(os.path.join(_SUITE_DIR, filename))
33 | for filename in _FILENAMES}
34 |
35 |
36 | def read_model(model_filename):
37 | """Reads a model XML file and returns its contents as a string."""
38 | return resources.GetResource(os.path.join(_SUITE_DIR, model_filename))
39 |
--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/common/materials.xml:
--------------------------------------------------------------------------------
1 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/common/skybox.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/common/visual.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/spacerobot/arm_v3.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/spacerobot/arm_v31.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/spacerobot/asset.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/spacerobot/sensor.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/spacerobot/spacerobot_cost.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
77 |
80 |
81 |
82 |
83 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/spacerobot/spacerobot_dualarm.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
99 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/spacerobot/spacerobot_image.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
76 |
79 |
80 |
81 |
82 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/spacerobot/spacerobot_state.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
77 |
80 |
81 |
82 |
83 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/spacerobot/stls/R10.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/assets/spacerobot/stls/R10.stl
--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/spacerobot/stls/cube.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/assets/spacerobot/stls/cube.stl
--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/spacerobot/stls/v_base.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/assets/spacerobot/stls/v_base.stl
--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/spacerobot/stls/v_forearm.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/assets/spacerobot/stls/v_forearm.stl
--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/spacerobot/stls/v_shoulder.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/assets/spacerobot/stls/v_shoulder.stl
--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/spacerobot/stls/v_upperarm.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/assets/spacerobot/stls/v_upperarm.stl
--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/spacerobot/stls/v_wrist1.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/assets/spacerobot/stls/v_wrist1.stl
--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/spacerobot/stls/v_wrist2.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/assets/spacerobot/stls/v_wrist2.stl
--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/spacerobot/stls/v_wrist3.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/assets/spacerobot/stls/v_wrist3.stl
--------------------------------------------------------------------------------
/SpaceRobotEnv/assets/spacerobot/subgoal.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
121 |
122 |
123 |
124 |
125 |
126 |
129 |
130 |
131 |
134 |
135 |
136 |
139 |
140 |
141 |
144 |
145 |
146 |
149 |
150 |
151 |
154 |
155 |
156 |
159 |
160 |
161 |
--------------------------------------------------------------------------------
/SpaceRobotEnv/envs/SpaceRobotReorientation.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import copy
4 | import numpy as np
5 |
6 | import gym
7 | from gym import spaces
8 | from gym.utils import seeding
9 |
10 | from gym.envs.robotics import utils
11 | from gym.envs.robotics import rotations
12 |
13 | import mujoco_py
14 |
15 |
16 | PATH = os.getcwd()
17 | MODEL_XML_PATH = os.path.join(PATH,'SpaceRobotEnv','assets', 'spacerobot', 'spacerobot_dualarm.xml')
18 | DEFAULT_SIZE = 500
19 |
20 |
21 | class RobotEnv(gym.GoalEnv):
22 | def __init__(self, model_path, initial_qpos, n_substeps):
23 |
24 | # load model and simulator
25 | self.model = mujoco_py.load_model_from_path(model_path)
26 | self.sim = mujoco_py.MjSim(self.model, nsubsteps=n_substeps)
27 |
28 | # render setting
29 | self.viewer = None
30 | self._viewers = {}
31 | self.metadata = {
32 | "render.modes": ["human", "rgb_array"],
33 | "video.frames_per_second": int(np.round(1.0 / self.dt)),
34 | }
35 |
36 | # seed
37 | self.seed()
38 |
39 | # initalization
40 | self._env_setup(initial_qpos=initial_qpos)
41 | self.initial_state = copy.deepcopy(self.sim.get_state())
42 | self.goal = self._sample_goal()
43 |
44 | # set action_space and observation_space
45 | obs = self._get_obs()
46 | self._set_action_space()
47 | self.observation_space = spaces.Dict(
48 | dict(
49 | desired_goal=spaces.Box(
50 | -np.inf, np.inf, shape=obs["desired_goal"].shape, dtype="float32"
51 | ),
52 | achieved_goal=spaces.Box(
53 | -np.inf, np.inf, shape=obs["achieved_goal"].shape, dtype="float32"
54 | ),
55 | observation=spaces.Box(
56 | -np.inf, np.inf, shape=obs["observation"].shape, dtype="float32"
57 | ),
58 | )
59 | )
60 |
61 | def _set_action_space(self):
62 | bounds = self.model.actuator_ctrlrange.copy()
63 | low, high = bounds.T
64 | self.action_space = spaces.Box(low=low, high=high, dtype=np.float32)
65 | return self.action_space
66 |
67 | @property
68 | def dt(self):
69 | return self.sim.model.opt.timestep * self.sim.nsubsteps
70 |
71 | def _detecte_collision(self):
72 | self.collision = self.sim.data.ncon
73 | return self.collision
74 |
75 | def _sensor_torque(self):
76 | self.sensor_data = self.sim.data.sensordata
77 | return self.sensor_data
78 |
79 | def seed(self, seed=None):
80 | self.np_random, seed = seeding.np_random(seed)
81 | return [seed]
82 |
83 | def step(self, action):
84 | action = np.clip(action, self.action_space.low, self.action_space.high)
85 | self._set_action(action) # do one step simulation here
86 | self._step_callback()
87 | obs = self._get_obs()
88 | done = False
89 | info = {
90 | "is_success": self._is_success(obs["achieved_goal"], self.goal)
91 | }
92 | reward = self.compute_reward(obs["achieved_goal"], self.goal, info)
93 | # reward = self.compute_reward(obs['achieved_goal'], self.goal, info) + self.compute_reward(obs['achieved_goal1'], self.goal1, info)
94 | return obs, reward, done, info
95 |
96 | def reset(self):
97 | """Attempt to reset the simulator. Since we randomize initial conditions, it
98 | is possible to get into a state with numerical issues (e.g. due to penetration or
99 | Gimbel lock) or we may not achieve an initial condition (e.g. an object is within the hand).
100 | In this case, we just keep randomizing until we eventually achieve a valid initial
101 | configuration.
102 | """
103 | super(RobotEnv, self).reset()
104 | did_reset_sim = False
105 | while not did_reset_sim:
106 | did_reset_sim = self._reset_sim()
107 |
108 | self.goal = self._sample_goal()
109 | obs = self._get_obs()
110 |
111 | # TODO: set the position of cube
112 | # body_id = self.sim.model.geom_name2id("cube")
113 | # self.sim.model.geom_pos[body_id] = np.array([0, 0, 6])
114 | return obs
115 |
116 | def close(self):
117 | if self.viewer is not None:
118 | # self.viewer.finish()
119 | self.viewer = None
120 | self._viewers = {}
121 |
122 | def render(self, mode="human", width=DEFAULT_SIZE, height=DEFAULT_SIZE):
123 | # self._render_callback()
124 | if mode == "rgb_array":
125 | self._get_viewer(mode).render(width, height)
126 | # window size used for old mujoco-py:
127 | datargb, datadepth = self._get_viewer(mode).read_pixels(
128 | width, height, depth=True
129 | )
130 | # original image is upside-down, so flip it
131 | return datargb[::-1, :, :], datadepth[::-1]
132 | elif mode == "human":
133 | self._get_viewer(mode).render()
134 |
135 | def _get_viewer(self, mode):
136 | self.viewer = self._viewers.get(mode)
137 |
138 | if self.viewer is None:
139 | if mode == "human":
140 | self.viewer = mujoco_py.MjViewer(self.sim)
141 | self._viewer_setup()
142 |
143 | elif mode == "rgb_array":
144 | self.viewer = mujoco_py.MjRenderContextOffscreen(self.sim, device_id=-1)
145 | self._viewer_setup()
146 | # self.viewer.cam.trackbodyid = 0
147 | # latest modification
148 | cam_pos = np.array([0.5, 0, 5, 0.3, -30, 0])
149 | for i in range(3):
150 | self.viewer.cam.lookat[i] = cam_pos[i]
151 | self.viewer.cam.distance = cam_pos[3]
152 | self.viewer.cam.elevation = cam_pos[4]
153 | self.viewer.cam.azimuth = cam_pos[5]
154 | # self.viewer.cam.trackbodyid = -1
155 |
156 | self._viewers[mode] = self.viewer
157 | return self.viewer
158 |
159 | def _reset_sim(self):
160 | """Resets a simulation and indicates whether or not it is successful.
161 | If a reset is unsuccessful (e.g. if a randomized state caused an error in the
162 | simulation), this method should indicate such a failure by returning False.
163 | In such a case, this method will be called again to attempt a the reset again.
164 | """
165 | self.sim.set_state(self.initial_state)
166 | self.sim.forward()
167 | return True
168 |
169 | def _get_obs(self):
170 | """Returns the observation."""
171 | raise NotImplementedError()
172 |
173 | def _set_action(self, action):
174 | """Applies the given action to the simulation."""
175 | raise NotImplementedError()
176 |
177 | def _is_success(self, achieved_goal, desired_goal):
178 | """Indicates whether or not the achieved goal successfully achieved the desired goal."""
179 | raise NotImplementedError()
180 |
181 | def _sample_goal(self):
182 | """Samples a new goal and returns it."""
183 | raise NotImplementedError()
184 |
185 | def _env_setup(self, initial_qpos):
186 | """Initial configuration of the environment. Can be used to configure initial state
187 | and extract information from the simulation.
188 | """
189 | pass
190 |
191 | def _viewer_setup(self):
192 | """Initial configuration of the viewer. Can be used to set the camera position,
193 | for example.
194 | """
195 | pass
196 |
197 | def _render_callback(self):
198 | """A custom callback【自定义回调】 that is called before rendering. Can be used
199 | to implement custom visualizations.【可实现自定义可视化】
200 | """
201 | pass
202 |
203 | def _step_callback(self):
204 | """A custom callback that is called after stepping the simulation. Can be used
205 | to enforce additional constraints on the simulation state.【对模拟状态强制附加约束】
206 | """
207 | pass
208 |
209 |
210 | def goal_distance(goal_a, goal_b):
211 | assert goal_a.shape == goal_b.shape
212 | return np.linalg.norm(goal_a - goal_b, axis=-1)
213 |
214 |
215 | class SpacerobotEnv(RobotEnv):
216 | """Superclass for all SpaceRobot environments."""
217 |
218 | def __init__(
219 | self,
220 | model_path,
221 | n_substeps,
222 | distance_threshold,
223 | initial_qpos,
224 | reward_type,
225 | pro_type,
226 | c_coeff,
227 | ):
228 | """Initializes a new Fetch environment.
229 | Args:
230 | model_path (string): path to the environments XML file
231 | n_substeps (int): number of substeps the simulation runs on every call to step
232 | distance_threshold (float): the threshold after which a goal is considered achieved
233 | initial_qpos (dict): a dictionary of joint names and values that define the initial configuration
234 | reward_type ('sparse' or 'dense'): the reward type, i.e. sparse or dense
235 | pro_type ('MDP' or 'CMDP'): the problem setting whether contains cost or not
236 | c_coeff: cost coefficient
237 | """
238 | self.n_substeps = n_substeps
239 | # self.target_range = target_range
240 | self.distance_threshold = distance_threshold
241 | self.reward_type = reward_type
242 | self.c_coeff = c_coeff
243 | self.pro_type = pro_type
244 |
245 | super(SpacerobotEnv, self).__init__(
246 | model_path=model_path,
247 | n_substeps=n_substeps,
248 | initial_qpos=initial_qpos,
249 | )
250 |
251 | def compute_reward(self, achieved_goal, desired_goal, info):
252 | # Compute distance between goal and the achieved goal.
253 | d = goal_distance(achieved_goal, desired_goal)
254 |
255 | reward = {
256 | "sparse": -(d > self.distance_threshold).astype(np.float32),
257 | "dense": -(0.001 * d ** 2 + np.log10(d ** 2 + 1e-6)),
258 | }
259 |
260 | return reward
261 |
262 |
263 | def _set_action(self, action):
264 | """
265 | output action (velocity)
266 | :param action: angle velocity of joints
267 | :return: angle velocity of joints
268 | """
269 | assert action.shape == (12,)
270 | self.sim.data.ctrl[:] = action * 0.5
271 | for _ in range(self.n_substeps):
272 | self.sim.step()
273 |
274 | def _get_obs(self):
275 | # positions
276 | # grip_pos = self.sim.data.get_body_xpos("tip_frame")
277 | # grip_pos1 = self.sim.data.get_body_xpos("tip_frame1")
278 | """
279 | # get the rotation angle of the target
280 | grip_rot = self.sim.data.get_body_xquat('tip_frame')
281 | grip_rot = rotations.quat2euler(grip_rot)
282 | grip_rot1 = self.sim.data.get_body_xquat('tip_frame1')
283 | grip_rot1 = rotations.quat2euler(grip_rot1)
284 | """
285 | # dt = self.sim.nsubsteps * self.sim.model.opt.timestep
286 | # grip_velp = self.sim.data.get_body_xvelp("tip_frame") * dt
287 | # grip_velp1 = self.sim.data.get_body_xvelp("tip_frame1") * dt
288 | """
289 | achieved_goal = np.concatenate([grip_pos.copy(),grip_rot.copy()])
290 | achieved_goal1 = np.concatenate([grip_pos1.copy(),grip_rot1.copy()])
291 | """
292 | post_base_att = self.sim.data.get_body_xquat('chasersat')
293 |
294 | obs = np.concatenate(
295 | [
296 | self.sim.data.qpos[:].copy(),
297 | self.sim.data.qvel[:].copy(),
298 | self.goal.copy(),
299 | ]
300 | )
301 |
302 | return {
303 | "observation": obs.copy(),
304 | "achieved_goal": post_base_att.copy(),
305 | "desired_goal": self.goal.copy(),
306 | }
307 |
308 | def _viewer_setup(self):
309 | # body_id = self.sim.model.body_name2id('forearm_link')
310 | body_id = self.sim.model.body_name2id("wrist_3_link")
311 | lookat = self.sim.data.body_xpos[body_id]
312 | for idx, value in enumerate(lookat):
313 | self.viewer.cam.lookat[idx] = value
314 | self.viewer.cam.distance = 2.5
315 | self.viewer.cam.azimuth = 132.0
316 | self.viewer.cam.elevation = -14.0
317 |
318 | def _reset_sim(self):
319 | self.sim.set_state(self.initial_state)
320 | self.sim.forward()
321 | return True
322 |
323 | def _sample_goal(self):
324 | goal = self.initial_base_att
325 |
326 | return goal.copy()
327 |
328 | def _is_success(self, achieved_goal, desired_goal):
329 | d = goal_distance(achieved_goal, desired_goal)
330 | return (d < self.distance_threshold).astype(np.float32)
331 | # return d
332 |
333 | def _env_setup(self, initial_qpos):
334 |
335 | # set qpos of chasersat
336 | chasersat_pos = [0.,0.,4.] # init pos of base
337 | chasersat_ori = np.random.rand(3) * 0.5 # initial base att range[0,1)
338 | chasersat_quat = rotations.euler2quat(chasersat_ori)
339 | initial_qpos['chasersat:joint'] = list(chasersat_pos) + list(chasersat_quat)
340 | # print('initial qpos of base is {}'.format(initial_qpos['chasersat:joint']))
341 |
342 | for name, value in initial_qpos.items():
343 | self.sim.data.set_joint_qpos(name, value)
344 | utils.reset_mocap_welds(self.sim)
345 |
346 | # Extract information for sampling goals.
347 | self.initial_gripper_xpos = self.sim.data.get_body_xpos("tip_frame").copy()
348 | self.initial_gripper_xpos1 = self.sim.data.get_body_xpos("tip_frame1").copy()
349 |
350 | # get the initial base attitude
351 | self.initial_base_att = self.sim.data.get_body_xquat("chasersat").copy()
352 |
353 | # get the initial base position
354 | self.initial_base_pos = self.sim.data.get_body_xpos("chasersat").copy()
355 | # print('initial base att is {}'.format(self.initial_base_att))
356 | # print('initial base pos is {}'.format(self.initial_base_pos))
357 | # print('initial pos is {}'.format(self.sim.data.qpos[:]))
358 |
359 | def render(self, mode="human", width=500, height=500):
360 | return super(SpacerobotEnv, self).render(mode, width, height)
361 |
362 |
363 | class SpaceRobotReorientation(SpacerobotEnv, gym.utils.EzPickle):
364 | def __init__(self, reward_type="sparse", pro_type="MDP"):
365 | initial_qpos = {
366 | "arm:shoulder_pan_joint": 0.0,
367 | "arm:shoulder_lift_joint": 0.0,
368 | "arm:elbow_joint": 0.0,
369 | "arm:wrist_1_joint": 0.0,
370 | "arm:wrist_2_joint": 0.0,
371 | "arm:wrist_3_joint": 0.0,
372 | "arm:shoulder_pan_joint1": 0.0,
373 | "arm:shoulder_lift_joint1": 0.0,
374 | "arm:elbow_joint1": 0.0,
375 | "arm:wrist_1_joint1": 0.0,
376 | "arm:wrist_2_joint1": 0.0,
377 | "arm:wrist_3_joint1": 0.0,
378 | }
379 | SpacerobotEnv.__init__(
380 | self,
381 | MODEL_XML_PATH,
382 | n_substeps=20,
383 | distance_threshold=0.05,
384 | initial_qpos=initial_qpos,
385 | reward_type=reward_type,
386 | pro_type=pro_type,
387 | c_coeff=0.1,
388 | )
389 | gym.utils.EzPickle.__init__(self)
390 |
--------------------------------------------------------------------------------
/SpaceRobotEnv/envs/SpaceRobotState.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import copy
4 | import numpy as np
5 |
6 | import gym
7 | from gym import spaces
8 | from gym.utils import seeding
9 |
10 | from gym.envs.robotics import utils
11 | from gym.envs.robotics import rotations
12 |
13 | import mujoco_py
14 |
15 | PATH = os.getcwd()
16 |
17 | MODEL_XML_PATH = os.path.join(
18 | PATH, "SpaceRobotEnv", "assets", "spacerobot", "spacerobot_state.xml"
19 | )
20 | DEFAULT_SIZE = 500
21 |
22 |
23 | class RobotEnv(gym.GoalEnv):
24 | def __init__(self, model_path, initial_qpos, n_substeps):
25 |
26 | # load model and simulator
27 | self.model = mujoco_py.load_model_from_path(model_path)
28 | self.sim = mujoco_py.MjSim(self.model, nsubsteps=n_substeps)
29 |
30 | # render setting
31 | self.viewer = None
32 | self._viewers = {}
33 | self.metadata = {
34 | "render.modes": ["human", "rgb_array"],
35 | "video.frames_per_second": int(np.round(1.0 / self.dt)),
36 | }
37 |
38 | # seed
39 | self.seed()
40 |
41 | # initalization
42 | self._env_setup(initial_qpos=initial_qpos)
43 | self.initial_state = copy.deepcopy(self.sim.get_state())
44 | self.goal = self._sample_goal()
45 |
46 | # set action_space and observation_space
47 | obs = self._get_obs()
48 | self._set_action_space()
49 | self.observation_space = spaces.Dict(
50 | dict(
51 | desired_goal=spaces.Box(
52 | -np.inf, np.inf, shape=obs["desired_goal"].shape, dtype="float32"
53 | ),
54 | achieved_goal=spaces.Box(
55 | -np.inf, np.inf, shape=obs["achieved_goal"].shape, dtype="float32"
56 | ),
57 | observation=spaces.Box(
58 | -np.inf, np.inf, shape=obs["observation"].shape, dtype="float32"
59 | ),
60 | )
61 | )
62 |
63 | def _set_action_space(self):
64 | bounds = self.model.actuator_ctrlrange.copy()
65 | low, high = bounds.T
66 | self.action_space = spaces.Box( low = low, high = high, dtype = np.float32)
67 | return self.action_space
68 |
69 | @property
70 | def dt(self):
71 | return self.sim.model.opt.timestep * self.sim.nsubsteps
72 |
73 | def _detecte_collision(self):
74 | self.collision = self.sim.data.ncon
75 | return self.collision
76 |
77 | def _sensor_torque(self):
78 | self.sensor_data = self.sim.data.sensordata
79 | return self.sensor_data
80 |
81 | def seed(self, seed=None):
82 | self.np_random, seed = seeding.np_random(seed)
83 | return [seed]
84 |
85 | def step(self, action):
86 | old_action = self.sim.data.ctrl.copy() * (1 / 0.5)
87 | action = np.clip(action, self.action_space.low, self.action_space.high)
88 | self._set_action(action)
89 | self._step_callback()
90 | obs = self._get_obs()
91 | done = False
92 | info = {
93 | "is_success": self._is_success(obs["achieved_goal"], self.goal),
94 | "act": action,
95 | "old_act": old_action,
96 | }
97 | reward = self.compute_reward(
98 | obs["achieved_goal"], self.goal, action, old_action, info
99 | )
100 | return obs, reward, done, info
101 |
102 | def reset(self):
103 | """Attempt to reset the simulator. Since we randomize initial conditions, it
104 | is possible to get into a state with numerical issues (e.g. due to penetration or
105 | Gimbel lock) or we may not achieve an initial condition (e.g. an object is within the hand).
106 | In this case, we just keep randomizing until we eventually achieve a valid initial
107 | configuration.
108 | """
109 | super(RobotEnv, self).reset()
110 | did_reset_sim = False
111 | while not did_reset_sim:
112 | did_reset_sim = self._reset_sim()
113 |
114 | self.goal = self._sample_goal()
115 | obs = self._get_obs()
116 |
117 | return obs
118 |
119 | def close(self):
120 | if self.viewer is not None:
121 | # self.viewer.finish()
122 | self.viewer = None
123 | self._viewers = {}
124 |
125 | def render(self, mode="human", width=DEFAULT_SIZE, height=DEFAULT_SIZE):
126 | # self._render_callback()
127 | if mode == "rgb_array":
128 | self._get_viewer(mode).render(width, height)
129 | # window size used for old mujoco-py:
130 | data = self._get_viewer(mode).read_pixels(width, height, depth=False)
131 | # original image is upside-down, so flip it
132 | return data[::-1, :, :]
133 | elif mode == "human":
134 | self._get_viewer(mode).render()
135 |
136 | def _get_viewer(self, mode):
137 | self.viewer = self._viewers.get(mode)
138 | if self.viewer is None:
139 | if mode == "human":
140 | self.viewer = mujoco_py.MjViewer(self.sim)
141 | elif mode == "rgb_array":
142 | self.viewer = mujoco_py.MjRenderContextOffscreen(self.sim, device_id=-1)
143 | self._viewer_setup()
144 | self._viewers[mode] = self.viewer
145 | return self.viewer
146 |
147 | def _reset_sim(self):
148 | """Resets a simulation and indicates whether or not it is successful.
149 | If a reset is unsuccessful (e.g. if a randomized state caused an error in the
150 | simulation), this method should indicate such a failure by returning False.
151 | In such a case, this method will be called again to attempt a the reset again.
152 | """
153 | self.sim.set_state(self.initial_state)
154 | self.sim.forward()
155 | return True
156 |
157 | def _get_obs(self):
158 | """Returns the observation."""
159 | raise NotImplementedError()
160 |
161 | def _set_action(self, action):
162 | """Applies the given action to the simulation."""
163 | raise NotImplementedError()
164 |
165 | def _is_success(self, achieved_goal, desired_goal):
166 | """Indicates whether or not the achieved goal successfully achieved the desired goal."""
167 | raise NotImplementedError()
168 |
169 | def _sample_goal(self):
170 | """Samples a new goal and returns it."""
171 | raise NotImplementedError()
172 |
173 | def _env_setup(self, initial_qpos):
174 | """Initial configuration of the environment. Can be used to configure initial state
175 | and extract information from the simulation.
176 | """
177 | pass
178 |
179 | def _viewer_setup(self):
180 | """Initial configuration of the viewer. Can be used to set the camera position,
181 | for example.
182 | """
183 | pass
184 |
185 | def _render_callback(self):
186 | """A custom callback【自定义回调】 that is called before rendering. Can be used
187 | to implement custom visualizations.【可实现自定义可视化】
188 | """
189 | pass
190 |
191 | def _step_callback(self):
192 | """A custom callback that is called after stepping the simulation. Can be used
193 | to enforce additional constraints on the simulation state.【对模拟状态强制附加约束】
194 | """
195 | pass
196 |
197 |
198 | def goal_distance(goal_a, goal_b):
199 | assert goal_a.shape == goal_b.shape
200 | return np.linalg.norm(goal_a - goal_b, axis=-1)
201 |
202 |
203 | class SpacerobotEnv(RobotEnv):
204 | """Superclass for all SpaceRobot environments."""
205 |
206 | def __init__(
207 | self,
208 | model_path,
209 | n_substeps,
210 | distance_threshold,
211 | initial_qpos,
212 | reward_type,
213 | ):
214 | """Initializes a new Fetch environment.
215 | Args:
216 | model_path (string): path to the environments XML file
217 | n_substeps (int): number of substeps the simulation runs on every call to step
218 | distance_threshold (float): the threshold after which a goal is considered achieved
219 | initial_qpos (dict): a dictionary of joint names and values that define the initial configuration
220 | reward_type ('sparse' or 'dense'): the reward type, i.e. sparse or dense
221 | """
222 | self.n_substeps = n_substeps
223 | self.distance_threshold = distance_threshold
224 | self.reward_type = reward_type
225 |
226 | super(SpacerobotEnv, self).__init__(
227 | model_path=model_path,
228 | n_substeps=n_substeps,
229 | initial_qpos=initial_qpos,
230 | )
231 |
232 | def compute_reward(self, achieved_goal, desired_goal, action, old_action, info):
233 |
234 | # Compute distance between goal and the achieved goal.
235 | d = goal_distance(achieved_goal, desired_goal)
236 | if self.reward_type == "sparse":
237 | return -(d > self.distance_threshold).astype(np.float32)
238 | elif self.reward_type == "distance":
239 | return d
240 | else:
241 | # dense reward
242 | return -(
243 | 0.001 * d ** 2
244 | + np.log10(d ** 2 + 1e-6)
245 | + 0.01 * np.linalg.norm(action - old_action) ** 2
246 | )
247 |
248 | def _set_action(self, action):
249 | """
250 | :param action: 3*None->6*None
251 | :return:
252 | """
253 | assert action.shape == (6,)
254 | self.sim.data.ctrl[:] = action * 0.5
255 | for _ in range(self.n_substeps):
256 | self.sim.step()
257 |
258 | def _get_obs(self):
259 | # positions
260 | grip_pos = self.sim.data.get_body_xpos("tip_frame")
261 | grip_velp = self.sim.data.get_body_xvelp("tip_frame") * self.dt
262 | robot_qpos, robot_qvel = utils.robot_get_obs(self.sim)
263 |
264 | gripper_state = robot_qpos[-1:]
265 | gripper_vel = (
266 | robot_qvel[-1:] * self.dt
267 | ) # change to a scalar if the gripper is made symmetric
268 |
269 | achieved_goal = grip_pos.copy()
270 |
271 | obs = np.concatenate(
272 | [
273 | self.sim.data.qpos[7:13].copy(),
274 | self.sim.data.qvel[6:12].copy(),
275 | grip_pos,
276 | grip_velp,
277 | self.goal.copy(),
278 | ]
279 | )
280 |
281 | return {
282 | "observation": obs.copy(),
283 | "achieved_goal": achieved_goal.copy(),
284 | "desired_goal": self.goal.copy(),
285 | }
286 |
287 | def _viewer_setup(self):
288 | body_id = self.sim.model.body_name2id("tip_frame")
289 | lookat = self.sim.data.body_xpos[body_id]
290 | for idx, value in enumerate(lookat):
291 | self.viewer.cam.lookat[idx] = value
292 | self.viewer.cam.distance = 2.5
293 | self.viewer.cam.azimuth = 132.0
294 | self.viewer.cam.elevation = -14.0
295 |
296 | def _reset_sim(self):
297 | self.sim.set_state(self.initial_state)
298 | self.sim.forward()
299 | return True
300 |
301 | def _sample_goal(self):
302 |
303 | goal = self.initial_gripper_xpos[:3].copy()
304 | d = goal_distance(self.sim.data.get_body_xpos("tip_frame").copy(), goal)
305 |
306 | goal[0] = self.initial_gripper_xpos[0] + np.random.uniform(-0.4, 0)
307 | goal[1] = self.initial_gripper_xpos[1] + np.random.uniform(-0.3, 0.3)
308 | goal[2] = self.initial_gripper_xpos[2] + np.random.uniform(0, 0.3)
309 |
310 | d = goal_distance(self.sim.data.get_body_xpos("tip_frame").copy(), goal)
311 |
312 | site_id = self.sim.model.site_name2id("target0")
313 | self.sim.model.site_pos[site_id] = goal
314 | self.sim.forward()
315 |
316 | return goal.copy()
317 |
318 | def _is_success(self, achieved_goal, desired_goal):
319 | d = goal_distance(achieved_goal, desired_goal)
320 | return (d < self.distance_threshold).astype(np.float32)
321 | # return d
322 |
323 | def _env_setup(self, initial_qpos):
324 | for name, value in initial_qpos.items():
325 | self.sim.data.set_joint_qpos(name, value)
326 | utils.reset_mocap_welds(self.sim)
327 |
328 | # Extract information for sampling goals.
329 | self.initial_gripper_xpos = self.sim.data.get_body_xpos("tip_frame").copy()
330 |
331 | def render(self, mode="human", width=500, height=500):
332 | return super(SpacerobotEnv, self).render(mode, width, height)
333 |
334 |
335 | class SpaceRobotState(SpacerobotEnv, gym.utils.EzPickle):
336 | def __init__(self, reward_type="nosparse"):
337 | initial_qpos = {
338 | "arm:shoulder_pan_joint": 0.0,
339 | "arm:shoulder_lift_joint": 0.0,
340 | "arm:elbow_joint": 0.0,
341 | "arm:wrist_1_joint": 0.0,
342 | "arm:wrist_2_joint": 0.0,
343 | "arm:wrist_3_joint": 0.0,
344 | }
345 | SpacerobotEnv.__init__(
346 | self,
347 | MODEL_XML_PATH,
348 | n_substeps=20,
349 | distance_threshold=0.05,
350 | initial_qpos=initial_qpos,
351 | reward_type=reward_type,
352 | )
353 | gym.utils.EzPickle.__init__(self)
354 |
--------------------------------------------------------------------------------
/SpaceRobotEnv/envs/__init__.py:
--------------------------------------------------------------------------------
1 | import imp
2 | from .SpaceRobotDualArm import SpaceRobotDualArm
3 | from .SpaceRobotImage import SpaceRobotImage
4 | from .SpaceRobotState import SpaceRobotState
5 | from .SpaceRobotCost import SpaceRobotCost
6 | from .SpaceRobotReorientation import SpaceRobotReorientation
7 |
--------------------------------------------------------------------------------
/SpaceRobotEnv/images/Simulation.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/images/Simulation.jpg
--------------------------------------------------------------------------------
/SpaceRobotEnv/images/ccc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/images/ccc.png
--------------------------------------------------------------------------------
/SpaceRobotEnv/images/iros.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/images/iros.gif
--------------------------------------------------------------------------------
/SpaceRobotEnv/images/ral.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/images/ral.gif
--------------------------------------------------------------------------------
/SpaceRobotEnv/images/robot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tsinghua-Space-Robot-Learning-Group/SpaceRobotEnv/155989c2ae94a3afeedf9b8601b6125d83b9c097/SpaceRobotEnv/images/robot.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | gym>=0.15.4
2 | mujoco-py>=1.15.1.0
3 | torch>=1.12.0
4 | torchvision>=0.13.0
5 | torchaudio>=0.12.0
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from os.path import dirname, realpath
2 | from setuptools import find_packages, setup
3 |
4 | def read_requirements_file(filename):
5 | req_file_path = '%s/%s' % (dirname(realpath(__file__)), filename)
6 | with open(req_file_path) as f:
7 | return [line.strip() for line in f]
8 |
9 | setup(
10 | name="SpaceRobotEnv",
11 | version="0.0.1",
12 | install_requires=read_requirements_file('requirements.txt'),
13 | packages=find_packages(exclude=("image",)),
14 | )
--------------------------------------------------------------------------------
/test_env.py:
--------------------------------------------------------------------------------
1 | import gym
2 |
3 | import SpaceRobotEnv
4 | import numpy as np
5 |
6 | env = gym.make("SpaceRobotReorientation-v0")
7 |
8 | dim_u = env.action_space.shape[0]
9 | print(dim_u)
10 | dim_o = env.observation_space["observation"].shape[0]
11 | print(dim_o)
12 |
13 |
14 | observation = env.reset()
15 | max_action = env.action_space.high
16 | print("max_action:", max_action)
17 | print("min_action", env.action_space.low)
18 | for e_step in range(20):
19 | observation = env.reset()
20 | for i_step in range(50):
21 | env.render()
22 | action = np.random.uniform(low=-1.0, high=1.0, size=(dim_u,))
23 | observation, reward, done, info = env.step(max_action * action)
24 |
25 | env.close()
26 |
--------------------------------------------------------------------------------