├── .gitignore
├── .style.yapf
├── .vscode
    ├── launch.json
    └── settings.json
├── README.md
├── __init__.py
├── dockerfiles
    ├── devel-training-gpu.Dockerfile
    └── devel-training.Dockerfile
├── envs
    ├── __init__.py
    ├── arl
    │   ├── __init__.py
    │   └── arl_env.py
    └── dVRK
    │   ├── PSM_cartesian_ddpg_env.py
    │   ├── PSM_cartesian_env.py
    │   ├── PSM_cartesian_herddpg_env.py
    │   └── __init__.py
├── install
    ├── stable_baseline_fix
    │   └── ddpg.py
    └── training-pip-requirements.txt
├── media
    ├── AMBF_Robots.png
    └── architecture-overview.png
├── scripts
    └── dVRK
    │   ├── PSM_cartesian_ddpg_algorithm.py
    │   ├── PSM_cartesian_herddpg_algorithm.py
    │   ├── bagging.bash
    │   ├── launch_ambf.bash
    │   ├── main.py
    │   ├── print_output.bash
    │   ├── psmIK_service.bash
    │   ├── roscore.bash
    │   ├── tensorboard_launch.bash
    │   └── training.bash
├── setup.py
└── utilities
    └── Create Expert Dataset
        ├── examples
            └── dVRK
            │   └── move_ambf_psm_from_rosbagdata.py
        └── generate_expert_dataset.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | ### C ###
  3 | # Prerequisites
  4 | *.d
  5 | 
  6 | # Object files
  7 | *.o
  8 | *.ko
  9 | *.obj
 10 | *.elf
 11 | 
 12 | # Linker output
 13 | *.ilk
 14 | *.map
 15 | *.exp
 16 | 
 17 | # Precompiled Headers
 18 | *.gch
 19 | *.pch
 20 | 
 21 | # Libraries
 22 | *.lib
 23 | *.a
 24 | *.la
 25 | *.lo
 26 | 
 27 | # Shared objects (inc. Windows DLLs)
 28 | *.dll
 29 | *.so
 30 | *.so.*
 31 | *.dylib
 32 | 
 33 | # Executables
 34 | *.exe
 35 | *.out
 36 | *.app
 37 | *.i*86
 38 | *.x86_64
 39 | *.hex
 40 | 
 41 | # Debug files
 42 | *.dSYM/
 43 | *.su
 44 | *.idb
 45 | *.pdb
 46 | 
 47 | # Kernel Module Compile Results
 48 | *.mod*
 49 | *.cmd
 50 | .tmp_versions/
 51 | modules.order
 52 | Module.symvers
 53 | Mkfile.old
 54 | dkms.conf
 55 | 
 56 | ### C++ ###
 57 | # Prerequisites
 58 | 
 59 | # Compiled Object files
 60 | *.slo
 61 | 
 62 | # Precompiled Headers
 63 | 
 64 | # Compiled Dynamic libraries
 65 | 
 66 | # Fortran module files
 67 | *.mod
 68 | *.smod
 69 | 
 70 | # Compiled Static libraries
 71 | *.lai
 72 | 
 73 | # Executables
 74 | 
 75 | ### Python ###
 76 | # Byte-compiled / optimized / DLL files
 77 | __pycache__/
 78 | *.py[cod]
 79 | *$py.class
 80 | 
 81 | # C extensions
 82 | 
 83 | # Distribution / packaging
 84 | .Python
 85 | build/
 86 | develop-eggs/
 87 | dist/
 88 | downloads/
 89 | eggs/
 90 | .eggs/
 91 | lib/
 92 | lib64/
 93 | parts/
 94 | sdist/
 95 | var/
 96 | wheels/
 97 | pip-wheel-metadata/
 98 | share/python-wheels/
 99 | *.egg-info/
100 | .installed.cfg
101 | *.egg
102 | MANIFEST
103 | 
104 | # PyInstaller
105 | #  Usually these files are written by a python script from a template
106 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
107 | *.manifest
108 | *.spec
109 | 
110 | # Installer logs
111 | pip-log.txt
112 | pip-delete-this-directory.txt
113 | 
114 | # Unit test / coverage reports
115 | htmlcov/
116 | .tox/
117 | .nox/
118 | .coverage
119 | .coverage.*
120 | .cache
121 | nosetests.xml
122 | coverage.xml
123 | *.cover
124 | *.py,cover
125 | .hypothesis/
126 | .pytest_cache/
127 | pytestdebug.log
128 | 
129 | # Translations
130 | *.mo
131 | *.pot
132 | 
133 | # Django stuff:
134 | *.log
135 | local_settings.py
136 | db.sqlite3
137 | db.sqlite3-journal
138 | 
139 | # Flask stuff:
140 | instance/
141 | .webassets-cache
142 | 
143 | # Scrapy stuff:
144 | .scrapy
145 | 
146 | # Sphinx documentation
147 | docs/_build/
148 | doc/_build/
149 | 
150 | # PyBuilder
151 | target/
152 | 
153 | # Jupyter Notebook
154 | .ipynb_checkpoints
155 | 
156 | # IPython
157 | profile_default/
158 | ipython_config.py
159 | 
160 | # pyenv
161 | .python-version
162 | 
163 | # pipenv
164 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
165 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
166 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
167 | #   install all needed dependencies.
168 | #Pipfile.lock
169 | 
170 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
171 | __pypackages__/
172 | 
173 | # Celery stuff
174 | celerybeat-schedule
175 | celerybeat.pid
176 | 
177 | # SageMath parsed files
178 | *.sage.py
179 | 
180 | # Environments
181 | .env
182 | .venv
183 | env/
184 | venv/
185 | ENV/
186 | env.bak/
187 | venv.bak/
188 | pyenv_arl_training
189 | 
190 | # Spyder project settings
191 | .spyderproject
192 | .spyproject
193 | 
194 | # Rope project settings
195 | .ropeproject
196 | 
197 | # mkdocs documentation
198 | /site
199 | 
200 | # mypy
201 | .mypy_cache/
202 | .dmypy.json
203 | dmypy.json
204 | 
205 | # Pyre type checker
206 | .pyre/
207 | 
208 | # pytype static type analyzer
209 | .pytype/
210 | 
211 | # Flags for Clang Code Model
212 | *.cxxflags
213 | *.cflags
214 | 
215 | # Ignoring generated logs and tensorboard folders
216 | .logs
217 | ddpg_dvrk_tensorboard/
218 | training_scripts/
219 | training-gpu.Dockerfile


--------------------------------------------------------------------------------
/.style.yapf:
--------------------------------------------------------------------------------
 1 | [style]
 2 | based_on_style = pep8
 3 | align_closing_bracket_with_visual_indent: true
 4 | continuation_indent_width: 2
 5 | use_tabs: false
 6 | each_dict_entry_on_separate_line: false
 7 | column_limit: 80
 8 | dedent_closing_brackets: true
 9 | blank_line_before_nested_class_or_def: true
10 | arithmetic_precedence_indication: true
11 | indent_dictionary_value: true
12 | indent_width: 2
13 | split_all_comma_separated_values: true
14 | split_before_logical_operator: true
15 | split_complex_comprehension: true
16 | split_before_named_assigns: false
17 | split_penalty_import_names=0
18 | split_penalty_for_added_line_split=30


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   // Use IntelliSense to learn about possible attributes.
 3 |   // Hover to view descriptions of existing attributes.
 4 |   // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
 5 |   "version": "0.2.0",
 6 |   "configurations": [
 7 |     {
 8 |       "name": "Python: Current File",
 9 |       "cwd": "${workspaceFolder}/scripts/dVRK/",
10 |       "type": "python",
11 |       "request": "launch",
12 |       "program": "${file}",
13 |       "console": "integratedTerminal"
14 |     }
15 |   ]
16 | }


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "editor.rulers": [
 3 |     80
 4 |   ],
 5 |   "githubIssues.createIssueTriggers": [
 6 |     "feature",
 7 |     "bug",
 8 |     "documentation",
 9 |     "question"
10 |   ],
11 |   "githubIssues.issueCompletions.enabled": true,
12 |   "githubIssues.useBranchForIssues": "prompt",
13 |   "githubIssues.issueBranchTitle": "$feature/${issueNumber}-${sanitizedIssueTitle}",
14 |   "githubIssues.queries": [
15 |     {
16 |       "label": "My Open Issues",
17 |       "query": "assignee:${user} state:open"
18 |     },
19 |     {
20 |       "label": "Issues",
21 |       "query": "repo:${owner}/${repository} state:open" //  sort:created-desc
22 |     }
23 |   ],
24 |   "python.formatting.provider": "yapf",
25 |   "python.autoComplete.extraPaths": [
26 |     "${env:AMBF_WS}/",
27 |     "${env:AMBF_WS}/ambf_ros_modules/",
28 |     "${env:AMBF_WS}/ambf_ros_modules/ambf_comm/",
29 |     "${env:AMBF_WS}/ambf_ros_modules/ambf_comm/scripts/",
30 |     "${env:AMBF_WS}/ambf_controller/dvrk_functions/scripts/",
31 |     "${env:AMBF_RL_WS}/**",
32 |     "${env:AMBF_RL_WS}/envs/**",
33 |     "${env:AMBF_RL_WS}/envs/ambf_rl/**",
34 |     "${env:AMBF_RL_WS}/envs/dVRK/**",
35 |     "${env:AMBF_RL_WS}/scripts/**",
36 |   ],
37 |   "python.analysis.extraPaths": [
38 |     "${env:AMBF_WS}/**",
39 |     "${env:AMBF_WS}/ambf_ros_modules/**",
40 |     "${env:AMBF_WS}/ambf_ros_modules/ambf_comm/**",
41 |     "${env:AMBF_WS}/ambf_ros_modules/ambf_comm/scripts/**",
42 |     "${env:AMBF_WS}/ambf_controller/dvrk_functions/scripts/**",
43 |     "${env:AMBF_RL_WS}/**",
44 |     "${env:AMBF_RL_WS}/envs/**",
45 |     "${env:AMBF_RL_WS}/envs/ambf_rl/**",
46 |     "${env:AMBF_RL_WS}/envs/dVRK/**",
47 |     "${env:AMBF_RL_WS}/scripts/**",
48 |   ],
49 |   "python.pythonPath": "pyenv_arl_training/bin/python"
50 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # AMBF Reinforcement Learning (AMBF-RL) Toolkit
  2 | 
  3 | ![ambf-robots](media/AMBF_Robots.png)
  4 | 
  5 | The AMBF-RL (ARL) toolkit assists in designing control algorithms for medical 
  6 | robots by providing environments conducive to medical settings and accurate 
  7 | models of well-known medical robotic platforms. 
  8 | It wraps [OpenAI-gym](https://github.com/openai/gym) and 
  9 | [stable-baselines](https://github.com/hill-a/stable-baselines) to provide an 
 10 | interface to train state of the art RL algorithms for robots in the 
 11 | [Asynchronous Multibody Framework (AMBF)](https://github.com/WPI-AIM/ambf) 
 12 | simulation environment.
 13 | 
 14 | ## Architecture
 15 | 
 16 | ![architecture-overview](media/architecture-overview.png)
 17 | 
 18 | ## Requirements  
 19 | 
 20 | - Computer with Ubuntu Bionic (18.04) or Ubuntu Bionic Server (18.04)
 21 | - [ROS Melodic - 1.14.*](http://wiki.ros.org/melodic/Installation/Ubuntu)
 22 | - (Optional) NVIDIA GPU
 23 |   - [CUDA - 10.0 (10.0.130)](https://developer.nvidia.com/cuda-10.0-download-archive)
 24 |   - [CUDNN - 7.4.2](https://developer.nvidia.com/compute/machine-learning/cudnn/secure/v7.4.2/prod/10.0_20181213/cudnn-10.0-linux-x64-v7.4.2.24.tgz)
 25 | - (Optional) Docker
 26 | 
 27 | ## Install
 28 | ### 1. Docker method
 29 | 
 30 | If docker is installed, the dockerfile [devel-training-gpu](dockerfiles/devel-training-gpu.Dockerfile) can be run.
 31 | ```bash
 32 | cd ~/
 33 | git clone https://github.com/WPI-AIM/ambf_rl.git && cd ambf_rl
 34 | sudo service docker start
 35 | docker build --rm -f "dockerfiles/devel-training-gpu.Dockerfile" -t ambf_rl:devel-training-gpu "."
 36 | docker run --rm -it ambf_rl:devel-training-gpu
 37 | # Now inside the interactive container
 38 | cd ~/ambf_rl
 39 | python scripts/dVRK/main.py
 40 | ```
 41 | 
 42 | ### 2. Install dependencies
 43 | 
 44 | Clone and install ambf requirements
 45 | ```bash
 46 | source /opt/ros/melodic/setup.bash
 47 | cd ~/
 48 | git clone --branch ambf-1.0-python3-fix \ 
 49 |   https://github.com/DhruvKoolRajamani/ambf.git
 50 | cd ~/ambf
 51 | sudo apt-get -y -qq install --fix-missing \ 
 52 |   $(cat install/apt-requirements.txt)
 53 | cat install/pip-requirements.txt | xargs -n 1 -L 1 pip install -U
 54 | mkdir build && cd build
 55 | cmake ../
 56 | make -j$(nproc)
 57 | ```
 58 | 
 59 | **Optional:** Install python-venv to use a virtual environment
 60 | ```bash
 61 | sudo apt-get install python3-venv python-venv
 62 | ```
 63 | 
 64 | Now clone this repository and install training requirements
 65 | ```bash
 66 | source ~/ambf/build/devel/setup.bash
 67 | cd ~/
 68 | git clone https://github.com/WPI-AIM/ambf_rl.git
 69 | ```
 70 | 
 71 | **Optional:** If venv is already installed, create and source the virtual environment.
 72 | ```bash
 73 | cd ~/ambf_rl
 74 | virtualenv --python=/usr/bin/python3 pyenv_arl_training
 75 | source ~/ambf_rl/pyenv_arl_training/bin/activate
 76 | ```
 77 | 
 78 | Continue with the installation of training specific requirements.
 79 | ```bash
 80 | cd ~/ambf_rl
 81 | cat install/training-pip-requirements.txt | xargs -n 1 -L 1 pip3 install -U
 82 | ```
 83 | 
 84 | Copy the ddpg fix to the installed stable-baselines folder.  
 85 | If the virtual environment was installed,  
 86 | `<path to python site-packages>=~/ambf_rl/pyenv_arl_training/lib/python3.6/site-packages/>` ortherwise  
 87 | `<path to python site-packages>=~/.local/lib/python3.6/site-packages/>`
 88 | ```bash
 89 | mv <path to python site-packages>/stable_baselines/ddpg/ddpg.py \
 90 |   <path to python site-packages>/stable_baselines/ddpg/ddpg_old.py && \
 91 |   cp ${AMBF_RL_WS}/install/stable_baseline_fix/ddpg.py \
 92 |   <path to python site-packages>/stable_baselines/ddpg/
 93 | ```
 94 | 
 95 | Example: if venv exists
 96 | ```bash
 97 | mv ~/ambf_rl/pyenv_arl_training/lib/python3.6/site-packages/stable_baselines/ddpg/ddpg.py \
 98 |   ~/ambf_rl/pyenv_arl_training/lib/python3.6/site-packages/stable_baselines/ddpg/ddpg_old.py && \
 99 |   cp ${AMBF_RL_WS}/install/stable_baseline_fix/ddpg.py \
100 |   ~/ambf_rl/pyenv_arl_training/lib/python3.6/site-packages/stable_baselines/ddpg/
101 | ```
102 | 
103 | Finally install AMBF-RL
104 | ```bash
105 | cd ~/ambf_rl
106 | python setup.py install
107 | ```
108 | 
109 | Test the installation by running the example DDPG or HER+DDPG algorithms to train the dVRK PSM to reach a point.
110 | ```bash
111 | cd ~/ambf_rl
112 | source /opt/ros/melodic/setup.bash
113 | source ~/ambf/build/devel/setup.bash
114 | # If a virtual environment was created
115 | source ~/ambf_rl/pyenv_arl_training/bin/activate
116 | cd scripts/dVRK/
117 | python main.py
118 | ```
119 | An instance of AMBF will spawn and training should begin.
120 | 
121 | ## Developer Guide
122 | 
123 | If using .vscode as an IDE, it would be helpful to export the path to the AMBF-RL workspace to your .bashrc, or individual terminal session.
124 | ```bash
125 | # Individual terminal session
126 | export AMBF_WS=~/ambf
127 | export AMBF_RL_WS=~/ambf_rl
128 | 
129 | # .bashrc
130 | echo "export AMBF_WS=~/ambf" >> ~/.bashrc
131 | echo "export AMBF_RL_WS=~/ambf_rl" >> ~/.bashrc
132 | source ~/.bashrc
133 | ```
134 | 
135 | ### Abstract AMBF-RL Base Class
136 | 
137 | - [arl_env.py](envs/arl/arl_env.py) is an abstract base class that automatically connects to the `ambf_client` and objects spawned in ambf.
138 | ```python
139 | class ARLEnv(gym.Env, metaclass=ABCMeta):
140 |   """Base class for the ARLEnv
141 | 
142 |   This class should not be instantiated on its own. It does not contain the 
143 |   desired RL Algorithm or the Robot. This should remain an abstract class and
144 |   should be wrapped by at least an algorithmic and robot environment wrapper.
145 |   An example of this can be PSM_cartesian_env_ddpg(ARLEnv).
146 | 
147 |   Attributes
148 |   ----------
149 |   
150 |   Methods
151 |   -------
152 |   skip_skipped_sim_steps(num)
153 |       Define number of steps to skip if step-throttling is enabled.
154 |   
155 |   set_throttling_enable(check):
156 |       Set the step-throttling Boolean.
157 |   """
158 | 
159 |   def __init__(
160 |     self,
161 |     enable_step_throttling: bool,
162 |     n_skip_steps: int,
163 |     env_name: str = "arl_env"
164 |   ) -> None:
165 |     """Initialize the abstract class which handles all AMBF related interactions
166 |     
167 |     Parameters
168 |     ----------
169 |     enable_step_throttling : bool
170 |         Flag to enable throttling of the simulator
171 |     n_skip_steps : int
172 |         Number of steps to skip after an update step
173 |     env_name : str
174 |         Name of the environment to train
175 |     """
176 | ```
177 | 
178 | - For dVRK PSM environments that are based in the cartesian space, an abstract class [PSM_cartesian_env.py](envs/dVRK/PSM_cartesian_env.py) has been created and should be inherited.
179 | ```python
180 | class PSMCartesianEnv(ARLEnv, metaclass=ABCMeta):
181 |   """Single task based environment for PSM to perform debris removal as shown in
182 |   the paper:
183 | 
184 |   The environment performs actions in the cartesian space in R3, with translational
185 |   movements in x, y, and z.
186 |   """
187 | 
188 |   def __init__(
189 |     self,
190 |     action_space_limit: float,
191 |     goal_position_range: float,
192 |     position_error_threshold: float,
193 |     goal_error_margin: float,
194 |     joint_limits: Dict[str,
195 |                        np.ndarray or List[str]],
196 |     workspace_limits: Dict[str,
197 |                            np.ndarray or List[str]],
198 |     enable_step_throttling: bool,
199 |     joints_to_control: List[str] = [
200 |       'baselink-yawlink',
201 |       'yawlink-pitchbacklink',
202 |       'pitchendlink-maininsertionlink',
203 |       'maininsertionlink-toolrolllink',
204 |       'toolrolllink-toolpitchlink',
205 |       'toolpitchlink-toolgripper1link',
206 |       'toolpitchlink-toolgripper2link'
207 |     ],
208 |     steps_to_print: int = 10000,
209 |     n_actions: int = 3,
210 |     n_skip_steps: int = 5,
211 |     env_name: str = "PSM_cartesian_ddpgenv"
212 |   ) -> None:
213 |     """Initialize an object of the PSM robot in cartesian space.
214 | 
215 |     Parameters
216 |     ----------
217 |     action_space_limit : float
218 |         Action space limit for cartesian actions
219 |     goal_position_range : int, optional
220 |         The variance in goal position
221 |     position_error_threshold : float
222 |         Maximum acceptable error in cartesian position
223 |     goal_error_margin : float
224 |         Maximum margin of error for an epoch to be considered successful
225 |     joint_limits : Dict(str, List(float) | np.array(float))
226 |         Robot joint limits in radians
227 |     workspace_limits : Dict(str, List(float) | np.array(float))
228 |         Workspace limits in x,y, and z for the robots workspace in Cartesian space
229 |     enable_step_throttling : bool
230 |         Flag to enable throttling of the simulator
231 |     joints_to_control : np.array(str) | List(str)
232 |         The list of joint links for the psm.
233 |     steps_to_print : int
234 |         Number of steps before model prints information to stdout
235 |     n_actions : int
236 |         Number of possible actions
237 |     n_skip_steps : int
238 |         Number of steps to skip after an update step
239 |     env_name : str
240 |         Name of the environment to train
241 |     """
242 | ```
243 | 
244 | ## Citation
245 | 
246 | If this work is helpful for your research, please use the following reference for citation:  
247 | ```latex
248 | @INPROCEEDINGS{9807609,  author={Varier, Vignesh Manoj and Rajamani, Dhruv Kool and Tavakkolmoghaddam, Farid and Munawar, Adnan and Fischer, Gregory S},  booktitle={2022 International Symposium on Medical Robotics (ISMR)},   title={AMBF-RL: A real-time simulation based Reinforcement Learning toolkit for Medical Robotics},   year={2022},  volume={},  number={},  pages={1-8},  doi={10.1109/ISMR48347.2022.9807609}}
249 | ```
250 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | from .envs.arl.arl_env import ARLEnv, Goal, Action, Observation
2 | from .envs.dVRK.PSM_cartesian_ddpg_env import PSMCartesianDDPGEnv


--------------------------------------------------------------------------------
/dockerfiles/devel-training-gpu.Dockerfile:
--------------------------------------------------------------------------------
  1 | FROM tensorflow/tensorflow:1.14.0-gpu-py3
  2 | 
  3 | ENV USERNAME="admin"
  4 | ENV NVIDIA_VISIBLE_DEVICES all
  5 | ENV NVIDIA_DRIVER_CAPABILITIES graphics,utility,compute
  6 | 
  7 | RUN useradd -ms /bin/bash ${USERNAME}
  8 | RUN usermod -aG sudo ${USERNAME}
  9 | 
 10 | ENV HOME="/home/${USERNAME}" \
 11 |   AMBF_WS="/home/${USERNAME}/ambf" \ 
 12 |   AMBF_RL_WS="/home/${USERNAME}/ambf_rl"
 13 | 
 14 | # Add apt-utils
 15 | RUN apt clean && \
 16 |   rm -rf /var/lib/apt/lists/* && \
 17 |   apt-get update && \
 18 |   apt-get install apt-utils -q -y \
 19 |   && rm -rf /var/lib/apt/lists/*
 20 | 
 21 | RUN apt-get update && \
 22 |   apt-get -y -qq install wget gdb
 23 | 
 24 | # setup timezone
 25 | RUN echo 'Etc/UTC' > /etc/timezone && \
 26 |   ln -s /usr/share/zoneinfo/Etc/UTC /etc/localtime && \
 27 |   apt-get update && apt-get install -q -y tzdata
 28 | 
 29 | # install packages
 30 | RUN apt-get update && apt-get install -q -y \
 31 |   dirmngr \
 32 |   gnupg2 \
 33 |   && rm -rf /var/lib/apt/lists/*
 34 | 
 35 | # setup keys
 36 | RUN apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys C1CF6E31E6BADE8868B172B4F42ED6FBAB17C654
 37 | 
 38 | # setup sources.list
 39 | RUN echo "deb http://packages.ros.org/ros/ubuntu bionic main" > /etc/apt/sources.list.d/ros1-latest.list
 40 | 
 41 | # install bootstrap tools
 42 | RUN apt-get update && apt-get install --no-install-recommends -y \
 43 |   python-rosdep \
 44 |   python-rosinstall \
 45 |   python-vcstools \
 46 |   && rm -rf /var/lib/apt/lists/*
 47 | 
 48 | # setup environment
 49 | ENV LANG C.UTF-8
 50 | ENV LC_ALL C.UTF-8
 51 | 
 52 | ENV ROS_DISTRO melodic
 53 | # bootstrap rosdep
 54 | RUN rosdep init && \
 55 |   rosdep update --rosdistro $ROS_DISTRO
 56 | 
 57 | RUN apt-get update && apt-get install -y \
 58 |   ros-melodic-ros-base=1.4.1-0* apt-utils git \
 59 |   && rm -rf /var/lib/apt/lists/*
 60 | 
 61 | WORKDIR ${HOME}
 62 | # Make Directory AMBF_WS
 63 | RUN git clone --branch ambf-1.0-python3-fix https://github.com/DhruvKoolRajamani/ambf.git
 64 | WORKDIR ${AMBF_WS}
 65 | RUN cd ${AMBF_WS} && \
 66 |   git submodule update --init --recursive
 67 | 
 68 | # Install apt and pip packages listed in (*-requirements.txt)
 69 | WORKDIR ${AMBF_WS}
 70 | RUN apt-get update && \
 71 |   apt-get -y -qq -o Dpkg::Use-Pty=0 install --no-install-recommends \
 72 |   --fix-missing $(cat install/apt-requirements.txt) && \
 73 |   cat install/pip-requirements.txt | xargs -n 1 -L 1 pip install -U && \
 74 |   apt-get clean && \
 75 |   rm -rf /var/lib/apt/lists/*
 76 | 
 77 | # Build AMBF
 78 | RUN . /opt/ros/melodic/setup.sh && \
 79 |   mkdir -p ${AMBF_WS}/build && \
 80 |   cd ${AMBF_WS}/build && \
 81 |   cmake ../ -DCMAKE_BUILD_TYPE=Debug && \
 82 |   make -j$(nproc)
 83 | 
 84 | WORKDIR ${HOME}
 85 | # Make Directory AMBF_RL_WS
 86 | RUN git clone https://github.com/WPI-AIM/ambf_rl.git
 87 | WORKDIR ${AMBF_RL_WS}
 88 | RUN apt-get update && \
 89 |   cat install/training-pip-requirements.txt | xargs -n 1 -L 1 pip3 install -U && \
 90 |   apt-get clean && \
 91 |   rm -rf /var/lib/apt/lists/*
 92 | 
 93 | # Stable Baselines fix
 94 | RUN mv /usr/local/lib/python3.6/dist-packages/stable_baselines/ddpg/ddpg.py \
 95 |   /usr/local/lib/python3.6/dist-packages/stable_baselines/ddpg/ddpg_old.py && \
 96 |   cp ${AMBF_RL_WS}/install/stable_baseline_fix/ddpg.py \
 97 |   /usr/local/lib/python3.6/dist-packages/stable_baselines/ddpg/
 98 | 
 99 | RUN touch ${HOME}/.bashrc && \
100 |   echo "source /opt/ros/melodic/setup.bash" >> ${HOME}/.bashrc && \
101 |   echo "source /home/admin/ambf/build/devel/setup.bash" >> ${HOME}/.bashrc
102 | 
103 | RUN . ${HOME}/.bashrc
104 | 
105 | WORKDIR ${AMBF_RL_WS}
106 | RUN python setup.py install
107 | 
108 | ENV ROS_HOSTNAME="localhost" \
109 |   ROS_MASTER_URI="http://localhost:11311"
110 | 
111 | WORKDIR ${AMBF_RL_WS}


--------------------------------------------------------------------------------
/dockerfiles/devel-training.Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM tensorflow/tensorflow:1.14.0-py3
 2 | 
 3 | # setup timezone
 4 | RUN echo 'Etc/UTC' > /etc/timezone && \
 5 |     ln -s /usr/share/zoneinfo/Etc/UTC /etc/localtime && \
 6 |     apt-get update && apt-get install -q -y tzdata && rm -rf /var/lib/apt/lists/*
 7 | 
 8 | # install packages
 9 | RUN apt-get update && apt-get install -q -y \
10 |     dirmngr \
11 |     gnupg2 \
12 |     && rm -rf /var/lib/apt/lists/*
13 | 
14 | # setup keys
15 | RUN apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys C1CF6E31E6BADE8868B172B4F42ED6FBAB17C654
16 | 
17 | # setup sources.list
18 | RUN echo "deb http://packages.ros.org/ros/ubuntu bionic main" > /etc/apt/sources.list.d/ros1-latest.list
19 | 
20 | # install bootstrap tools
21 | RUN apt-get update && apt-get install --no-install-recommends -y \
22 |     python-rosdep \
23 |     python-rosinstall \
24 |     python-vcstools \
25 |     && rm -rf /var/lib/apt/lists/*
26 | 
27 | # setup environment
28 | ENV LANG C.UTF-8
29 | ENV LC_ALL C.UTF-8
30 | 
31 | ENV ROS_DISTRO melodic
32 | # bootstrap rosdep
33 | RUN rosdep init && \
34 |   rosdep update --rosdistro $ROS_DISTRO
35 | 
36 | RUN apt-get update && apt-get install -y \
37 |     ros-melodic-ros-base=1.4.1-0* apt-utils git \
38 |     && rm -rf /var/lib/apt/lists/*
39 | 
40 | ENV HOME="/root" \
41 |   AMBF_WS="/root/ambf"
42 | 
43 | WORKDIR ${HOME}
44 | # Make Directory AMBF_WS
45 | RUN git clone --single-branch --branch feat-rl https://github.com/WPI-AIM/ambf.git
46 | WORKDIR ${AMBF_WS}
47 | RUN cd ${AMBF_WS} && \
48 |   git submodule update --init --recursive
49 | 
50 | # Install apt and pip packages listed in (*-requirements.txt)
51 | WORKDIR ${AMBF_WS}
52 | RUN apt-get update && \
53 |   apt-get -y -qq -o Dpkg::Use-Pty=0 install --no-install-recommends \
54 |   --fix-missing $(cat install/apt-requirements.txt) && \
55 |   cat install/pip-requirements.txt | xargs -n 1 -L 1 pip install -U && \
56 |   apt-get clean && \
57 |   rm -rf /var/lib/apt/lists/*
58 | 
59 | # Build AMBF
60 | RUN . /opt/ros/melodic/setup.sh && \
61 |   mkdir -p ${AMBF_WS}/build && \
62 |   cd ${AMBF_WS}/build && \
63 |   cmake ../ && \
64 |   make -j$(nproc)
65 | 
66 | RUN touch /root/.bashrc && \
67 |   echo "source /opt/ros/melodic/setup.bash" >> /root/.bashrc && \
68 |   echo "source /root/ambf/build/devel/setup.bash" >> /root/.bashrc
69 |   
70 | WORKDIR ${AMBF_WS}/training_scripts
71 | # CMD ./wrapper_script.sh
72 | 
73 | # CMD python3 -c "import tensorflow as tf; print(tf.reduce_sum(tf.random.normal([1000, 1000])))"


--------------------------------------------------------------------------------
/envs/__init__.py:
--------------------------------------------------------------------------------
1 | from .arl.arl_env import ARLEnv, Goal, Action, Observation
2 | from .dVRK.PSM_cartesian_env import PSMCartesianEnv, CartesianAction
3 | from .dVRK.PSM_cartesian_ddpg_env import PSMCartesianDDPGEnv, DDPGObservation
4 | from .dVRK.PSM_cartesian_herddpg_env import PSMCartesianHERDDPGEnv, HERDDPGObservation


--------------------------------------------------------------------------------
/envs/arl/__init__.py:
--------------------------------------------------------------------------------
1 | from .arl_env import ARLEnv, Goal, Action, Observation


--------------------------------------------------------------------------------
/envs/arl/arl_env.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # //============================================================================
  4 | # /*
  5 | #     Software License Agreement (BSD License)
  6 | #     Copyright (c) 2019, AMBF
  7 | #     (www.aimlab.wpi.edu)
  8 | 
  9 | #     All rights reserved.
 10 | 
 11 | #     Redistribution and use in source and binary forms, with or without
 12 | #     modification, are permitted provided that the following conditions
 13 | #     are met:
 14 | 
 15 | #     * Redistributions of source code must retain the above copyright
 16 | #     notice, this list of conditions and the following disclaimer.
 17 | 
 18 | #     * Redistributions in binary form must reproduce the above
 19 | #     copyright notice, this list of conditions and the following
 20 | #     disclaimer in the documentation and/or other materials provided
 21 | #     with the distribution.
 22 | 
 23 | #     * Neither the name of authors nor the names of its contributors may
 24 | #     be used to endorse or promote products derived from this software
 25 | #     without specific prior written permission.
 26 | 
 27 | #     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 28 | #     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 29 | #     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 30 | #     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 31 | #     COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 32 | #     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 33 | #     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 34 | #     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 35 | #     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 36 | #     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 37 | #     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 38 | #     POSSIBILITY OF SUCH DAMAGE.
 39 | 
 40 | #     \author    <http://aimlab.wpi.edu>
 41 | #     \author    <dkoolrajamani@wpi.edu>, <vvarier@wpi.edu>, <amunawar@wpi.edu>
 42 | #     \author    Dhruv Kool Rajamani, Vignesh Manoj Varier, and Adnan Munawar
 43 | #     \version   1.0.0
 44 | # */
 45 | # //============================================================================
 46 | 
 47 | from typing import List, Set, Tuple, Dict, Any, Type
 48 | from ambf_client import Client
 49 | import ambf_client
 50 | from gym import spaces
 51 | import numpy as np
 52 | import copy
 53 | import time
 54 | import gym
 55 | from gym.utils import seeding
 56 | from ambf_world import World
 57 | from ambf_object import Object
 58 | import sys
 59 | from abc import ABC, abstractmethod, ABCMeta
 60 | 
 61 | 
 62 | class Observation:
 63 | 
 64 |   def __init__(
 65 |     self,
 66 |     state: Any or np.ndarray or List[float] or Dict,
 67 |     dist: int = 0,
 68 |     reward: float = 0.0,
 69 |     prev_reward: float = 0.0,
 70 |     cur_reward: float = 0.0,
 71 |     is_done: bool = False,
 72 |     info: Dict = {},
 73 |     sim_step_no: int = 0
 74 |   ) -> None:
 75 | 
 76 |     self.state = state
 77 |     self.dist = dist
 78 |     self.reward = reward
 79 |     self.prev_reward = prev_reward
 80 |     self.cur_reward = cur_reward
 81 |     self.is_done = is_done
 82 |     self.info = info
 83 |     self.sim_step_no = sim_step_no
 84 | 
 85 |     return
 86 | 
 87 |   def cur_observation(self) -> Tuple[Any or np.ndarray, float, bool, Dict]:
 88 |     return self.state, self.reward, self.is_done, self.info
 89 | 
 90 | 
 91 | class ClientHandleError(BaseException):
 92 | 
 93 |   def __init__(self, *args: object) -> None:
 94 |     super().__init__(*args)
 95 | 
 96 | 
 97 | class Action:
 98 | 
 99 |   def __init__(
100 |     self,
101 |     n_actions: int,
102 |     action_space_limit: float,
103 |     action_lims_low: List[float] = None,
104 |     action_lims_high: List[float] = None
105 |   ) -> None:
106 |     self.n_actions = n_actions
107 |     self.action = [0.0 for i in range(n_actions)]
108 |     self.action_space_limit = action_space_limit
109 |     if action_lims_low is None:
110 |       self.action_lims_low = -action_space_limit * np.ones(self.n_actions)
111 |     else:
112 |       self.action_lims_low = action_lims_low
113 | 
114 |     if action_lims_high is None:
115 |       self.action_lims_high = action_space_limit * np.ones(self.n_actions)
116 |     else:
117 |       self.action_lims_high = action_lims_high
118 | 
119 |     self.action_space = spaces.Space()
120 | 
121 |     return
122 | 
123 |   def check_if_valid_action(self):
124 |     """Check if the length of the action matches the defined action in the action space
125 |     
126 |     Raises
127 |     ------
128 |     TypeError
129 |         If the length of action does not match the number of actions defined
130 |     """
131 | 
132 |     assert len(self.action) == self.n_actions, TypeError("Incorrect length of actions provided!")
133 |     # Clips the action value between the upper and lower limits
134 |     np.clip(self.action, self.action_lims_low, self.action_lims_high, out=self.action)
135 | 
136 |     return
137 | 
138 |   def apply(self, state: np.ndarray, out: np.ndarray = None) -> np.ndarray:
139 |     """Apply the action to the given state.
140 | 
141 |     This function can be overrided if the application of the action to state corresponds
142 |     to a complex relationship to obtain the next state.
143 | 
144 |     Parameter
145 |     ---------
146 |     state : np.ndarray
147 |         The state (s) to which the action should be added
148 |     
149 |     out : np.ndarray, optional
150 |         The output parameter, passed by reference if the return argument is not used
151 | 
152 |     Raises
153 |     ------
154 |     Exception
155 | 
156 |     """
157 |     if out is None:
158 |       result = np.add(state, self.action)
159 |     else:
160 |       result = np.add(state, self.action, out=out)
161 | 
162 |     return result
163 | 
164 | 
165 | class Goal:
166 | 
167 |   def __init__(
168 |     self,
169 |     goal_position_range: float,
170 |     goal_error_margin: float,
171 |     goal: Any or np.ndarray
172 |   ) -> None:
173 | 
174 |     self.goal_position_range = goal_position_range
175 |     self.goal_error_margin = goal_error_margin
176 |     self.goal = goal
177 | 
178 |     return
179 | 
180 |   def _sample_goal(self) -> Any or np.ndarray:
181 | 
182 |     return self.goal
183 | 
184 | 
185 | class ARLEnv(gym.Env, metaclass=ABCMeta):
186 |   """Base class for the ARLEnv
187 | 
188 |   This class should not be instantiated on its own. It does not contain the 
189 |   desired RL Algorithm or the Robot. This should remain an abstract class and
190 |   should be wrapped by at least an algorithmic and robot environment wrapper.
191 |   An example of this can be PSM_cartesian_env_ddpg(ARLEnv).
192 | 
193 |   Attributes
194 |   ----------
195 |   
196 |   Methods
197 |   -------
198 |   skip_skipped_sim_steps(num)
199 |       Define number of steps to skip if step-throttling is enabled.
200 |   
201 |   set_throttling_enable(check):
202 |       Set the step-throttling Boolean.
203 |   """
204 | 
205 |   def __init__(
206 |     self,
207 |     enable_step_throttling: bool,
208 |     n_skip_steps: int,
209 |     env_name: str = "arl_env"
210 |   ) -> None:
211 |     """Initialize the abstract class which handles all AMBF related interactions
212 |     
213 |     Parameters
214 |     ----------
215 |     enable_step_throttling : bool
216 |         Flag to enable throttling of the simulator
217 |     n_skip_steps : int
218 |         Number of steps to skip after an update step
219 |     env_name : str
220 |         Name of the environment to train
221 |     """
222 |     super(ARLEnv, self).__init__()
223 | 
224 |     # Set the environment name
225 |     self._env_name = env_name
226 |     self._client_name = self._env_name + '_client'
227 | 
228 |     # Set environment and task parameters
229 |     self._n_skip_steps = n_skip_steps
230 |     self._enable_step_throttling = enable_step_throttling
231 | 
232 |     # Initialize sim steps
233 |     self._skipped_sim_steps = 0
234 |     self._prev_sim_step = 0
235 |     self._count_for_print = 0
236 | 
237 |     # AMBF Sim Environment Declarations
238 |     self._obj_handle = None  # Object
239 |     self._world_handle = None  # World
240 |     self._ambf_client = None  # Client(client_name=self._client_name)
241 |     # Initialize the ambf client
242 |     self.ambf_client = Client(client_name=self._client_name)
243 | 
244 |     # Set default Observation, Goal, and Action
245 |     self._goal = Goal(0.0, 0.0, None)
246 |     self._obs = Observation(None)
247 |     self._action = Action(0, 0.0)
248 | 
249 |     # Set action space and observation space
250 |     self.action_space = self.action.action_space
251 |     self.observation_space = spaces.Space()
252 | 
253 |     # Sleep to allow the handle to connect to the AMBF server
254 |     time.sleep(1)
255 | 
256 |     # Random seed the environment
257 |     self.seed(5)
258 | 
259 |     return
260 | 
261 | # Properties
262 | 
263 |   @property
264 |   def env_name(self) -> str:
265 |     """Return the name of the environment
266 |     """
267 |     return self._env_name
268 | 
269 |   @env_name.setter
270 |   def env_name(self, value: str):
271 |     """Set the environment name
272 |     """
273 |     self._env_name = value
274 | 
275 |   @property
276 |   def client_name(self) -> str:
277 |     """Return the AMBF Client Name
278 |     """
279 |     return self._client_name
280 | 
281 |   @client_name.setter
282 |   def client_name(self, value: str):
283 |     """Sets the AMBF Client Name
284 |     """
285 |     self._client_name = value
286 |     return
287 | 
288 |   @property
289 |   def n_skip_steps(self) -> int:
290 |     """Return number of steps to skip.
291 | 
292 |     TODO: Provide reference for step-throttling
293 |     """
294 |     return self._n_skip_steps
295 | 
296 |   @n_skip_steps.setter
297 |   def n_skip_steps(self, value: int):
298 |     """Define number of steps to skip if step-throttling is enabled.
299 |     """
300 |     self._n_skip_steps = value
301 |     self._world_handle.set_num_step_skips(value)
302 |     return
303 | 
304 |   @property
305 |   def enable_step_throttling(self) -> bool:
306 |     """Return the step-throttling state
307 | 
308 |     TODO: Provide reference for step-throttling
309 |     """
310 |     return self._enable_step_throttling
311 | 
312 |   @enable_step_throttling.setter
313 |   def enable_step_throttling(self, value: bool):
314 |     """Set the step-throttling state
315 |     """
316 |     self._enable_step_throttling = value
317 |     self._world_handle.enable_throttling(self._enable_step_throttling)
318 |     return
319 | 
320 |   @property
321 |   def skipped_sim_steps(self) -> int:
322 |     """Return the simulation steps skipped
323 |     """
324 |     skipped_steps = 0
325 |     if self.enable_step_throttling:
326 |       while skipped_steps < self.n_skip_steps:
327 |         skipped_steps = self.obj_handle.get_sim_step() - self.prev_sim_step
328 |         time.sleep(1e-5)
329 |       self.prev_sim_step = self.obj_handle.get_sim_step()
330 |       if skipped_steps > self.n_skip_steps:
331 |         print(
332 |           'WARN: Skipped {} steps, Default skip limit {} Steps'.format(
333 |             skipped_steps,
334 |             self.n_skip_steps
335 |           )
336 |         )
337 |     else:
338 |       skipped_steps = self.obj_handle.get_sim_step() - self.prev_sim_step
339 |       self.prev_sim_step = self.obj_handle.get_sim_step()
340 | 
341 |     self._skipped_sim_steps = skipped_steps
342 | 
343 |     return self._skipped_sim_steps
344 | 
345 |   @property
346 |   def prev_sim_step(self) -> int:
347 |     """Return the previous simulation step number
348 |     """
349 |     return self._prev_sim_step
350 | 
351 |   @prev_sim_step.setter
352 |   def prev_sim_step(self, value: int):
353 |     """Set the previous simulation step number
354 |     """
355 |     self._prev_sim_step = value
356 |     return
357 | 
358 |   @property
359 |   def count_for_print(self) -> int:
360 |     """Return the number of counts to print
361 |     """
362 |     return self._count_for_print
363 | 
364 |   @count_for_print.setter
365 |   def count_for_print(self, value: int):
366 |     """Set the number of counts to print
367 |     """
368 |     self._count_for_print = value
369 |     return
370 | 
371 |   @property
372 |   def obj_handle(self) -> Object:
373 |     """Return the AMBF Object Handle
374 |     """
375 |     return self._obj_handle
376 | 
377 |   @obj_handle.setter
378 |   def obj_handle(self, value: Object):
379 |     """Set the AMBF Object Handle
380 |     """
381 |     self._obj_handle = value
382 |     return
383 | 
384 |   @property
385 |   def world_handle(self) -> World:
386 |     """Return the AMBF World Handle
387 |     """
388 |     return self._world_handle
389 | 
390 |   @world_handle.setter
391 |   def world_handle(self, value: World):
392 |     """Set the AMBF World Handle
393 |     """
394 |     self._world_handle = value
395 |     return
396 | 
397 |   @property
398 |   def ambf_client(self) -> Client:
399 |     """Return the AMBF Client
400 |     """
401 |     return self._ambf_client
402 | 
403 |   @ambf_client.setter
404 |   def ambf_client(self, value: Client):
405 |     """Set the AMBF Client
406 |     """
407 |     self._ambf_client = value
408 |     self._ambf_client.connect()
409 |     time.sleep(1)
410 |     self._ambf_client.create_objs_from_rostopics()
411 |     return
412 | 
413 |   @property
414 |   def goal(self) -> Goal:
415 |     """Return the goal object
416 |     """
417 |     return self._goal
418 | 
419 |   @goal.setter
420 |   def goal(self, value: Goal):
421 |     """Set the goal object
422 |     """
423 |     self._goal = value
424 |     return
425 | 
426 |   @property
427 |   def obs(self) -> Observation:
428 |     """Return the Observation object
429 |     """
430 |     return self._obs
431 | 
432 |   @obs.setter
433 |   def obs(self, value: Observation):
434 |     """Set the Observation object
435 |     """
436 |     self._obs = value
437 |     return
438 | 
439 |   @property
440 |   def action(self) -> Action:
441 |     """Return the action object
442 |     """
443 |     return self._action
444 | 
445 |   @action.setter
446 |   def action(self, value: Action):
447 |     """Set the action object
448 |     """
449 |     self._action = value
450 |     return
451 | 
452 |   @property
453 |   def action_space(self) -> spaces.Space:
454 |     """Return the action_space
455 |     """
456 |     return self._action_space
457 | 
458 |   @action_space.setter
459 |   def action_space(self, value: spaces.Space):
460 |     """Set the action_space
461 |     """
462 |     self._action_space = value
463 |     return
464 | 
465 |   @property
466 |   def observation_space(self) -> spaces.Space:
467 |     """Return the observation_space
468 |     """
469 |     return self._observation_space
470 | 
471 |   @observation_space.setter
472 |   def observation_space(self, value: spaces.Space):
473 |     """Set the observation_space
474 |     """
475 |     self._observation_space = value
476 |     return
477 | 
478 | 
479 | # Gym requirements
480 | 
481 |   def make(self, robot_root_link: str):
482 |     """Creates an object handle of the robot and world in AMBF
483 | 
484 |     Parameters
485 |     ----------
486 |     robot_root_link : string  
487 |     
488 |         Name of the root link of the robot.  
489 | 
490 |         eg. for the dVRK PSM: robot_root_link = 'psm/baselink'  
491 | 
492 |     Raises
493 |     ------
494 |     ClientHandleError
495 |         If obj handle or world handle are None, then a ClientHandleError is
496 |         raised.
497 |     """
498 | 
499 |     self.world_handle = self.ambf_client.get_world_handle()
500 |     if self.world_handle is None:
501 |       raise ClientHandleError("World handle not found, please make sure AMBF is running")
502 |     self.obj_handle = self.ambf_client.get_obj_handle(robot_root_link)
503 |     if self.obj_handle is None:
504 |       raise ClientHandleError("Object handle not found, please make sure robot is loaded in AMBF")
505 |     time.sleep(2)
506 |     self.world_handle.enable_throttling(self.enable_step_throttling)
507 |     self.world_handle.set_num_step_skips(self.n_skip_steps)
508 | 
509 |     # Verify obj_handle
510 |     if self.obj_handle.get_num_joints() == 0:
511 |       raise ClientHandleError(
512 |         "Object handle returned {} objects, please make sure robot is loaded in AMBF".format(
513 |           self.obj_handle.get_num_joints()
514 |         )
515 |       )
516 | 
517 |     return
518 | 
519 |   def seed(self, seed: int) -> List[int]:
520 |     """Randomize the environment
521 |     """
522 |     self.np_random, seed = seeding.np_random(seed)
523 | 
524 |     return [seed]
525 | 
526 |   @abstractmethod
527 |   def reset(self) -> np.ndarray or List[float] or Dict:
528 |     """Reset the robot environment
529 | 
530 |     Raises
531 |     ------
532 |     NotImplementedError
533 |         If function is not overridden, a Not Implemented error is raised if the 
534 |         reset function from the base class is called.
535 |     """
536 |     raise NotImplementedError("Abstract method reset needs to be overridden")
537 |     return Observation(None)
538 | 
539 |   def render(self, mode):
540 |     return
541 | 
542 |   @abstractmethod
543 |   def step(
544 |     self,
545 |     action  #: Action
546 |   ) -> Tuple[List[Any] or np.ndarray,
547 |              float,
548 |              bool,
549 |              Dict[str,
550 |                   bool]]:
551 |     """Performs the update step for the algorithm and dynamics
552 |     """
553 |     return [], 0.0, False, {'': False}
554 | 
555 |   @abstractmethod
556 |   def compute_reward(self, reached_goal: Goal, desired_goal: Goal, info: Dict[str, bool]) -> float:
557 |     """Function to compute reward received by the agent
558 |     """
559 |     return 0.0
560 | 
561 |   @abstractmethod
562 |   def _sample_goal(self, observation: Observation) -> Goal:
563 |     """Function to samples new goal positions and ensures its within the workspace of PSM
564 |     """
565 |     return Goal(0.0, 0.0, None)
566 | 
567 |   def _check_if_done(self) -> bool:
568 |     """Function to check if the episode was successful
569 |     """
570 |     if abs(self.obs.dist) < self.goal.goal_error_margin:
571 |       return True
572 |     else:
573 |       return False
574 | 
575 |   # @abstractmethod
576 |   def _update_info(self):
577 |     """Can be used to Provide information for debugging purpose
578 | 
579 |     TODO: Should this function be made abstract?
580 |     """
581 |     info = {'is_success': self._is_success()}
582 |     return info
583 | 
584 |   # @abstractmethod
585 |   def _is_success(self):
586 |     """Function to check if the robot reached the desired goal within a predefined error margin
587 |     
588 |     TODO: Should this function be made abstract?
589 |     """
590 |     return self._check_if_done()
591 | 
592 |   @abstractmethod
593 |   def send_cmd(self, cmd: np.ndarray or List[float]):
594 |     """Send the command to the robot in the AMBF Simulation.
595 |     """
596 |     return
597 | 
598 | if __name__ == '__main__':
599 | 
600 |   print(sys.path)


--------------------------------------------------------------------------------
/envs/dVRK/PSM_cartesian_ddpg_env.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # //============================================================================
  4 | # /*
  5 | #     Software License Agreement (BSD License)
  6 | #     Copyright (c) 2019, AMBF
  7 | #     (www.aimlab.wpi.edu)
  8 | 
  9 | #     All rights reserved.
 10 | 
 11 | #     Redistribution and use in source and binary forms, with or without
 12 | #     modification, are permitted provided that the following conditions
 13 | #     are met:
 14 | 
 15 | #     * Redistributions of source code must retain the above copyright
 16 | #     notice, this list of conditions and the following disclaimer.
 17 | 
 18 | #     * Redistributions in binary form must reproduce the above
 19 | #     copyright notice, this list of conditions and the following
 20 | #     disclaimer in the documentation and/or other materials provided
 21 | #     with the distribution.
 22 | 
 23 | #     * Neither the name of authors nor the names of its contributors may
 24 | #     be used to endorse or promote products derived from this software
 25 | #     without specific prior written permission.
 26 | 
 27 | #     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 28 | #     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 29 | #     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 30 | #     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 31 | #     COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 32 | #     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 33 | #     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 34 | #     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 35 | #     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 36 | #     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 37 | #     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 38 | #     POSSIBILITY OF SUCH DAMAGE.
 39 | 
 40 | #     \author    <http://aimlab.wpi.edu>
 41 | #     \author    <dkoolrajamani@wpi.edu>, <vvarier@wpi.edu>, <amunawar@wpi.edu>
 42 | #     \author    Dhruv Kool Rajamani, Vignesh Manoj Varier, and Adnan Munawar
 43 | #     \version   1.0.0
 44 | # */
 45 | # //============================================================================
 46 | 
 47 | import sys, os, copy, time
 48 | 
 49 | from typing import Iterable, List, Set, Tuple, Dict, Any, Type
 50 | 
 51 | from arl.arl_env import Action, Goal, Observation, ARLEnv
 52 | from .PSM_cartesian_env import PSMCartesianEnv, CartesianAction
 53 | 
 54 | import numpy as np
 55 | from numpy import linalg as LA
 56 | 
 57 | import gym
 58 | from gym import spaces
 59 | from gym.utils import seeding
 60 | 
 61 | from psmFK import compute_FK
 62 | from transformations import euler_from_matrix
 63 | from dvrk_functions.msg import HomogenousTransform
 64 | import rospy
 65 | from dvrk_functions.srv import *
 66 | 
 67 | 
 68 | class DDPGObservation(Observation):
 69 | 
 70 |   def __init__(
 71 |     self,
 72 |     state: Any or np.ndarray,
 73 |     dist: int = 0,
 74 |     reward: float = 0.0,
 75 |     prev_reward: float = 0.0,
 76 |     cur_reward: float = 0.0,
 77 |     is_done: bool = False,
 78 |     info: Dict = {},
 79 |     sim_step_no: int = 0
 80 |   ) -> None:
 81 | 
 82 |     super(DDPGObservation,
 83 |           self).__init__(state,
 84 |                          dist,
 85 |                          reward,
 86 |                          prev_reward,
 87 |                          cur_reward,
 88 |                          is_done,
 89 |                          info,
 90 |                          sim_step_no)
 91 |     return
 92 | 
 93 | 
 94 | class PSMCartesianDDPGEnv(PSMCartesianEnv):
 95 |   """Single task based environment for PSM to perform debris removal as shown in
 96 |   the paper:
 97 |   
 98 |   TODO: Enter paper doi and name
 99 | 
100 |   The environment performs actions in the cartesian space in R3, with translational
101 |   movements in x, y, and z.
102 |   """
103 | 
104 |   def __init__(
105 |     self,
106 |     action_space_limit: float,
107 |     goal_position_range: float,
108 |     position_error_threshold: float,
109 |     goal_error_margin: float,
110 |     joint_limits: Dict[str,
111 |                        np.ndarray or List[str]],
112 |     workspace_limits: Dict[str,
113 |                            np.ndarray or List[str]],
114 |     enable_step_throttling: bool,
115 |     joints_to_control: List[str] = [
116 |       'baselink-yawlink',
117 |       'yawlink-pitchbacklink',
118 |       'pitchendlink-maininsertionlink',
119 |       'maininsertionlink-toolrolllink',
120 |       'toolrolllink-toolpitchlink',
121 |       'toolpitchlink-toolgripper1link',
122 |       'toolpitchlink-toolgripper2link'
123 |     ],
124 |     steps_to_print: int = 10000,
125 |     n_actions: int = 3,
126 |     n_skip_steps: int = 5,
127 |     env_name: str = "PSM_cartesian_ddpg_env"
128 |   ) -> None:
129 |     """Initialize an object to train with DDPG on the PSM robot.
130 | 
131 |     Parameters
132 |     ----------
133 |     action_space_limit : float
134 |         Action space limit for cartesian actions
135 |     goal_position_range : int, optional
136 |         The variance in goal position
137 |     position_error_threshold : float
138 |         Maximum acceptable error in cartesian position
139 |     goal_error_margin : float
140 |         Maximum margin of error for an epoch to be considered successful
141 |     joint_limits : Dict(str, List(float) | np.array(float))
142 |         Robot joint limits in radians
143 |     workspace_limits : Dict(str, List(float) | np.array(float))
144 |         Workspace limits in x,y, and z for the robots workspace in Cartesian space
145 |     enable_step_throttling : bool
146 |         Flag to enable throttling of the simulator
147 |     joints_to_control : np.array(str) | List(str)
148 |         The list of joint links for the psm.
149 |     steps_to_print : int
150 |         Number of steps before model prints information to stdout
151 |     n_actions : int
152 |         Number of possible actions
153 |     n_skip_steps : int
154 |         Number of steps to skip after an update step
155 |     env_name : str
156 |         Name of the environment to train
157 |     """
158 |     super(PSMCartesianDDPGEnv,
159 |           self).__init__(
160 |             action_space_limit,
161 |             goal_position_range,
162 |             position_error_threshold,
163 |             goal_error_margin,
164 |             joint_limits,
165 |             workspace_limits,
166 |             enable_step_throttling,
167 |             joints_to_control,
168 |             steps_to_print,
169 |             n_actions,
170 |             n_skip_steps,
171 |             env_name
172 |           )
173 | 
174 |     # Set observation space and constraints
175 |     self.obs = DDPGObservation(state=np.zeros(20))
176 |     self.initial_pos = copy.deepcopy(self.obs.cur_observation()[0])
177 |     self.observation_space = spaces.Box(
178 |       -np.inf,
179 |       np.inf,
180 |       shape=np.array(self.initial_pos).shape,
181 |       dtype="float32"
182 |     )
183 | 
184 |     return
185 | 
186 |   def compute_reward(self, reached_goal: Goal, desired_goal: Goal, info: Dict[str, bool]) -> float:
187 |     """Function to compute reward received by the agent
188 |     """
189 |     # Find the distance between goal and achieved goal
190 |     cur_dist = LA.norm(np.subtract(desired_goal.goal[0:3], reached_goal.goal[0:3]))
191 |     # Continuous reward
192 |     reward = round(1 - float(abs(cur_dist) / 0.05) * 0.5, 5)
193 |     # Sparse reward
194 |     # if abs(cur_dist) < self.goal_error_margin:
195 |     #     reward = 1
196 |     # else:
197 |     #     reward = -1
198 |     self.obs.dist = cur_dist
199 |     return reward
200 | 
201 |   def _sample_goal(self, observation: Observation) -> Goal:
202 |     """Function to samples new goal positions and ensures its within the workspace of PSM
203 |     """
204 |     rand_val_pos = np.around(
205 |       np.add(
206 |         observation.state[0:3],
207 |         self.np_random.uniform(
208 |           -self.goal.goal_position_range,
209 |           self.goal.goal_position_range,
210 |           size=3
211 |         )
212 |       ),
213 |       decimals=4
214 |     )
215 |     rand_val_pos[0] = np.around(np.clip(rand_val_pos[0], -0.04, 0.03), decimals=4)
216 |     rand_val_pos[1] = np.around(np.clip(rand_val_pos[1], -0.03, 0.04), decimals=4)
217 |     rand_val_pos[2] = np.around(np.clip(rand_val_pos[2], -0.20, -0.09), decimals=4)
218 |     # Uncomment below lines if individual limits need to be set for generating desired goal state
219 |     '''
220 |         rand_val_pos = self.np_random.uniform(-0.1935, 0.1388, size=3)
221 |         rand_val_pos[0] = np.around(np.clip(rand_val_pos[0], -0.1388, 0.1319), decimals=4)
222 |         rand_val_pos[1] = np.around(np.clip(rand_val_pos[1], -0.1319, 0.1388), decimals=4)
223 |         rand_val_pos[2] = np.around(np.clip(rand_val_pos[2], -0.1935, -0.0477), decimals=4)
224 |         rand_val_angle[0] = np.clip(rand_val_angle[0], -0.15, 0.15)
225 |         rand_val_angle[1] = np.clip(rand_val_angle[1], -0.15, 0.15)
226 |         rand_val_angle[2] = np.clip(rand_val_angle[2], -0.15, 0.15)
227 |         '''
228 |     # Provide the range for generating the desired orientation at the terminal state
229 |     rand_val_angle = self.np_random.uniform(-1.5, 1.5, size=3)
230 |     goal = Goal(
231 |       goal=np.concatenate((rand_val_pos,
232 |                            rand_val_angle),
233 |                           axis=None),
234 |       goal_error_margin=self.goal.goal_error_margin,
235 |       goal_position_range=self.goal.goal_position_range
236 |     )
237 | 
238 |     return goal
239 | 
240 |   def _update_observation(
241 |     self,
242 |     end_effector_frame: Any or np.ndarray,
243 |     joint_pos: Any or np.ndarray,
244 |     joint_vel: Any or np.ndarray
245 |   ):
246 |     """Update the observation object in this class
247 |     """
248 |     # Function ensuring skipped steps based on step throttling
249 |     skipped_steps = self.skipped_sim_steps
250 | 
251 |     # Robot tip cartesian position and orientation
252 |     end_effector_pos = end_effector_frame[0:3, 3]
253 |     end_effector_euler = np.array(euler_from_matrix(end_effector_frame[0:3,
254 |                                                                        0:3],
255 |                                                     axes='szyx')).reshape((3,
256 |                                                                            1))
257 |     # State vec is 20x1
258 |     # [x, y, z, ez, ey, ex, j1, j2, j3, j4, j5, j6, j7, w1, w2, w3, w4, w5, w6, w7]
259 | 
260 |     obs = np.asarray(
261 |       np.concatenate((end_effector_pos,
262 |                       end_effector_euler,
263 |                       joint_pos.reshape((7,
264 |                                          1))),
265 |                      axis=0)
266 |     )
267 |     obs = np.concatenate((obs, joint_vel), axis=None)
268 |     # Update the observation object
269 |     self.obs.state = obs.copy()
270 |     # Update the obs info
271 |     self.obs.info = self._update_info()
272 |     # Compute the reward
273 |     reached_goal = copy.deepcopy(self.goal)
274 |     reached_goal.goal = self.obs.state
275 |     self.obs.reward = self.compute_reward(reached_goal, self.goal, self.obs.info)
276 |     self.obs.is_done = self._check_if_done()
277 | 
278 |     return
279 | 
280 | 
281 | if __name__ == "__main__":
282 |   # Create object of this class
283 |   root_link = 'baselink'
284 |   env_kwargs = {
285 |     'action_space_limit': 0.05,
286 |     'goal_position_range': 0.05,
287 |     'position_error_threshold': 0.01,
288 |     'goal_error_margin': 0.0075,
289 |     'joint_limits':
290 |       {
291 |         'lower_limit': np.array([-0.2,
292 |                                  -0.2,
293 |                                  0.1,
294 |                                  -1.5,
295 |                                  -1.5,
296 |                                  -1.5,
297 |                                  -1.5]),
298 |         'upper_limit': np.array([0.2,
299 |                                  0.2,
300 |                                  0.24,
301 |                                  1.5,
302 |                                  1.5,
303 |                                  1.5,
304 |                                  1.5])
305 |       },
306 |     'workspace_limits':
307 |       {
308 |         'lower_limit': np.array([-0.04,
309 |                                  -0.03,
310 |                                  -0.2]),
311 |         'upper_limit': np.array([0.03,
312 |                                  0.04,
313 |                                  -0.091])
314 |       },
315 |     'enable_step_throttling': False,
316 |     'steps_to_print': 10
317 |   }
318 |   psmEnv = PSMCartesianDDPGEnv(**env_kwargs)
319 |   psmEnv.make(root_link)
320 |   # psmEnv.world_handle = psmEnv.ambf_client.get_world_handle()
321 |   # psmEnv.world_handle.enable_throttling(False)


--------------------------------------------------------------------------------
/envs/dVRK/PSM_cartesian_env.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # //============================================================================
  4 | # /*
  5 | #     Software License Agreement (BSD License)
  6 | #     Copyright (c) 2019, AMBF
  7 | #     (www.aimlab.wpi.edu)
  8 | 
  9 | #     All rights reserved.
 10 | 
 11 | #     Redistribution and use in source and binary forms, with or without
 12 | #     modification, are permitted provided that the following conditions
 13 | #     are met:
 14 | 
 15 | #     * Redistributions of source code must retain the above copyright
 16 | #     notice, this list of conditions and the following disclaimer.
 17 | 
 18 | #     * Redistributions in binary form must reproduce the above
 19 | #     copyright notice, this list of conditions and the following
 20 | #     disclaimer in the documentation and/or other materials provided
 21 | #     with the distribution.
 22 | 
 23 | #     * Neither the name of authors nor the names of its contributors may
 24 | #     be used to endorse or promote products derived from this software
 25 | #     without specific prior written permission.
 26 | 
 27 | #     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 28 | #     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 29 | #     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 30 | #     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 31 | #     COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 32 | #     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 33 | #     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 34 | #     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 35 | #     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 36 | #     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 37 | #     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 38 | #     POSSIBILITY OF SUCH DAMAGE.
 39 | 
 40 | #     \author    <http://aimlab.wpi.edu>
 41 | #     \author    <dkoolrajamani@wpi.edu>, <vvarier@wpi.edu>, <amunawar@wpi.edu>
 42 | #     \author    Dhruv Kool Rajamani, Vignesh Manoj Varier, and Adnan Munawar
 43 | #     \version   1.0.0
 44 | # */
 45 | # //============================================================================
 46 | 
 47 | from abc import ABCMeta, abstractmethod
 48 | import sys, os, copy, time
 49 | 
 50 | from typing import Iterable, List, Set, Tuple, Dict, Any, Type
 51 | 
 52 | from arl.arl_env import Action, Goal, Observation, ARLEnv
 53 | 
 54 | import numpy as np
 55 | from numpy import linalg as LA
 56 | 
 57 | import gym
 58 | from gym import spaces
 59 | from gym.utils import seeding
 60 | 
 61 | from psmFK import compute_FK
 62 | from transformations import euler_from_matrix
 63 | from dvrk_functions.msg import HomogenousTransform
 64 | import rospy
 65 | from dvrk_functions.srv import *
 66 | 
 67 | 
 68 | class CartesianAction(Action):
 69 | 
 70 |   def __init__(
 71 |     self,
 72 |     n_actions: int,
 73 |     action_space_limit: float,
 74 |     action_lims_low: List[float] = None,
 75 |     action_lims_high: List[float] = None
 76 |   ) -> None:
 77 |     super(CartesianAction,
 78 |           self).__init__(n_actions,
 79 |                          action_space_limit,
 80 |                          action_lims_low,
 81 |                          action_lims_high)
 82 | 
 83 |     self.action_space = spaces.Box(
 84 |       low=-action_space_limit,
 85 |       high=action_space_limit,
 86 |       shape=(self.n_actions,
 87 |              ),
 88 |       dtype="float32"
 89 |     )
 90 | 
 91 | 
 92 | class PSMCartesianEnv(ARLEnv, metaclass=ABCMeta):
 93 |   """Single task based environment for PSM to perform debris removal as shown in
 94 |   the paper:
 95 |   
 96 |   TODO: Enter paper doi and name
 97 | 
 98 |   The environment performs actions in the cartesian space in R3, with translational
 99 |   movements in x, y, and z.
100 |   """
101 | 
102 |   def __init__(
103 |     self,
104 |     action_space_limit: float,
105 |     goal_position_range: float,
106 |     position_error_threshold: float,
107 |     goal_error_margin: float,
108 |     joint_limits: Dict[str,
109 |                        np.ndarray or List[str]],
110 |     workspace_limits: Dict[str,
111 |                            np.ndarray or List[str]],
112 |     enable_step_throttling: bool,
113 |     joints_to_control: List[str] = [
114 |       'baselink-yawlink',
115 |       'yawlink-pitchbacklink',
116 |       'pitchendlink-maininsertionlink',
117 |       'maininsertionlink-toolrolllink',
118 |       'toolrolllink-toolpitchlink',
119 |       'toolpitchlink-toolgripper1link',
120 |       'toolpitchlink-toolgripper2link'
121 |     ],
122 |     steps_to_print: int = 10000,
123 |     n_actions: int = 3,
124 |     n_skip_steps: int = 5,
125 |     env_name: str = "PSM_cartesian_ddpgenv"
126 |   ) -> None:
127 |     """Initialize an object of the PSM robot in cartesian space.
128 | 
129 |     Parameters
130 |     ----------
131 |     action_space_limit : float
132 |         Action space limit for cartesian actions
133 |     goal_position_range : int, optional
134 |         The variance in goal position
135 |     position_error_threshold : float
136 |         Maximum acceptable error in cartesian position
137 |     goal_error_margin : float
138 |         Maximum margin of error for an epoch to be considered successful
139 |     joint_limits : Dict(str, List(float) | np.array(float))
140 |         Robot joint limits in radians
141 |     workspace_limits : Dict(str, List(float) | np.array(float))
142 |         Workspace limits in x,y, and z for the robots workspace in Cartesian space
143 |     enable_step_throttling : bool
144 |         Flag to enable throttling of the simulator
145 |     joints_to_control : np.array(str) | List(str)
146 |         The list of joint links for the psm.
147 |     steps_to_print : int
148 |         Number of steps before model prints information to stdout
149 |     n_actions : int
150 |         Number of possible actions
151 |     n_skip_steps : int
152 |         Number of steps to skip after an update step
153 |     env_name : str
154 |         Name of the environment to train
155 |     """
156 |     super(PSMCartesianEnv, self).__init__(enable_step_throttling, n_skip_steps, env_name)
157 | 
158 |     # Set environment limits
159 |     self._position_error_threshold = position_error_threshold
160 |     self._joint_limits = joint_limits
161 |     self._workspace_limits = workspace_limits
162 | 
163 |     # Store controllable joints
164 |     self._joints_to_control = joints_to_control
165 | 
166 |     # Steps to print
167 |     self._steps_to_print = steps_to_print
168 | 
169 |     # Set environment and task parameters
170 |     self._n_actions = n_actions
171 | 
172 |     ## Set task constraints
173 |     # Set action space limits
174 |     self.action = CartesianAction(self._n_actions, action_space_limit)
175 |     self.action_space = self.action.action_space
176 | 
177 |     # Set goal position and constraints
178 |     # TODO: Consider using args/extra args in init() to specify goal.
179 |     self.goal = Goal(
180 |       goal_position_range,
181 |       goal_error_margin,
182 |       np.array([0.0,
183 |                 0.0,
184 |                 -0.1,
185 |                 0.0,
186 |                 0.0,
187 |                 0.0])
188 |     )
189 | 
190 |     return
191 | 
192 | # Properties
193 | 
194 |   @property
195 |   def position_error_threshold(self) -> float:
196 |     """Returns the position error threshold
197 |     """
198 |     return self._position_error_threshold
199 | 
200 |   @position_error_threshold.setter
201 |   def position_error_threshold(self, value: float):
202 |     """Set the position error threshold
203 |     """
204 |     self._position_error_threshold = value
205 |     return
206 | 
207 |   @property
208 |   def joint_limits(self) -> Dict[str, np.ndarray or List[float]]:
209 |     """Return the joint limits dictionary
210 |     """
211 |     return self._joint_limits
212 | 
213 |   @joint_limits.setter
214 |   def joint_limits(self, value: Dict[str, np.ndarray or List[float]]):
215 |     """Set the joint limits dictionary
216 |     """
217 |     self._joint_limits = value
218 |     return
219 | 
220 |   @property
221 |   def workspace_limits(self) -> Dict[str, np.ndarray or List[float]]:
222 |     """Returns the workspace limits dictionary
223 |       """
224 |     return self._workspace_limits
225 | 
226 |   @workspace_limits.setter
227 |   def workspace_limits(self, value: Dict[str, np.ndarray or List[float]]):
228 |     """Set the workspace limits dictionary
229 |     """
230 |     self._workspace_limits = value
231 |     return
232 | 
233 |   @property
234 |   def joints_to_control(self) -> Any or List[str]:
235 |     """Returns a np.array or List of joint object handles
236 |     """
237 |     return self._joints_to_control
238 | 
239 |   @joints_to_control.setter
240 |   def joints_to_control(self, value: Any or List[str]):
241 |     """Set a np.array or List of joint object handles
242 |     """
243 |     self._joints_to_control = value
244 |     return
245 | 
246 |   @property
247 |   def steps_to_print(self) -> int:
248 |     """Returns the number of steps to print
249 |     """
250 |     return self._steps_to_print
251 | 
252 |   @steps_to_print.setter
253 |   def steps_to_print(self, value: int):
254 |     """Set the number of steps to print
255 |     """
256 |     self._steps_to_print = value
257 |     return
258 | 
259 |   @property
260 |   def n_actions(self) -> int:
261 |     """Returns the number of actions possible
262 |     """
263 |     return self._n_actions
264 | 
265 |   @n_actions.setter
266 |   def n_actions(self, value: int):
267 |     """Set the number of actions possible
268 |     """
269 |     self._n_actions = value
270 |     return
271 | 
272 |   @property
273 |   def initial_pos(self) -> Any or np.ndarray or List[float] or Dict:
274 |     """Returns the initial position (state) of the environment
275 |     """
276 |     # if type(self._initial_pos) == type(np.ndarray):
277 |     #   return self._initial_pos
278 |     # elif type(self._initial_pos) == type(dict):
279 |     #   return self._initial_pos
280 |     # else:
281 |     #   return np.array(self._initial_pos)
282 |     return self._initial_pos
283 | 
284 |   @initial_pos.setter
285 |   def initial_pos(self, value: Any or np.ndarray or List[float] or Dict or float):
286 |     """Set the initial position (state) of the environment
287 |     """
288 |     if type(value) == type(float):
289 |       # Set default values
290 |       for joint_idx, jt_name in enumerate(self.joints_to_control):
291 |         # Prismatic joint is set to different value to ensure at least some part of robot tip
292 |         # goes past the cannula
293 |         if joint_idx == 2:
294 |           self.obj_handle.set_joint_pos(jt_name, self.joint_limits['lower_limit'][2])
295 |         else:
296 |           self.obj_handle.set_joint_pos(jt_name, value)
297 |       time.sleep(0.5)
298 |     else:
299 |       self._initial_pos = value
300 |     return
301 | 
302 | # Overriding ARLEnv functions
303 | 
304 |   def reset(self) -> np.ndarray or List[float] or Dict:
305 |     """Reset the robot environment
306 | 
307 |     Type 1 Reset : Uses the previous reached state as the initial state for
308 |     next iteration
309 | 
310 |     action = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
311 | 
312 |     observation, _, _, _ = self.step(action)
313 | 
314 |     Type 2 Reset : Sets the robot to a predefined initial state for each 
315 |     iteration.
316 |     """
317 | 
318 |     # Set the initial Position of the PSM
319 |     self.initial_pos = 0.0
320 |     initial_joint_pos, initial_joint_vel = self.get_joint_states()
321 |     end_effector_frame = compute_FK(initial_joint_pos)
322 |     # Updates the observation to the initialized position
323 |     self._update_observation(end_effector_frame, initial_joint_pos, initial_joint_vel)
324 |     # Samples a goal
325 |     self.goal = self._sample_goal(self.obs)
326 | 
327 |     return self.obs.state
328 | 
329 |   def step(
330 |     self,
331 |     action  #: np.ndarray or List[float]
332 |   ) -> Tuple[np.ndarray or List[float] or Dict,
333 |              float,
334 |              bool,
335 |              Dict[str,
336 |                   bool]]:
337 |     """Performs the update step for the algorithm and dynamics
338 |     """
339 |     # Set incoming action to Action member variable
340 |     self.action.action = action
341 | 
342 |     # Get the current state
343 |     cur_state = copy.deepcopy(self.obs.state)
344 | 
345 |     # Make sure the action is valid
346 |     self.action.check_if_valid_action()
347 | 
348 |     # counter for printing position, action, and reward
349 |     self.count_for_print += 1
350 | 
351 |     # Compute the End Effector frame from the current joint states
352 |     cur_joint_pos, cur_joint_vel = self.get_joint_states()
353 |     cur_end_effector_frame = compute_FK(cur_joint_pos)
354 | 
355 |     # Get the positional component from end effector Homogenous Transform
356 |     cur_end_effector_pos = np.asarray(cur_end_effector_frame[0:3, 3]).reshape(-1)
357 |     # Apply the action and compute the resulting state
358 |     next_end_effector_pos = self.action.apply(cur_end_effector_pos)
359 |     # Clip the next state to ensure joint limits aren't broken
360 |     next_end_effector_pos = self.check_if_valid_state(next_end_effector_pos)
361 | 
362 |     # Create a frame and maintain previous orientation
363 |     next_end_effector_frame = cur_end_effector_frame
364 |     for i in range(3):
365 |       next_end_effector_frame[i, 3] = next_end_effector_pos[i]
366 | 
367 |     # Create a Homogenous Transform Message
368 |     msg = HomogenousTransform()
369 |     msg.data = np.array(next_end_effector_frame).flatten()
370 | 
371 |     rospy.wait_for_service('compute_IK')
372 |     computed_joint_state = None
373 |     try:
374 |       compute_IK_service = rospy.ServiceProxy('compute_IK', ComputeIK)
375 |       compute_IK_response = compute_IK_service.call(ComputeIKRequest(msg))
376 |       computed_joint_state = list(compute_IK_response.q_des)
377 |       for i in range(0, 6):
378 |         computed_joint_state[i] = round(computed_joint_state[i], 4)
379 |     except rospy.ServiceException as e:
380 |       print("Service call failed: %s" % e, file=sys.stderr)
381 |     # Ensure the computed joint positions are within the limit of user set joint positions
382 |     next_joint_state = self.check_if_valid_joint_state(computed_joint_state)
383 |     # Ensures that PSM joints reach the desired joint positions
384 |     self.send_cmd(cmd=next_joint_state)
385 | 
386 |     # Update state, reward, done, and world values in the code
387 |     self._update_observation(next_end_effector_frame, next_joint_state, cur_joint_vel)
388 |     # Update the world handle
389 |     self.world_handle.update()
390 | 
391 |     # Print function for viewing the output intermittently
392 |     if self.count_for_print % self.steps_to_print == 0:
393 |       print("Count: {} Goal: {}".format(self.count_for_print, self.goal.goal))
394 |       print("\tState: {}".format(cur_state))
395 |       print("\tAction: {}".format(self.action.action))
396 |       print("\tReward: {}".format(self.obs.reward))
397 | 
398 |     return self.obs.state, self.obs.reward, self.obs.is_done, self.obs.info
399 | 
400 |   def send_cmd(self, cmd: Any or np.ndarray or List[float]):
401 |     """Ensure the robot tip reaches the desired goal position before moving on to next iteration
402 |     """
403 | 
404 |     count_for_joint_pos = 0
405 |     while True:
406 |       # Command joints to reach position
407 |       for joint_idx, joint_name in enumerate(self.joints_to_control):
408 |         self.obj_handle.set_joint_pos(joint_name, cmd[joint_idx])
409 | 
410 |       reached_joint_pos = np.zeros(7)
411 |       # Check to see if desired joint positions have been reached
412 |       for joint_idx, joint_name in enumerate(self.joints_to_control):
413 |         reached_joint_pos[joint_idx] = self.obj_handle.get_joint_pos(joint_name)
414 | 
415 |       # Compare the error between desired and reached pos and allow acceptable margin
416 |       error = np.around(np.subtract(cmd, reached_joint_pos), decimals=3)
417 |       # Since Prismatic joint limits are smaller compared to the limits of other joints
418 |       error[2] = np.around(np.subtract(cmd[2], reached_joint_pos[2]), decimals=4)
419 |       # Create error margin vector
420 |       error_margin = np.array([self.position_error_threshold] * len(self.joints_to_control))
421 |       error_margin[2] = 0.5 * self.position_error_threshold
422 | 
423 |       # Check to ensure the error margins have been reached
424 |       if (np.all(np.abs(error) <= error_margin)) or count_for_joint_pos > 75:
425 |         break
426 | 
427 |       # Increment counter for iterations of checks
428 |       count_for_joint_pos += 1
429 | 
430 |     return
431 | 
432 |   @abstractmethod
433 |   def compute_reward(self, reached_goal: Goal, desired_goal: Goal, info: Dict[str, bool]) -> float:
434 |     """Function to compute reward received by the agent
435 |     """
436 |     reward = 0.0
437 |     return reward
438 | 
439 |   @abstractmethod
440 |   def _sample_goal(self, observation: Observation) -> Goal:
441 |     """Function to samples new goal positions and ensures its within the workspace of PSM
442 |     """
443 |     goal = Goal(0.0, 0.0, None)
444 |     return goal
445 | 
446 | 
447 | # PSM functions, can be imitated for other robots
448 | 
449 |   @abstractmethod
450 |   def _update_observation(
451 |     self,
452 |     end_effector_frame: Any or np.ndarray,
453 |     joint_pos: Any or np.ndarray,
454 |     joint_vel: Any or np.ndarray
455 |   ):
456 |     """Update the observation object in this class
457 |     """
458 |     return
459 | 
460 |   def check_if_valid_state(self, state: np.ndarray or List[float]) -> np.ndarray or List[float]:
461 |     """Clips the state if it goes beyond the cartesian limits
462 |     """
463 |     clipped_state = np.zeros(3)
464 |     cartesian_pos_lower_limit = self.workspace_limits['lower_limit']
465 |     cartesian_pos_upper_limit = self.workspace_limits['upper_limit']
466 |     for axis in range(3):
467 |       clipped_state[axis] = np.clip(
468 |         state[axis],
469 |         cartesian_pos_lower_limit[axis],
470 |         cartesian_pos_upper_limit[axis]
471 |       )
472 |     return clipped_state
473 | 
474 |   def check_if_valid_joint_state(self,
475 |                                  state: Any or np.ndarray
476 |                                  or List[float]) -> Any or np.ndarray or List[float]:
477 |     """Limits the joint states if it goes beyond the joint limits
478 |     """
479 |     # dvrk_limits_low = np.array([-1.605, -0.93556, -0.002444, -3.0456, -3.0414, -3.0481, -3.0498])
480 |     # dvrk_limits_high = np.array([1.5994, 0.94249, 0.24001, 3.0485, 3.0528, 3.0376, 3.0399])
481 |     # Note: Joint 5 and 6, joint pos = 0, 0 is closed jaw and 0.5, 0.5 is open
482 |     limit_joint_values = np.zeros(7)
483 |     joint_lower_limit = self.joint_limits['lower_limit']
484 |     joint_upper_limit = self.joint_limits['upper_limit']
485 |     for joint_idx in range(len(state)):
486 |       limit_joint_values[joint_idx] = np.clip(
487 |         state[joint_idx],
488 |         joint_lower_limit[joint_idx],
489 |         joint_upper_limit[joint_idx]
490 |       )
491 | 
492 |     return limit_joint_values
493 | 
494 |   def get_joint_states(self) -> Tuple[List[float], List[float]]:
495 |     """Computes the joint position and velocities
496 |     """
497 |     joint_positions = np.zeros(7)
498 |     joint_velocities = np.zeros(7)
499 | 
500 |     for joint_idx, joint_name in enumerate(self.joints_to_control):
501 |       joint_positions[joint_idx] = self.obj_handle.get_joint_pos(joint_name)
502 |       joint_velocities[joint_idx] = self.obj_handle.get_joint_vel(joint_name)
503 | 
504 |     return joint_positions, joint_velocities
505 | 
506 | if __name__ == "__main__":
507 |   # Create object of this class
508 |   root_link = 'psm/baselink'
509 | 
510 |   # 'joints_to_control':
511 |   #   np.array(
512 |   #     [
513 |   #       'baselink-yawlink',
514 |   #       'yawlink-pitchbacklink',
515 |   #       'pitchendlink-maininsertionlink',
516 |   #       'maininsertionlink-toolrolllink',
517 |   #       'toolrolllink-toolpitchlink',
518 |   #       'toolpitchlink-toolgripper1link',
519 |   #       'toolpitchlink-toolgripper2link'
520 |   #     ]
521 |   #   ),
522 |   env_kwargs = {
523 |     'action_space_limit': 0.05,
524 |     'goal_position_range': 0.05,
525 |     'position_error_threshold': 0.01,
526 |     'goal_error_margin': 0.0075,
527 |     'joint_limits':
528 |       {
529 |         'lower_limit': np.array([-0.2,
530 |                                  -0.2,
531 |                                  0.1,
532 |                                  -1.5,
533 |                                  -1.5,
534 |                                  -1.5,
535 |                                  -1.5]),
536 |         'upper_limit': np.array([0.2,
537 |                                  0.2,
538 |                                  0.24,
539 |                                  1.5,
540 |                                  1.5,
541 |                                  1.5,
542 |                                  1.5])
543 |       },
544 |     'workspace_limits':
545 |       {
546 |         'lower_limit': np.array([-0.04,
547 |                                  -0.03,
548 |                                  -0.2]),
549 |         'upper_limit': np.array([0.03,
550 |                                  0.04,
551 |                                  -0.091])
552 |       },
553 |     'enable_step_throttling': False,
554 |   }
555 |   psmEnv = PSMCartesianDDPGEnv(**env_kwargs)
556 |   psmEnv.make(root_link)
557 |   # psmEnv.world_handle = psmEnv.ambf_client.get_world_handle()
558 |   # psmEnv.world_handle.enable_throttling(False)


--------------------------------------------------------------------------------
/envs/dVRK/PSM_cartesian_herddpg_env.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # //============================================================================
  4 | # /*
  5 | #     Software License Agreement (BSD License)
  6 | #     Copyright (c) 2019, AMBF
  7 | #     (www.aimlab.wpi.edu)
  8 | 
  9 | #     All rights reserved.
 10 | 
 11 | #     Redistribution and use in source and binary forms, with or without
 12 | #     modification, are permitted provided that the following conditions
 13 | #     are met:
 14 | 
 15 | #     * Redistributions of source code must retain the above copyright
 16 | #     notice, this list of conditions and the following disclaimer.
 17 | 
 18 | #     * Redistributions in binary form must reproduce the above
 19 | #     copyright notice, this list of conditions and the following
 20 | #     disclaimer in the documentation and/or other materials provided
 21 | #     with the distribution.
 22 | 
 23 | #     * Neither the name of authors nor the names of its contributors may
 24 | #     be used to endorse or promote products derived from this software
 25 | #     without specific prior written permission.
 26 | 
 27 | #     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 28 | #     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 29 | #     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 30 | #     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 31 | #     COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 32 | #     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 33 | #     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 34 | #     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 35 | #     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 36 | #     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 37 | #     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 38 | #     POSSIBILITY OF SUCH DAMAGE.
 39 | 
 40 | #     \author    <http://aimlab.wpi.edu>
 41 | #     \author    <dkoolrajamani@wpi.edu>, <vvarier@wpi.edu>, <amunawar@wpi.edu>
 42 | #     \author    Dhruv Kool Rajamani, Vignesh Manoj Varier, and Adnan Munawar
 43 | #     \version   1.0.0
 44 | # */
 45 | # //============================================================================
 46 | 
 47 | import sys, os, copy, time
 48 | 
 49 | from typing import Iterable, List, Set, Tuple, Dict, Any, Type
 50 | 
 51 | from gym.logger import error
 52 | from arl.arl_env import Action, Goal, Observation, ARLEnv
 53 | from .PSM_cartesian_env import PSMCartesianEnv, CartesianAction
 54 | 
 55 | import numpy as np
 56 | from numpy import linalg as LA
 57 | 
 58 | import gym
 59 | from gym import spaces
 60 | from gym.utils import seeding
 61 | 
 62 | from psmFK import compute_FK
 63 | from transformations import euler_from_matrix
 64 | from dvrk_functions.msg import HomogenousTransform
 65 | import rospy
 66 | from dvrk_functions.srv import *
 67 | 
 68 | 
 69 | class HERDDPGObservation(Observation):
 70 | 
 71 |   def __init__(
 72 |     self,
 73 |     state: Dict,
 74 |     dist: int = 0,
 75 |     reward: float = 0.0,
 76 |     prev_reward: float = 0.0,
 77 |     cur_reward: float = 0.0,
 78 |     is_done: bool = False,
 79 |     info: Dict = {},
 80 |     sim_step_no: int = 0
 81 |   ) -> None:
 82 | 
 83 |     super(HERDDPGObservation,
 84 |           self).__init__(state,
 85 |                          dist,
 86 |                          reward,
 87 |                          prev_reward,
 88 |                          cur_reward,
 89 |                          is_done,
 90 |                          info,
 91 |                          sim_step_no)
 92 |     return
 93 | 
 94 | 
 95 | class PSMCartesianHERDDPGEnv(PSMCartesianEnv, gym.GoalEnv):
 96 |   """Single task based environment for PSM to perform debris removal as shown in
 97 |   the paper:
 98 |   
 99 |   TODO: Enter paper doi and name
100 | 
101 |   The environment performs actions in the cartesian space in R3, with translational
102 |   movements in x, y, and z.
103 |   """
104 | 
105 |   def __init__(
106 |     self,
107 |     action_space_limit: float,
108 |     goal_position_range: float,
109 |     position_error_threshold: float,
110 |     goal_error_margin: float,
111 |     joint_limits: Dict[str,
112 |                        np.ndarray or List[str]],
113 |     workspace_limits: Dict[str,
114 |                            np.ndarray or List[str]],
115 |     enable_step_throttling: bool,
116 |     joints_to_control: List[str] = [
117 |       'baselink-yawlink',
118 |       'yawlink-pitchbacklink',
119 |       'pitchendlink-maininsertionlink',
120 |       'maininsertionlink-toolrolllink',
121 |       'toolrolllink-toolpitchlink',
122 |       'toolpitchlink-toolgripper1link',
123 |       'toolpitchlink-toolgripper2link'
124 |     ],
125 |     steps_to_print: int = 10000,
126 |     n_actions: int = 3,
127 |     n_skip_steps: int = 5,
128 |     env_name: str = "PSM_cartesian_ddpg_env"
129 |   ) -> None:
130 |     """Initialize an object to train with DDPG on the PSM robot.
131 | 
132 |     Parameters
133 |     ----------
134 |     action_space_limit : float
135 |         Action space limit for cartesian actions
136 |     goal_position_range : int, optional
137 |         The variance in goal position
138 |     position_error_threshold : float
139 |         Maximum acceptable error in cartesian position
140 |     goal_error_margin : float
141 |         Maximum margin of error for an epoch to be considered successful
142 |     joint_limits : Dict(str, List(float) | np.array(float))
143 |         Robot joint limits in radians
144 |     workspace_limits : Dict(str, List(float) | np.array(float))
145 |         Workspace limits in x,y, and z for the robots workspace in Cartesian space
146 |     enable_step_throttling : bool
147 |         Flag to enable throttling of the simulator
148 |     joints_to_control : np.array(str) | List(str)
149 |         The list of joint links for the psm.
150 |     steps_to_print : int
151 |         Number of steps before model prints information to stdout
152 |     n_actions : int
153 |         Number of possible actions
154 |     n_skip_steps : int
155 |         Number of steps to skip after an update step
156 |     env_name : str
157 |         Name of the environment to train
158 |     """
159 |     super(PSMCartesianHERDDPGEnv,
160 |           self).__init__(
161 |             action_space_limit,
162 |             goal_position_range,
163 |             position_error_threshold,
164 |             goal_error_margin,
165 |             joint_limits,
166 |             workspace_limits,
167 |             enable_step_throttling,
168 |             joints_to_control,
169 |             steps_to_print,
170 |             n_actions,
171 |             n_skip_steps,
172 |             env_name
173 |           )
174 | 
175 |     # Set observation space and constraints
176 |     self.obs = HERDDPGObservation(
177 |       state={
178 |         'observation': np.zeros(20),
179 |         # Prismatic joint is set to 0.1 to ensure at least some part of robot tip goes past the cannula
180 |         'achieved_goal': np.array([0.0,
181 |                                    0.0,
182 |                                    0.1,
183 |                                    0.0,
184 |                                    0.0,
185 |                                    0.0]),
186 |         'desired_goal': np.zeros(6)
187 |       }
188 |     )
189 |     self.initial_pos = copy.deepcopy(self.obs.cur_observation()[0])
190 |     self.observation_space = spaces.Dict(
191 |       dict(
192 |         desired_goal=spaces.Box(
193 |           -np.inf,
194 |           np.inf,
195 |           shape=self.initial_pos['achieved_goal'].shape,
196 |           dtype='float32'
197 |         ),
198 |         achieved_goal=spaces.Box(
199 |           -np.inf,
200 |           np.inf,
201 |           shape=self.initial_pos['achieved_goal'].shape,
202 |           dtype='float32'
203 |         ),
204 |         observation=spaces.Box(
205 |           -np.inf,
206 |           np.inf,
207 |           shape=self.initial_pos['observation'].shape,
208 |           dtype='float32'
209 |         ),
210 |       )
211 |     )
212 | 
213 |     return
214 | 
215 |   def compute_reward(self, reached_goal: Any, desired_goal: Any, info: Dict[str, bool]) -> float:
216 |     """Function to compute reward received by the agent
217 |     """
218 |     # Find the distance between goal and achieved goal
219 |     cur_dist = None
220 |     if isinstance(desired_goal, Goal) and isinstance(reached_goal, Goal):
221 |       cur_dist = LA.norm(np.subtract(desired_goal.goal[0:3], reached_goal.goal[0:3]))
222 |     elif isinstance(desired_goal, np.ndarray) and isinstance(reached_goal, np.ndarray):
223 |       cur_dist = LA.norm(np.subtract(desired_goal[0:3], reached_goal[0:3]))
224 |     else:
225 |       error_string = 'reached_goal: {}\tdesired_goal: {}'.format(
226 |         type(reached_goal),
227 |         type(desired_goal)
228 |       )
229 |       raise Exception(error_string)
230 |     # Continuous reward
231 |     # reward = round(1 - float(abs(cur_dist) / 0.05) * 0.5, 5)
232 |     # Sparse reward
233 |     if abs(cur_dist) < self.goal.goal_error_margin:
234 |       reward = 1
235 |     else:
236 |       reward = -1
237 |     self.obs.dist = cur_dist
238 |     return reward
239 | 
240 |   def _sample_goal(self, observation: Observation) -> Goal:
241 |     """Function to samples new goal positions and ensures its within the workspace of PSM
242 |     """
243 |     rand_val_pos = np.around(
244 |       np.add(
245 |         observation.state['achieved_goal'][0:3],
246 |         self.np_random.uniform(
247 |           -self.goal.goal_position_range,
248 |           self.goal.goal_position_range,
249 |           size=3
250 |         )
251 |       ),
252 |       decimals=4
253 |     )
254 |     rand_val_pos[0] = np.around(
255 |       np.clip(
256 |         rand_val_pos[0],
257 |         self.workspace_limits['lower_limit'][0],
258 |         self.workspace_limits['upper_limit'][0]
259 |       ),
260 |       decimals=4
261 |     )
262 |     rand_val_pos[1] = np.around(
263 |       np.clip(
264 |         rand_val_pos[1],
265 |         self.workspace_limits['lower_limit'][1],
266 |         self.workspace_limits['upper_limit'][1]
267 |       ),
268 |       decimals=4
269 |     )
270 |     rand_val_pos[2] = np.around(
271 |       np.clip(
272 |         rand_val_pos[2],
273 |         self.workspace_limits['lower_limit'][2],
274 |         self.workspace_limits['upper_limit'][2]
275 |       ),
276 |       decimals=4
277 |     )
278 |     # Uncomment below lines if individual limits need to be set for generating desired goal state
279 |     '''
280 |         rand_val_pos = self.np_random.uniform(-0.1935, 0.1388, size=3)
281 |         rand_val_pos[0] = np.around(np.clip(rand_val_pos[0], -0.1388, 0.1319), decimals=4)
282 |         rand_val_pos[1] = np.around(np.clip(rand_val_pos[1], -0.1319, 0.1388), decimals=4)
283 |         rand_val_pos[2] = np.around(np.clip(rand_val_pos[2], -0.1935, -0.0477), decimals=4)
284 |         rand_val_angle[0] = np.clip(rand_val_angle[0], -0.15, 0.15)
285 |         rand_val_angle[1] = np.clip(rand_val_angle[1], -0.15, 0.15)
286 |         rand_val_angle[2] = np.clip(rand_val_angle[2], -0.15, 0.15)
287 |         '''
288 |     # Provide the range for generating the desired orientation at the terminal state
289 |     rand_val_angle = self.np_random.uniform(-1.5, 1.5, size=3)
290 |     goal = Goal(
291 |       goal=np.concatenate((rand_val_pos,
292 |                            rand_val_angle),
293 |                           axis=None),
294 |       goal_error_margin=self.goal.goal_error_margin,
295 |       goal_position_range=self.goal.goal_position_range
296 |     )
297 | 
298 |     return goal
299 | 
300 |   def _update_observation(
301 |     self,
302 |     end_effector_frame: Any or np.ndarray,
303 |     joint_pos: Any or np.ndarray,
304 |     joint_vel: Any or np.ndarray
305 |   ):
306 |     """Update the observation object in this class
307 |     """
308 |     # Function ensuring skipped steps based on step throttling
309 |     skipped_steps = self.skipped_sim_steps
310 | 
311 |     # Robot tip cartesian position and orientation
312 |     end_effector_pos = end_effector_frame[0:3, 3]
313 |     end_effector_euler = np.array(euler_from_matrix(end_effector_frame[0:3,
314 |                                                                        0:3],
315 |                                                     axes='szyx')).reshape((3,
316 |                                                                            1))
317 |     # State vec is 20x1
318 |     # [x, y, z, ez, ey, ex, j1, j2, j3, j4, j5, j6, j7, w1, w2, w3, w4, w5, w6, w7]
319 |     achieved_goal = np.asarray(
320 |       np.concatenate((end_effector_pos.copy(),
321 |                       end_effector_euler.copy()),
322 |                      axis=0)
323 |     ).reshape(-1)
324 |     obs = np.asarray(
325 |       np.concatenate((end_effector_pos,
326 |                       end_effector_euler,
327 |                       joint_pos.reshape((7,
328 |                                          1))),
329 |                      axis=0)
330 |     )
331 |     obs = np.concatenate((obs, joint_vel), axis=None)
332 |     # Update the observation object
333 |     self.obs.state.update(
334 |       observation=obs.copy(),
335 |       achieved_goal=achieved_goal.copy(),
336 |       desired_goal=self.goal.goal.copy()
337 |     )
338 |     # Update the obs info
339 |     self.obs.info = self._update_info()
340 |     # Compute the reward
341 |     achieved_goal = copy.deepcopy(self.goal)
342 |     achieved_goal.goal = self.obs.state['achieved_goal']
343 |     self.obs.reward = self.compute_reward(achieved_goal, self.goal, self.obs.info)
344 |     self.obs.is_done = self._check_if_done()
345 | 
346 |     return
347 | 
348 | 
349 | if __name__ == "__main__":
350 |   # Create object of this class
351 |   root_link = 'baselink'
352 |   env_kwargs = {
353 |     'action_space_limit': 0.05,
354 |     'goal_position_range': 0.05,
355 |     'position_error_threshold': 0.01,
356 |     'goal_error_margin': 0.0075,
357 |     'joint_limits':
358 |       {
359 |         'lower_limit': np.array([-0.2,
360 |                                  -0.2,
361 |                                  0.1,
362 |                                  -1.5,
363 |                                  -1.5,
364 |                                  -1.5,
365 |                                  -1.5]),
366 |         'upper_limit': np.array([0.2,
367 |                                  0.2,
368 |                                  0.24,
369 |                                  1.5,
370 |                                  1.5,
371 |                                  1.5,
372 |                                  1.5])
373 |       },
374 |     'workspace_limits':
375 |       {
376 |         'lower_limit': np.array([-0.04,
377 |                                  -0.03,
378 |                                  -0.2]),
379 |         'upper_limit': np.array([0.03,
380 |                                  0.04,
381 |                                  -0.091])
382 |       },
383 |     'enable_step_throttling': False,
384 |   }
385 |   psmEnv = PSMCartesianHERDDPGEnv(**env_kwargs)
386 |   psmEnv.make(root_link)
387 |   # psmEnv.world_handle = psmEnv.ambf_client.get_world_handle()
388 |   # psmEnv.world_handle.enable_throttling(False)


--------------------------------------------------------------------------------
/envs/dVRK/__init__.py:
--------------------------------------------------------------------------------
1 | # from ..arl.arl_env import ARLEnv, Goal, Action, Observation
2 | from .PSM_cartesian_env import PSMCartesianEnv, CartesianAction
3 | from .PSM_cartesian_ddpg_env import PSMCartesianDDPGEnv, DDPGObservation
4 | from .PSM_cartesian_herddpg_env import PSMCartesianHERDDPGEnv, HERDDPGObservation


--------------------------------------------------------------------------------
/install/stable_baseline_fix/ddpg.py:
--------------------------------------------------------------------------------
   1 | from functools import reduce
   2 | import os
   3 | import time
   4 | from collections import deque
   5 | import pickle
   6 | import warnings
   7 | 
   8 | import gym
   9 | import numpy as np
  10 | import tensorflow as tf
  11 | import tensorflow.contrib as tc
  12 | from mpi4py import MPI
  13 | 
  14 | from stable_baselines import logger
  15 | from stable_baselines.common import tf_util, OffPolicyRLModel, SetVerbosity, TensorboardWriter
  16 | from stable_baselines.common.vec_env import VecEnv
  17 | from stable_baselines.common.mpi_adam import MpiAdam
  18 | from stable_baselines.common.buffers import ReplayBuffer
  19 | from stable_baselines.common.math_util import unscale_action, scale_action
  20 | from stable_baselines.common.mpi_running_mean_std import RunningMeanStd
  21 | from stable_baselines.ddpg.policies import DDPGPolicy
  22 | 
  23 | 
  24 | def normalize(tensor, stats):
  25 |     """
  26 |     normalize a tensor using a running mean and std
  27 | 
  28 |     :param tensor: (TensorFlow Tensor) the input tensor
  29 |     :param stats: (RunningMeanStd) the running mean and std of the input to normalize
  30 |     :return: (TensorFlow Tensor) the normalized tensor
  31 |     """
  32 |     if stats is None:
  33 |         return tensor
  34 |     return (tensor - stats.mean) / stats.std
  35 | 
  36 | 
  37 | def denormalize(tensor, stats):
  38 |     """
  39 |     denormalize a tensor using a running mean and std
  40 | 
  41 |     :param tensor: (TensorFlow Tensor) the normalized tensor
  42 |     :param stats: (RunningMeanStd) the running mean and std of the input to normalize
  43 |     :return: (TensorFlow Tensor) the restored tensor
  44 |     """
  45 |     if stats is None:
  46 |         return tensor
  47 |     return tensor * stats.std + stats.mean
  48 | 
  49 | 
  50 | def reduce_std(tensor, axis=None, keepdims=False):
  51 |     """
  52 |     get the standard deviation of a Tensor
  53 | 
  54 |     :param tensor: (TensorFlow Tensor) the input tensor
  55 |     :param axis: (int or [int]) the axis to itterate the std over
  56 |     :param keepdims: (bool) keep the other dimensions the same
  57 |     :return: (TensorFlow Tensor) the std of the tensor
  58 |     """
  59 |     return tf.sqrt(reduce_var(tensor, axis=axis, keepdims=keepdims))
  60 | 
  61 | 
  62 | def reduce_var(tensor, axis=None, keepdims=False):
  63 |     """
  64 |     get the variance of a Tensor
  65 | 
  66 |     :param tensor: (TensorFlow Tensor) the input tensor
  67 |     :param axis: (int or [int]) the axis to itterate the variance over
  68 |     :param keepdims: (bool) keep the other dimensions the same
  69 |     :return: (TensorFlow Tensor) the variance of the tensor
  70 |     """
  71 |     tensor_mean = tf.reduce_mean(tensor, axis=axis, keepdims=True)
  72 |     devs_squared = tf.square(tensor - tensor_mean)
  73 |     return tf.reduce_mean(devs_squared, axis=axis, keepdims=keepdims)
  74 | 
  75 | 
  76 | def get_target_updates(_vars, target_vars, tau, verbose=0):
  77 |     """
  78 |     get target update operations
  79 | 
  80 |     :param _vars: ([TensorFlow Tensor]) the initial variables
  81 |     :param target_vars: ([TensorFlow Tensor]) the target variables
  82 |     :param tau: (float) the soft update coefficient (keep old values, between 0 and 1)
  83 |     :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
  84 |     :return: (TensorFlow Operation, TensorFlow Operation) initial update, soft update
  85 |     """
  86 |     if verbose >= 2:
  87 |         logger.info('setting up target updates ...')
  88 |     soft_updates = []
  89 |     init_updates = []
  90 |     assert len(_vars) == len(target_vars)
  91 |     for var, target_var in zip(_vars, target_vars):
  92 |         if verbose >= 2:
  93 |             logger.info('  {} <- {}'.format(target_var.name, var.name))
  94 |         init_updates.append(tf.assign(target_var, var))
  95 |         soft_updates.append(tf.assign(target_var, (1. - tau) * target_var + tau * var))
  96 |     assert len(init_updates) == len(_vars)
  97 |     assert len(soft_updates) == len(_vars)
  98 |     return tf.group(*init_updates), tf.group(*soft_updates)
  99 | 
 100 | 
 101 | def get_perturbable_vars(scope):
 102 |     """
 103 |     Get the trainable variables that can be perturbed when using
 104 |     parameter noise.
 105 | 
 106 |     :param scope: (str) tensorflow scope of the variables
 107 |     :return: ([tf.Variables])
 108 |     """
 109 |     return [var for var in tf_util.get_trainable_vars(scope) if 'LayerNorm' not in var.name]
 110 | 
 111 | 
 112 | def get_perturbed_actor_updates(actor, perturbed_actor, param_noise_stddev, verbose=0):
 113 |     """
 114 |     Get the actor update, with noise.
 115 | 
 116 |     :param actor: (str) the actor
 117 |     :param perturbed_actor: (str) the pertubed actor
 118 |     :param param_noise_stddev: (float) the std of the parameter noise
 119 |     :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
 120 |     :return: (TensorFlow Operation) the update function
 121 |     """
 122 |     assert len(tf_util.get_globals_vars(actor)) == len(tf_util.get_globals_vars(perturbed_actor))
 123 |     assert len(get_perturbable_vars(actor)) == len(get_perturbable_vars(perturbed_actor))
 124 | 
 125 |     updates = []
 126 |     for var, perturbed_var in zip(tf_util.get_globals_vars(actor), tf_util.get_globals_vars(perturbed_actor)):
 127 |         if var in get_perturbable_vars(actor):
 128 |             if verbose >= 2:
 129 |                 logger.info('  {} <- {} + noise'.format(perturbed_var.name, var.name))
 130 |             # Add Gaussian noise to the parameter
 131 |             updates.append(tf.assign(perturbed_var,
 132 |                                      var + tf.random_normal(tf.shape(var), mean=0., stddev=param_noise_stddev)))
 133 |         else:
 134 |             if verbose >= 2:
 135 |                 logger.info('  {} <- {}'.format(perturbed_var.name, var.name))
 136 |             updates.append(tf.assign(perturbed_var, var))
 137 |     assert len(updates) == len(tf_util.get_globals_vars(actor))
 138 |     return tf.group(*updates)
 139 | 
 140 | 
 141 | class DDPG(OffPolicyRLModel):
 142 |     """
 143 |     Deep Deterministic Policy Gradient (DDPG) model
 144 | 
 145 |     DDPG: https://arxiv.org/pdf/1509.02971.pdf
 146 | 
 147 |     :param policy: (DDPGPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, LnMlpPolicy, ...)
 148 |     :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str)
 149 |     :param gamma: (float) the discount factor
 150 |     :param memory_policy: (ReplayBuffer) the replay buffer
 151 |         (if None, default to baselines.deepq.replay_buffer.ReplayBuffer)
 152 | 
 153 |         .. deprecated:: 2.6.0
 154 |             This parameter will be removed in a future version
 155 | 
 156 |     :param eval_env: (Gym Environment) the evaluation environment (can be None)
 157 |     :param nb_train_steps: (int) the number of training steps
 158 |     :param nb_rollout_steps: (int) the number of rollout steps
 159 |     :param nb_eval_steps: (int) the number of evaluation steps
 160 |     :param param_noise: (AdaptiveParamNoiseSpec) the parameter noise type (can be None)
 161 |     :param action_noise: (ActionNoise) the action noise type (can be None)
 162 |     :param param_noise_adaption_interval: (int) apply param noise every N steps
 163 |     :param tau: (float) the soft update coefficient (keep old values, between 0 and 1)
 164 |     :param normalize_returns: (bool) should the critic output be normalized
 165 |     :param enable_popart: (bool) enable pop-art normalization of the critic output
 166 |         (https://arxiv.org/pdf/1602.07714.pdf), normalize_returns must be set to True.
 167 |     :param normalize_observations: (bool) should the observation be normalized
 168 |     :param batch_size: (int) the size of the batch for learning the policy
 169 |     :param observation_range: (tuple) the bounding values for the observation
 170 |     :param return_range: (tuple) the bounding values for the critic output
 171 |     :param critic_l2_reg: (float) l2 regularizer coefficient
 172 |     :param actor_lr: (float) the actor learning rate
 173 |     :param critic_lr: (float) the critic learning rate
 174 |     :param clip_norm: (float) clip the gradients (disabled if None)
 175 |     :param reward_scale: (float) the value the reward should be scaled by
 176 |     :param render: (bool) enable rendering of the environment
 177 |     :param render_eval: (bool) enable rendering of the evaluation environment
 178 |     :param memory_limit: (int) the max number of transitions to store, size of the replay buffer
 179 | 
 180 |         .. deprecated:: 2.6.0
 181 |             Use `buffer_size` instead.
 182 | 
 183 |     :param buffer_size: (int) the max number of transitions to store, size of the replay buffer
 184 |     :param random_exploration: (float) Probability of taking a random action (as in an epsilon-greedy strategy)
 185 |         This is not needed for DDPG normally but can help exploring when using HER + DDPG.
 186 |         This hack was present in the original OpenAI Baselines repo (DDPG + HER)
 187 |     :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
 188 |     :param tensorboard_log: (str) the log location for tensorboard (if None, no logging)
 189 |     :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
 190 |     :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation
 191 |     :param full_tensorboard_log: (bool) enable additional logging when using tensorboard
 192 |         WARNING: this logging can take a lot of space quickly
 193 |     :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow).
 194 |         If None (default), use random seed. Note that if you want completely deterministic
 195 |         results, you must set `n_cpu_tf_sess` to 1.
 196 |     :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations
 197 |         If None, the number of cpu of the current machine will be used.
 198 |     """
 199 |     def __init__(self, policy, env, gamma=0.99, memory_policy=None, eval_env=None, nb_train_steps=50,
 200 |                  nb_rollout_steps=100, nb_eval_steps=100, param_noise=None, action_noise=None,
 201 |                  normalize_observations=False, tau=0.001, batch_size=128, param_noise_adaption_interval=50,
 202 |                  normalize_returns=False, enable_popart=False, observation_range=(-5., 5.), critic_l2_reg=0.,
 203 |                  return_range=(-np.inf, np.inf), actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.,
 204 |                  render=False, render_eval=False, memory_limit=None, buffer_size=50000, random_exploration=0.0,
 205 |                  verbose=0, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None,
 206 |                  full_tensorboard_log=False, seed=None, n_cpu_tf_sess=1):
 207 | 
 208 |         super(DDPG, self).__init__(policy=policy, env=env, replay_buffer=None,
 209 |                                    verbose=verbose, policy_base=DDPGPolicy,
 210 |                                    requires_vec_env=False, policy_kwargs=policy_kwargs,
 211 |                                    seed=seed, n_cpu_tf_sess=n_cpu_tf_sess)
 212 | 
 213 |         # Parameters.
 214 |         self.gamma = gamma
 215 |         self.tau = tau
 216 | 
 217 |         # TODO: remove this param in v3.x.x
 218 |         if memory_policy is not None:
 219 |             warnings.warn("memory_policy will be removed in a future version (v3.x.x) "
 220 |                           "it is now ignored and replaced with ReplayBuffer", DeprecationWarning)
 221 | 
 222 |         if memory_limit is not None:
 223 |             warnings.warn("memory_limit will be removed in a future version (v3.x.x) "
 224 |                           "use buffer_size instead", DeprecationWarning)
 225 |             buffer_size = memory_limit
 226 | 
 227 |         self.normalize_observations = normalize_observations
 228 |         self.normalize_returns = normalize_returns
 229 |         self.action_noise = action_noise
 230 |         self.param_noise = param_noise
 231 |         self.return_range = return_range
 232 |         self.observation_range = observation_range
 233 |         self.actor_lr = actor_lr
 234 |         self.critic_lr = critic_lr
 235 |         self.clip_norm = clip_norm
 236 |         self.enable_popart = enable_popart
 237 |         self.reward_scale = reward_scale
 238 |         self.batch_size = batch_size
 239 |         self.critic_l2_reg = critic_l2_reg
 240 |         self.eval_env = eval_env
 241 |         self.render = render
 242 |         self.render_eval = render_eval
 243 |         self.nb_eval_steps = nb_eval_steps
 244 |         self.param_noise_adaption_interval = param_noise_adaption_interval
 245 |         self.nb_train_steps = nb_train_steps
 246 |         self.nb_rollout_steps = nb_rollout_steps
 247 |         self.memory_limit = memory_limit
 248 |         self.buffer_size = buffer_size
 249 |         self.tensorboard_log = tensorboard_log
 250 |         self.full_tensorboard_log = full_tensorboard_log
 251 |         self.random_exploration = random_exploration
 252 | 
 253 |         # init
 254 |         self.graph = None
 255 |         self.stats_sample = None
 256 |         self.replay_buffer = None
 257 |         self.policy_tf = None
 258 |         self.target_init_updates = None
 259 |         self.target_soft_updates = None
 260 |         self.critic_loss = None
 261 |         self.critic_grads = None
 262 |         self.critic_optimizer = None
 263 |         self.sess = None
 264 |         self.stats_ops = None
 265 |         self.stats_names = None
 266 |         self.perturbed_actor_tf = None
 267 |         self.perturb_policy_ops = None
 268 |         self.perturb_adaptive_policy_ops = None
 269 |         self.adaptive_policy_distance = None
 270 |         self.actor_loss = None
 271 |         self.actor_grads = None
 272 |         self.actor_optimizer = None
 273 |         self.old_std = None
 274 |         self.old_mean = None
 275 |         self.renormalize_q_outputs_op = None
 276 |         self.obs_rms = None
 277 |         self.ret_rms = None
 278 |         self.target_policy = None
 279 |         self.actor_tf = None
 280 |         self.normalized_critic_tf = None
 281 |         self.critic_tf = None
 282 |         self.normalized_critic_with_actor_tf = None
 283 |         self.critic_with_actor_tf = None
 284 |         self.target_q = None
 285 |         self.obs_train = None
 286 |         self.action_train_ph = None
 287 |         self.obs_target = None
 288 |         self.action_target = None
 289 |         self.obs_noise = None
 290 |         self.action_noise_ph = None
 291 |         self.obs_adapt_noise = None
 292 |         self.action_adapt_noise = None
 293 |         self.terminals_ph = None
 294 |         self.rewards = None
 295 |         self.actions = None
 296 |         self.critic_target = None
 297 |         self.param_noise_stddev = None
 298 |         self.param_noise_actor = None
 299 |         self.adaptive_param_noise_actor = None
 300 |         self.params = None
 301 |         self.summary = None
 302 |         self.tb_seen_steps = None
 303 | 
 304 |         self.target_params = None
 305 |         self.obs_rms_params = None
 306 |         self.ret_rms_params = None
 307 | 
 308 |         if _init_setup_model:
 309 |             self.setup_model()
 310 | 
 311 |     def _get_pretrain_placeholders(self):
 312 |         policy = self.policy_tf
 313 |         # Rescale
 314 |         deterministic_action = unscale_action(self.action_space, self.actor_tf)
 315 |         return policy.obs_ph, self.actions, deterministic_action
 316 | 
 317 |     def setup_model(self):
 318 |         with SetVerbosity(self.verbose):
 319 | 
 320 |             assert isinstance(self.action_space, gym.spaces.Box), \
 321 |                 "Error: DDPG cannot output a {} action space, only spaces.Box is supported.".format(self.action_space)
 322 |             assert issubclass(self.policy, DDPGPolicy), "Error: the input policy for the DDPG model must be " \
 323 |                                                         "an instance of DDPGPolicy."
 324 | 
 325 |             self.graph = tf.Graph()
 326 |             with self.graph.as_default():
 327 |                 self.set_random_seed(self.seed)
 328 |                 self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph)
 329 | 
 330 |                 self.replay_buffer = ReplayBuffer(self.buffer_size)
 331 | 
 332 |                 with tf.variable_scope("input", reuse=False):
 333 |                     # Observation normalization.
 334 |                     if self.normalize_observations:
 335 |                         with tf.variable_scope('obs_rms'):
 336 |                             self.obs_rms = RunningMeanStd(shape=self.observation_space.shape)
 337 |                     else:
 338 |                         self.obs_rms = None
 339 | 
 340 |                     # Return normalization.
 341 |                     if self.normalize_returns:
 342 |                         with tf.variable_scope('ret_rms'):
 343 |                             self.ret_rms = RunningMeanStd()
 344 |                     else:
 345 |                         self.ret_rms = None
 346 | 
 347 |                     self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, 1, 1, None,
 348 |                                                  **self.policy_kwargs)
 349 | 
 350 |                     # Create target networks.
 351 |                     self.target_policy = self.policy(self.sess, self.observation_space, self.action_space, 1, 1, None,
 352 |                                                      **self.policy_kwargs)
 353 |                     self.obs_target = self.target_policy.obs_ph
 354 |                     self.action_target = self.target_policy.action_ph
 355 | 
 356 |                     normalized_obs = tf.clip_by_value(normalize(self.policy_tf.processed_obs, self.obs_rms),
 357 |                                                        self.observation_range[0], self.observation_range[1])
 358 |                     normalized_next_obs = tf.clip_by_value(normalize(self.target_policy.processed_obs, self.obs_rms),
 359 |                                                        self.observation_range[0], self.observation_range[1])
 360 | 
 361 |                     if self.param_noise is not None:
 362 |                         # Configure perturbed actor.
 363 |                         self.param_noise_actor = self.policy(self.sess, self.observation_space, self.action_space, 1, 1,
 364 |                                                              None, **self.policy_kwargs)
 365 |                         self.obs_noise = self.param_noise_actor.obs_ph
 366 |                         self.action_noise_ph = self.param_noise_actor.action_ph
 367 | 
 368 |                         # Configure separate copy for stddev adoption.
 369 |                         self.adaptive_param_noise_actor = self.policy(self.sess, self.observation_space,
 370 |                                                                       self.action_space, 1, 1, None,
 371 |                                                                       **self.policy_kwargs)
 372 |                         self.obs_adapt_noise = self.adaptive_param_noise_actor.obs_ph
 373 |                         self.action_adapt_noise = self.adaptive_param_noise_actor.action_ph
 374 | 
 375 |                     # Inputs.
 376 |                     self.obs_train = self.policy_tf.obs_ph
 377 |                     self.action_train_ph = self.policy_tf.action_ph
 378 |                     self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name='terminals')
 379 |                     self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards')
 380 |                     self.actions = tf.placeholder(tf.float32, shape=(None,) + self.action_space.shape, name='actions')
 381 |                     self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target')
 382 |                     self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev')
 383 | 
 384 |                 # Create networks and core TF parts that are shared across setup parts.
 385 |                 with tf.variable_scope("model", reuse=False):
 386 |                     self.actor_tf = self.policy_tf.make_actor(normalized_obs)
 387 |                     self.normalized_critic_tf = self.policy_tf.make_critic(normalized_obs, self.actions)
 388 |                     self.normalized_critic_with_actor_tf = self.policy_tf.make_critic(normalized_obs,
 389 |                                                                                       self.actor_tf,
 390 |                                                                                       reuse=True)
 391 |                 # Noise setup
 392 |                 if self.param_noise is not None:
 393 |                     self._setup_param_noise(normalized_obs)
 394 | 
 395 |                 with tf.variable_scope("target", reuse=False):
 396 |                     critic_target = self.target_policy.make_critic(normalized_next_obs,
 397 |                                                                    self.target_policy.make_actor(normalized_next_obs))
 398 | 
 399 |                 with tf.variable_scope("loss", reuse=False):
 400 |                     self.critic_tf = denormalize(
 401 |                         tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]),
 402 |                         self.ret_rms)
 403 | 
 404 |                     self.critic_with_actor_tf = denormalize(
 405 |                         tf.clip_by_value(self.normalized_critic_with_actor_tf,
 406 |                                          self.return_range[0], self.return_range[1]),
 407 |                         self.ret_rms)
 408 | 
 409 |                     q_next_obs = denormalize(critic_target, self.ret_rms)
 410 |                     self.target_q = self.rewards + (1. - self.terminals_ph) * self.gamma * q_next_obs
 411 | 
 412 |                     tf.summary.scalar('critic_target', tf.reduce_mean(self.critic_target))
 413 |                     if self.full_tensorboard_log:
 414 |                         tf.summary.histogram('critic_target', self.critic_target)
 415 | 
 416 |                     # Set up parts.
 417 |                     if self.normalize_returns and self.enable_popart:
 418 |                         self._setup_popart()
 419 |                     self._setup_stats()
 420 |                     self._setup_target_network_updates()
 421 | 
 422 |                 with tf.variable_scope("input_info", reuse=False):
 423 |                     tf.summary.scalar('rewards', tf.reduce_mean(self.rewards))
 424 |                     tf.summary.scalar('param_noise_stddev', tf.reduce_mean(self.param_noise_stddev))
 425 | 
 426 |                     if self.full_tensorboard_log:
 427 |                         tf.summary.histogram('rewards', self.rewards)
 428 |                         tf.summary.histogram('param_noise_stddev', self.param_noise_stddev)
 429 |                         if len(self.observation_space.shape) == 3 and self.observation_space.shape[0] in [1, 3, 4]:
 430 |                             tf.summary.image('observation', self.obs_train)
 431 |                         else:
 432 |                             tf.summary.histogram('observation', self.obs_train)
 433 | 
 434 |                 with tf.variable_scope("Adam_mpi", reuse=False):
 435 |                     self._setup_actor_optimizer()
 436 |                     self._setup_critic_optimizer()
 437 |                     tf.summary.scalar('actor_loss', self.actor_loss)
 438 |                     tf.summary.scalar('critic_loss', self.critic_loss)
 439 | 
 440 |                 self.params = tf_util.get_trainable_vars("model") \
 441 |                     + tf_util.get_trainable_vars('noise/') + tf_util.get_trainable_vars('noise_adapt/')
 442 | 
 443 |                 self.target_params = tf_util.get_trainable_vars("target")
 444 |                 self.obs_rms_params = [var for var in tf.global_variables()
 445 |                                        if "obs_rms" in var.name]
 446 |                 self.ret_rms_params = [var for var in tf.global_variables()
 447 |                                        if "ret_rms" in var.name]
 448 | 
 449 |                 with self.sess.as_default():
 450 |                     self._initialize(self.sess)
 451 | 
 452 |                 self.summary = tf.summary.merge_all()
 453 | 
 454 |     def _setup_target_network_updates(self):
 455 |         """
 456 |         set the target update operations
 457 |         """
 458 |         init_updates, soft_updates = get_target_updates(tf_util.get_trainable_vars('model/'),
 459 |                                                         tf_util.get_trainable_vars('target/'), self.tau,
 460 |                                                         self.verbose)
 461 |         self.target_init_updates = init_updates
 462 |         self.target_soft_updates = soft_updates
 463 | 
 464 |     def _setup_param_noise(self, normalized_obs):
 465 |         """
 466 |         Setup the parameter noise operations
 467 | 
 468 |         :param normalized_obs: (TensorFlow Tensor) the normalized observation
 469 |         """
 470 |         assert self.param_noise is not None
 471 | 
 472 |         with tf.variable_scope("noise", reuse=False):
 473 |             self.perturbed_actor_tf = self.param_noise_actor.make_actor(normalized_obs)
 474 | 
 475 |         with tf.variable_scope("noise_adapt", reuse=False):
 476 |             adaptive_actor_tf = self.adaptive_param_noise_actor.make_actor(normalized_obs)
 477 | 
 478 |         with tf.variable_scope("noise_update_func", reuse=False):
 479 |             if self.verbose >= 2:
 480 |                 logger.info('setting up param noise')
 481 |             self.perturb_policy_ops = get_perturbed_actor_updates('model/pi/', 'noise/pi/', self.param_noise_stddev,
 482 |                                                                   verbose=self.verbose)
 483 | 
 484 |             self.perturb_adaptive_policy_ops = get_perturbed_actor_updates('model/pi/', 'noise_adapt/pi/',
 485 |                                                                            self.param_noise_stddev,
 486 |                                                                            verbose=self.verbose)
 487 |             self.adaptive_policy_distance = tf.sqrt(tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf)))
 488 | 
 489 |     def _setup_actor_optimizer(self):
 490 |         """
 491 |         setup the optimizer for the actor
 492 |         """
 493 |         if self.verbose >= 2:
 494 |             logger.info('setting up actor optimizer')
 495 |         self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)
 496 |         actor_shapes = [var.get_shape().as_list() for var in tf_util.get_trainable_vars('model/pi/')]
 497 |         actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
 498 |         if self.verbose >= 2:
 499 |             logger.info('  actor shapes: {}'.format(actor_shapes))
 500 |             logger.info('  actor params: {}'.format(actor_nb_params))
 501 |         self.actor_grads = tf_util.flatgrad(self.actor_loss, tf_util.get_trainable_vars('model/pi/'),
 502 |                                             clip_norm=self.clip_norm)
 503 |         self.actor_optimizer = MpiAdam(var_list=tf_util.get_trainable_vars('model/pi/'), beta1=0.9, beta2=0.999,
 504 |                                        epsilon=1e-08)
 505 | 
 506 |     def _setup_critic_optimizer(self):
 507 |         """
 508 |         setup the optimizer for the critic
 509 |         """
 510 |         if self.verbose >= 2:
 511 |             logger.info('setting up critic optimizer')
 512 |         normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms),
 513 |                                                        self.return_range[0], self.return_range[1])
 514 |         self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf))
 515 |         if self.critic_l2_reg > 0.:
 516 |             critic_reg_vars = [var for var in tf_util.get_trainable_vars('model/qf/')
 517 |                                if 'bias' not in var.name and 'qf_output' not in var.name and 'b' not in var.name]
 518 |             if self.verbose >= 2:
 519 |                 for var in critic_reg_vars:
 520 |                     logger.info('  regularizing: {}'.format(var.name))
 521 |                 logger.info('  applying l2 regularization with {}'.format(self.critic_l2_reg))
 522 |             critic_reg = tc.layers.apply_regularization(
 523 |                 tc.layers.l2_regularizer(self.critic_l2_reg),
 524 |                 weights_list=critic_reg_vars
 525 |             )
 526 |             self.critic_loss += critic_reg
 527 |         critic_shapes = [var.get_shape().as_list() for var in tf_util.get_trainable_vars('model/qf/')]
 528 |         critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
 529 |         if self.verbose >= 2:
 530 |             logger.info('  critic shapes: {}'.format(critic_shapes))
 531 |             logger.info('  critic params: {}'.format(critic_nb_params))
 532 |         self.critic_grads = tf_util.flatgrad(self.critic_loss, tf_util.get_trainable_vars('model/qf/'),
 533 |                                              clip_norm=self.clip_norm)
 534 |         self.critic_optimizer = MpiAdam(var_list=tf_util.get_trainable_vars('model/qf/'), beta1=0.9, beta2=0.999,
 535 |                                         epsilon=1e-08)
 536 | 
 537 |     def _setup_popart(self):
 538 |         """
 539 |         setup pop-art normalization of the critic output
 540 | 
 541 |         See https://arxiv.org/pdf/1602.07714.pdf for details.
 542 |         Preserving Outputs Precisely, while Adaptively Rescaling Targets”.
 543 |         """
 544 |         self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std')
 545 |         new_std = self.ret_rms.std
 546 |         self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
 547 |         new_mean = self.ret_rms.mean
 548 | 
 549 |         self.renormalize_q_outputs_op = []
 550 |         for out_vars in [[var for var in tf_util.get_trainable_vars('model/qf/') if 'qf_output' in var.name],
 551 |                          [var for var in tf_util.get_trainable_vars('target/qf/') if 'qf_output' in var.name]]:
 552 |             assert len(out_vars) == 2
 553 |             # wieght and bias of the last layer
 554 |             weight, bias = out_vars
 555 |             assert 'kernel' in weight.name
 556 |             assert 'bias' in bias.name
 557 |             assert weight.get_shape()[-1] == 1
 558 |             assert bias.get_shape()[-1] == 1
 559 |             self.renormalize_q_outputs_op += [weight.assign(weight * self.old_std / new_std)]
 560 |             self.renormalize_q_outputs_op += [bias.assign((bias * self.old_std + self.old_mean - new_mean) / new_std)]
 561 | 
 562 |     def _setup_stats(self):
 563 |         """
 564 |         Setup the stat logger for DDPG.
 565 |         """
 566 |         ops = [
 567 |             tf.reduce_mean(self.critic_tf),
 568 |             reduce_std(self.critic_tf),
 569 |             tf.reduce_mean(self.critic_with_actor_tf),
 570 |             reduce_std(self.critic_with_actor_tf),
 571 |             tf.reduce_mean(self.actor_tf),
 572 |             reduce_std(self.actor_tf)
 573 |         ]
 574 |         names = [
 575 |             'reference_Q_mean',
 576 |             'reference_Q_std',
 577 |             'reference_actor_Q_mean',
 578 |             'reference_actor_Q_std',
 579 |             'reference_action_mean',
 580 |             'reference_action_std'
 581 |         ]
 582 | 
 583 |         if self.normalize_returns:
 584 |             ops += [self.ret_rms.mean, self.ret_rms.std]
 585 |             names += ['ret_rms_mean', 'ret_rms_std']
 586 | 
 587 |         if self.normalize_observations:
 588 |             ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)]
 589 |             names += ['obs_rms_mean', 'obs_rms_std']
 590 | 
 591 |         if self.param_noise:
 592 |             ops += [tf.reduce_mean(self.perturbed_actor_tf), reduce_std(self.perturbed_actor_tf)]
 593 |             names += ['reference_perturbed_action_mean', 'reference_perturbed_action_std']
 594 | 
 595 |         self.stats_ops = ops
 596 |         self.stats_names = names
 597 | 
 598 |     def _policy(self, obs, apply_noise=True, compute_q=True):
 599 |         """
 600 |         Get the actions and critic output, from a given observation
 601 | 
 602 |         :param obs: ([float] or [int]) the observation
 603 |         :param apply_noise: (bool) enable the noise
 604 |         :param compute_q: (bool) compute the critic output
 605 |         :return: ([float], float) the action and critic value
 606 |         """
 607 |         obs = np.array(obs).reshape((-1,) + self.observation_space.shape)
 608 |         feed_dict = {self.obs_train: obs}
 609 |         if self.param_noise is not None and apply_noise:
 610 |             actor_tf = self.perturbed_actor_tf
 611 |             feed_dict[self.obs_noise] = obs
 612 |         else:
 613 |             actor_tf = self.actor_tf
 614 | 
 615 |         if compute_q:
 616 |             action, q_value = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict)
 617 |         else:
 618 |             action = self.sess.run(actor_tf, feed_dict=feed_dict)
 619 |             q_value = None
 620 | 
 621 |         action = action.flatten()
 622 |         if self.action_noise is not None and apply_noise:
 623 |             noise = self.action_noise()
 624 |             action += noise
 625 |         action = np.clip(action, -1, 1)
 626 |         return action, q_value
 627 | 
 628 |     def _store_transition(self, obs, action, reward, next_obs, done):
 629 |         """
 630 |         Store a transition in the replay buffer
 631 | 
 632 |         :param obs: ([float] or [int]) the last observation
 633 |         :param action: ([float]) the action
 634 |         :param reward: (float] the reward
 635 |         :param next_obs: ([float] or [int]) the current observation
 636 |         :param done: (bool) Whether the episode is over
 637 |         """
 638 |         reward *= self.reward_scale
 639 |         self.replay_buffer.add(obs, action, reward, next_obs, float(done))
 640 |         if self.normalize_observations:
 641 |             self.obs_rms.update(np.array([obs]))
 642 | 
 643 |     def _train_step(self, step, writer, log=False):
 644 |         """
 645 |         run a step of training from batch
 646 | 
 647 |         :param step: (int) the current step iteration
 648 |         :param writer: (TensorFlow Summary.writer) the writer for tensorboard
 649 |         :param log: (bool) whether or not to log to metadata
 650 |         :return: (float, float) critic loss, actor loss
 651 |         """
 652 |         # Get a batch
 653 |         obs, actions, rewards, next_obs, terminals = self.replay_buffer.sample(batch_size=self.batch_size,
 654 |                                                                                env=self._vec_normalize_env)
 655 |         # Reshape to match previous behavior and placeholder shape
 656 |         rewards = rewards.reshape(-1, 1)
 657 |         terminals = terminals.reshape(-1, 1)
 658 | 
 659 |         if self.normalize_returns and self.enable_popart:
 660 |             old_mean, old_std, target_q = self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_q],
 661 |                                                         feed_dict={
 662 |                                                             self.obs_target: next_obs,
 663 |                                                             self.rewards: rewards,
 664 |                                                             self.terminals_ph: terminals
 665 |                                                         })
 666 |             self.ret_rms.update(target_q.flatten())
 667 |             self.sess.run(self.renormalize_q_outputs_op, feed_dict={
 668 |                 self.old_std: np.array([old_std]),
 669 |                 self.old_mean: np.array([old_mean]),
 670 |             })
 671 | 
 672 |         else:
 673 |             target_q = self.sess.run(self.target_q, feed_dict={
 674 |                 self.obs_target: next_obs,
 675 |                 self.rewards: rewards,
 676 |                 self.terminals_ph: terminals
 677 |             })
 678 | 
 679 |         # Get all gradients and perform a synced update.
 680 |         ops = [self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss]
 681 |         td_map = {
 682 |             self.obs_train: obs,
 683 |             self.actions: actions,
 684 |             self.action_train_ph: actions,
 685 |             self.rewards: rewards,
 686 |             self.critic_target: target_q,
 687 |             self.param_noise_stddev: 0 if self.param_noise is None else self.param_noise.current_stddev
 688 |         }
 689 |         if writer is not None:
 690 |             # run loss backprop with summary if the step_id was not already logged (can happen with the right
 691 |             # parameters as the step value is only an estimate)
 692 |             if self.full_tensorboard_log and log and step not in self.tb_seen_steps:
 693 |                 run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
 694 |                 run_metadata = tf.RunMetadata()
 695 |                 summary, actor_grads, actor_loss, critic_grads, critic_loss = \
 696 |                     self.sess.run([self.summary] + ops, td_map, options=run_options, run_metadata=run_metadata)
 697 | 
 698 |                 writer.add_run_metadata(run_metadata, 'step%d' % step)
 699 |                 self.tb_seen_steps.append(step)
 700 |             else:
 701 |                 summary, actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run([self.summary] + ops,
 702 |                                                                                             td_map)
 703 |             writer.add_summary(summary, step)
 704 |         else:
 705 |             actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, td_map)
 706 | 
 707 |         self.actor_optimizer.update(actor_grads, learning_rate=self.actor_lr)
 708 |         self.critic_optimizer.update(critic_grads, learning_rate=self.critic_lr)
 709 | 
 710 |         return critic_loss, actor_loss
 711 | 
 712 |     def _initialize(self, sess):
 713 |         """
 714 |         initialize the model parameters and optimizers
 715 | 
 716 |         :param sess: (TensorFlow Session) the current TensorFlow session
 717 |         """
 718 |         self.sess = sess
 719 |         self.sess.run(tf.global_variables_initializer())
 720 |         self.actor_optimizer.sync()
 721 |         self.critic_optimizer.sync()
 722 |         self.sess.run(self.target_init_updates)
 723 | 
 724 |     def _update_target_net(self):
 725 |         """
 726 |         run target soft update operation
 727 |         """
 728 |         self.sess.run(self.target_soft_updates)
 729 | 
 730 |     def _get_stats(self):
 731 |         """
 732 |         Get the mean and standard deviation of the model's inputs and outputs
 733 | 
 734 |         :return: (dict) the means and stds
 735 |         """
 736 |         if self.stats_sample is None:
 737 |             # Get a sample and keep that fixed for all further computations.
 738 |             # This allows us to estimate the change in value for the same set of inputs.
 739 |             obs, actions, rewards, next_obs, terminals = self.replay_buffer.sample(batch_size=self.batch_size,
 740 |                                                                                    env=self._vec_normalize_env)
 741 |             self.stats_sample = {
 742 |                 'obs': obs,
 743 |                 'actions': actions,
 744 |                 'rewards': rewards,
 745 |                 'next_obs': next_obs,
 746 |                 'terminals': terminals
 747 |             }
 748 | 
 749 |         feed_dict = {
 750 |             self.actions: self.stats_sample['actions']
 751 |         }
 752 | 
 753 |         for placeholder in [self.action_train_ph, self.action_target, self.action_adapt_noise, self.action_noise_ph]:
 754 |             if placeholder is not None:
 755 |                 feed_dict[placeholder] = self.stats_sample['actions']
 756 | 
 757 |         for placeholder in [self.obs_train, self.obs_target, self.obs_adapt_noise, self.obs_noise]:
 758 |             if placeholder is not None:
 759 |                 feed_dict[placeholder] = self.stats_sample['obs']
 760 | 
 761 |         values = self.sess.run(self.stats_ops, feed_dict=feed_dict)
 762 | 
 763 |         names = self.stats_names[:]
 764 |         assert len(names) == len(values)
 765 |         stats = dict(zip(names, values))
 766 | 
 767 |         if self.param_noise is not None:
 768 |             stats = {**stats, **self.param_noise.get_stats()}
 769 | 
 770 |         return stats
 771 | 
 772 |     def _adapt_param_noise(self):
 773 |         """
 774 |         calculate the adaptation for the parameter noise
 775 | 
 776 |         :return: (float) the mean distance for the parameter noise
 777 |         """
 778 |         if self.param_noise is None:
 779 |             return 0.
 780 | 
 781 |         # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
 782 |         obs, *_ = self.replay_buffer.sample(batch_size=self.batch_size, env=self._vec_normalize_env)
 783 |         self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={
 784 |             self.param_noise_stddev: self.param_noise.current_stddev,
 785 |         })
 786 |         distance = self.sess.run(self.adaptive_policy_distance, feed_dict={
 787 |             self.obs_adapt_noise: obs, self.obs_train: obs,
 788 |             self.param_noise_stddev: self.param_noise.current_stddev,
 789 |         })
 790 | 
 791 |         mean_distance = MPI.COMM_WORLD.allreduce(distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
 792 |         self.param_noise.adapt(mean_distance)
 793 |         return mean_distance
 794 | 
 795 |     def _reset(self):
 796 |         """
 797 |         Reset internal state after an episode is complete.
 798 |         """
 799 |         if self.action_noise is not None:
 800 |             self.action_noise.reset()
 801 |         if self.param_noise is not None:
 802 |             self.sess.run(self.perturb_policy_ops, feed_dict={
 803 |                 self.param_noise_stddev: self.param_noise.current_stddev,
 804 |             })
 805 | 
 806 |     def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="DDPG",
 807 |               reset_num_timesteps=True, replay_wrapper=None):
 808 | 
 809 |         new_tb_log = self._init_num_timesteps(reset_num_timesteps)
 810 |         callback = self._init_callback(callback)
 811 | 
 812 |         if replay_wrapper is not None:
 813 |             self.replay_buffer = replay_wrapper(self.replay_buffer)
 814 | 
 815 |         with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
 816 |                 as writer:
 817 |             self._setup_learn()
 818 | 
 819 |             # a list for tensorboard logging, to prevent logging with the same step number, if it already occured
 820 |             self.tb_seen_steps = []
 821 | 
 822 |             rank = MPI.COMM_WORLD.Get_rank()
 823 | 
 824 |             if self.verbose >= 2:
 825 |                 logger.log('Using agent with the following configuration:')
 826 |                 logger.log(str(self.__dict__.items()))
 827 | 
 828 |             eval_episode_rewards_history = deque(maxlen=100)
 829 |             episode_rewards_history = deque(maxlen=100)
 830 |             episode_successes = []
 831 | 
 832 | 
 833 |             with self.sess.as_default(), self.graph.as_default():
 834 |                 # Prepare everything.
 835 |                 self._reset()
 836 |                 obs = self.env.reset()
 837 |                 # Retrieve unnormalized observation for saving into the buffer
 838 |                 if self._vec_normalize_env is not None:
 839 |                     obs_ = self._vec_normalize_env.get_original_obs().squeeze()
 840 |                 eval_obs = None
 841 |                 if self.eval_env is not None:
 842 |                     eval_obs = self.eval_env.reset()
 843 |                 episode_reward = 0.
 844 |                 episode_step = 0
 845 |                 episodes = 0
 846 |                 step = 0
 847 |                 total_steps = 0
 848 | 
 849 |                 start_time = time.time()
 850 | 
 851 |                 epoch_episode_rewards = []
 852 |                 epoch_episode_steps = []
 853 |                 epoch_actor_losses = []
 854 |                 epoch_critic_losses = []
 855 |                 epoch_adaptive_distances = []
 856 |                 eval_episode_rewards = []
 857 |                 eval_qs = []
 858 |                 epoch_actions = []
 859 |                 epoch_qs = []
 860 |                 epoch_episodes = 0
 861 |                 epoch = 0
 862 | 
 863 |                 callback.on_training_start(locals(), globals())
 864 | 
 865 |                 while True:
 866 |                     for _ in range(log_interval):
 867 |                         callback.on_rollout_start()
 868 |                         # Perform rollouts.
 869 |                         for _ in range(self.nb_rollout_steps):
 870 | 
 871 |                             if total_steps >= total_timesteps:
 872 |                                 callback.on_training_end()
 873 |                                 return self
 874 | 
 875 |                             # Predict next action.
 876 |                             action, q_value = self._policy(obs, apply_noise=True, compute_q=True)
 877 |                             assert action.shape == self.env.action_space.shape
 878 | 
 879 |                             # Execute next action.
 880 |                             if rank == 0 and self.render:
 881 |                                 self.env.render()
 882 | 
 883 |                             # Randomly sample actions from a uniform distribution
 884 |                             # with a probability self.random_exploration (used in HER + DDPG)
 885 |                             if np.random.rand() < self.random_exploration:
 886 |                                 # actions sampled from action space are from range specific to the environment
 887 |                                 # but algorithm operates on tanh-squashed actions therefore simple scaling is used
 888 |                                 unscaled_action = self.action_space.sample()
 889 |                                 action = scale_action(self.action_space, unscaled_action)
 890 |                             else:
 891 |                                 # inferred actions need to be transformed to environment action_space before stepping
 892 |                                 unscaled_action = unscale_action(self.action_space, action)
 893 | 
 894 |                             new_obs, reward, done, info = self.env.step(unscaled_action)
 895 | 
 896 |                             self.num_timesteps += 1
 897 | 
 898 |                             if callback.on_step() is False:
 899 |                                 callback.on_training_end()
 900 |                                 return self
 901 | 
 902 |                             step += 1
 903 |                             total_steps += 1
 904 |                             if rank == 0 and self.render:
 905 |                                 self.env.render()
 906 | 
 907 |                             # Book-keeping.
 908 |                             epoch_actions.append(action)
 909 |                             epoch_qs.append(q_value)
 910 | 
 911 |                             # Store only the unnormalized version
 912 |                             if self._vec_normalize_env is not None:
 913 |                                 new_obs_ = self._vec_normalize_env.get_original_obs().squeeze()
 914 |                                 reward_ = self._vec_normalize_env.get_original_reward().squeeze()
 915 |                             else:
 916 |                                 # Avoid changing the original ones
 917 |                                 obs_, new_obs_, reward_ = obs, new_obs, reward
 918 | 
 919 |                             self._store_transition(obs_, action, reward_, new_obs_, done)
 920 |                             obs = new_obs
 921 |                             # Save the unnormalized observation
 922 |                             if self._vec_normalize_env is not None:
 923 |                                 obs_ = new_obs_
 924 | 
 925 |                             episode_reward += reward_
 926 |                             episode_step += 1
 927 | 
 928 |                             if writer is not None:
 929 |                                 ep_rew = np.array([reward_]).reshape((1, -1))
 930 |                                 ep_done = np.array([done]).reshape((1, -1))
 931 |                                 tf_util.total_episode_reward_logger(self.episode_reward, ep_rew, ep_done,
 932 |                                                                     writer, self.num_timesteps)
 933 | 
 934 |                             if done:
 935 |                                 # Episode done.
 936 |                                 epoch_episode_rewards.append(episode_reward)
 937 |                                 episode_rewards_history.append(episode_reward)
 938 |                                 epoch_episode_steps.append(episode_step)
 939 |                                 episode_reward = 0.
 940 |                                 episode_step = 0
 941 |                                 epoch_episodes += 1
 942 |                                 episodes += 1
 943 | 
 944 |                                 maybe_is_success = info.get('is_success')
 945 |                                 if maybe_is_success is not None:
 946 |                                     episode_successes.append(float(maybe_is_success))
 947 | 
 948 |                                 self._reset()
 949 |                                 if not isinstance(self.env, VecEnv):
 950 |                                     obs = self.env.reset()
 951 | 
 952 |                         callback.on_rollout_end()
 953 |                         # Train.
 954 |                         epoch_actor_losses = []
 955 |                         epoch_critic_losses = []
 956 |                         epoch_adaptive_distances = []
 957 |                         for t_train in range(self.nb_train_steps):
 958 |                             # Not enough samples in the replay buffer
 959 |                             if not self.replay_buffer.can_sample(self.batch_size):
 960 |                                 break
 961 | 
 962 |                             # Adapt param noise, if necessary.
 963 |                             if len(self.replay_buffer) >= self.batch_size and \
 964 |                                     t_train % self.param_noise_adaption_interval == 0:
 965 |                                 distance = self._adapt_param_noise()
 966 |                                 epoch_adaptive_distances.append(distance)
 967 | 
 968 |                             # weird equation to deal with the fact the nb_train_steps will be different
 969 |                             # to nb_rollout_steps
 970 |                             step = (int(t_train * (self.nb_rollout_steps / self.nb_train_steps)) +
 971 |                                     self.num_timesteps - self.nb_rollout_steps)
 972 | 
 973 |                             critic_loss, actor_loss = self._train_step(step, writer, log=t_train == 0)
 974 |                             epoch_critic_losses.append(critic_loss)
 975 |                             epoch_actor_losses.append(actor_loss)
 976 |                             self._update_target_net()
 977 | 
 978 |                         # Evaluate.
 979 |                         eval_episode_rewards = []
 980 |                         eval_qs = []
 981 |                         if self.eval_env is not None:
 982 |                             eval_episode_reward = 0.
 983 |                             for _ in range(self.nb_eval_steps):
 984 |                                 if total_steps >= total_timesteps:
 985 |                                     return self
 986 | 
 987 |                                 eval_action, eval_q = self._policy(eval_obs, apply_noise=False, compute_q=True)
 988 |                                 unscaled_action = unscale_action(self.action_space, eval_action)
 989 |                                 eval_obs, eval_r, eval_done, _ = self.eval_env.step(unscaled_action)
 990 |                                 if self.render_eval:
 991 |                                     self.eval_env.render()
 992 |                                 eval_episode_reward += eval_r
 993 | 
 994 |                                 eval_qs.append(eval_q)
 995 |                                 if eval_done:
 996 |                                     if not isinstance(self.env, VecEnv):
 997 |                                         eval_obs = self.eval_env.reset()
 998 |                                     eval_episode_rewards.append(eval_episode_reward)
 999 |                                     eval_episode_rewards_history.append(eval_episode_reward)
1000 |                                     eval_episode_reward = 0.
1001 | 
1002 |                     mpi_size = MPI.COMM_WORLD.Get_size()
1003 |                     # Not enough samples in the replay buffer
1004 |                     if not self.replay_buffer.can_sample(self.batch_size):
1005 |                         continue
1006 |                     # Log stats.
1007 |                     # XXX shouldn't call np.mean on variable length lists
1008 |                     duration = time.time() - start_time
1009 |                     stats = self._get_stats()
1010 |                     combined_stats = stats.copy()
1011 |                     combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
1012 |                     combined_stats['rollout/return_history'] = np.mean(episode_rewards_history)
1013 |                     combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
1014 |                     combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
1015 |                     combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
1016 |                     combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
1017 |                     combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
1018 |                     if len(epoch_adaptive_distances) != 0:
1019 |                         combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances)
1020 |                     combined_stats['total/duration'] = duration
1021 |                     combined_stats['total/steps_per_second'] = float(step) / float(duration)
1022 |                     combined_stats['total/episodes'] = episodes
1023 |                     combined_stats['rollout/episodes'] = epoch_episodes
1024 |                     combined_stats['rollout/actions_std'] = np.std(epoch_actions)
1025 |                     # Evaluation statistics.
1026 |                     if self.eval_env is not None:
1027 |                         combined_stats['eval/return'] = np.mean(eval_episode_rewards)
1028 |                         combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history)
1029 |                         combined_stats['eval/Q'] = np.mean(eval_qs)
1030 |                         combined_stats['eval/episodes'] = len(eval_episode_rewards)
1031 | 
1032 |                     def as_scalar(scalar):
1033 |                         """
1034 |                         check and return the input if it is a scalar, otherwise raise ValueError
1035 | 
1036 |                         :param scalar: (Any) the object to check
1037 |                         :return: (Number) the scalar if x is a scalar
1038 |                         """
1039 |                         if isinstance(scalar, np.ndarray):
1040 |                             assert scalar.size == 1
1041 |                             return scalar[0]
1042 |                         elif np.isscalar(scalar):
1043 |                             return scalar
1044 |                         else:
1045 |                             raise ValueError('expected scalar, got %s' % scalar)
1046 | 
1047 |                     combined_stats_sums = MPI.COMM_WORLD.allreduce(
1048 |                         np.array([as_scalar(x) for x in combined_stats.values()]))
1049 |                     combined_stats = {k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums)}
1050 | 
1051 |                     # Total statistics.
1052 |                     combined_stats['total/epochs'] = epoch + 1
1053 |                     combined_stats['total/steps'] = step
1054 | 
1055 |                     for key in sorted(combined_stats.keys()):
1056 |                         logger.record_tabular(key, combined_stats[key])
1057 |                     if len(episode_successes) > 0:
1058 |                         logger.logkv("success rate", np.mean(episode_successes[-100:]))
1059 |                     logger.dump_tabular()
1060 |                     logger.info('')
1061 |                     logdir = logger.get_dir()
1062 |                     if rank == 0 and logdir:
1063 |                         if hasattr(self.env, 'get_state'):
1064 |                             with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as file_handler:
1065 |                                 pickle.dump(self.env.get_state(), file_handler)
1066 |                         if self.eval_env and hasattr(self.eval_env, 'get_state'):
1067 |                             with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as file_handler:
1068 |                                 pickle.dump(self.eval_env.get_state(), file_handler)
1069 | 
1070 |     def predict(self, observation, state=None, mask=None, deterministic=True):
1071 |         observation = np.array(observation)
1072 |         vectorized_env = self._is_vectorized_observation(observation, self.observation_space)
1073 | 
1074 |         observation = observation.reshape((-1,) + self.observation_space.shape)
1075 |         actions, _, = self._policy(observation, apply_noise=not deterministic, compute_q=False)
1076 |         actions = actions.reshape((-1,) + self.action_space.shape)  # reshape to the correct action shape
1077 |         actions = unscale_action(self.action_space, actions)  # scale the output for the prediction
1078 | 
1079 |         if not vectorized_env:
1080 |             actions = actions[0]
1081 | 
1082 |         return actions, None
1083 | 
1084 |     def action_probability(self, observation, state=None, mask=None, actions=None, logp=False):
1085 |         _ = np.array(observation)
1086 | 
1087 |         if actions is not None:
1088 |             raise ValueError("Error: DDPG does not have action probabilities.")
1089 | 
1090 |         # here there are no action probabilities, as DDPG does not use a probability distribution
1091 |         warnings.warn("Warning: action probability is meaningless for DDPG. Returning None")
1092 |         return None
1093 | 
1094 |     def get_parameter_list(self):
1095 |         return (self.params +
1096 |                 self.target_params +
1097 |                 self.obs_rms_params +
1098 |                 self.ret_rms_params)
1099 | 
1100 |     def save(self, save_path, cloudpickle=False):
1101 |         data = {
1102 |             "observation_space": self.observation_space,
1103 |             "action_space": self.action_space,
1104 |             "nb_eval_steps": self.nb_eval_steps,
1105 |             "param_noise_adaption_interval": self.param_noise_adaption_interval,
1106 |             "nb_train_steps": self.nb_train_steps,
1107 |             "nb_rollout_steps": self.nb_rollout_steps,
1108 |             "verbose": self.verbose,
1109 |             "param_noise": self.param_noise,
1110 |             "action_noise": self.action_noise,
1111 |             "gamma": self.gamma,
1112 |             "tau": self.tau,
1113 |             "normalize_returns": self.normalize_returns,
1114 |             "enable_popart": self.enable_popart,
1115 |             "normalize_observations": self.normalize_observations,
1116 |             "batch_size": self.batch_size,
1117 |             "observation_range": self.observation_range,
1118 |             "return_range": self.return_range,
1119 |             "critic_l2_reg": self.critic_l2_reg,
1120 |             "actor_lr": self.actor_lr,
1121 |             "critic_lr": self.critic_lr,
1122 |             "clip_norm": self.clip_norm,
1123 |             "reward_scale": self.reward_scale,
1124 |             "memory_limit": self.memory_limit,
1125 |             "buffer_size": self.buffer_size,
1126 |             "random_exploration": self.random_exploration,
1127 |             "policy": self.policy,
1128 |             "n_envs": self.n_envs,
1129 |             "n_cpu_tf_sess": self.n_cpu_tf_sess,
1130 |             "seed": self.seed,
1131 |             "_vectorize_action": self._vectorize_action,
1132 |             "policy_kwargs": self.policy_kwargs
1133 |         }
1134 | 
1135 |         params_to_save = self.get_parameters()
1136 | 
1137 |         self._save_to_file(save_path,
1138 |                            data=data,
1139 |                            params=params_to_save,
1140 |                            cloudpickle=cloudpickle)
1141 | 
1142 |     @classmethod
1143 |     def load(cls, load_path, env=None, custom_objects=None, **kwargs):
1144 |         data, params = cls._load_from_file(load_path, custom_objects=custom_objects)
1145 | 
1146 |         if 'policy_kwargs' in kwargs and kwargs['policy_kwargs'] != data['policy_kwargs']:
1147 |             raise ValueError("The specified policy kwargs do not equal the stored policy kwargs. "
1148 |                              "Stored kwargs: {}, specified kwargs: {}".format(data['policy_kwargs'],
1149 |                                                                               kwargs['policy_kwargs']))
1150 | 
1151 |         model = cls(None, env, _init_setup_model=False)
1152 |         model.__dict__.update(data)
1153 |         model.__dict__.update(kwargs)
1154 |         model.set_env(env)
1155 |         model.setup_model()
1156 |         # Patch for version < v2.6.0, duplicated keys where saved
1157 |         if len(params) > len(model.get_parameter_list()):
1158 |             n_params = len(model.params)
1159 |             n_target_params = len(model.target_params)
1160 |             n_normalisation_params = len(model.obs_rms_params) + len(model.ret_rms_params)
1161 |             # Check that the issue is the one from
1162 |             # https://github.com/hill-a/stable-baselines/issues/363
1163 |             assert len(params) == 2 * (n_params + n_target_params) + n_normalisation_params,\
1164 |                 "The number of parameter saved differs from the number of parameters"\
1165 |                 " that should be loaded: {}!={}".format(len(params), len(model.get_parameter_list()))
1166 |             # Remove duplicates
1167 |             params_ = params[:n_params + n_target_params]
1168 |             if n_normalisation_params > 0:
1169 |                 params_ += params[-n_normalisation_params:]
1170 |             params = params_
1171 |         model.load_parameters(params)
1172 | 
1173 |         return model
1174 | 


--------------------------------------------------------------------------------
/install/training-pip-requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow==1.14.0
2 | tensorboard==1.14.0
3 | cloudpickle==1.3.0
4 | pyKDL
5 | rospkg
6 | catkin_pkg
7 | matplotlib
8 | stable-baselines[mpi]==2.10.0


--------------------------------------------------------------------------------
/media/AMBF_Robots.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WPI-AIM/ambf_rl/79154bc3b1c6743752851829bf04bd6bd9431d24/media/AMBF_Robots.png


--------------------------------------------------------------------------------
/media/architecture-overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WPI-AIM/ambf_rl/79154bc3b1c6743752851829bf04bd6bd9431d24/media/architecture-overview.png


--------------------------------------------------------------------------------
/scripts/dVRK/PSM_cartesian_ddpg_algorithm.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # //============================================================================
  4 | # /*
  5 | #     Software License Agreement (BSD License)
  6 | #     Copyright (c) 2019, AMBF
  7 | #     (www.aimlab.wpi.edu)
  8 | 
  9 | #     All rights reserved.
 10 | 
 11 | #     Redistribution and use in source and binary forms, with or without
 12 | #     modification, are permitted provided that the following conditions
 13 | #     are met:
 14 | 
 15 | #     * Redistributions of source code must retain the above copyright
 16 | #     notice, this list of conditions and the following disclaimer.
 17 | 
 18 | #     * Redistributions in binary form must reproduce the above
 19 | #     copyright notice, this list of conditions and the following
 20 | #     disclaimer in the documentation and/or other materials provided
 21 | #     with the distribution.
 22 | 
 23 | #     * Neither the name of authors nor the names of its contributors may
 24 | #     be used to endorse or promote products derived from this software
 25 | #     without specific prior written permission.
 26 | 
 27 | #     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 28 | #     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 29 | #     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 30 | #     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 31 | #     COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 32 | #     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 33 | #     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 34 | #     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 35 | #     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 36 | #     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 37 | #     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 38 | #     POSSIBILITY OF SUCH DAMAGE.
 39 | 
 40 | #     \author    <http://aimlab.wpi.edu>
 41 | #     \author    <dkoolrajamani@wpi.edu>, <vvarier@wpi.edu>, <amunawar@wpi.edu>
 42 | #     \author    Dhruv Kool Rajamani, Vignesh Manoj Varier, and Adnan Munawar
 43 | #     \version   1.0.0
 44 | # */
 45 | # //============================================================================
 46 | 
 47 | # System imports
 48 | import os, sys, time, datetime
 49 | 
 50 | # Additional packages
 51 | import numpy as np
 52 | 
 53 | # ARL Env
 54 | from dVRK.PSM_cartesian_ddpg_env import PSMCartesianDDPGEnv
 55 | 
 56 | # Stable baselines algorithms
 57 | from stable_baselines.ddpg.policies import MlpPolicy
 58 | from stable_baselines import HER, DDPG
 59 | from stable_baselines.common.noise import NormalActionNoise
 60 | from stable_baselines.common.noise import OrnsteinUhlenbeckActionNoise, AdaptiveParamNoiseSpec
 61 | from stable_baselines.common.callbacks import CheckpointCallback
 62 | 
 63 | 
 64 | def redirect_stdout(filepath: str = None):
 65 |   """Redirect the ouput stream to a file. Also redirect error output stream
 66 |   """
 67 |   cdir = os.getcwd()
 68 |   basepath = os.path.join(cdir, '.logs')
 69 | 
 70 |   if not os.path.exists(basepath):
 71 |     os.makedirs(basepath)
 72 | 
 73 |   if filepath is None:
 74 |     now = datetime.datetime.now()
 75 |     filepath = 'log_' + now.strftime("%Y_%m_%d-%H_%M_%S.txt")
 76 |     filepath = os.path.join(basepath, filepath)
 77 | 
 78 |   err_filepath = filepath[:-4] + '_err.txt'
 79 | 
 80 |   if os.path.exists(filepath):
 81 |     filepath = filepath[:-4]
 82 |     filepath += now.strftime("_%H_%M_%S") + '.txt'
 83 | 
 84 |   sys.stdout = open(filepath, 'w')
 85 |   sys.stderr = open(err_filepath, 'w')
 86 |   print("Began logging")
 87 |   return
 88 | 
 89 | 
 90 | def main(env: PSMCartesianDDPGEnv):
 91 |   # the noise objects for DDPG
 92 |   n_actions = env.action.action_space.shape[0]
 93 |   param_noise = None
 94 |   action_noise = OrnsteinUhlenbeckActionNoise(
 95 |     mean=np.zeros(n_actions),
 96 |     sigma=float(0.5) * np.ones(n_actions)
 97 |   )
 98 | 
 99 |   model = DDPG(
100 |     MlpPolicy,
101 |     env,
102 |     gamma=0.95,
103 |     verbose=1,
104 |     nb_train_steps=300,
105 |     nb_rollout_steps=150,
106 |     param_noise=param_noise,
107 |     batch_size=128,
108 |     action_noise=action_noise,
109 |     random_exploration=0.05,
110 |     normalize_observations=True,
111 |     tensorboard_log="./ddpg_dvrk_tensorboard/",
112 |     observation_range=(-1.5,
113 |                        1.5),
114 |     critic_l2_reg=0.01
115 |   )
116 | 
117 |   model.learn(
118 |     total_timesteps=4000000,
119 |     log_interval=100,
120 |     callback=CheckpointCallback(save_freq=100000,
121 |                                 save_path="./ddpg_dvrk_tensorboard/")
122 |   )
123 |   model.save("./ddpg_robot_env")
124 | 
125 |   # NOTE:
126 |   # If continuing learning from previous checkpoint,
127 |   # Comment above chunk of code {model=DDPG(''') till model.save("./her_robot_env")} and uncomment below lines:
128 |   # Replace the XXXXX below with the largest number present in (rl_model_) directory ./ddpg_dvrk_tensorboard/
129 |   # remaining_training_steps = 4000000 - XXXXX
130 |   # model_log_dir = './ddpg_dvrk_tensorboard/rl_model_XXXXX_steps.zip'
131 |   # model = DDPG.load(model_log_dir, env=env)
132 |   # # Reset the model
133 |   # env.reset()
134 |   # model.learn(remaining_training_steps, log_interval=100,
135 |   #             callback=CheckpointCallback(save_freq=100000, save_path="./ddpg_dvrk_tensorboard/"))
136 |   # model.save("./ddpg_robot_env")
137 | 
138 | 
139 | def load_model(eval_env):
140 |   model = DDPG.load('./ddpg_robot_env', env=eval_env)
141 |   count = 0
142 |   step_num_arr = []
143 |   for _ in range(20):
144 |     number_steps = 0
145 |     obs = eval_env.reset()
146 |     for _ in range(400):
147 |       action, _ = model.predict(obs)
148 |       obs, reward, done, _ = eval_env.step(action)
149 |       number_steps += 1
150 |       if done:
151 |         step_num_arr.append(number_steps)
152 |         count += 1
153 |         print("----------------It reached terminal state -------------------")
154 |         break
155 |   print(
156 |     "Robot reached the goal position successfully ",
157 |     count,
158 |     " times and the Average step count was ",
159 |     np.average(np.array(step_num_arr))
160 |   )
161 | 
162 | 
163 | if __name__ == '__main__':
164 |   # redirect_stdout()
165 |   root_link_name = 'baselink'
166 |   env_kwargs = {
167 |     'action_space_limit': 0.05,
168 |     'goal_position_range': 0.05,
169 |     'position_error_threshold': 0.01,
170 |     'goal_error_margin': 0.0075,
171 |     'joint_limits':
172 |       {
173 |         'lower_limit': np.array([-0.2,
174 |                                  -0.2,
175 |                                  0.1,
176 |                                  -1.5,
177 |                                  -1.5,
178 |                                  -1.5,
179 |                                  -1.5]),
180 |         'upper_limit': np.array([0.2,
181 |                                  0.2,
182 |                                  0.24,
183 |                                  1.5,
184 |                                  1.5,
185 |                                  1.5,
186 |                                  1.5])
187 |       },
188 |     'workspace_limits':
189 |       {
190 |         'lower_limit': np.array([-0.04,
191 |                                  -0.03,
192 |                                  -0.2]),
193 |         'upper_limit': np.array([0.03,
194 |                                  0.04,
195 |                                  -0.091])
196 |       },
197 |     'enable_step_throttling': False,
198 |     'steps_to_print': 10000
199 |   }
200 |   # Training
201 |   ambf_env = PSMCartesianDDPGEnv(**env_kwargs)
202 |   time.sleep(5)
203 |   ambf_env.make(root_link_name)
204 |   ambf_env.reset()
205 |   main(env=ambf_env)
206 |   ambf_env.ambf_client.clean_up()
207 |   # Evaluate learnt policy
208 |   eval_env = PSMCartesianDDPGEnv(**env_kwargs)
209 |   time.sleep(5)
210 |   eval_env.make(root_link_name)
211 |   eval_env.reset()
212 |   load_model(eval_env=eval_env)
213 |   eval_env.ambf_client.clean_up()
214 | 


--------------------------------------------------------------------------------
/scripts/dVRK/PSM_cartesian_herddpg_algorithm.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # //============================================================================
  4 | # /*
  5 | #     Software License Agreement (BSD License)
  6 | #     Copyright (c) 2019, AMBF
  7 | #     (www.aimlab.wpi.edu)
  8 | 
  9 | #     All rights reserved.
 10 | 
 11 | #     Redistribution and use in source and binary forms, with or without
 12 | #     modification, are permitted provided that the following conditions
 13 | #     are met:
 14 | 
 15 | #     * Redistributions of source code must retain the above copyright
 16 | #     notice, this list of conditions and the following disclaimer.
 17 | 
 18 | #     * Redistributions in binary form must reproduce the above
 19 | #     copyright notice, this list of conditions and the following
 20 | #     disclaimer in the documentation and/or other materials provided
 21 | #     with the distribution.
 22 | 
 23 | #     * Neither the name of authors nor the names of its contributors may
 24 | #     be used to endorse or promote products derived from this software
 25 | #     without specific prior written permission.
 26 | 
 27 | #     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 28 | #     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 29 | #     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 30 | #     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 31 | #     COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 32 | #     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 33 | #     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 34 | #     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 35 | #     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 36 | #     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 37 | #     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 38 | #     POSSIBILITY OF SUCH DAMAGE.
 39 | 
 40 | #     \author    <http://aimlab.wpi.edu>
 41 | #     \author    <dkoolrajamani@wpi.edu>, <vvarier@wpi.edu>, <amunawar@wpi.edu>
 42 | #     \author    Dhruv Kool Rajamani, Vignesh Manoj Varier, and Adnan Munawar
 43 | #     \version   1.0.0
 44 | # */
 45 | # //============================================================================
 46 | 
 47 | # System imports
 48 | import os, sys, time, datetime
 49 | 
 50 | # Additional packages
 51 | import numpy as np
 52 | 
 53 | # ARL Env
 54 | from dVRK.PSM_cartesian_herddpg_env import PSMCartesianHERDDPGEnv
 55 | 
 56 | # Stable baselines algorithms
 57 | from stable_baselines.ddpg.policies import MlpPolicy
 58 | from stable_baselines import HER, DDPG
 59 | from stable_baselines.common.noise import NormalActionNoise
 60 | from stable_baselines.common.noise import OrnsteinUhlenbeckActionNoise, AdaptiveParamNoiseSpec
 61 | from stable_baselines.bench import Monitor
 62 | from stable_baselines.common.callbacks import CallbackList, CheckpointCallback, EvalCallback
 63 | 
 64 | 
 65 | def redirect_stdout(filepath=None):
 66 |   # os.makedirs('/home/admin/ambf/.logs')
 67 |   cdir = os.getcwd()
 68 |   basepath = os.path.join(cdir, '.logs')
 69 | 
 70 |   if not os.path.exists(basepath):
 71 |     os.makedirs(basepath)
 72 | 
 73 |   if filepath is None:
 74 |     now = datetime.now()
 75 |     filepath = 'log_' + now.strftime("%Y_%m_%d-%H_%M_%S.txt")
 76 |     filepath = os.path.join(basepath, filepath)
 77 | 
 78 |   err_filepath = filepath[:-4] + '_err.txt'
 79 | 
 80 |   if os.path.exists(filepath):
 81 |     filepath = filepath[:-4]
 82 |     filepath += now.strftime("_%H_%M_%S") + '.txt'
 83 | 
 84 |   sys.stdout = open(filepath, 'w')
 85 |   sys.stderr = open(err_filepath, 'w')
 86 |   print("Began logging")
 87 |   return
 88 | 
 89 | 
 90 | def main(
 91 |   training_env: PSMCartesianHERDDPGEnv,
 92 |   eval_env: PSMCartesianHERDDPGEnv = None,
 93 |   log_dir='./.logs/results'
 94 | ):
 95 | 
 96 |   os.makedirs(log_dir, exist_ok=True)
 97 | 
 98 |   # training_env = Monitor(training_env, log_dir)
 99 | 
100 |   n_actions = training_env.action_space.shape[0]
101 |   noise_std = 0.2
102 |   # Currently using OU noise
103 |   action_noise = OrnsteinUhlenbeckActionNoise(
104 |     mean=np.zeros(n_actions),
105 |     sigma=noise_std * np.ones(n_actions)
106 |   )
107 |   model_class = DDPG  # works also with SAC, DDPG and TD3
108 | 
109 |   rl_model_kwargs = {
110 |     'actor_lr': 1e-3,
111 |     'critic_lr': 1e-3,
112 |     'action_noise': action_noise,
113 |     'nb_train_steps': 300,
114 |     'nb_rollout_steps': 100,
115 |     'gamma': 0.95,
116 |     'observation_range': (-1.5,
117 |                           1.5),
118 |     'random_exploration': 0.05,
119 |     'normalize_observations': True,
120 |     'critic_l2_reg': 0.01
121 |   }
122 | 
123 |   # Available strategies (cf paper): future, final, episode, random
124 |   model = HER(
125 |     'MlpPolicy',
126 |     training_env,
127 |     model_class,
128 |     verbose=1,
129 |     n_sampled_goal=4,
130 |     goal_selection_strategy='future',
131 |     buffer_size=int(1e5),
132 |     batch_size=128,
133 |     tensorboard_log="./ddpg_dvrk_tensorboard/",
134 |     **rl_model_kwargs
135 |   )
136 |   # Reset the model
137 |   training_env.reset()
138 |   # Create callbacks
139 |   checkpoint_callback = CheckpointCallback(
140 |     save_freq=100000,
141 |     save_path="./ddpg_dvrk_tensorboard/"
142 |   )  # save_path="./.model/model_checkpoint/") #save_freq=100000
143 |   # eval_callback = EvalCallback(training_env, best_model_save_path='./ddpg_dvrk_tensorboard/best_model',
144 |   #                             log_path=log_dir, eval_freq=500)
145 |   callback = CallbackList([checkpoint_callback])  # , eval_callback])
146 |   # Train the model
147 |   model.learn(4000000, log_interval=100, callback=callback)
148 |   model.save("./her_robot_env")
149 | 
150 |   # NOTE:
151 |   # If continuing learning from previous checkpoint,
152 |   # Comment above chunk of code {model=HER(''') till model.save("./her_robot_env")} and uncomment below lines:
153 |   # Replace the XXXXX below with the largest number present in (rl_model_) directory ./ddpg_dvrk_tensorboard/
154 |   # remaining_training_steps = 4000000 - XXXXX
155 |   # model_log_dir = './ddpg_dvrk_tensorboard/rl_model_XXXXX_steps.zip'
156 |   # model = HER.load(model_log_dir, env=env)
157 |   # # Reset the model
158 |   # env.reset()
159 |   # model.learn(remaining_training_steps, log_interval=100,
160 |   #             callback=CheckpointCallback(save_freq=100000, save_path="./ddpg_dvrk_tensorboard/"))
161 |   # model.save("./her_robot_env")
162 | 
163 | 
164 | def load_model(eval_env):
165 |   # WARNING: you must pass an env
166 |   # or wrap your environment with HERGoalEnvWrapper to use the predict method
167 |   model = HER.load('./her_robot_env', env=eval_env)
168 |   count = 0
169 |   step_num_arr = []
170 |   for _ in range(20):
171 |     number_steps = 0
172 |     obs = eval_env.reset()
173 |     for _ in range(400):
174 |       action, _ = model.predict(obs)
175 |       obs, reward, done, _ = eval_env.step(action)
176 |       number_steps += 1
177 |       # print(obs['achieved_goal'][0:3], obs['desired_goal'][0:3], reward)
178 |       if done:
179 |         step_num_arr.append(number_steps)
180 |         count += 1
181 |         print("----------------It reached terminal state -------------------")
182 |         break
183 |   print(
184 |     "Robot reached the goal position successfully ",
185 |     count,
186 |     " times and the Average step count was ",
187 |     np.average(np.array(step_num_arr))
188 |   )
189 | 
190 | 
191 | if __name__ == '__main__':
192 |   # redirect_stdout()
193 |   ENV_NAME = 'psm/baselink'
194 |   env_kwargs = {
195 |     'action_space_limit': 0.05,
196 |     'goal_position_range': 0.05,
197 |     'position_error_threshold': 0.01,
198 |     'goal_error_margin': 0.0075,
199 |     'joint_limits':
200 |       {
201 |         'lower_limit': np.array([-0.2,
202 |                                  -0.2,
203 |                                  0.1,
204 |                                  -1.5,
205 |                                  -1.5,
206 |                                  -1.5,
207 |                                  -1.5]),
208 |         'upper_limit': np.array([0.2,
209 |                                  0.2,
210 |                                  0.24,
211 |                                  1.5,
212 |                                  1.5,
213 |                                  1.5,
214 |                                  1.5])
215 |       },
216 |     'workspace_limits':
217 |       {
218 |         'lower_limit': np.array([-0.04,
219 |                                  -0.03,
220 |                                  -0.2]),
221 |         'upper_limit': np.array([0.03,
222 |                                  0.04,
223 |                                  -0.091])
224 |       },
225 |     'enable_step_throttling': False,
226 |     'steps_to_print': 10000
227 |   }
228 |   # Training
229 |   print("Creating training_env")
230 |   ambf_env = PSMCartesianHERDDPGEnv(**env_kwargs)
231 |   time.sleep(5)
232 |   ambf_env.make(ENV_NAME)
233 |   ambf_env.reset()
234 | 
235 |   main(training_env=ambf_env)  #, eval_env=eval_env)
236 |   ambf_env.ambf_client.clean_up()
237 | 
238 |   # Evaluate learnt policy
239 |   print("Creating eval_env")
240 |   eval_env = PSMCartesianHERDDPGEnv(**env_kwargs)
241 |   time.sleep(5)
242 |   eval_env.make(ENV_NAME)
243 |   eval_env.reset()
244 | 
245 |   load_model(eval_env=eval_env)
246 |   eval_env.ambf_client.clean_up()
247 | 


--------------------------------------------------------------------------------
/scripts/dVRK/bagging.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | arg=$1
4 | 
5 | rosbag record -e "/ambf/env/(.*)/(.*)/Command" -O $arg
6 | 


--------------------------------------------------------------------------------
/scripts/dVRK/launch_ambf.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | source /opt/ros/melodic/setup.bash
4 | source $AMBF_WS/build/devel/setup.bash
5 | 
6 | pushd $AMBF_WS/bin/lin-x86_64/
7 | ./ambf_simulator -l 5,22 -p 2000 -t 1 -s 2 # -g false
8 | popd
9 | 


--------------------------------------------------------------------------------
/scripts/dVRK/main.py:
--------------------------------------------------------------------------------
 1 | #!/usr/local/bin/python
 2 | 
 3 | import os, sys, shutil
 4 | from multiprocessing import Pool
 5 | 
 6 | processes = (
 7 |   'roscore',
 8 |   'launch_ambf',
 9 |   'psmIK_service',
10 |   'training',
11 |   'tensorboard_launch',
12 |   # 'bagging',
13 |   'print_output'
14 | )
15 | 
16 | 
17 | def run_process(script_name):
18 |   print('Running process: {}'.format(script_name))
19 |   if script_name == 'print_output':
20 |     os.system('./{}.bash'.format(script_name))
21 |   elif script_name == 'training':
22 |     os.system(
23 |       './{}.bash 1> .logs/{}_out.txt  2> .logs/{}_err.txt'.format(
24 |         script_name,
25 |         script_name,
26 |         script_name
27 |       )
28 |     )
29 |   else:
30 |     os.system('./{}.bash >> .logs/{}.txt 2>&1'.format(script_name, script_name))  #
31 | 
32 | 
33 | if __name__ == '__main__':
34 | 
35 |   if os.path.isdir('.logs'):
36 |     print('Previous .logs dir exists, deleting it...')
37 |     shutil.rmtree('.logs')
38 | 
39 |   if os.path.isdir('.model'):
40 |     print('Previous .model dir exists, deleting it...')
41 |     shutil.rmtree('.model')
42 | 
43 |   if os.path.isdir('ddpg_dvrk_tensorboard'):
44 |     print('Previous tensorboard dir exists, deleting it...')
45 |     shutil.rmtree('ddpg_dvrk_tensorboard')
46 | 
47 |   print('Creating new .logs dir...')
48 |   os.mkdir('.logs')
49 | 
50 |   print('Running {} processes: {}'.format(len(processes), processes))
51 |   pool = Pool(processes=len(processes))
52 |   pool.map(run_process, processes)
53 | 


--------------------------------------------------------------------------------
/scripts/dVRK/print_output.bash:
--------------------------------------------------------------------------------
1 | if [ -f .logs/training_out.txt ]; then
2 |   tail -f .logs/training_out.txt
3 | else
4 |   echo "training_out.txt has not been generated yet, trying again in some time."
5 | fi
6 | 


--------------------------------------------------------------------------------
/scripts/dVRK/psmIK_service.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | source /opt/ros/melodic/setup.bash
4 | source $AMBF_WS/build/devel/setup.bash
5 | 
6 | rosrun dvrk_ambf_extensions psmIK_service.py


--------------------------------------------------------------------------------
/scripts/dVRK/roscore.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
4 | 
5 | source /opt/ros/melodic/setup.bash
6 | roscore
7 | 


--------------------------------------------------------------------------------
/scripts/dVRK/tensorboard_launch.bash:
--------------------------------------------------------------------------------
1 | # !/bin/bash
2 | 
3 | tensorboard --logdir $AMBF_RL_WS/scripts/dVRK/ddpg_dvrk_tensorboard --port 12346


--------------------------------------------------------------------------------
/scripts/dVRK/training.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | source /opt/ros/melodic/setup.bash
3 | source $AMBF_WS/build/devel/setup.bash
4 | 
5 | python $AMBF_RL_WS/scripts/dVRK/PSM_cartesian_herddpg_algorithm.py
6 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from distutils.core import setup
 4 | 
 5 | setup(
 6 |   name='AMBF_RL',
 7 |   version='1.0.0',
 8 |   description='AMBF Reinforcement Learning Toolkit',
 9 |   author='Dhruv Kool Rajamani',
10 |   author_email='dkoolrajamani@wpi.edu',
11 |   package_dir={'': 'envs'},
12 |   packages=['arl',
13 |             'dVRK']
14 | )
15 | 


--------------------------------------------------------------------------------
/utilities/Create Expert Dataset/examples/dVRK/move_ambf_psm_from_rosbagdata.py:
--------------------------------------------------------------------------------
 1 | from ambf_client import Client
 2 | import numpy as np
 3 | import time
 4 | import pandas as pd
 5 | from psmFK import compute_FK
 6 | from psmIK import *
 7 | 
 8 | 
 9 | if __name__ == "__main__":
10 | 
11 |     csv_path = "path_to_csv_directory"
12 |     trajectory_data_values = pd.read_csv(csv_path).to_numpy()
13 |     data = trajectory_data_values
14 |     # data = np.load(file_dir + 'rosbag_data.npy')
15 |     print(data.shape)
16 |     print(data[0])
17 |     _client = Client()
18 |     _client.connect()
19 |     print(_client.get_obj_names())
20 |     psm_handle = _client.get_obj_handle('psm/baselink')
21 |     time.sleep(5)
22 |     joint_pos = np.zeros(7)
23 |     count = 0
24 |     error_threshold = 0.01
25 |     # desired_pos = np.array([[0.5, -0.33, 0.149, 0, 0, 0., 0.],
26 |     #                         [-1.605, 0.026, 0.1, 1.569, -1.572, 0., -0.]])
27 |     joint_thresh = joint_pos
28 |     counter = 0
29 |     # Example for dvrk robot
30 |     joints_to_control = np.array(['baselink-yawlink',
31 |                                   'yawlink-pitchbacklink',
32 |                                   'pitchendlink-maininsertionlink',
33 |                                   'maininsertionlink-toolrolllink',
34 |                                   'toolrolllink-toolpitchlink',
35 |                                   'toolpitchlink-toolgripper1link',
36 |                                   'toolpitchlink-toolgripper2link'])
37 | 
38 |     current_end_effector_frame = compute_FK(np.zeros(7))
39 |     desired_end_effector_frame = current_end_effector_frame
40 |     for i in range(0, data.shape[0]):
41 |         for j in range(3):
42 |             # print("data ij ", data[i, j])
43 |             desired_end_effector_frame[j, 3] = round(data[i, j], 3)
44 |         desired_joint_pos = np.around(compute_IK(convert_mat_to_frame(desired_end_effector_frame)), 3)
45 |         desired_joint_pos = np.append(desired_joint_pos, 0)
46 |         # print(desired_joint_pos)
47 |         count = 0
48 |         time.sleep(0.25)
49 |         while True:
50 | 
51 |             for joint_idx, jt_name in enumerate(joints_to_control):
52 |                 psm_handle.set_joint_pos(jt_name, desired_joint_pos[joint_idx])
53 |                 joint_pos[joint_idx] = round(psm_handle.get_joint_pos(jt_name), 3)
54 | 
55 |             error_in_pos = np.around(np.subtract(desired_joint_pos, joint_pos), decimals=3)
56 |             # print("Error in pos ", error_in_pos)
57 |             count += 1
58 |             if np.all(np.abs(error_in_pos) <= 0.01) or count > 75:
59 |                 break
60 | 


--------------------------------------------------------------------------------
/utilities/Create Expert Dataset/generate_expert_dataset.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import csv
  4 | import math
  5 | import rosbag
  6 | import rospy
  7 | from numpy import linalg as LA
  8 | 
  9 | 
 10 | def getData(rosbag_path, ros_topic_name, pos=True):
 11 |     cur_bag = rosbag.Bag(rosbag_path)
 12 | 
 13 |     t_start = rospy.Time(cur_bag.get_start_time())
 14 |     t_end = rospy.Time(cur_bag.get_end_time())
 15 | 
 16 |     num_msgs = cur_bag.get_message_count(ros_topic_name)
 17 |     # For position
 18 |     if pos:
 19 |         data = np.zeros(shape=(num_msgs, 3))
 20 |     # For orientation (quaternion)
 21 |     else:
 22 |         data = np.zeros(shape=(num_msgs, 4))
 23 |     idx = 0
 24 | 
 25 |     for top, msg, t in cur_bag.read_messages(topics=[ros_topic_name],
 26 |                                              start_time=t_start,
 27 |                                              end_time=t_end):
 28 |         if pos:
 29 |             cur = np.array([msg.pose.position.x, msg.pose.position.y, msg.pose.position.z])
 30 |         else:
 31 |             cur = np.array([msg.pose.orientation.x, msg.pose.orientation.y,
 32 |                             msg.pose.orientation.z, msg.pose.orientation.w])
 33 |         data[idx, :] = cur
 34 |         idx = idx + 1
 35 | 
 36 |     return data
 37 | 
 38 | 
 39 | def create_state_action_pairs(data):
 40 |     steps = []
 41 |     fixed = []
 42 |     curr = data[0]
 43 |     steps.append(curr-curr)
 44 |     fixed.append(np.round(curr, 3))
 45 |     for j in range(0, len(data)-1):
 46 |         curr = round_nearest(data[j], 0.005)
 47 |         nxt = round_nearest(data[j+1], 0.005)
 48 |         interval = np.round(nxt-curr, 3)
 49 |         if abs(interval[0]) > 0 or abs(interval[1]) > 0 or abs(interval[2]) > 0:
 50 |             steps.append(interval)
 51 |             fixed.append(curr)
 52 | 
 53 |     return np.array(fixed), steps
 54 | 
 55 | 
 56 | def make_file(robot_states, robot_actions, csv_name):
 57 |     with open(csv_name + '.csv', mode='w') as f:
 58 |         writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
 59 |         for robot_state, robot_action in zip(robot_states, robot_actions):
 60 |             writer.writerow([robot_state[0], robot_state[1],  robot_state[2], robot_action[0],
 61 |                              robot_action[1], robot_action[2]])
 62 | 
 63 | 
 64 | def round_nearest(x, a):
 65 |     return np.round(np.round(x / a) * a, -int(math.floor(math.log10(a))))
 66 | 
 67 | 
 68 | def create_reward(data_values, goal_pos, file_dir):
 69 |     total_reward = 0
 70 |     list_of_reward = []
 71 |     print(data_values.shape)
 72 |     for i in range(data_values.shape[0]):
 73 |         # dist_x = np.subtract(final_state, actual_val[i, 0])
 74 |         # dist_y = np.subtract(final_state, actual_val[i, 1])
 75 |         # dist_z = np.subtract(final_state, actual_val[i, 2])
 76 |         # reward = math.sqrt(math.pow(dist_x, 2) + math.pow(dist_y, 2) + math.pow(dist_z, 2))
 77 |         cur_dist = LA.norm(np.subtract(goal_pos, data_values[i, 0:3]))
 78 |         # reward = round(1 - float(abs(cur_dist)/0.05)*0.5, 5)
 79 |         reward = -cur_dist
 80 |         if abs(cur_dist) < 0.000001:
 81 |             reward = 1
 82 | 
 83 |         total_reward += reward
 84 |         list_of_reward.append(reward)
 85 |     print(total_reward)
 86 |     # np.savetxt(file_dir + 'episode_returns.csv', total_reward, delimiter=',')
 87 |     # np.savetxt(file_dir + 'rewards.csv', np.array(list_of_reward), delimiter=',')
 88 |     np.save(file_dir + 'rewards', np.array(list_of_reward))
 89 |     np.save(file_dir + 'episode_returns', total_reward)
 90 | 
 91 | 
 92 | def create_obs_action_func(data_values, file_dir):
 93 |     obs = []
 94 |     actions = []
 95 |     print(data_values.shape)
 96 |     for i in range(data_values.shape[0]):
 97 |         obs.append(data_values[i, 0:3])
 98 |         actions.append(data_values[i, 3:6])
 99 |     print(obs)
100 |     np.save(file_dir + 'observations', np.array(obs))
101 |     np.save(file_dir + 'actions', np.array(actions))
102 | 
103 | 
104 | if __name__ == "__main__":
105 | 
106 |     rosbag_file_dir = "path_to_rosbag file"
107 |     rostopic_name = "/dvrk/PSM2/position_cartesian_current"
108 |     csv_file_name = "path_to_csv_file"
109 |     # Read rosbag data
110 |     data = getData(rosbag_file_dir, rostopic_name)
111 |     # Create state action pairs
112 |     robot_state, robot_action = create_state_action_pairs(data)
113 |     # Write it to a csv file
114 |     make_file(robot_state, robot_action, csv_file_name)
115 |     # Assuming reward is reward = round(1 - float(abs(cur_dist)/0.3)*0.5, 5)
116 |     traj_csv_name = "path_to_where_rosbag_file_is_stored"
117 |     trajectory_data_values = pd.read_csv(traj_csv_name).to_numpy()
118 |     final_state = np.array([0.005, 0.054, -0.122])
119 |     # create_obs_action_func(trajectory_data_values, file_dir)
120 |     # create_reward(trajectory_data_values, final_state, file_dir)
121 |     epi_start = np.array([True])
122 |     for i in range(trajectory_data_values.shape[0]):
123 |         epi_start = np.append(epi_start, False)
124 |     np.save(file_dir + 'episode_starts', epi_start)
125 | 
126 | 
127 | 
128 | 


--------------------------------------------------------------------------------