├── .gitignore ├── .style.yapf ├── .vscode ├── launch.json └── settings.json ├── README.md ├── __init__.py ├── dockerfiles ├── devel-training-gpu.Dockerfile └── devel-training.Dockerfile ├── envs ├── __init__.py ├── arl │ ├── __init__.py │ └── arl_env.py └── dVRK │ ├── PSM_cartesian_ddpg_env.py │ ├── PSM_cartesian_env.py │ ├── PSM_cartesian_herddpg_env.py │ └── __init__.py ├── install ├── stable_baseline_fix │ └── ddpg.py └── training-pip-requirements.txt ├── media ├── AMBF_Robots.png └── architecture-overview.png ├── scripts └── dVRK │ ├── PSM_cartesian_ddpg_algorithm.py │ ├── PSM_cartesian_herddpg_algorithm.py │ ├── bagging.bash │ ├── launch_ambf.bash │ ├── main.py │ ├── print_output.bash │ ├── psmIK_service.bash │ ├── roscore.bash │ ├── tensorboard_launch.bash │ └── training.bash ├── setup.py └── utilities └── Create Expert Dataset ├── examples └── dVRK │ └── move_ambf_psm_from_rosbagdata.py └── generate_expert_dataset.py /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | ### C ### 3 | # Prerequisites 4 | *.d 5 | 6 | # Object files 7 | *.o 8 | *.ko 9 | *.obj 10 | *.elf 11 | 12 | # Linker output 13 | *.ilk 14 | *.map 15 | *.exp 16 | 17 | # Precompiled Headers 18 | *.gch 19 | *.pch 20 | 21 | # Libraries 22 | *.lib 23 | *.a 24 | *.la 25 | *.lo 26 | 27 | # Shared objects (inc. Windows DLLs) 28 | *.dll 29 | *.so 30 | *.so.* 31 | *.dylib 32 | 33 | # Executables 34 | *.exe 35 | *.out 36 | *.app 37 | *.i*86 38 | *.x86_64 39 | *.hex 40 | 41 | # Debug files 42 | *.dSYM/ 43 | *.su 44 | *.idb 45 | *.pdb 46 | 47 | # Kernel Module Compile Results 48 | *.mod* 49 | *.cmd 50 | .tmp_versions/ 51 | modules.order 52 | Module.symvers 53 | Mkfile.old 54 | dkms.conf 55 | 56 | ### C++ ### 57 | # Prerequisites 58 | 59 | # Compiled Object files 60 | *.slo 61 | 62 | # Precompiled Headers 63 | 64 | # Compiled Dynamic libraries 65 | 66 | # Fortran module files 67 | *.mod 68 | *.smod 69 | 70 | # Compiled Static libraries 71 | *.lai 72 | 73 | # Executables 74 | 75 | ### Python ### 76 | # Byte-compiled / optimized / DLL files 77 | __pycache__/ 78 | *.py[cod] 79 | *$py.class 80 | 81 | # C extensions 82 | 83 | # Distribution / packaging 84 | .Python 85 | build/ 86 | develop-eggs/ 87 | dist/ 88 | downloads/ 89 | eggs/ 90 | .eggs/ 91 | lib/ 92 | lib64/ 93 | parts/ 94 | sdist/ 95 | var/ 96 | wheels/ 97 | pip-wheel-metadata/ 98 | share/python-wheels/ 99 | *.egg-info/ 100 | .installed.cfg 101 | *.egg 102 | MANIFEST 103 | 104 | # PyInstaller 105 | # Usually these files are written by a python script from a template 106 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 107 | *.manifest 108 | *.spec 109 | 110 | # Installer logs 111 | pip-log.txt 112 | pip-delete-this-directory.txt 113 | 114 | # Unit test / coverage reports 115 | htmlcov/ 116 | .tox/ 117 | .nox/ 118 | .coverage 119 | .coverage.* 120 | .cache 121 | nosetests.xml 122 | coverage.xml 123 | *.cover 124 | *.py,cover 125 | .hypothesis/ 126 | .pytest_cache/ 127 | pytestdebug.log 128 | 129 | # Translations 130 | *.mo 131 | *.pot 132 | 133 | # Django stuff: 134 | *.log 135 | local_settings.py 136 | db.sqlite3 137 | db.sqlite3-journal 138 | 139 | # Flask stuff: 140 | instance/ 141 | .webassets-cache 142 | 143 | # Scrapy stuff: 144 | .scrapy 145 | 146 | # Sphinx documentation 147 | docs/_build/ 148 | doc/_build/ 149 | 150 | # PyBuilder 151 | target/ 152 | 153 | # Jupyter Notebook 154 | .ipynb_checkpoints 155 | 156 | # IPython 157 | profile_default/ 158 | ipython_config.py 159 | 160 | # pyenv 161 | .python-version 162 | 163 | # pipenv 164 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 165 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 166 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 167 | # install all needed dependencies. 168 | #Pipfile.lock 169 | 170 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 171 | __pypackages__/ 172 | 173 | # Celery stuff 174 | celerybeat-schedule 175 | celerybeat.pid 176 | 177 | # SageMath parsed files 178 | *.sage.py 179 | 180 | # Environments 181 | .env 182 | .venv 183 | env/ 184 | venv/ 185 | ENV/ 186 | env.bak/ 187 | venv.bak/ 188 | pyenv_arl_training 189 | 190 | # Spyder project settings 191 | .spyderproject 192 | .spyproject 193 | 194 | # Rope project settings 195 | .ropeproject 196 | 197 | # mkdocs documentation 198 | /site 199 | 200 | # mypy 201 | .mypy_cache/ 202 | .dmypy.json 203 | dmypy.json 204 | 205 | # Pyre type checker 206 | .pyre/ 207 | 208 | # pytype static type analyzer 209 | .pytype/ 210 | 211 | # Flags for Clang Code Model 212 | *.cxxflags 213 | *.cflags 214 | 215 | # Ignoring generated logs and tensorboard folders 216 | .logs 217 | ddpg_dvrk_tensorboard/ 218 | training_scripts/ 219 | training-gpu.Dockerfile -------------------------------------------------------------------------------- /.style.yapf: -------------------------------------------------------------------------------- 1 | [style] 2 | based_on_style = pep8 3 | align_closing_bracket_with_visual_indent: true 4 | continuation_indent_width: 2 5 | use_tabs: false 6 | each_dict_entry_on_separate_line: false 7 | column_limit: 80 8 | dedent_closing_brackets: true 9 | blank_line_before_nested_class_or_def: true 10 | arithmetic_precedence_indication: true 11 | indent_dictionary_value: true 12 | indent_width: 2 13 | split_all_comma_separated_values: true 14 | split_before_logical_operator: true 15 | split_complex_comprehension: true 16 | split_before_named_assigns: false 17 | split_penalty_import_names=0 18 | split_penalty_for_added_line_split=30 -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "name": "Python: Current File", 9 | "cwd": "${workspaceFolder}/scripts/dVRK/", 10 | "type": "python", 11 | "request": "launch", 12 | "program": "${file}", 13 | "console": "integratedTerminal" 14 | } 15 | ] 16 | } -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "editor.rulers": [ 3 | 80 4 | ], 5 | "githubIssues.createIssueTriggers": [ 6 | "feature", 7 | "bug", 8 | "documentation", 9 | "question" 10 | ], 11 | "githubIssues.issueCompletions.enabled": true, 12 | "githubIssues.useBranchForIssues": "prompt", 13 | "githubIssues.issueBranchTitle": "$feature/${issueNumber}-${sanitizedIssueTitle}", 14 | "githubIssues.queries": [ 15 | { 16 | "label": "My Open Issues", 17 | "query": "assignee:${user} state:open" 18 | }, 19 | { 20 | "label": "Issues", 21 | "query": "repo:${owner}/${repository} state:open" // sort:created-desc 22 | } 23 | ], 24 | "python.formatting.provider": "yapf", 25 | "python.autoComplete.extraPaths": [ 26 | "${env:AMBF_WS}/", 27 | "${env:AMBF_WS}/ambf_ros_modules/", 28 | "${env:AMBF_WS}/ambf_ros_modules/ambf_comm/", 29 | "${env:AMBF_WS}/ambf_ros_modules/ambf_comm/scripts/", 30 | "${env:AMBF_WS}/ambf_controller/dvrk_functions/scripts/", 31 | "${env:AMBF_RL_WS}/**", 32 | "${env:AMBF_RL_WS}/envs/**", 33 | "${env:AMBF_RL_WS}/envs/ambf_rl/**", 34 | "${env:AMBF_RL_WS}/envs/dVRK/**", 35 | "${env:AMBF_RL_WS}/scripts/**", 36 | ], 37 | "python.analysis.extraPaths": [ 38 | "${env:AMBF_WS}/**", 39 | "${env:AMBF_WS}/ambf_ros_modules/**", 40 | "${env:AMBF_WS}/ambf_ros_modules/ambf_comm/**", 41 | "${env:AMBF_WS}/ambf_ros_modules/ambf_comm/scripts/**", 42 | "${env:AMBF_WS}/ambf_controller/dvrk_functions/scripts/**", 43 | "${env:AMBF_RL_WS}/**", 44 | "${env:AMBF_RL_WS}/envs/**", 45 | "${env:AMBF_RL_WS}/envs/ambf_rl/**", 46 | "${env:AMBF_RL_WS}/envs/dVRK/**", 47 | "${env:AMBF_RL_WS}/scripts/**", 48 | ], 49 | "python.pythonPath": "pyenv_arl_training/bin/python" 50 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AMBF Reinforcement Learning (AMBF-RL) Toolkit 2 | 3 | ![ambf-robots](media/AMBF_Robots.png) 4 | 5 | The AMBF-RL (ARL) toolkit assists in designing control algorithms for medical 6 | robots by providing environments conducive to medical settings and accurate 7 | models of well-known medical robotic platforms. 8 | It wraps [OpenAI-gym](https://github.com/openai/gym) and 9 | [stable-baselines](https://github.com/hill-a/stable-baselines) to provide an 10 | interface to train state of the art RL algorithms for robots in the 11 | [Asynchronous Multibody Framework (AMBF)](https://github.com/WPI-AIM/ambf) 12 | simulation environment. 13 | 14 | ## Architecture 15 | 16 | ![architecture-overview](media/architecture-overview.png) 17 | 18 | ## Requirements 19 | 20 | - Computer with Ubuntu Bionic (18.04) or Ubuntu Bionic Server (18.04) 21 | - [ROS Melodic - 1.14.*](http://wiki.ros.org/melodic/Installation/Ubuntu) 22 | - (Optional) NVIDIA GPU 23 | - [CUDA - 10.0 (10.0.130)](https://developer.nvidia.com/cuda-10.0-download-archive) 24 | - [CUDNN - 7.4.2](https://developer.nvidia.com/compute/machine-learning/cudnn/secure/v7.4.2/prod/10.0_20181213/cudnn-10.0-linux-x64-v7.4.2.24.tgz) 25 | - (Optional) Docker 26 | 27 | ## Install 28 | ### 1. Docker method 29 | 30 | If docker is installed, the dockerfile [devel-training-gpu](dockerfiles/devel-training-gpu.Dockerfile) can be run. 31 | ```bash 32 | cd ~/ 33 | git clone https://github.com/WPI-AIM/ambf_rl.git && cd ambf_rl 34 | sudo service docker start 35 | docker build --rm -f "dockerfiles/devel-training-gpu.Dockerfile" -t ambf_rl:devel-training-gpu "." 36 | docker run --rm -it ambf_rl:devel-training-gpu 37 | # Now inside the interactive container 38 | cd ~/ambf_rl 39 | python scripts/dVRK/main.py 40 | ``` 41 | 42 | ### 2. Install dependencies 43 | 44 | Clone and install ambf requirements 45 | ```bash 46 | source /opt/ros/melodic/setup.bash 47 | cd ~/ 48 | git clone --branch ambf-1.0-python3-fix \ 49 | https://github.com/DhruvKoolRajamani/ambf.git 50 | cd ~/ambf 51 | sudo apt-get -y -qq install --fix-missing \ 52 | $(cat install/apt-requirements.txt) 53 | cat install/pip-requirements.txt | xargs -n 1 -L 1 pip install -U 54 | mkdir build && cd build 55 | cmake ../ 56 | make -j$(nproc) 57 | ``` 58 | 59 | **Optional:** Install python-venv to use a virtual environment 60 | ```bash 61 | sudo apt-get install python3-venv python-venv 62 | ``` 63 | 64 | Now clone this repository and install training requirements 65 | ```bash 66 | source ~/ambf/build/devel/setup.bash 67 | cd ~/ 68 | git clone https://github.com/WPI-AIM/ambf_rl.git 69 | ``` 70 | 71 | **Optional:** If venv is already installed, create and source the virtual environment. 72 | ```bash 73 | cd ~/ambf_rl 74 | virtualenv --python=/usr/bin/python3 pyenv_arl_training 75 | source ~/ambf_rl/pyenv_arl_training/bin/activate 76 | ``` 77 | 78 | Continue with the installation of training specific requirements. 79 | ```bash 80 | cd ~/ambf_rl 81 | cat install/training-pip-requirements.txt | xargs -n 1 -L 1 pip3 install -U 82 | ``` 83 | 84 | Copy the ddpg fix to the installed stable-baselines folder. 85 | If the virtual environment was installed, 86 | `=~/ambf_rl/pyenv_arl_training/lib/python3.6/site-packages/>` ortherwise 87 | `=~/.local/lib/python3.6/site-packages/>` 88 | ```bash 89 | mv /stable_baselines/ddpg/ddpg.py \ 90 | /stable_baselines/ddpg/ddpg_old.py && \ 91 | cp ${AMBF_RL_WS}/install/stable_baseline_fix/ddpg.py \ 92 | /stable_baselines/ddpg/ 93 | ``` 94 | 95 | Example: if venv exists 96 | ```bash 97 | mv ~/ambf_rl/pyenv_arl_training/lib/python3.6/site-packages/stable_baselines/ddpg/ddpg.py \ 98 | ~/ambf_rl/pyenv_arl_training/lib/python3.6/site-packages/stable_baselines/ddpg/ddpg_old.py && \ 99 | cp ${AMBF_RL_WS}/install/stable_baseline_fix/ddpg.py \ 100 | ~/ambf_rl/pyenv_arl_training/lib/python3.6/site-packages/stable_baselines/ddpg/ 101 | ``` 102 | 103 | Finally install AMBF-RL 104 | ```bash 105 | cd ~/ambf_rl 106 | python setup.py install 107 | ``` 108 | 109 | Test the installation by running the example DDPG or HER+DDPG algorithms to train the dVRK PSM to reach a point. 110 | ```bash 111 | cd ~/ambf_rl 112 | source /opt/ros/melodic/setup.bash 113 | source ~/ambf/build/devel/setup.bash 114 | # If a virtual environment was created 115 | source ~/ambf_rl/pyenv_arl_training/bin/activate 116 | cd scripts/dVRK/ 117 | python main.py 118 | ``` 119 | An instance of AMBF will spawn and training should begin. 120 | 121 | ## Developer Guide 122 | 123 | If using .vscode as an IDE, it would be helpful to export the path to the AMBF-RL workspace to your .bashrc, or individual terminal session. 124 | ```bash 125 | # Individual terminal session 126 | export AMBF_WS=~/ambf 127 | export AMBF_RL_WS=~/ambf_rl 128 | 129 | # .bashrc 130 | echo "export AMBF_WS=~/ambf" >> ~/.bashrc 131 | echo "export AMBF_RL_WS=~/ambf_rl" >> ~/.bashrc 132 | source ~/.bashrc 133 | ``` 134 | 135 | ### Abstract AMBF-RL Base Class 136 | 137 | - [arl_env.py](envs/arl/arl_env.py) is an abstract base class that automatically connects to the `ambf_client` and objects spawned in ambf. 138 | ```python 139 | class ARLEnv(gym.Env, metaclass=ABCMeta): 140 | """Base class for the ARLEnv 141 | 142 | This class should not be instantiated on its own. It does not contain the 143 | desired RL Algorithm or the Robot. This should remain an abstract class and 144 | should be wrapped by at least an algorithmic and robot environment wrapper. 145 | An example of this can be PSM_cartesian_env_ddpg(ARLEnv). 146 | 147 | Attributes 148 | ---------- 149 | 150 | Methods 151 | ------- 152 | skip_skipped_sim_steps(num) 153 | Define number of steps to skip if step-throttling is enabled. 154 | 155 | set_throttling_enable(check): 156 | Set the step-throttling Boolean. 157 | """ 158 | 159 | def __init__( 160 | self, 161 | enable_step_throttling: bool, 162 | n_skip_steps: int, 163 | env_name: str = "arl_env" 164 | ) -> None: 165 | """Initialize the abstract class which handles all AMBF related interactions 166 | 167 | Parameters 168 | ---------- 169 | enable_step_throttling : bool 170 | Flag to enable throttling of the simulator 171 | n_skip_steps : int 172 | Number of steps to skip after an update step 173 | env_name : str 174 | Name of the environment to train 175 | """ 176 | ``` 177 | 178 | - For dVRK PSM environments that are based in the cartesian space, an abstract class [PSM_cartesian_env.py](envs/dVRK/PSM_cartesian_env.py) has been created and should be inherited. 179 | ```python 180 | class PSMCartesianEnv(ARLEnv, metaclass=ABCMeta): 181 | """Single task based environment for PSM to perform debris removal as shown in 182 | the paper: 183 | 184 | The environment performs actions in the cartesian space in R3, with translational 185 | movements in x, y, and z. 186 | """ 187 | 188 | def __init__( 189 | self, 190 | action_space_limit: float, 191 | goal_position_range: float, 192 | position_error_threshold: float, 193 | goal_error_margin: float, 194 | joint_limits: Dict[str, 195 | np.ndarray or List[str]], 196 | workspace_limits: Dict[str, 197 | np.ndarray or List[str]], 198 | enable_step_throttling: bool, 199 | joints_to_control: List[str] = [ 200 | 'baselink-yawlink', 201 | 'yawlink-pitchbacklink', 202 | 'pitchendlink-maininsertionlink', 203 | 'maininsertionlink-toolrolllink', 204 | 'toolrolllink-toolpitchlink', 205 | 'toolpitchlink-toolgripper1link', 206 | 'toolpitchlink-toolgripper2link' 207 | ], 208 | steps_to_print: int = 10000, 209 | n_actions: int = 3, 210 | n_skip_steps: int = 5, 211 | env_name: str = "PSM_cartesian_ddpgenv" 212 | ) -> None: 213 | """Initialize an object of the PSM robot in cartesian space. 214 | 215 | Parameters 216 | ---------- 217 | action_space_limit : float 218 | Action space limit for cartesian actions 219 | goal_position_range : int, optional 220 | The variance in goal position 221 | position_error_threshold : float 222 | Maximum acceptable error in cartesian position 223 | goal_error_margin : float 224 | Maximum margin of error for an epoch to be considered successful 225 | joint_limits : Dict(str, List(float) | np.array(float)) 226 | Robot joint limits in radians 227 | workspace_limits : Dict(str, List(float) | np.array(float)) 228 | Workspace limits in x,y, and z for the robots workspace in Cartesian space 229 | enable_step_throttling : bool 230 | Flag to enable throttling of the simulator 231 | joints_to_control : np.array(str) | List(str) 232 | The list of joint links for the psm. 233 | steps_to_print : int 234 | Number of steps before model prints information to stdout 235 | n_actions : int 236 | Number of possible actions 237 | n_skip_steps : int 238 | Number of steps to skip after an update step 239 | env_name : str 240 | Name of the environment to train 241 | """ 242 | ``` 243 | 244 | ## Citation 245 | 246 | If this work is helpful for your research, please use the following reference for citation: 247 | ```latex 248 | @INPROCEEDINGS{9807609, author={Varier, Vignesh Manoj and Rajamani, Dhruv Kool and Tavakkolmoghaddam, Farid and Munawar, Adnan and Fischer, Gregory S}, booktitle={2022 International Symposium on Medical Robotics (ISMR)}, title={AMBF-RL: A real-time simulation based Reinforcement Learning toolkit for Medical Robotics}, year={2022}, volume={}, number={}, pages={1-8}, doi={10.1109/ISMR48347.2022.9807609}} 249 | ``` 250 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | from .envs.arl.arl_env import ARLEnv, Goal, Action, Observation 2 | from .envs.dVRK.PSM_cartesian_ddpg_env import PSMCartesianDDPGEnv -------------------------------------------------------------------------------- /dockerfiles/devel-training-gpu.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM tensorflow/tensorflow:1.14.0-gpu-py3 2 | 3 | ENV USERNAME="admin" 4 | ENV NVIDIA_VISIBLE_DEVICES all 5 | ENV NVIDIA_DRIVER_CAPABILITIES graphics,utility,compute 6 | 7 | RUN useradd -ms /bin/bash ${USERNAME} 8 | RUN usermod -aG sudo ${USERNAME} 9 | 10 | ENV HOME="/home/${USERNAME}" \ 11 | AMBF_WS="/home/${USERNAME}/ambf" \ 12 | AMBF_RL_WS="/home/${USERNAME}/ambf_rl" 13 | 14 | # Add apt-utils 15 | RUN apt clean && \ 16 | rm -rf /var/lib/apt/lists/* && \ 17 | apt-get update && \ 18 | apt-get install apt-utils -q -y \ 19 | && rm -rf /var/lib/apt/lists/* 20 | 21 | RUN apt-get update && \ 22 | apt-get -y -qq install wget gdb 23 | 24 | # setup timezone 25 | RUN echo 'Etc/UTC' > /etc/timezone && \ 26 | ln -s /usr/share/zoneinfo/Etc/UTC /etc/localtime && \ 27 | apt-get update && apt-get install -q -y tzdata 28 | 29 | # install packages 30 | RUN apt-get update && apt-get install -q -y \ 31 | dirmngr \ 32 | gnupg2 \ 33 | && rm -rf /var/lib/apt/lists/* 34 | 35 | # setup keys 36 | RUN apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys C1CF6E31E6BADE8868B172B4F42ED6FBAB17C654 37 | 38 | # setup sources.list 39 | RUN echo "deb http://packages.ros.org/ros/ubuntu bionic main" > /etc/apt/sources.list.d/ros1-latest.list 40 | 41 | # install bootstrap tools 42 | RUN apt-get update && apt-get install --no-install-recommends -y \ 43 | python-rosdep \ 44 | python-rosinstall \ 45 | python-vcstools \ 46 | && rm -rf /var/lib/apt/lists/* 47 | 48 | # setup environment 49 | ENV LANG C.UTF-8 50 | ENV LC_ALL C.UTF-8 51 | 52 | ENV ROS_DISTRO melodic 53 | # bootstrap rosdep 54 | RUN rosdep init && \ 55 | rosdep update --rosdistro $ROS_DISTRO 56 | 57 | RUN apt-get update && apt-get install -y \ 58 | ros-melodic-ros-base=1.4.1-0* apt-utils git \ 59 | && rm -rf /var/lib/apt/lists/* 60 | 61 | WORKDIR ${HOME} 62 | # Make Directory AMBF_WS 63 | RUN git clone --branch ambf-1.0-python3-fix https://github.com/DhruvKoolRajamani/ambf.git 64 | WORKDIR ${AMBF_WS} 65 | RUN cd ${AMBF_WS} && \ 66 | git submodule update --init --recursive 67 | 68 | # Install apt and pip packages listed in (*-requirements.txt) 69 | WORKDIR ${AMBF_WS} 70 | RUN apt-get update && \ 71 | apt-get -y -qq -o Dpkg::Use-Pty=0 install --no-install-recommends \ 72 | --fix-missing $(cat install/apt-requirements.txt) && \ 73 | cat install/pip-requirements.txt | xargs -n 1 -L 1 pip install -U && \ 74 | apt-get clean && \ 75 | rm -rf /var/lib/apt/lists/* 76 | 77 | # Build AMBF 78 | RUN . /opt/ros/melodic/setup.sh && \ 79 | mkdir -p ${AMBF_WS}/build && \ 80 | cd ${AMBF_WS}/build && \ 81 | cmake ../ -DCMAKE_BUILD_TYPE=Debug && \ 82 | make -j$(nproc) 83 | 84 | WORKDIR ${HOME} 85 | # Make Directory AMBF_RL_WS 86 | RUN git clone https://github.com/WPI-AIM/ambf_rl.git 87 | WORKDIR ${AMBF_RL_WS} 88 | RUN apt-get update && \ 89 | cat install/training-pip-requirements.txt | xargs -n 1 -L 1 pip3 install -U && \ 90 | apt-get clean && \ 91 | rm -rf /var/lib/apt/lists/* 92 | 93 | # Stable Baselines fix 94 | RUN mv /usr/local/lib/python3.6/dist-packages/stable_baselines/ddpg/ddpg.py \ 95 | /usr/local/lib/python3.6/dist-packages/stable_baselines/ddpg/ddpg_old.py && \ 96 | cp ${AMBF_RL_WS}/install/stable_baseline_fix/ddpg.py \ 97 | /usr/local/lib/python3.6/dist-packages/stable_baselines/ddpg/ 98 | 99 | RUN touch ${HOME}/.bashrc && \ 100 | echo "source /opt/ros/melodic/setup.bash" >> ${HOME}/.bashrc && \ 101 | echo "source /home/admin/ambf/build/devel/setup.bash" >> ${HOME}/.bashrc 102 | 103 | RUN . ${HOME}/.bashrc 104 | 105 | WORKDIR ${AMBF_RL_WS} 106 | RUN python setup.py install 107 | 108 | ENV ROS_HOSTNAME="localhost" \ 109 | ROS_MASTER_URI="http://localhost:11311" 110 | 111 | WORKDIR ${AMBF_RL_WS} -------------------------------------------------------------------------------- /dockerfiles/devel-training.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM tensorflow/tensorflow:1.14.0-py3 2 | 3 | # setup timezone 4 | RUN echo 'Etc/UTC' > /etc/timezone && \ 5 | ln -s /usr/share/zoneinfo/Etc/UTC /etc/localtime && \ 6 | apt-get update && apt-get install -q -y tzdata && rm -rf /var/lib/apt/lists/* 7 | 8 | # install packages 9 | RUN apt-get update && apt-get install -q -y \ 10 | dirmngr \ 11 | gnupg2 \ 12 | && rm -rf /var/lib/apt/lists/* 13 | 14 | # setup keys 15 | RUN apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys C1CF6E31E6BADE8868B172B4F42ED6FBAB17C654 16 | 17 | # setup sources.list 18 | RUN echo "deb http://packages.ros.org/ros/ubuntu bionic main" > /etc/apt/sources.list.d/ros1-latest.list 19 | 20 | # install bootstrap tools 21 | RUN apt-get update && apt-get install --no-install-recommends -y \ 22 | python-rosdep \ 23 | python-rosinstall \ 24 | python-vcstools \ 25 | && rm -rf /var/lib/apt/lists/* 26 | 27 | # setup environment 28 | ENV LANG C.UTF-8 29 | ENV LC_ALL C.UTF-8 30 | 31 | ENV ROS_DISTRO melodic 32 | # bootstrap rosdep 33 | RUN rosdep init && \ 34 | rosdep update --rosdistro $ROS_DISTRO 35 | 36 | RUN apt-get update && apt-get install -y \ 37 | ros-melodic-ros-base=1.4.1-0* apt-utils git \ 38 | && rm -rf /var/lib/apt/lists/* 39 | 40 | ENV HOME="/root" \ 41 | AMBF_WS="/root/ambf" 42 | 43 | WORKDIR ${HOME} 44 | # Make Directory AMBF_WS 45 | RUN git clone --single-branch --branch feat-rl https://github.com/WPI-AIM/ambf.git 46 | WORKDIR ${AMBF_WS} 47 | RUN cd ${AMBF_WS} && \ 48 | git submodule update --init --recursive 49 | 50 | # Install apt and pip packages listed in (*-requirements.txt) 51 | WORKDIR ${AMBF_WS} 52 | RUN apt-get update && \ 53 | apt-get -y -qq -o Dpkg::Use-Pty=0 install --no-install-recommends \ 54 | --fix-missing $(cat install/apt-requirements.txt) && \ 55 | cat install/pip-requirements.txt | xargs -n 1 -L 1 pip install -U && \ 56 | apt-get clean && \ 57 | rm -rf /var/lib/apt/lists/* 58 | 59 | # Build AMBF 60 | RUN . /opt/ros/melodic/setup.sh && \ 61 | mkdir -p ${AMBF_WS}/build && \ 62 | cd ${AMBF_WS}/build && \ 63 | cmake ../ && \ 64 | make -j$(nproc) 65 | 66 | RUN touch /root/.bashrc && \ 67 | echo "source /opt/ros/melodic/setup.bash" >> /root/.bashrc && \ 68 | echo "source /root/ambf/build/devel/setup.bash" >> /root/.bashrc 69 | 70 | WORKDIR ${AMBF_WS}/training_scripts 71 | # CMD ./wrapper_script.sh 72 | 73 | # CMD python3 -c "import tensorflow as tf; print(tf.reduce_sum(tf.random.normal([1000, 1000])))" -------------------------------------------------------------------------------- /envs/__init__.py: -------------------------------------------------------------------------------- 1 | from .arl.arl_env import ARLEnv, Goal, Action, Observation 2 | from .dVRK.PSM_cartesian_env import PSMCartesianEnv, CartesianAction 3 | from .dVRK.PSM_cartesian_ddpg_env import PSMCartesianDDPGEnv, DDPGObservation 4 | from .dVRK.PSM_cartesian_herddpg_env import PSMCartesianHERDDPGEnv, HERDDPGObservation -------------------------------------------------------------------------------- /envs/arl/__init__.py: -------------------------------------------------------------------------------- 1 | from .arl_env import ARLEnv, Goal, Action, Observation -------------------------------------------------------------------------------- /envs/arl/arl_env.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # //============================================================================ 4 | # /* 5 | # Software License Agreement (BSD License) 6 | # Copyright (c) 2019, AMBF 7 | # (www.aimlab.wpi.edu) 8 | 9 | # All rights reserved. 10 | 11 | # Redistribution and use in source and binary forms, with or without 12 | # modification, are permitted provided that the following conditions 13 | # are met: 14 | 15 | # * Redistributions of source code must retain the above copyright 16 | # notice, this list of conditions and the following disclaimer. 17 | 18 | # * Redistributions in binary form must reproduce the above 19 | # copyright notice, this list of conditions and the following 20 | # disclaimer in the documentation and/or other materials provided 21 | # with the distribution. 22 | 23 | # * Neither the name of authors nor the names of its contributors may 24 | # be used to endorse or promote products derived from this software 25 | # without specific prior written permission. 26 | 27 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 28 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 29 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 30 | # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 31 | # COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 32 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 33 | # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 34 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 35 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36 | # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 37 | # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 38 | # POSSIBILITY OF SUCH DAMAGE. 39 | 40 | # \author 41 | # \author , , 42 | # \author Dhruv Kool Rajamani, Vignesh Manoj Varier, and Adnan Munawar 43 | # \version 1.0.0 44 | # */ 45 | # //============================================================================ 46 | 47 | from typing import List, Set, Tuple, Dict, Any, Type 48 | from ambf_client import Client 49 | import ambf_client 50 | from gym import spaces 51 | import numpy as np 52 | import copy 53 | import time 54 | import gym 55 | from gym.utils import seeding 56 | from ambf_world import World 57 | from ambf_object import Object 58 | import sys 59 | from abc import ABC, abstractmethod, ABCMeta 60 | 61 | 62 | class Observation: 63 | 64 | def __init__( 65 | self, 66 | state: Any or np.ndarray or List[float] or Dict, 67 | dist: int = 0, 68 | reward: float = 0.0, 69 | prev_reward: float = 0.0, 70 | cur_reward: float = 0.0, 71 | is_done: bool = False, 72 | info: Dict = {}, 73 | sim_step_no: int = 0 74 | ) -> None: 75 | 76 | self.state = state 77 | self.dist = dist 78 | self.reward = reward 79 | self.prev_reward = prev_reward 80 | self.cur_reward = cur_reward 81 | self.is_done = is_done 82 | self.info = info 83 | self.sim_step_no = sim_step_no 84 | 85 | return 86 | 87 | def cur_observation(self) -> Tuple[Any or np.ndarray, float, bool, Dict]: 88 | return self.state, self.reward, self.is_done, self.info 89 | 90 | 91 | class ClientHandleError(BaseException): 92 | 93 | def __init__(self, *args: object) -> None: 94 | super().__init__(*args) 95 | 96 | 97 | class Action: 98 | 99 | def __init__( 100 | self, 101 | n_actions: int, 102 | action_space_limit: float, 103 | action_lims_low: List[float] = None, 104 | action_lims_high: List[float] = None 105 | ) -> None: 106 | self.n_actions = n_actions 107 | self.action = [0.0 for i in range(n_actions)] 108 | self.action_space_limit = action_space_limit 109 | if action_lims_low is None: 110 | self.action_lims_low = -action_space_limit * np.ones(self.n_actions) 111 | else: 112 | self.action_lims_low = action_lims_low 113 | 114 | if action_lims_high is None: 115 | self.action_lims_high = action_space_limit * np.ones(self.n_actions) 116 | else: 117 | self.action_lims_high = action_lims_high 118 | 119 | self.action_space = spaces.Space() 120 | 121 | return 122 | 123 | def check_if_valid_action(self): 124 | """Check if the length of the action matches the defined action in the action space 125 | 126 | Raises 127 | ------ 128 | TypeError 129 | If the length of action does not match the number of actions defined 130 | """ 131 | 132 | assert len(self.action) == self.n_actions, TypeError("Incorrect length of actions provided!") 133 | # Clips the action value between the upper and lower limits 134 | np.clip(self.action, self.action_lims_low, self.action_lims_high, out=self.action) 135 | 136 | return 137 | 138 | def apply(self, state: np.ndarray, out: np.ndarray = None) -> np.ndarray: 139 | """Apply the action to the given state. 140 | 141 | This function can be overrided if the application of the action to state corresponds 142 | to a complex relationship to obtain the next state. 143 | 144 | Parameter 145 | --------- 146 | state : np.ndarray 147 | The state (s) to which the action should be added 148 | 149 | out : np.ndarray, optional 150 | The output parameter, passed by reference if the return argument is not used 151 | 152 | Raises 153 | ------ 154 | Exception 155 | 156 | """ 157 | if out is None: 158 | result = np.add(state, self.action) 159 | else: 160 | result = np.add(state, self.action, out=out) 161 | 162 | return result 163 | 164 | 165 | class Goal: 166 | 167 | def __init__( 168 | self, 169 | goal_position_range: float, 170 | goal_error_margin: float, 171 | goal: Any or np.ndarray 172 | ) -> None: 173 | 174 | self.goal_position_range = goal_position_range 175 | self.goal_error_margin = goal_error_margin 176 | self.goal = goal 177 | 178 | return 179 | 180 | def _sample_goal(self) -> Any or np.ndarray: 181 | 182 | return self.goal 183 | 184 | 185 | class ARLEnv(gym.Env, metaclass=ABCMeta): 186 | """Base class for the ARLEnv 187 | 188 | This class should not be instantiated on its own. It does not contain the 189 | desired RL Algorithm or the Robot. This should remain an abstract class and 190 | should be wrapped by at least an algorithmic and robot environment wrapper. 191 | An example of this can be PSM_cartesian_env_ddpg(ARLEnv). 192 | 193 | Attributes 194 | ---------- 195 | 196 | Methods 197 | ------- 198 | skip_skipped_sim_steps(num) 199 | Define number of steps to skip if step-throttling is enabled. 200 | 201 | set_throttling_enable(check): 202 | Set the step-throttling Boolean. 203 | """ 204 | 205 | def __init__( 206 | self, 207 | enable_step_throttling: bool, 208 | n_skip_steps: int, 209 | env_name: str = "arl_env" 210 | ) -> None: 211 | """Initialize the abstract class which handles all AMBF related interactions 212 | 213 | Parameters 214 | ---------- 215 | enable_step_throttling : bool 216 | Flag to enable throttling of the simulator 217 | n_skip_steps : int 218 | Number of steps to skip after an update step 219 | env_name : str 220 | Name of the environment to train 221 | """ 222 | super(ARLEnv, self).__init__() 223 | 224 | # Set the environment name 225 | self._env_name = env_name 226 | self._client_name = self._env_name + '_client' 227 | 228 | # Set environment and task parameters 229 | self._n_skip_steps = n_skip_steps 230 | self._enable_step_throttling = enable_step_throttling 231 | 232 | # Initialize sim steps 233 | self._skipped_sim_steps = 0 234 | self._prev_sim_step = 0 235 | self._count_for_print = 0 236 | 237 | # AMBF Sim Environment Declarations 238 | self._obj_handle = None # Object 239 | self._world_handle = None # World 240 | self._ambf_client = None # Client(client_name=self._client_name) 241 | # Initialize the ambf client 242 | self.ambf_client = Client(client_name=self._client_name) 243 | 244 | # Set default Observation, Goal, and Action 245 | self._goal = Goal(0.0, 0.0, None) 246 | self._obs = Observation(None) 247 | self._action = Action(0, 0.0) 248 | 249 | # Set action space and observation space 250 | self.action_space = self.action.action_space 251 | self.observation_space = spaces.Space() 252 | 253 | # Sleep to allow the handle to connect to the AMBF server 254 | time.sleep(1) 255 | 256 | # Random seed the environment 257 | self.seed(5) 258 | 259 | return 260 | 261 | # Properties 262 | 263 | @property 264 | def env_name(self) -> str: 265 | """Return the name of the environment 266 | """ 267 | return self._env_name 268 | 269 | @env_name.setter 270 | def env_name(self, value: str): 271 | """Set the environment name 272 | """ 273 | self._env_name = value 274 | 275 | @property 276 | def client_name(self) -> str: 277 | """Return the AMBF Client Name 278 | """ 279 | return self._client_name 280 | 281 | @client_name.setter 282 | def client_name(self, value: str): 283 | """Sets the AMBF Client Name 284 | """ 285 | self._client_name = value 286 | return 287 | 288 | @property 289 | def n_skip_steps(self) -> int: 290 | """Return number of steps to skip. 291 | 292 | TODO: Provide reference for step-throttling 293 | """ 294 | return self._n_skip_steps 295 | 296 | @n_skip_steps.setter 297 | def n_skip_steps(self, value: int): 298 | """Define number of steps to skip if step-throttling is enabled. 299 | """ 300 | self._n_skip_steps = value 301 | self._world_handle.set_num_step_skips(value) 302 | return 303 | 304 | @property 305 | def enable_step_throttling(self) -> bool: 306 | """Return the step-throttling state 307 | 308 | TODO: Provide reference for step-throttling 309 | """ 310 | return self._enable_step_throttling 311 | 312 | @enable_step_throttling.setter 313 | def enable_step_throttling(self, value: bool): 314 | """Set the step-throttling state 315 | """ 316 | self._enable_step_throttling = value 317 | self._world_handle.enable_throttling(self._enable_step_throttling) 318 | return 319 | 320 | @property 321 | def skipped_sim_steps(self) -> int: 322 | """Return the simulation steps skipped 323 | """ 324 | skipped_steps = 0 325 | if self.enable_step_throttling: 326 | while skipped_steps < self.n_skip_steps: 327 | skipped_steps = self.obj_handle.get_sim_step() - self.prev_sim_step 328 | time.sleep(1e-5) 329 | self.prev_sim_step = self.obj_handle.get_sim_step() 330 | if skipped_steps > self.n_skip_steps: 331 | print( 332 | 'WARN: Skipped {} steps, Default skip limit {} Steps'.format( 333 | skipped_steps, 334 | self.n_skip_steps 335 | ) 336 | ) 337 | else: 338 | skipped_steps = self.obj_handle.get_sim_step() - self.prev_sim_step 339 | self.prev_sim_step = self.obj_handle.get_sim_step() 340 | 341 | self._skipped_sim_steps = skipped_steps 342 | 343 | return self._skipped_sim_steps 344 | 345 | @property 346 | def prev_sim_step(self) -> int: 347 | """Return the previous simulation step number 348 | """ 349 | return self._prev_sim_step 350 | 351 | @prev_sim_step.setter 352 | def prev_sim_step(self, value: int): 353 | """Set the previous simulation step number 354 | """ 355 | self._prev_sim_step = value 356 | return 357 | 358 | @property 359 | def count_for_print(self) -> int: 360 | """Return the number of counts to print 361 | """ 362 | return self._count_for_print 363 | 364 | @count_for_print.setter 365 | def count_for_print(self, value: int): 366 | """Set the number of counts to print 367 | """ 368 | self._count_for_print = value 369 | return 370 | 371 | @property 372 | def obj_handle(self) -> Object: 373 | """Return the AMBF Object Handle 374 | """ 375 | return self._obj_handle 376 | 377 | @obj_handle.setter 378 | def obj_handle(self, value: Object): 379 | """Set the AMBF Object Handle 380 | """ 381 | self._obj_handle = value 382 | return 383 | 384 | @property 385 | def world_handle(self) -> World: 386 | """Return the AMBF World Handle 387 | """ 388 | return self._world_handle 389 | 390 | @world_handle.setter 391 | def world_handle(self, value: World): 392 | """Set the AMBF World Handle 393 | """ 394 | self._world_handle = value 395 | return 396 | 397 | @property 398 | def ambf_client(self) -> Client: 399 | """Return the AMBF Client 400 | """ 401 | return self._ambf_client 402 | 403 | @ambf_client.setter 404 | def ambf_client(self, value: Client): 405 | """Set the AMBF Client 406 | """ 407 | self._ambf_client = value 408 | self._ambf_client.connect() 409 | time.sleep(1) 410 | self._ambf_client.create_objs_from_rostopics() 411 | return 412 | 413 | @property 414 | def goal(self) -> Goal: 415 | """Return the goal object 416 | """ 417 | return self._goal 418 | 419 | @goal.setter 420 | def goal(self, value: Goal): 421 | """Set the goal object 422 | """ 423 | self._goal = value 424 | return 425 | 426 | @property 427 | def obs(self) -> Observation: 428 | """Return the Observation object 429 | """ 430 | return self._obs 431 | 432 | @obs.setter 433 | def obs(self, value: Observation): 434 | """Set the Observation object 435 | """ 436 | self._obs = value 437 | return 438 | 439 | @property 440 | def action(self) -> Action: 441 | """Return the action object 442 | """ 443 | return self._action 444 | 445 | @action.setter 446 | def action(self, value: Action): 447 | """Set the action object 448 | """ 449 | self._action = value 450 | return 451 | 452 | @property 453 | def action_space(self) -> spaces.Space: 454 | """Return the action_space 455 | """ 456 | return self._action_space 457 | 458 | @action_space.setter 459 | def action_space(self, value: spaces.Space): 460 | """Set the action_space 461 | """ 462 | self._action_space = value 463 | return 464 | 465 | @property 466 | def observation_space(self) -> spaces.Space: 467 | """Return the observation_space 468 | """ 469 | return self._observation_space 470 | 471 | @observation_space.setter 472 | def observation_space(self, value: spaces.Space): 473 | """Set the observation_space 474 | """ 475 | self._observation_space = value 476 | return 477 | 478 | 479 | # Gym requirements 480 | 481 | def make(self, robot_root_link: str): 482 | """Creates an object handle of the robot and world in AMBF 483 | 484 | Parameters 485 | ---------- 486 | robot_root_link : string 487 | 488 | Name of the root link of the robot. 489 | 490 | eg. for the dVRK PSM: robot_root_link = 'psm/baselink' 491 | 492 | Raises 493 | ------ 494 | ClientHandleError 495 | If obj handle or world handle are None, then a ClientHandleError is 496 | raised. 497 | """ 498 | 499 | self.world_handle = self.ambf_client.get_world_handle() 500 | if self.world_handle is None: 501 | raise ClientHandleError("World handle not found, please make sure AMBF is running") 502 | self.obj_handle = self.ambf_client.get_obj_handle(robot_root_link) 503 | if self.obj_handle is None: 504 | raise ClientHandleError("Object handle not found, please make sure robot is loaded in AMBF") 505 | time.sleep(2) 506 | self.world_handle.enable_throttling(self.enable_step_throttling) 507 | self.world_handle.set_num_step_skips(self.n_skip_steps) 508 | 509 | # Verify obj_handle 510 | if self.obj_handle.get_num_joints() == 0: 511 | raise ClientHandleError( 512 | "Object handle returned {} objects, please make sure robot is loaded in AMBF".format( 513 | self.obj_handle.get_num_joints() 514 | ) 515 | ) 516 | 517 | return 518 | 519 | def seed(self, seed: int) -> List[int]: 520 | """Randomize the environment 521 | """ 522 | self.np_random, seed = seeding.np_random(seed) 523 | 524 | return [seed] 525 | 526 | @abstractmethod 527 | def reset(self) -> np.ndarray or List[float] or Dict: 528 | """Reset the robot environment 529 | 530 | Raises 531 | ------ 532 | NotImplementedError 533 | If function is not overridden, a Not Implemented error is raised if the 534 | reset function from the base class is called. 535 | """ 536 | raise NotImplementedError("Abstract method reset needs to be overridden") 537 | return Observation(None) 538 | 539 | def render(self, mode): 540 | return 541 | 542 | @abstractmethod 543 | def step( 544 | self, 545 | action #: Action 546 | ) -> Tuple[List[Any] or np.ndarray, 547 | float, 548 | bool, 549 | Dict[str, 550 | bool]]: 551 | """Performs the update step for the algorithm and dynamics 552 | """ 553 | return [], 0.0, False, {'': False} 554 | 555 | @abstractmethod 556 | def compute_reward(self, reached_goal: Goal, desired_goal: Goal, info: Dict[str, bool]) -> float: 557 | """Function to compute reward received by the agent 558 | """ 559 | return 0.0 560 | 561 | @abstractmethod 562 | def _sample_goal(self, observation: Observation) -> Goal: 563 | """Function to samples new goal positions and ensures its within the workspace of PSM 564 | """ 565 | return Goal(0.0, 0.0, None) 566 | 567 | def _check_if_done(self) -> bool: 568 | """Function to check if the episode was successful 569 | """ 570 | if abs(self.obs.dist) < self.goal.goal_error_margin: 571 | return True 572 | else: 573 | return False 574 | 575 | # @abstractmethod 576 | def _update_info(self): 577 | """Can be used to Provide information for debugging purpose 578 | 579 | TODO: Should this function be made abstract? 580 | """ 581 | info = {'is_success': self._is_success()} 582 | return info 583 | 584 | # @abstractmethod 585 | def _is_success(self): 586 | """Function to check if the robot reached the desired goal within a predefined error margin 587 | 588 | TODO: Should this function be made abstract? 589 | """ 590 | return self._check_if_done() 591 | 592 | @abstractmethod 593 | def send_cmd(self, cmd: np.ndarray or List[float]): 594 | """Send the command to the robot in the AMBF Simulation. 595 | """ 596 | return 597 | 598 | if __name__ == '__main__': 599 | 600 | print(sys.path) -------------------------------------------------------------------------------- /envs/dVRK/PSM_cartesian_ddpg_env.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # //============================================================================ 4 | # /* 5 | # Software License Agreement (BSD License) 6 | # Copyright (c) 2019, AMBF 7 | # (www.aimlab.wpi.edu) 8 | 9 | # All rights reserved. 10 | 11 | # Redistribution and use in source and binary forms, with or without 12 | # modification, are permitted provided that the following conditions 13 | # are met: 14 | 15 | # * Redistributions of source code must retain the above copyright 16 | # notice, this list of conditions and the following disclaimer. 17 | 18 | # * Redistributions in binary form must reproduce the above 19 | # copyright notice, this list of conditions and the following 20 | # disclaimer in the documentation and/or other materials provided 21 | # with the distribution. 22 | 23 | # * Neither the name of authors nor the names of its contributors may 24 | # be used to endorse or promote products derived from this software 25 | # without specific prior written permission. 26 | 27 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 28 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 29 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 30 | # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 31 | # COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 32 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 33 | # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 34 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 35 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36 | # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 37 | # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 38 | # POSSIBILITY OF SUCH DAMAGE. 39 | 40 | # \author 41 | # \author , , 42 | # \author Dhruv Kool Rajamani, Vignesh Manoj Varier, and Adnan Munawar 43 | # \version 1.0.0 44 | # */ 45 | # //============================================================================ 46 | 47 | import sys, os, copy, time 48 | 49 | from typing import Iterable, List, Set, Tuple, Dict, Any, Type 50 | 51 | from arl.arl_env import Action, Goal, Observation, ARLEnv 52 | from .PSM_cartesian_env import PSMCartesianEnv, CartesianAction 53 | 54 | import numpy as np 55 | from numpy import linalg as LA 56 | 57 | import gym 58 | from gym import spaces 59 | from gym.utils import seeding 60 | 61 | from psmFK import compute_FK 62 | from transformations import euler_from_matrix 63 | from dvrk_functions.msg import HomogenousTransform 64 | import rospy 65 | from dvrk_functions.srv import * 66 | 67 | 68 | class DDPGObservation(Observation): 69 | 70 | def __init__( 71 | self, 72 | state: Any or np.ndarray, 73 | dist: int = 0, 74 | reward: float = 0.0, 75 | prev_reward: float = 0.0, 76 | cur_reward: float = 0.0, 77 | is_done: bool = False, 78 | info: Dict = {}, 79 | sim_step_no: int = 0 80 | ) -> None: 81 | 82 | super(DDPGObservation, 83 | self).__init__(state, 84 | dist, 85 | reward, 86 | prev_reward, 87 | cur_reward, 88 | is_done, 89 | info, 90 | sim_step_no) 91 | return 92 | 93 | 94 | class PSMCartesianDDPGEnv(PSMCartesianEnv): 95 | """Single task based environment for PSM to perform debris removal as shown in 96 | the paper: 97 | 98 | TODO: Enter paper doi and name 99 | 100 | The environment performs actions in the cartesian space in R3, with translational 101 | movements in x, y, and z. 102 | """ 103 | 104 | def __init__( 105 | self, 106 | action_space_limit: float, 107 | goal_position_range: float, 108 | position_error_threshold: float, 109 | goal_error_margin: float, 110 | joint_limits: Dict[str, 111 | np.ndarray or List[str]], 112 | workspace_limits: Dict[str, 113 | np.ndarray or List[str]], 114 | enable_step_throttling: bool, 115 | joints_to_control: List[str] = [ 116 | 'baselink-yawlink', 117 | 'yawlink-pitchbacklink', 118 | 'pitchendlink-maininsertionlink', 119 | 'maininsertionlink-toolrolllink', 120 | 'toolrolllink-toolpitchlink', 121 | 'toolpitchlink-toolgripper1link', 122 | 'toolpitchlink-toolgripper2link' 123 | ], 124 | steps_to_print: int = 10000, 125 | n_actions: int = 3, 126 | n_skip_steps: int = 5, 127 | env_name: str = "PSM_cartesian_ddpg_env" 128 | ) -> None: 129 | """Initialize an object to train with DDPG on the PSM robot. 130 | 131 | Parameters 132 | ---------- 133 | action_space_limit : float 134 | Action space limit for cartesian actions 135 | goal_position_range : int, optional 136 | The variance in goal position 137 | position_error_threshold : float 138 | Maximum acceptable error in cartesian position 139 | goal_error_margin : float 140 | Maximum margin of error for an epoch to be considered successful 141 | joint_limits : Dict(str, List(float) | np.array(float)) 142 | Robot joint limits in radians 143 | workspace_limits : Dict(str, List(float) | np.array(float)) 144 | Workspace limits in x,y, and z for the robots workspace in Cartesian space 145 | enable_step_throttling : bool 146 | Flag to enable throttling of the simulator 147 | joints_to_control : np.array(str) | List(str) 148 | The list of joint links for the psm. 149 | steps_to_print : int 150 | Number of steps before model prints information to stdout 151 | n_actions : int 152 | Number of possible actions 153 | n_skip_steps : int 154 | Number of steps to skip after an update step 155 | env_name : str 156 | Name of the environment to train 157 | """ 158 | super(PSMCartesianDDPGEnv, 159 | self).__init__( 160 | action_space_limit, 161 | goal_position_range, 162 | position_error_threshold, 163 | goal_error_margin, 164 | joint_limits, 165 | workspace_limits, 166 | enable_step_throttling, 167 | joints_to_control, 168 | steps_to_print, 169 | n_actions, 170 | n_skip_steps, 171 | env_name 172 | ) 173 | 174 | # Set observation space and constraints 175 | self.obs = DDPGObservation(state=np.zeros(20)) 176 | self.initial_pos = copy.deepcopy(self.obs.cur_observation()[0]) 177 | self.observation_space = spaces.Box( 178 | -np.inf, 179 | np.inf, 180 | shape=np.array(self.initial_pos).shape, 181 | dtype="float32" 182 | ) 183 | 184 | return 185 | 186 | def compute_reward(self, reached_goal: Goal, desired_goal: Goal, info: Dict[str, bool]) -> float: 187 | """Function to compute reward received by the agent 188 | """ 189 | # Find the distance between goal and achieved goal 190 | cur_dist = LA.norm(np.subtract(desired_goal.goal[0:3], reached_goal.goal[0:3])) 191 | # Continuous reward 192 | reward = round(1 - float(abs(cur_dist) / 0.05) * 0.5, 5) 193 | # Sparse reward 194 | # if abs(cur_dist) < self.goal_error_margin: 195 | # reward = 1 196 | # else: 197 | # reward = -1 198 | self.obs.dist = cur_dist 199 | return reward 200 | 201 | def _sample_goal(self, observation: Observation) -> Goal: 202 | """Function to samples new goal positions and ensures its within the workspace of PSM 203 | """ 204 | rand_val_pos = np.around( 205 | np.add( 206 | observation.state[0:3], 207 | self.np_random.uniform( 208 | -self.goal.goal_position_range, 209 | self.goal.goal_position_range, 210 | size=3 211 | ) 212 | ), 213 | decimals=4 214 | ) 215 | rand_val_pos[0] = np.around(np.clip(rand_val_pos[0], -0.04, 0.03), decimals=4) 216 | rand_val_pos[1] = np.around(np.clip(rand_val_pos[1], -0.03, 0.04), decimals=4) 217 | rand_val_pos[2] = np.around(np.clip(rand_val_pos[2], -0.20, -0.09), decimals=4) 218 | # Uncomment below lines if individual limits need to be set for generating desired goal state 219 | ''' 220 | rand_val_pos = self.np_random.uniform(-0.1935, 0.1388, size=3) 221 | rand_val_pos[0] = np.around(np.clip(rand_val_pos[0], -0.1388, 0.1319), decimals=4) 222 | rand_val_pos[1] = np.around(np.clip(rand_val_pos[1], -0.1319, 0.1388), decimals=4) 223 | rand_val_pos[2] = np.around(np.clip(rand_val_pos[2], -0.1935, -0.0477), decimals=4) 224 | rand_val_angle[0] = np.clip(rand_val_angle[0], -0.15, 0.15) 225 | rand_val_angle[1] = np.clip(rand_val_angle[1], -0.15, 0.15) 226 | rand_val_angle[2] = np.clip(rand_val_angle[2], -0.15, 0.15) 227 | ''' 228 | # Provide the range for generating the desired orientation at the terminal state 229 | rand_val_angle = self.np_random.uniform(-1.5, 1.5, size=3) 230 | goal = Goal( 231 | goal=np.concatenate((rand_val_pos, 232 | rand_val_angle), 233 | axis=None), 234 | goal_error_margin=self.goal.goal_error_margin, 235 | goal_position_range=self.goal.goal_position_range 236 | ) 237 | 238 | return goal 239 | 240 | def _update_observation( 241 | self, 242 | end_effector_frame: Any or np.ndarray, 243 | joint_pos: Any or np.ndarray, 244 | joint_vel: Any or np.ndarray 245 | ): 246 | """Update the observation object in this class 247 | """ 248 | # Function ensuring skipped steps based on step throttling 249 | skipped_steps = self.skipped_sim_steps 250 | 251 | # Robot tip cartesian position and orientation 252 | end_effector_pos = end_effector_frame[0:3, 3] 253 | end_effector_euler = np.array(euler_from_matrix(end_effector_frame[0:3, 254 | 0:3], 255 | axes='szyx')).reshape((3, 256 | 1)) 257 | # State vec is 20x1 258 | # [x, y, z, ez, ey, ex, j1, j2, j3, j4, j5, j6, j7, w1, w2, w3, w4, w5, w6, w7] 259 | 260 | obs = np.asarray( 261 | np.concatenate((end_effector_pos, 262 | end_effector_euler, 263 | joint_pos.reshape((7, 264 | 1))), 265 | axis=0) 266 | ) 267 | obs = np.concatenate((obs, joint_vel), axis=None) 268 | # Update the observation object 269 | self.obs.state = obs.copy() 270 | # Update the obs info 271 | self.obs.info = self._update_info() 272 | # Compute the reward 273 | reached_goal = copy.deepcopy(self.goal) 274 | reached_goal.goal = self.obs.state 275 | self.obs.reward = self.compute_reward(reached_goal, self.goal, self.obs.info) 276 | self.obs.is_done = self._check_if_done() 277 | 278 | return 279 | 280 | 281 | if __name__ == "__main__": 282 | # Create object of this class 283 | root_link = 'baselink' 284 | env_kwargs = { 285 | 'action_space_limit': 0.05, 286 | 'goal_position_range': 0.05, 287 | 'position_error_threshold': 0.01, 288 | 'goal_error_margin': 0.0075, 289 | 'joint_limits': 290 | { 291 | 'lower_limit': np.array([-0.2, 292 | -0.2, 293 | 0.1, 294 | -1.5, 295 | -1.5, 296 | -1.5, 297 | -1.5]), 298 | 'upper_limit': np.array([0.2, 299 | 0.2, 300 | 0.24, 301 | 1.5, 302 | 1.5, 303 | 1.5, 304 | 1.5]) 305 | }, 306 | 'workspace_limits': 307 | { 308 | 'lower_limit': np.array([-0.04, 309 | -0.03, 310 | -0.2]), 311 | 'upper_limit': np.array([0.03, 312 | 0.04, 313 | -0.091]) 314 | }, 315 | 'enable_step_throttling': False, 316 | 'steps_to_print': 10 317 | } 318 | psmEnv = PSMCartesianDDPGEnv(**env_kwargs) 319 | psmEnv.make(root_link) 320 | # psmEnv.world_handle = psmEnv.ambf_client.get_world_handle() 321 | # psmEnv.world_handle.enable_throttling(False) -------------------------------------------------------------------------------- /envs/dVRK/PSM_cartesian_env.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # //============================================================================ 4 | # /* 5 | # Software License Agreement (BSD License) 6 | # Copyright (c) 2019, AMBF 7 | # (www.aimlab.wpi.edu) 8 | 9 | # All rights reserved. 10 | 11 | # Redistribution and use in source and binary forms, with or without 12 | # modification, are permitted provided that the following conditions 13 | # are met: 14 | 15 | # * Redistributions of source code must retain the above copyright 16 | # notice, this list of conditions and the following disclaimer. 17 | 18 | # * Redistributions in binary form must reproduce the above 19 | # copyright notice, this list of conditions and the following 20 | # disclaimer in the documentation and/or other materials provided 21 | # with the distribution. 22 | 23 | # * Neither the name of authors nor the names of its contributors may 24 | # be used to endorse or promote products derived from this software 25 | # without specific prior written permission. 26 | 27 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 28 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 29 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 30 | # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 31 | # COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 32 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 33 | # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 34 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 35 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36 | # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 37 | # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 38 | # POSSIBILITY OF SUCH DAMAGE. 39 | 40 | # \author 41 | # \author , , 42 | # \author Dhruv Kool Rajamani, Vignesh Manoj Varier, and Adnan Munawar 43 | # \version 1.0.0 44 | # */ 45 | # //============================================================================ 46 | 47 | from abc import ABCMeta, abstractmethod 48 | import sys, os, copy, time 49 | 50 | from typing import Iterable, List, Set, Tuple, Dict, Any, Type 51 | 52 | from arl.arl_env import Action, Goal, Observation, ARLEnv 53 | 54 | import numpy as np 55 | from numpy import linalg as LA 56 | 57 | import gym 58 | from gym import spaces 59 | from gym.utils import seeding 60 | 61 | from psmFK import compute_FK 62 | from transformations import euler_from_matrix 63 | from dvrk_functions.msg import HomogenousTransform 64 | import rospy 65 | from dvrk_functions.srv import * 66 | 67 | 68 | class CartesianAction(Action): 69 | 70 | def __init__( 71 | self, 72 | n_actions: int, 73 | action_space_limit: float, 74 | action_lims_low: List[float] = None, 75 | action_lims_high: List[float] = None 76 | ) -> None: 77 | super(CartesianAction, 78 | self).__init__(n_actions, 79 | action_space_limit, 80 | action_lims_low, 81 | action_lims_high) 82 | 83 | self.action_space = spaces.Box( 84 | low=-action_space_limit, 85 | high=action_space_limit, 86 | shape=(self.n_actions, 87 | ), 88 | dtype="float32" 89 | ) 90 | 91 | 92 | class PSMCartesianEnv(ARLEnv, metaclass=ABCMeta): 93 | """Single task based environment for PSM to perform debris removal as shown in 94 | the paper: 95 | 96 | TODO: Enter paper doi and name 97 | 98 | The environment performs actions in the cartesian space in R3, with translational 99 | movements in x, y, and z. 100 | """ 101 | 102 | def __init__( 103 | self, 104 | action_space_limit: float, 105 | goal_position_range: float, 106 | position_error_threshold: float, 107 | goal_error_margin: float, 108 | joint_limits: Dict[str, 109 | np.ndarray or List[str]], 110 | workspace_limits: Dict[str, 111 | np.ndarray or List[str]], 112 | enable_step_throttling: bool, 113 | joints_to_control: List[str] = [ 114 | 'baselink-yawlink', 115 | 'yawlink-pitchbacklink', 116 | 'pitchendlink-maininsertionlink', 117 | 'maininsertionlink-toolrolllink', 118 | 'toolrolllink-toolpitchlink', 119 | 'toolpitchlink-toolgripper1link', 120 | 'toolpitchlink-toolgripper2link' 121 | ], 122 | steps_to_print: int = 10000, 123 | n_actions: int = 3, 124 | n_skip_steps: int = 5, 125 | env_name: str = "PSM_cartesian_ddpgenv" 126 | ) -> None: 127 | """Initialize an object of the PSM robot in cartesian space. 128 | 129 | Parameters 130 | ---------- 131 | action_space_limit : float 132 | Action space limit for cartesian actions 133 | goal_position_range : int, optional 134 | The variance in goal position 135 | position_error_threshold : float 136 | Maximum acceptable error in cartesian position 137 | goal_error_margin : float 138 | Maximum margin of error for an epoch to be considered successful 139 | joint_limits : Dict(str, List(float) | np.array(float)) 140 | Robot joint limits in radians 141 | workspace_limits : Dict(str, List(float) | np.array(float)) 142 | Workspace limits in x,y, and z for the robots workspace in Cartesian space 143 | enable_step_throttling : bool 144 | Flag to enable throttling of the simulator 145 | joints_to_control : np.array(str) | List(str) 146 | The list of joint links for the psm. 147 | steps_to_print : int 148 | Number of steps before model prints information to stdout 149 | n_actions : int 150 | Number of possible actions 151 | n_skip_steps : int 152 | Number of steps to skip after an update step 153 | env_name : str 154 | Name of the environment to train 155 | """ 156 | super(PSMCartesianEnv, self).__init__(enable_step_throttling, n_skip_steps, env_name) 157 | 158 | # Set environment limits 159 | self._position_error_threshold = position_error_threshold 160 | self._joint_limits = joint_limits 161 | self._workspace_limits = workspace_limits 162 | 163 | # Store controllable joints 164 | self._joints_to_control = joints_to_control 165 | 166 | # Steps to print 167 | self._steps_to_print = steps_to_print 168 | 169 | # Set environment and task parameters 170 | self._n_actions = n_actions 171 | 172 | ## Set task constraints 173 | # Set action space limits 174 | self.action = CartesianAction(self._n_actions, action_space_limit) 175 | self.action_space = self.action.action_space 176 | 177 | # Set goal position and constraints 178 | # TODO: Consider using args/extra args in init() to specify goal. 179 | self.goal = Goal( 180 | goal_position_range, 181 | goal_error_margin, 182 | np.array([0.0, 183 | 0.0, 184 | -0.1, 185 | 0.0, 186 | 0.0, 187 | 0.0]) 188 | ) 189 | 190 | return 191 | 192 | # Properties 193 | 194 | @property 195 | def position_error_threshold(self) -> float: 196 | """Returns the position error threshold 197 | """ 198 | return self._position_error_threshold 199 | 200 | @position_error_threshold.setter 201 | def position_error_threshold(self, value: float): 202 | """Set the position error threshold 203 | """ 204 | self._position_error_threshold = value 205 | return 206 | 207 | @property 208 | def joint_limits(self) -> Dict[str, np.ndarray or List[float]]: 209 | """Return the joint limits dictionary 210 | """ 211 | return self._joint_limits 212 | 213 | @joint_limits.setter 214 | def joint_limits(self, value: Dict[str, np.ndarray or List[float]]): 215 | """Set the joint limits dictionary 216 | """ 217 | self._joint_limits = value 218 | return 219 | 220 | @property 221 | def workspace_limits(self) -> Dict[str, np.ndarray or List[float]]: 222 | """Returns the workspace limits dictionary 223 | """ 224 | return self._workspace_limits 225 | 226 | @workspace_limits.setter 227 | def workspace_limits(self, value: Dict[str, np.ndarray or List[float]]): 228 | """Set the workspace limits dictionary 229 | """ 230 | self._workspace_limits = value 231 | return 232 | 233 | @property 234 | def joints_to_control(self) -> Any or List[str]: 235 | """Returns a np.array or List of joint object handles 236 | """ 237 | return self._joints_to_control 238 | 239 | @joints_to_control.setter 240 | def joints_to_control(self, value: Any or List[str]): 241 | """Set a np.array or List of joint object handles 242 | """ 243 | self._joints_to_control = value 244 | return 245 | 246 | @property 247 | def steps_to_print(self) -> int: 248 | """Returns the number of steps to print 249 | """ 250 | return self._steps_to_print 251 | 252 | @steps_to_print.setter 253 | def steps_to_print(self, value: int): 254 | """Set the number of steps to print 255 | """ 256 | self._steps_to_print = value 257 | return 258 | 259 | @property 260 | def n_actions(self) -> int: 261 | """Returns the number of actions possible 262 | """ 263 | return self._n_actions 264 | 265 | @n_actions.setter 266 | def n_actions(self, value: int): 267 | """Set the number of actions possible 268 | """ 269 | self._n_actions = value 270 | return 271 | 272 | @property 273 | def initial_pos(self) -> Any or np.ndarray or List[float] or Dict: 274 | """Returns the initial position (state) of the environment 275 | """ 276 | # if type(self._initial_pos) == type(np.ndarray): 277 | # return self._initial_pos 278 | # elif type(self._initial_pos) == type(dict): 279 | # return self._initial_pos 280 | # else: 281 | # return np.array(self._initial_pos) 282 | return self._initial_pos 283 | 284 | @initial_pos.setter 285 | def initial_pos(self, value: Any or np.ndarray or List[float] or Dict or float): 286 | """Set the initial position (state) of the environment 287 | """ 288 | if type(value) == type(float): 289 | # Set default values 290 | for joint_idx, jt_name in enumerate(self.joints_to_control): 291 | # Prismatic joint is set to different value to ensure at least some part of robot tip 292 | # goes past the cannula 293 | if joint_idx == 2: 294 | self.obj_handle.set_joint_pos(jt_name, self.joint_limits['lower_limit'][2]) 295 | else: 296 | self.obj_handle.set_joint_pos(jt_name, value) 297 | time.sleep(0.5) 298 | else: 299 | self._initial_pos = value 300 | return 301 | 302 | # Overriding ARLEnv functions 303 | 304 | def reset(self) -> np.ndarray or List[float] or Dict: 305 | """Reset the robot environment 306 | 307 | Type 1 Reset : Uses the previous reached state as the initial state for 308 | next iteration 309 | 310 | action = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] 311 | 312 | observation, _, _, _ = self.step(action) 313 | 314 | Type 2 Reset : Sets the robot to a predefined initial state for each 315 | iteration. 316 | """ 317 | 318 | # Set the initial Position of the PSM 319 | self.initial_pos = 0.0 320 | initial_joint_pos, initial_joint_vel = self.get_joint_states() 321 | end_effector_frame = compute_FK(initial_joint_pos) 322 | # Updates the observation to the initialized position 323 | self._update_observation(end_effector_frame, initial_joint_pos, initial_joint_vel) 324 | # Samples a goal 325 | self.goal = self._sample_goal(self.obs) 326 | 327 | return self.obs.state 328 | 329 | def step( 330 | self, 331 | action #: np.ndarray or List[float] 332 | ) -> Tuple[np.ndarray or List[float] or Dict, 333 | float, 334 | bool, 335 | Dict[str, 336 | bool]]: 337 | """Performs the update step for the algorithm and dynamics 338 | """ 339 | # Set incoming action to Action member variable 340 | self.action.action = action 341 | 342 | # Get the current state 343 | cur_state = copy.deepcopy(self.obs.state) 344 | 345 | # Make sure the action is valid 346 | self.action.check_if_valid_action() 347 | 348 | # counter for printing position, action, and reward 349 | self.count_for_print += 1 350 | 351 | # Compute the End Effector frame from the current joint states 352 | cur_joint_pos, cur_joint_vel = self.get_joint_states() 353 | cur_end_effector_frame = compute_FK(cur_joint_pos) 354 | 355 | # Get the positional component from end effector Homogenous Transform 356 | cur_end_effector_pos = np.asarray(cur_end_effector_frame[0:3, 3]).reshape(-1) 357 | # Apply the action and compute the resulting state 358 | next_end_effector_pos = self.action.apply(cur_end_effector_pos) 359 | # Clip the next state to ensure joint limits aren't broken 360 | next_end_effector_pos = self.check_if_valid_state(next_end_effector_pos) 361 | 362 | # Create a frame and maintain previous orientation 363 | next_end_effector_frame = cur_end_effector_frame 364 | for i in range(3): 365 | next_end_effector_frame[i, 3] = next_end_effector_pos[i] 366 | 367 | # Create a Homogenous Transform Message 368 | msg = HomogenousTransform() 369 | msg.data = np.array(next_end_effector_frame).flatten() 370 | 371 | rospy.wait_for_service('compute_IK') 372 | computed_joint_state = None 373 | try: 374 | compute_IK_service = rospy.ServiceProxy('compute_IK', ComputeIK) 375 | compute_IK_response = compute_IK_service.call(ComputeIKRequest(msg)) 376 | computed_joint_state = list(compute_IK_response.q_des) 377 | for i in range(0, 6): 378 | computed_joint_state[i] = round(computed_joint_state[i], 4) 379 | except rospy.ServiceException as e: 380 | print("Service call failed: %s" % e, file=sys.stderr) 381 | # Ensure the computed joint positions are within the limit of user set joint positions 382 | next_joint_state = self.check_if_valid_joint_state(computed_joint_state) 383 | # Ensures that PSM joints reach the desired joint positions 384 | self.send_cmd(cmd=next_joint_state) 385 | 386 | # Update state, reward, done, and world values in the code 387 | self._update_observation(next_end_effector_frame, next_joint_state, cur_joint_vel) 388 | # Update the world handle 389 | self.world_handle.update() 390 | 391 | # Print function for viewing the output intermittently 392 | if self.count_for_print % self.steps_to_print == 0: 393 | print("Count: {} Goal: {}".format(self.count_for_print, self.goal.goal)) 394 | print("\tState: {}".format(cur_state)) 395 | print("\tAction: {}".format(self.action.action)) 396 | print("\tReward: {}".format(self.obs.reward)) 397 | 398 | return self.obs.state, self.obs.reward, self.obs.is_done, self.obs.info 399 | 400 | def send_cmd(self, cmd: Any or np.ndarray or List[float]): 401 | """Ensure the robot tip reaches the desired goal position before moving on to next iteration 402 | """ 403 | 404 | count_for_joint_pos = 0 405 | while True: 406 | # Command joints to reach position 407 | for joint_idx, joint_name in enumerate(self.joints_to_control): 408 | self.obj_handle.set_joint_pos(joint_name, cmd[joint_idx]) 409 | 410 | reached_joint_pos = np.zeros(7) 411 | # Check to see if desired joint positions have been reached 412 | for joint_idx, joint_name in enumerate(self.joints_to_control): 413 | reached_joint_pos[joint_idx] = self.obj_handle.get_joint_pos(joint_name) 414 | 415 | # Compare the error between desired and reached pos and allow acceptable margin 416 | error = np.around(np.subtract(cmd, reached_joint_pos), decimals=3) 417 | # Since Prismatic joint limits are smaller compared to the limits of other joints 418 | error[2] = np.around(np.subtract(cmd[2], reached_joint_pos[2]), decimals=4) 419 | # Create error margin vector 420 | error_margin = np.array([self.position_error_threshold] * len(self.joints_to_control)) 421 | error_margin[2] = 0.5 * self.position_error_threshold 422 | 423 | # Check to ensure the error margins have been reached 424 | if (np.all(np.abs(error) <= error_margin)) or count_for_joint_pos > 75: 425 | break 426 | 427 | # Increment counter for iterations of checks 428 | count_for_joint_pos += 1 429 | 430 | return 431 | 432 | @abstractmethod 433 | def compute_reward(self, reached_goal: Goal, desired_goal: Goal, info: Dict[str, bool]) -> float: 434 | """Function to compute reward received by the agent 435 | """ 436 | reward = 0.0 437 | return reward 438 | 439 | @abstractmethod 440 | def _sample_goal(self, observation: Observation) -> Goal: 441 | """Function to samples new goal positions and ensures its within the workspace of PSM 442 | """ 443 | goal = Goal(0.0, 0.0, None) 444 | return goal 445 | 446 | 447 | # PSM functions, can be imitated for other robots 448 | 449 | @abstractmethod 450 | def _update_observation( 451 | self, 452 | end_effector_frame: Any or np.ndarray, 453 | joint_pos: Any or np.ndarray, 454 | joint_vel: Any or np.ndarray 455 | ): 456 | """Update the observation object in this class 457 | """ 458 | return 459 | 460 | def check_if_valid_state(self, state: np.ndarray or List[float]) -> np.ndarray or List[float]: 461 | """Clips the state if it goes beyond the cartesian limits 462 | """ 463 | clipped_state = np.zeros(3) 464 | cartesian_pos_lower_limit = self.workspace_limits['lower_limit'] 465 | cartesian_pos_upper_limit = self.workspace_limits['upper_limit'] 466 | for axis in range(3): 467 | clipped_state[axis] = np.clip( 468 | state[axis], 469 | cartesian_pos_lower_limit[axis], 470 | cartesian_pos_upper_limit[axis] 471 | ) 472 | return clipped_state 473 | 474 | def check_if_valid_joint_state(self, 475 | state: Any or np.ndarray 476 | or List[float]) -> Any or np.ndarray or List[float]: 477 | """Limits the joint states if it goes beyond the joint limits 478 | """ 479 | # dvrk_limits_low = np.array([-1.605, -0.93556, -0.002444, -3.0456, -3.0414, -3.0481, -3.0498]) 480 | # dvrk_limits_high = np.array([1.5994, 0.94249, 0.24001, 3.0485, 3.0528, 3.0376, 3.0399]) 481 | # Note: Joint 5 and 6, joint pos = 0, 0 is closed jaw and 0.5, 0.5 is open 482 | limit_joint_values = np.zeros(7) 483 | joint_lower_limit = self.joint_limits['lower_limit'] 484 | joint_upper_limit = self.joint_limits['upper_limit'] 485 | for joint_idx in range(len(state)): 486 | limit_joint_values[joint_idx] = np.clip( 487 | state[joint_idx], 488 | joint_lower_limit[joint_idx], 489 | joint_upper_limit[joint_idx] 490 | ) 491 | 492 | return limit_joint_values 493 | 494 | def get_joint_states(self) -> Tuple[List[float], List[float]]: 495 | """Computes the joint position and velocities 496 | """ 497 | joint_positions = np.zeros(7) 498 | joint_velocities = np.zeros(7) 499 | 500 | for joint_idx, joint_name in enumerate(self.joints_to_control): 501 | joint_positions[joint_idx] = self.obj_handle.get_joint_pos(joint_name) 502 | joint_velocities[joint_idx] = self.obj_handle.get_joint_vel(joint_name) 503 | 504 | return joint_positions, joint_velocities 505 | 506 | if __name__ == "__main__": 507 | # Create object of this class 508 | root_link = 'psm/baselink' 509 | 510 | # 'joints_to_control': 511 | # np.array( 512 | # [ 513 | # 'baselink-yawlink', 514 | # 'yawlink-pitchbacklink', 515 | # 'pitchendlink-maininsertionlink', 516 | # 'maininsertionlink-toolrolllink', 517 | # 'toolrolllink-toolpitchlink', 518 | # 'toolpitchlink-toolgripper1link', 519 | # 'toolpitchlink-toolgripper2link' 520 | # ] 521 | # ), 522 | env_kwargs = { 523 | 'action_space_limit': 0.05, 524 | 'goal_position_range': 0.05, 525 | 'position_error_threshold': 0.01, 526 | 'goal_error_margin': 0.0075, 527 | 'joint_limits': 528 | { 529 | 'lower_limit': np.array([-0.2, 530 | -0.2, 531 | 0.1, 532 | -1.5, 533 | -1.5, 534 | -1.5, 535 | -1.5]), 536 | 'upper_limit': np.array([0.2, 537 | 0.2, 538 | 0.24, 539 | 1.5, 540 | 1.5, 541 | 1.5, 542 | 1.5]) 543 | }, 544 | 'workspace_limits': 545 | { 546 | 'lower_limit': np.array([-0.04, 547 | -0.03, 548 | -0.2]), 549 | 'upper_limit': np.array([0.03, 550 | 0.04, 551 | -0.091]) 552 | }, 553 | 'enable_step_throttling': False, 554 | } 555 | psmEnv = PSMCartesianDDPGEnv(**env_kwargs) 556 | psmEnv.make(root_link) 557 | # psmEnv.world_handle = psmEnv.ambf_client.get_world_handle() 558 | # psmEnv.world_handle.enable_throttling(False) -------------------------------------------------------------------------------- /envs/dVRK/PSM_cartesian_herddpg_env.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # //============================================================================ 4 | # /* 5 | # Software License Agreement (BSD License) 6 | # Copyright (c) 2019, AMBF 7 | # (www.aimlab.wpi.edu) 8 | 9 | # All rights reserved. 10 | 11 | # Redistribution and use in source and binary forms, with or without 12 | # modification, are permitted provided that the following conditions 13 | # are met: 14 | 15 | # * Redistributions of source code must retain the above copyright 16 | # notice, this list of conditions and the following disclaimer. 17 | 18 | # * Redistributions in binary form must reproduce the above 19 | # copyright notice, this list of conditions and the following 20 | # disclaimer in the documentation and/or other materials provided 21 | # with the distribution. 22 | 23 | # * Neither the name of authors nor the names of its contributors may 24 | # be used to endorse or promote products derived from this software 25 | # without specific prior written permission. 26 | 27 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 28 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 29 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 30 | # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 31 | # COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 32 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 33 | # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 34 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 35 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36 | # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 37 | # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 38 | # POSSIBILITY OF SUCH DAMAGE. 39 | 40 | # \author 41 | # \author , , 42 | # \author Dhruv Kool Rajamani, Vignesh Manoj Varier, and Adnan Munawar 43 | # \version 1.0.0 44 | # */ 45 | # //============================================================================ 46 | 47 | import sys, os, copy, time 48 | 49 | from typing import Iterable, List, Set, Tuple, Dict, Any, Type 50 | 51 | from gym.logger import error 52 | from arl.arl_env import Action, Goal, Observation, ARLEnv 53 | from .PSM_cartesian_env import PSMCartesianEnv, CartesianAction 54 | 55 | import numpy as np 56 | from numpy import linalg as LA 57 | 58 | import gym 59 | from gym import spaces 60 | from gym.utils import seeding 61 | 62 | from psmFK import compute_FK 63 | from transformations import euler_from_matrix 64 | from dvrk_functions.msg import HomogenousTransform 65 | import rospy 66 | from dvrk_functions.srv import * 67 | 68 | 69 | class HERDDPGObservation(Observation): 70 | 71 | def __init__( 72 | self, 73 | state: Dict, 74 | dist: int = 0, 75 | reward: float = 0.0, 76 | prev_reward: float = 0.0, 77 | cur_reward: float = 0.0, 78 | is_done: bool = False, 79 | info: Dict = {}, 80 | sim_step_no: int = 0 81 | ) -> None: 82 | 83 | super(HERDDPGObservation, 84 | self).__init__(state, 85 | dist, 86 | reward, 87 | prev_reward, 88 | cur_reward, 89 | is_done, 90 | info, 91 | sim_step_no) 92 | return 93 | 94 | 95 | class PSMCartesianHERDDPGEnv(PSMCartesianEnv, gym.GoalEnv): 96 | """Single task based environment for PSM to perform debris removal as shown in 97 | the paper: 98 | 99 | TODO: Enter paper doi and name 100 | 101 | The environment performs actions in the cartesian space in R3, with translational 102 | movements in x, y, and z. 103 | """ 104 | 105 | def __init__( 106 | self, 107 | action_space_limit: float, 108 | goal_position_range: float, 109 | position_error_threshold: float, 110 | goal_error_margin: float, 111 | joint_limits: Dict[str, 112 | np.ndarray or List[str]], 113 | workspace_limits: Dict[str, 114 | np.ndarray or List[str]], 115 | enable_step_throttling: bool, 116 | joints_to_control: List[str] = [ 117 | 'baselink-yawlink', 118 | 'yawlink-pitchbacklink', 119 | 'pitchendlink-maininsertionlink', 120 | 'maininsertionlink-toolrolllink', 121 | 'toolrolllink-toolpitchlink', 122 | 'toolpitchlink-toolgripper1link', 123 | 'toolpitchlink-toolgripper2link' 124 | ], 125 | steps_to_print: int = 10000, 126 | n_actions: int = 3, 127 | n_skip_steps: int = 5, 128 | env_name: str = "PSM_cartesian_ddpg_env" 129 | ) -> None: 130 | """Initialize an object to train with DDPG on the PSM robot. 131 | 132 | Parameters 133 | ---------- 134 | action_space_limit : float 135 | Action space limit for cartesian actions 136 | goal_position_range : int, optional 137 | The variance in goal position 138 | position_error_threshold : float 139 | Maximum acceptable error in cartesian position 140 | goal_error_margin : float 141 | Maximum margin of error for an epoch to be considered successful 142 | joint_limits : Dict(str, List(float) | np.array(float)) 143 | Robot joint limits in radians 144 | workspace_limits : Dict(str, List(float) | np.array(float)) 145 | Workspace limits in x,y, and z for the robots workspace in Cartesian space 146 | enable_step_throttling : bool 147 | Flag to enable throttling of the simulator 148 | joints_to_control : np.array(str) | List(str) 149 | The list of joint links for the psm. 150 | steps_to_print : int 151 | Number of steps before model prints information to stdout 152 | n_actions : int 153 | Number of possible actions 154 | n_skip_steps : int 155 | Number of steps to skip after an update step 156 | env_name : str 157 | Name of the environment to train 158 | """ 159 | super(PSMCartesianHERDDPGEnv, 160 | self).__init__( 161 | action_space_limit, 162 | goal_position_range, 163 | position_error_threshold, 164 | goal_error_margin, 165 | joint_limits, 166 | workspace_limits, 167 | enable_step_throttling, 168 | joints_to_control, 169 | steps_to_print, 170 | n_actions, 171 | n_skip_steps, 172 | env_name 173 | ) 174 | 175 | # Set observation space and constraints 176 | self.obs = HERDDPGObservation( 177 | state={ 178 | 'observation': np.zeros(20), 179 | # Prismatic joint is set to 0.1 to ensure at least some part of robot tip goes past the cannula 180 | 'achieved_goal': np.array([0.0, 181 | 0.0, 182 | 0.1, 183 | 0.0, 184 | 0.0, 185 | 0.0]), 186 | 'desired_goal': np.zeros(6) 187 | } 188 | ) 189 | self.initial_pos = copy.deepcopy(self.obs.cur_observation()[0]) 190 | self.observation_space = spaces.Dict( 191 | dict( 192 | desired_goal=spaces.Box( 193 | -np.inf, 194 | np.inf, 195 | shape=self.initial_pos['achieved_goal'].shape, 196 | dtype='float32' 197 | ), 198 | achieved_goal=spaces.Box( 199 | -np.inf, 200 | np.inf, 201 | shape=self.initial_pos['achieved_goal'].shape, 202 | dtype='float32' 203 | ), 204 | observation=spaces.Box( 205 | -np.inf, 206 | np.inf, 207 | shape=self.initial_pos['observation'].shape, 208 | dtype='float32' 209 | ), 210 | ) 211 | ) 212 | 213 | return 214 | 215 | def compute_reward(self, reached_goal: Any, desired_goal: Any, info: Dict[str, bool]) -> float: 216 | """Function to compute reward received by the agent 217 | """ 218 | # Find the distance between goal and achieved goal 219 | cur_dist = None 220 | if isinstance(desired_goal, Goal) and isinstance(reached_goal, Goal): 221 | cur_dist = LA.norm(np.subtract(desired_goal.goal[0:3], reached_goal.goal[0:3])) 222 | elif isinstance(desired_goal, np.ndarray) and isinstance(reached_goal, np.ndarray): 223 | cur_dist = LA.norm(np.subtract(desired_goal[0:3], reached_goal[0:3])) 224 | else: 225 | error_string = 'reached_goal: {}\tdesired_goal: {}'.format( 226 | type(reached_goal), 227 | type(desired_goal) 228 | ) 229 | raise Exception(error_string) 230 | # Continuous reward 231 | # reward = round(1 - float(abs(cur_dist) / 0.05) * 0.5, 5) 232 | # Sparse reward 233 | if abs(cur_dist) < self.goal.goal_error_margin: 234 | reward = 1 235 | else: 236 | reward = -1 237 | self.obs.dist = cur_dist 238 | return reward 239 | 240 | def _sample_goal(self, observation: Observation) -> Goal: 241 | """Function to samples new goal positions and ensures its within the workspace of PSM 242 | """ 243 | rand_val_pos = np.around( 244 | np.add( 245 | observation.state['achieved_goal'][0:3], 246 | self.np_random.uniform( 247 | -self.goal.goal_position_range, 248 | self.goal.goal_position_range, 249 | size=3 250 | ) 251 | ), 252 | decimals=4 253 | ) 254 | rand_val_pos[0] = np.around( 255 | np.clip( 256 | rand_val_pos[0], 257 | self.workspace_limits['lower_limit'][0], 258 | self.workspace_limits['upper_limit'][0] 259 | ), 260 | decimals=4 261 | ) 262 | rand_val_pos[1] = np.around( 263 | np.clip( 264 | rand_val_pos[1], 265 | self.workspace_limits['lower_limit'][1], 266 | self.workspace_limits['upper_limit'][1] 267 | ), 268 | decimals=4 269 | ) 270 | rand_val_pos[2] = np.around( 271 | np.clip( 272 | rand_val_pos[2], 273 | self.workspace_limits['lower_limit'][2], 274 | self.workspace_limits['upper_limit'][2] 275 | ), 276 | decimals=4 277 | ) 278 | # Uncomment below lines if individual limits need to be set for generating desired goal state 279 | ''' 280 | rand_val_pos = self.np_random.uniform(-0.1935, 0.1388, size=3) 281 | rand_val_pos[0] = np.around(np.clip(rand_val_pos[0], -0.1388, 0.1319), decimals=4) 282 | rand_val_pos[1] = np.around(np.clip(rand_val_pos[1], -0.1319, 0.1388), decimals=4) 283 | rand_val_pos[2] = np.around(np.clip(rand_val_pos[2], -0.1935, -0.0477), decimals=4) 284 | rand_val_angle[0] = np.clip(rand_val_angle[0], -0.15, 0.15) 285 | rand_val_angle[1] = np.clip(rand_val_angle[1], -0.15, 0.15) 286 | rand_val_angle[2] = np.clip(rand_val_angle[2], -0.15, 0.15) 287 | ''' 288 | # Provide the range for generating the desired orientation at the terminal state 289 | rand_val_angle = self.np_random.uniform(-1.5, 1.5, size=3) 290 | goal = Goal( 291 | goal=np.concatenate((rand_val_pos, 292 | rand_val_angle), 293 | axis=None), 294 | goal_error_margin=self.goal.goal_error_margin, 295 | goal_position_range=self.goal.goal_position_range 296 | ) 297 | 298 | return goal 299 | 300 | def _update_observation( 301 | self, 302 | end_effector_frame: Any or np.ndarray, 303 | joint_pos: Any or np.ndarray, 304 | joint_vel: Any or np.ndarray 305 | ): 306 | """Update the observation object in this class 307 | """ 308 | # Function ensuring skipped steps based on step throttling 309 | skipped_steps = self.skipped_sim_steps 310 | 311 | # Robot tip cartesian position and orientation 312 | end_effector_pos = end_effector_frame[0:3, 3] 313 | end_effector_euler = np.array(euler_from_matrix(end_effector_frame[0:3, 314 | 0:3], 315 | axes='szyx')).reshape((3, 316 | 1)) 317 | # State vec is 20x1 318 | # [x, y, z, ez, ey, ex, j1, j2, j3, j4, j5, j6, j7, w1, w2, w3, w4, w5, w6, w7] 319 | achieved_goal = np.asarray( 320 | np.concatenate((end_effector_pos.copy(), 321 | end_effector_euler.copy()), 322 | axis=0) 323 | ).reshape(-1) 324 | obs = np.asarray( 325 | np.concatenate((end_effector_pos, 326 | end_effector_euler, 327 | joint_pos.reshape((7, 328 | 1))), 329 | axis=0) 330 | ) 331 | obs = np.concatenate((obs, joint_vel), axis=None) 332 | # Update the observation object 333 | self.obs.state.update( 334 | observation=obs.copy(), 335 | achieved_goal=achieved_goal.copy(), 336 | desired_goal=self.goal.goal.copy() 337 | ) 338 | # Update the obs info 339 | self.obs.info = self._update_info() 340 | # Compute the reward 341 | achieved_goal = copy.deepcopy(self.goal) 342 | achieved_goal.goal = self.obs.state['achieved_goal'] 343 | self.obs.reward = self.compute_reward(achieved_goal, self.goal, self.obs.info) 344 | self.obs.is_done = self._check_if_done() 345 | 346 | return 347 | 348 | 349 | if __name__ == "__main__": 350 | # Create object of this class 351 | root_link = 'baselink' 352 | env_kwargs = { 353 | 'action_space_limit': 0.05, 354 | 'goal_position_range': 0.05, 355 | 'position_error_threshold': 0.01, 356 | 'goal_error_margin': 0.0075, 357 | 'joint_limits': 358 | { 359 | 'lower_limit': np.array([-0.2, 360 | -0.2, 361 | 0.1, 362 | -1.5, 363 | -1.5, 364 | -1.5, 365 | -1.5]), 366 | 'upper_limit': np.array([0.2, 367 | 0.2, 368 | 0.24, 369 | 1.5, 370 | 1.5, 371 | 1.5, 372 | 1.5]) 373 | }, 374 | 'workspace_limits': 375 | { 376 | 'lower_limit': np.array([-0.04, 377 | -0.03, 378 | -0.2]), 379 | 'upper_limit': np.array([0.03, 380 | 0.04, 381 | -0.091]) 382 | }, 383 | 'enable_step_throttling': False, 384 | } 385 | psmEnv = PSMCartesianHERDDPGEnv(**env_kwargs) 386 | psmEnv.make(root_link) 387 | # psmEnv.world_handle = psmEnv.ambf_client.get_world_handle() 388 | # psmEnv.world_handle.enable_throttling(False) -------------------------------------------------------------------------------- /envs/dVRK/__init__.py: -------------------------------------------------------------------------------- 1 | # from ..arl.arl_env import ARLEnv, Goal, Action, Observation 2 | from .PSM_cartesian_env import PSMCartesianEnv, CartesianAction 3 | from .PSM_cartesian_ddpg_env import PSMCartesianDDPGEnv, DDPGObservation 4 | from .PSM_cartesian_herddpg_env import PSMCartesianHERDDPGEnv, HERDDPGObservation -------------------------------------------------------------------------------- /install/stable_baseline_fix/ddpg.py: -------------------------------------------------------------------------------- 1 | from functools import reduce 2 | import os 3 | import time 4 | from collections import deque 5 | import pickle 6 | import warnings 7 | 8 | import gym 9 | import numpy as np 10 | import tensorflow as tf 11 | import tensorflow.contrib as tc 12 | from mpi4py import MPI 13 | 14 | from stable_baselines import logger 15 | from stable_baselines.common import tf_util, OffPolicyRLModel, SetVerbosity, TensorboardWriter 16 | from stable_baselines.common.vec_env import VecEnv 17 | from stable_baselines.common.mpi_adam import MpiAdam 18 | from stable_baselines.common.buffers import ReplayBuffer 19 | from stable_baselines.common.math_util import unscale_action, scale_action 20 | from stable_baselines.common.mpi_running_mean_std import RunningMeanStd 21 | from stable_baselines.ddpg.policies import DDPGPolicy 22 | 23 | 24 | def normalize(tensor, stats): 25 | """ 26 | normalize a tensor using a running mean and std 27 | 28 | :param tensor: (TensorFlow Tensor) the input tensor 29 | :param stats: (RunningMeanStd) the running mean and std of the input to normalize 30 | :return: (TensorFlow Tensor) the normalized tensor 31 | """ 32 | if stats is None: 33 | return tensor 34 | return (tensor - stats.mean) / stats.std 35 | 36 | 37 | def denormalize(tensor, stats): 38 | """ 39 | denormalize a tensor using a running mean and std 40 | 41 | :param tensor: (TensorFlow Tensor) the normalized tensor 42 | :param stats: (RunningMeanStd) the running mean and std of the input to normalize 43 | :return: (TensorFlow Tensor) the restored tensor 44 | """ 45 | if stats is None: 46 | return tensor 47 | return tensor * stats.std + stats.mean 48 | 49 | 50 | def reduce_std(tensor, axis=None, keepdims=False): 51 | """ 52 | get the standard deviation of a Tensor 53 | 54 | :param tensor: (TensorFlow Tensor) the input tensor 55 | :param axis: (int or [int]) the axis to itterate the std over 56 | :param keepdims: (bool) keep the other dimensions the same 57 | :return: (TensorFlow Tensor) the std of the tensor 58 | """ 59 | return tf.sqrt(reduce_var(tensor, axis=axis, keepdims=keepdims)) 60 | 61 | 62 | def reduce_var(tensor, axis=None, keepdims=False): 63 | """ 64 | get the variance of a Tensor 65 | 66 | :param tensor: (TensorFlow Tensor) the input tensor 67 | :param axis: (int or [int]) the axis to itterate the variance over 68 | :param keepdims: (bool) keep the other dimensions the same 69 | :return: (TensorFlow Tensor) the variance of the tensor 70 | """ 71 | tensor_mean = tf.reduce_mean(tensor, axis=axis, keepdims=True) 72 | devs_squared = tf.square(tensor - tensor_mean) 73 | return tf.reduce_mean(devs_squared, axis=axis, keepdims=keepdims) 74 | 75 | 76 | def get_target_updates(_vars, target_vars, tau, verbose=0): 77 | """ 78 | get target update operations 79 | 80 | :param _vars: ([TensorFlow Tensor]) the initial variables 81 | :param target_vars: ([TensorFlow Tensor]) the target variables 82 | :param tau: (float) the soft update coefficient (keep old values, between 0 and 1) 83 | :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug 84 | :return: (TensorFlow Operation, TensorFlow Operation) initial update, soft update 85 | """ 86 | if verbose >= 2: 87 | logger.info('setting up target updates ...') 88 | soft_updates = [] 89 | init_updates = [] 90 | assert len(_vars) == len(target_vars) 91 | for var, target_var in zip(_vars, target_vars): 92 | if verbose >= 2: 93 | logger.info(' {} <- {}'.format(target_var.name, var.name)) 94 | init_updates.append(tf.assign(target_var, var)) 95 | soft_updates.append(tf.assign(target_var, (1. - tau) * target_var + tau * var)) 96 | assert len(init_updates) == len(_vars) 97 | assert len(soft_updates) == len(_vars) 98 | return tf.group(*init_updates), tf.group(*soft_updates) 99 | 100 | 101 | def get_perturbable_vars(scope): 102 | """ 103 | Get the trainable variables that can be perturbed when using 104 | parameter noise. 105 | 106 | :param scope: (str) tensorflow scope of the variables 107 | :return: ([tf.Variables]) 108 | """ 109 | return [var for var in tf_util.get_trainable_vars(scope) if 'LayerNorm' not in var.name] 110 | 111 | 112 | def get_perturbed_actor_updates(actor, perturbed_actor, param_noise_stddev, verbose=0): 113 | """ 114 | Get the actor update, with noise. 115 | 116 | :param actor: (str) the actor 117 | :param perturbed_actor: (str) the pertubed actor 118 | :param param_noise_stddev: (float) the std of the parameter noise 119 | :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug 120 | :return: (TensorFlow Operation) the update function 121 | """ 122 | assert len(tf_util.get_globals_vars(actor)) == len(tf_util.get_globals_vars(perturbed_actor)) 123 | assert len(get_perturbable_vars(actor)) == len(get_perturbable_vars(perturbed_actor)) 124 | 125 | updates = [] 126 | for var, perturbed_var in zip(tf_util.get_globals_vars(actor), tf_util.get_globals_vars(perturbed_actor)): 127 | if var in get_perturbable_vars(actor): 128 | if verbose >= 2: 129 | logger.info(' {} <- {} + noise'.format(perturbed_var.name, var.name)) 130 | # Add Gaussian noise to the parameter 131 | updates.append(tf.assign(perturbed_var, 132 | var + tf.random_normal(tf.shape(var), mean=0., stddev=param_noise_stddev))) 133 | else: 134 | if verbose >= 2: 135 | logger.info(' {} <- {}'.format(perturbed_var.name, var.name)) 136 | updates.append(tf.assign(perturbed_var, var)) 137 | assert len(updates) == len(tf_util.get_globals_vars(actor)) 138 | return tf.group(*updates) 139 | 140 | 141 | class DDPG(OffPolicyRLModel): 142 | """ 143 | Deep Deterministic Policy Gradient (DDPG) model 144 | 145 | DDPG: https://arxiv.org/pdf/1509.02971.pdf 146 | 147 | :param policy: (DDPGPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, LnMlpPolicy, ...) 148 | :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) 149 | :param gamma: (float) the discount factor 150 | :param memory_policy: (ReplayBuffer) the replay buffer 151 | (if None, default to baselines.deepq.replay_buffer.ReplayBuffer) 152 | 153 | .. deprecated:: 2.6.0 154 | This parameter will be removed in a future version 155 | 156 | :param eval_env: (Gym Environment) the evaluation environment (can be None) 157 | :param nb_train_steps: (int) the number of training steps 158 | :param nb_rollout_steps: (int) the number of rollout steps 159 | :param nb_eval_steps: (int) the number of evaluation steps 160 | :param param_noise: (AdaptiveParamNoiseSpec) the parameter noise type (can be None) 161 | :param action_noise: (ActionNoise) the action noise type (can be None) 162 | :param param_noise_adaption_interval: (int) apply param noise every N steps 163 | :param tau: (float) the soft update coefficient (keep old values, between 0 and 1) 164 | :param normalize_returns: (bool) should the critic output be normalized 165 | :param enable_popart: (bool) enable pop-art normalization of the critic output 166 | (https://arxiv.org/pdf/1602.07714.pdf), normalize_returns must be set to True. 167 | :param normalize_observations: (bool) should the observation be normalized 168 | :param batch_size: (int) the size of the batch for learning the policy 169 | :param observation_range: (tuple) the bounding values for the observation 170 | :param return_range: (tuple) the bounding values for the critic output 171 | :param critic_l2_reg: (float) l2 regularizer coefficient 172 | :param actor_lr: (float) the actor learning rate 173 | :param critic_lr: (float) the critic learning rate 174 | :param clip_norm: (float) clip the gradients (disabled if None) 175 | :param reward_scale: (float) the value the reward should be scaled by 176 | :param render: (bool) enable rendering of the environment 177 | :param render_eval: (bool) enable rendering of the evaluation environment 178 | :param memory_limit: (int) the max number of transitions to store, size of the replay buffer 179 | 180 | .. deprecated:: 2.6.0 181 | Use `buffer_size` instead. 182 | 183 | :param buffer_size: (int) the max number of transitions to store, size of the replay buffer 184 | :param random_exploration: (float) Probability of taking a random action (as in an epsilon-greedy strategy) 185 | This is not needed for DDPG normally but can help exploring when using HER + DDPG. 186 | This hack was present in the original OpenAI Baselines repo (DDPG + HER) 187 | :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug 188 | :param tensorboard_log: (str) the log location for tensorboard (if None, no logging) 189 | :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance 190 | :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation 191 | :param full_tensorboard_log: (bool) enable additional logging when using tensorboard 192 | WARNING: this logging can take a lot of space quickly 193 | :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). 194 | If None (default), use random seed. Note that if you want completely deterministic 195 | results, you must set `n_cpu_tf_sess` to 1. 196 | :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations 197 | If None, the number of cpu of the current machine will be used. 198 | """ 199 | def __init__(self, policy, env, gamma=0.99, memory_policy=None, eval_env=None, nb_train_steps=50, 200 | nb_rollout_steps=100, nb_eval_steps=100, param_noise=None, action_noise=None, 201 | normalize_observations=False, tau=0.001, batch_size=128, param_noise_adaption_interval=50, 202 | normalize_returns=False, enable_popart=False, observation_range=(-5., 5.), critic_l2_reg=0., 203 | return_range=(-np.inf, np.inf), actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1., 204 | render=False, render_eval=False, memory_limit=None, buffer_size=50000, random_exploration=0.0, 205 | verbose=0, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None, 206 | full_tensorboard_log=False, seed=None, n_cpu_tf_sess=1): 207 | 208 | super(DDPG, self).__init__(policy=policy, env=env, replay_buffer=None, 209 | verbose=verbose, policy_base=DDPGPolicy, 210 | requires_vec_env=False, policy_kwargs=policy_kwargs, 211 | seed=seed, n_cpu_tf_sess=n_cpu_tf_sess) 212 | 213 | # Parameters. 214 | self.gamma = gamma 215 | self.tau = tau 216 | 217 | # TODO: remove this param in v3.x.x 218 | if memory_policy is not None: 219 | warnings.warn("memory_policy will be removed in a future version (v3.x.x) " 220 | "it is now ignored and replaced with ReplayBuffer", DeprecationWarning) 221 | 222 | if memory_limit is not None: 223 | warnings.warn("memory_limit will be removed in a future version (v3.x.x) " 224 | "use buffer_size instead", DeprecationWarning) 225 | buffer_size = memory_limit 226 | 227 | self.normalize_observations = normalize_observations 228 | self.normalize_returns = normalize_returns 229 | self.action_noise = action_noise 230 | self.param_noise = param_noise 231 | self.return_range = return_range 232 | self.observation_range = observation_range 233 | self.actor_lr = actor_lr 234 | self.critic_lr = critic_lr 235 | self.clip_norm = clip_norm 236 | self.enable_popart = enable_popart 237 | self.reward_scale = reward_scale 238 | self.batch_size = batch_size 239 | self.critic_l2_reg = critic_l2_reg 240 | self.eval_env = eval_env 241 | self.render = render 242 | self.render_eval = render_eval 243 | self.nb_eval_steps = nb_eval_steps 244 | self.param_noise_adaption_interval = param_noise_adaption_interval 245 | self.nb_train_steps = nb_train_steps 246 | self.nb_rollout_steps = nb_rollout_steps 247 | self.memory_limit = memory_limit 248 | self.buffer_size = buffer_size 249 | self.tensorboard_log = tensorboard_log 250 | self.full_tensorboard_log = full_tensorboard_log 251 | self.random_exploration = random_exploration 252 | 253 | # init 254 | self.graph = None 255 | self.stats_sample = None 256 | self.replay_buffer = None 257 | self.policy_tf = None 258 | self.target_init_updates = None 259 | self.target_soft_updates = None 260 | self.critic_loss = None 261 | self.critic_grads = None 262 | self.critic_optimizer = None 263 | self.sess = None 264 | self.stats_ops = None 265 | self.stats_names = None 266 | self.perturbed_actor_tf = None 267 | self.perturb_policy_ops = None 268 | self.perturb_adaptive_policy_ops = None 269 | self.adaptive_policy_distance = None 270 | self.actor_loss = None 271 | self.actor_grads = None 272 | self.actor_optimizer = None 273 | self.old_std = None 274 | self.old_mean = None 275 | self.renormalize_q_outputs_op = None 276 | self.obs_rms = None 277 | self.ret_rms = None 278 | self.target_policy = None 279 | self.actor_tf = None 280 | self.normalized_critic_tf = None 281 | self.critic_tf = None 282 | self.normalized_critic_with_actor_tf = None 283 | self.critic_with_actor_tf = None 284 | self.target_q = None 285 | self.obs_train = None 286 | self.action_train_ph = None 287 | self.obs_target = None 288 | self.action_target = None 289 | self.obs_noise = None 290 | self.action_noise_ph = None 291 | self.obs_adapt_noise = None 292 | self.action_adapt_noise = None 293 | self.terminals_ph = None 294 | self.rewards = None 295 | self.actions = None 296 | self.critic_target = None 297 | self.param_noise_stddev = None 298 | self.param_noise_actor = None 299 | self.adaptive_param_noise_actor = None 300 | self.params = None 301 | self.summary = None 302 | self.tb_seen_steps = None 303 | 304 | self.target_params = None 305 | self.obs_rms_params = None 306 | self.ret_rms_params = None 307 | 308 | if _init_setup_model: 309 | self.setup_model() 310 | 311 | def _get_pretrain_placeholders(self): 312 | policy = self.policy_tf 313 | # Rescale 314 | deterministic_action = unscale_action(self.action_space, self.actor_tf) 315 | return policy.obs_ph, self.actions, deterministic_action 316 | 317 | def setup_model(self): 318 | with SetVerbosity(self.verbose): 319 | 320 | assert isinstance(self.action_space, gym.spaces.Box), \ 321 | "Error: DDPG cannot output a {} action space, only spaces.Box is supported.".format(self.action_space) 322 | assert issubclass(self.policy, DDPGPolicy), "Error: the input policy for the DDPG model must be " \ 323 | "an instance of DDPGPolicy." 324 | 325 | self.graph = tf.Graph() 326 | with self.graph.as_default(): 327 | self.set_random_seed(self.seed) 328 | self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) 329 | 330 | self.replay_buffer = ReplayBuffer(self.buffer_size) 331 | 332 | with tf.variable_scope("input", reuse=False): 333 | # Observation normalization. 334 | if self.normalize_observations: 335 | with tf.variable_scope('obs_rms'): 336 | self.obs_rms = RunningMeanStd(shape=self.observation_space.shape) 337 | else: 338 | self.obs_rms = None 339 | 340 | # Return normalization. 341 | if self.normalize_returns: 342 | with tf.variable_scope('ret_rms'): 343 | self.ret_rms = RunningMeanStd() 344 | else: 345 | self.ret_rms = None 346 | 347 | self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, 1, 1, None, 348 | **self.policy_kwargs) 349 | 350 | # Create target networks. 351 | self.target_policy = self.policy(self.sess, self.observation_space, self.action_space, 1, 1, None, 352 | **self.policy_kwargs) 353 | self.obs_target = self.target_policy.obs_ph 354 | self.action_target = self.target_policy.action_ph 355 | 356 | normalized_obs = tf.clip_by_value(normalize(self.policy_tf.processed_obs, self.obs_rms), 357 | self.observation_range[0], self.observation_range[1]) 358 | normalized_next_obs = tf.clip_by_value(normalize(self.target_policy.processed_obs, self.obs_rms), 359 | self.observation_range[0], self.observation_range[1]) 360 | 361 | if self.param_noise is not None: 362 | # Configure perturbed actor. 363 | self.param_noise_actor = self.policy(self.sess, self.observation_space, self.action_space, 1, 1, 364 | None, **self.policy_kwargs) 365 | self.obs_noise = self.param_noise_actor.obs_ph 366 | self.action_noise_ph = self.param_noise_actor.action_ph 367 | 368 | # Configure separate copy for stddev adoption. 369 | self.adaptive_param_noise_actor = self.policy(self.sess, self.observation_space, 370 | self.action_space, 1, 1, None, 371 | **self.policy_kwargs) 372 | self.obs_adapt_noise = self.adaptive_param_noise_actor.obs_ph 373 | self.action_adapt_noise = self.adaptive_param_noise_actor.action_ph 374 | 375 | # Inputs. 376 | self.obs_train = self.policy_tf.obs_ph 377 | self.action_train_ph = self.policy_tf.action_ph 378 | self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name='terminals') 379 | self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') 380 | self.actions = tf.placeholder(tf.float32, shape=(None,) + self.action_space.shape, name='actions') 381 | self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') 382 | self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') 383 | 384 | # Create networks and core TF parts that are shared across setup parts. 385 | with tf.variable_scope("model", reuse=False): 386 | self.actor_tf = self.policy_tf.make_actor(normalized_obs) 387 | self.normalized_critic_tf = self.policy_tf.make_critic(normalized_obs, self.actions) 388 | self.normalized_critic_with_actor_tf = self.policy_tf.make_critic(normalized_obs, 389 | self.actor_tf, 390 | reuse=True) 391 | # Noise setup 392 | if self.param_noise is not None: 393 | self._setup_param_noise(normalized_obs) 394 | 395 | with tf.variable_scope("target", reuse=False): 396 | critic_target = self.target_policy.make_critic(normalized_next_obs, 397 | self.target_policy.make_actor(normalized_next_obs)) 398 | 399 | with tf.variable_scope("loss", reuse=False): 400 | self.critic_tf = denormalize( 401 | tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), 402 | self.ret_rms) 403 | 404 | self.critic_with_actor_tf = denormalize( 405 | tf.clip_by_value(self.normalized_critic_with_actor_tf, 406 | self.return_range[0], self.return_range[1]), 407 | self.ret_rms) 408 | 409 | q_next_obs = denormalize(critic_target, self.ret_rms) 410 | self.target_q = self.rewards + (1. - self.terminals_ph) * self.gamma * q_next_obs 411 | 412 | tf.summary.scalar('critic_target', tf.reduce_mean(self.critic_target)) 413 | if self.full_tensorboard_log: 414 | tf.summary.histogram('critic_target', self.critic_target) 415 | 416 | # Set up parts. 417 | if self.normalize_returns and self.enable_popart: 418 | self._setup_popart() 419 | self._setup_stats() 420 | self._setup_target_network_updates() 421 | 422 | with tf.variable_scope("input_info", reuse=False): 423 | tf.summary.scalar('rewards', tf.reduce_mean(self.rewards)) 424 | tf.summary.scalar('param_noise_stddev', tf.reduce_mean(self.param_noise_stddev)) 425 | 426 | if self.full_tensorboard_log: 427 | tf.summary.histogram('rewards', self.rewards) 428 | tf.summary.histogram('param_noise_stddev', self.param_noise_stddev) 429 | if len(self.observation_space.shape) == 3 and self.observation_space.shape[0] in [1, 3, 4]: 430 | tf.summary.image('observation', self.obs_train) 431 | else: 432 | tf.summary.histogram('observation', self.obs_train) 433 | 434 | with tf.variable_scope("Adam_mpi", reuse=False): 435 | self._setup_actor_optimizer() 436 | self._setup_critic_optimizer() 437 | tf.summary.scalar('actor_loss', self.actor_loss) 438 | tf.summary.scalar('critic_loss', self.critic_loss) 439 | 440 | self.params = tf_util.get_trainable_vars("model") \ 441 | + tf_util.get_trainable_vars('noise/') + tf_util.get_trainable_vars('noise_adapt/') 442 | 443 | self.target_params = tf_util.get_trainable_vars("target") 444 | self.obs_rms_params = [var for var in tf.global_variables() 445 | if "obs_rms" in var.name] 446 | self.ret_rms_params = [var for var in tf.global_variables() 447 | if "ret_rms" in var.name] 448 | 449 | with self.sess.as_default(): 450 | self._initialize(self.sess) 451 | 452 | self.summary = tf.summary.merge_all() 453 | 454 | def _setup_target_network_updates(self): 455 | """ 456 | set the target update operations 457 | """ 458 | init_updates, soft_updates = get_target_updates(tf_util.get_trainable_vars('model/'), 459 | tf_util.get_trainable_vars('target/'), self.tau, 460 | self.verbose) 461 | self.target_init_updates = init_updates 462 | self.target_soft_updates = soft_updates 463 | 464 | def _setup_param_noise(self, normalized_obs): 465 | """ 466 | Setup the parameter noise operations 467 | 468 | :param normalized_obs: (TensorFlow Tensor) the normalized observation 469 | """ 470 | assert self.param_noise is not None 471 | 472 | with tf.variable_scope("noise", reuse=False): 473 | self.perturbed_actor_tf = self.param_noise_actor.make_actor(normalized_obs) 474 | 475 | with tf.variable_scope("noise_adapt", reuse=False): 476 | adaptive_actor_tf = self.adaptive_param_noise_actor.make_actor(normalized_obs) 477 | 478 | with tf.variable_scope("noise_update_func", reuse=False): 479 | if self.verbose >= 2: 480 | logger.info('setting up param noise') 481 | self.perturb_policy_ops = get_perturbed_actor_updates('model/pi/', 'noise/pi/', self.param_noise_stddev, 482 | verbose=self.verbose) 483 | 484 | self.perturb_adaptive_policy_ops = get_perturbed_actor_updates('model/pi/', 'noise_adapt/pi/', 485 | self.param_noise_stddev, 486 | verbose=self.verbose) 487 | self.adaptive_policy_distance = tf.sqrt(tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf))) 488 | 489 | def _setup_actor_optimizer(self): 490 | """ 491 | setup the optimizer for the actor 492 | """ 493 | if self.verbose >= 2: 494 | logger.info('setting up actor optimizer') 495 | self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) 496 | actor_shapes = [var.get_shape().as_list() for var in tf_util.get_trainable_vars('model/pi/')] 497 | actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) 498 | if self.verbose >= 2: 499 | logger.info(' actor shapes: {}'.format(actor_shapes)) 500 | logger.info(' actor params: {}'.format(actor_nb_params)) 501 | self.actor_grads = tf_util.flatgrad(self.actor_loss, tf_util.get_trainable_vars('model/pi/'), 502 | clip_norm=self.clip_norm) 503 | self.actor_optimizer = MpiAdam(var_list=tf_util.get_trainable_vars('model/pi/'), beta1=0.9, beta2=0.999, 504 | epsilon=1e-08) 505 | 506 | def _setup_critic_optimizer(self): 507 | """ 508 | setup the optimizer for the critic 509 | """ 510 | if self.verbose >= 2: 511 | logger.info('setting up critic optimizer') 512 | normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms), 513 | self.return_range[0], self.return_range[1]) 514 | self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) 515 | if self.critic_l2_reg > 0.: 516 | critic_reg_vars = [var for var in tf_util.get_trainable_vars('model/qf/') 517 | if 'bias' not in var.name and 'qf_output' not in var.name and 'b' not in var.name] 518 | if self.verbose >= 2: 519 | for var in critic_reg_vars: 520 | logger.info(' regularizing: {}'.format(var.name)) 521 | logger.info(' applying l2 regularization with {}'.format(self.critic_l2_reg)) 522 | critic_reg = tc.layers.apply_regularization( 523 | tc.layers.l2_regularizer(self.critic_l2_reg), 524 | weights_list=critic_reg_vars 525 | ) 526 | self.critic_loss += critic_reg 527 | critic_shapes = [var.get_shape().as_list() for var in tf_util.get_trainable_vars('model/qf/')] 528 | critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) 529 | if self.verbose >= 2: 530 | logger.info(' critic shapes: {}'.format(critic_shapes)) 531 | logger.info(' critic params: {}'.format(critic_nb_params)) 532 | self.critic_grads = tf_util.flatgrad(self.critic_loss, tf_util.get_trainable_vars('model/qf/'), 533 | clip_norm=self.clip_norm) 534 | self.critic_optimizer = MpiAdam(var_list=tf_util.get_trainable_vars('model/qf/'), beta1=0.9, beta2=0.999, 535 | epsilon=1e-08) 536 | 537 | def _setup_popart(self): 538 | """ 539 | setup pop-art normalization of the critic output 540 | 541 | See https://arxiv.org/pdf/1602.07714.pdf for details. 542 | Preserving Outputs Precisely, while Adaptively Rescaling Targets”. 543 | """ 544 | self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std') 545 | new_std = self.ret_rms.std 546 | self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') 547 | new_mean = self.ret_rms.mean 548 | 549 | self.renormalize_q_outputs_op = [] 550 | for out_vars in [[var for var in tf_util.get_trainable_vars('model/qf/') if 'qf_output' in var.name], 551 | [var for var in tf_util.get_trainable_vars('target/qf/') if 'qf_output' in var.name]]: 552 | assert len(out_vars) == 2 553 | # wieght and bias of the last layer 554 | weight, bias = out_vars 555 | assert 'kernel' in weight.name 556 | assert 'bias' in bias.name 557 | assert weight.get_shape()[-1] == 1 558 | assert bias.get_shape()[-1] == 1 559 | self.renormalize_q_outputs_op += [weight.assign(weight * self.old_std / new_std)] 560 | self.renormalize_q_outputs_op += [bias.assign((bias * self.old_std + self.old_mean - new_mean) / new_std)] 561 | 562 | def _setup_stats(self): 563 | """ 564 | Setup the stat logger for DDPG. 565 | """ 566 | ops = [ 567 | tf.reduce_mean(self.critic_tf), 568 | reduce_std(self.critic_tf), 569 | tf.reduce_mean(self.critic_with_actor_tf), 570 | reduce_std(self.critic_with_actor_tf), 571 | tf.reduce_mean(self.actor_tf), 572 | reduce_std(self.actor_tf) 573 | ] 574 | names = [ 575 | 'reference_Q_mean', 576 | 'reference_Q_std', 577 | 'reference_actor_Q_mean', 578 | 'reference_actor_Q_std', 579 | 'reference_action_mean', 580 | 'reference_action_std' 581 | ] 582 | 583 | if self.normalize_returns: 584 | ops += [self.ret_rms.mean, self.ret_rms.std] 585 | names += ['ret_rms_mean', 'ret_rms_std'] 586 | 587 | if self.normalize_observations: 588 | ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)] 589 | names += ['obs_rms_mean', 'obs_rms_std'] 590 | 591 | if self.param_noise: 592 | ops += [tf.reduce_mean(self.perturbed_actor_tf), reduce_std(self.perturbed_actor_tf)] 593 | names += ['reference_perturbed_action_mean', 'reference_perturbed_action_std'] 594 | 595 | self.stats_ops = ops 596 | self.stats_names = names 597 | 598 | def _policy(self, obs, apply_noise=True, compute_q=True): 599 | """ 600 | Get the actions and critic output, from a given observation 601 | 602 | :param obs: ([float] or [int]) the observation 603 | :param apply_noise: (bool) enable the noise 604 | :param compute_q: (bool) compute the critic output 605 | :return: ([float], float) the action and critic value 606 | """ 607 | obs = np.array(obs).reshape((-1,) + self.observation_space.shape) 608 | feed_dict = {self.obs_train: obs} 609 | if self.param_noise is not None and apply_noise: 610 | actor_tf = self.perturbed_actor_tf 611 | feed_dict[self.obs_noise] = obs 612 | else: 613 | actor_tf = self.actor_tf 614 | 615 | if compute_q: 616 | action, q_value = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) 617 | else: 618 | action = self.sess.run(actor_tf, feed_dict=feed_dict) 619 | q_value = None 620 | 621 | action = action.flatten() 622 | if self.action_noise is not None and apply_noise: 623 | noise = self.action_noise() 624 | action += noise 625 | action = np.clip(action, -1, 1) 626 | return action, q_value 627 | 628 | def _store_transition(self, obs, action, reward, next_obs, done): 629 | """ 630 | Store a transition in the replay buffer 631 | 632 | :param obs: ([float] or [int]) the last observation 633 | :param action: ([float]) the action 634 | :param reward: (float] the reward 635 | :param next_obs: ([float] or [int]) the current observation 636 | :param done: (bool) Whether the episode is over 637 | """ 638 | reward *= self.reward_scale 639 | self.replay_buffer.add(obs, action, reward, next_obs, float(done)) 640 | if self.normalize_observations: 641 | self.obs_rms.update(np.array([obs])) 642 | 643 | def _train_step(self, step, writer, log=False): 644 | """ 645 | run a step of training from batch 646 | 647 | :param step: (int) the current step iteration 648 | :param writer: (TensorFlow Summary.writer) the writer for tensorboard 649 | :param log: (bool) whether or not to log to metadata 650 | :return: (float, float) critic loss, actor loss 651 | """ 652 | # Get a batch 653 | obs, actions, rewards, next_obs, terminals = self.replay_buffer.sample(batch_size=self.batch_size, 654 | env=self._vec_normalize_env) 655 | # Reshape to match previous behavior and placeholder shape 656 | rewards = rewards.reshape(-1, 1) 657 | terminals = terminals.reshape(-1, 1) 658 | 659 | if self.normalize_returns and self.enable_popart: 660 | old_mean, old_std, target_q = self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_q], 661 | feed_dict={ 662 | self.obs_target: next_obs, 663 | self.rewards: rewards, 664 | self.terminals_ph: terminals 665 | }) 666 | self.ret_rms.update(target_q.flatten()) 667 | self.sess.run(self.renormalize_q_outputs_op, feed_dict={ 668 | self.old_std: np.array([old_std]), 669 | self.old_mean: np.array([old_mean]), 670 | }) 671 | 672 | else: 673 | target_q = self.sess.run(self.target_q, feed_dict={ 674 | self.obs_target: next_obs, 675 | self.rewards: rewards, 676 | self.terminals_ph: terminals 677 | }) 678 | 679 | # Get all gradients and perform a synced update. 680 | ops = [self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss] 681 | td_map = { 682 | self.obs_train: obs, 683 | self.actions: actions, 684 | self.action_train_ph: actions, 685 | self.rewards: rewards, 686 | self.critic_target: target_q, 687 | self.param_noise_stddev: 0 if self.param_noise is None else self.param_noise.current_stddev 688 | } 689 | if writer is not None: 690 | # run loss backprop with summary if the step_id was not already logged (can happen with the right 691 | # parameters as the step value is only an estimate) 692 | if self.full_tensorboard_log and log and step not in self.tb_seen_steps: 693 | run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) 694 | run_metadata = tf.RunMetadata() 695 | summary, actor_grads, actor_loss, critic_grads, critic_loss = \ 696 | self.sess.run([self.summary] + ops, td_map, options=run_options, run_metadata=run_metadata) 697 | 698 | writer.add_run_metadata(run_metadata, 'step%d' % step) 699 | self.tb_seen_steps.append(step) 700 | else: 701 | summary, actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run([self.summary] + ops, 702 | td_map) 703 | writer.add_summary(summary, step) 704 | else: 705 | actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, td_map) 706 | 707 | self.actor_optimizer.update(actor_grads, learning_rate=self.actor_lr) 708 | self.critic_optimizer.update(critic_grads, learning_rate=self.critic_lr) 709 | 710 | return critic_loss, actor_loss 711 | 712 | def _initialize(self, sess): 713 | """ 714 | initialize the model parameters and optimizers 715 | 716 | :param sess: (TensorFlow Session) the current TensorFlow session 717 | """ 718 | self.sess = sess 719 | self.sess.run(tf.global_variables_initializer()) 720 | self.actor_optimizer.sync() 721 | self.critic_optimizer.sync() 722 | self.sess.run(self.target_init_updates) 723 | 724 | def _update_target_net(self): 725 | """ 726 | run target soft update operation 727 | """ 728 | self.sess.run(self.target_soft_updates) 729 | 730 | def _get_stats(self): 731 | """ 732 | Get the mean and standard deviation of the model's inputs and outputs 733 | 734 | :return: (dict) the means and stds 735 | """ 736 | if self.stats_sample is None: 737 | # Get a sample and keep that fixed for all further computations. 738 | # This allows us to estimate the change in value for the same set of inputs. 739 | obs, actions, rewards, next_obs, terminals = self.replay_buffer.sample(batch_size=self.batch_size, 740 | env=self._vec_normalize_env) 741 | self.stats_sample = { 742 | 'obs': obs, 743 | 'actions': actions, 744 | 'rewards': rewards, 745 | 'next_obs': next_obs, 746 | 'terminals': terminals 747 | } 748 | 749 | feed_dict = { 750 | self.actions: self.stats_sample['actions'] 751 | } 752 | 753 | for placeholder in [self.action_train_ph, self.action_target, self.action_adapt_noise, self.action_noise_ph]: 754 | if placeholder is not None: 755 | feed_dict[placeholder] = self.stats_sample['actions'] 756 | 757 | for placeholder in [self.obs_train, self.obs_target, self.obs_adapt_noise, self.obs_noise]: 758 | if placeholder is not None: 759 | feed_dict[placeholder] = self.stats_sample['obs'] 760 | 761 | values = self.sess.run(self.stats_ops, feed_dict=feed_dict) 762 | 763 | names = self.stats_names[:] 764 | assert len(names) == len(values) 765 | stats = dict(zip(names, values)) 766 | 767 | if self.param_noise is not None: 768 | stats = {**stats, **self.param_noise.get_stats()} 769 | 770 | return stats 771 | 772 | def _adapt_param_noise(self): 773 | """ 774 | calculate the adaptation for the parameter noise 775 | 776 | :return: (float) the mean distance for the parameter noise 777 | """ 778 | if self.param_noise is None: 779 | return 0. 780 | 781 | # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. 782 | obs, *_ = self.replay_buffer.sample(batch_size=self.batch_size, env=self._vec_normalize_env) 783 | self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={ 784 | self.param_noise_stddev: self.param_noise.current_stddev, 785 | }) 786 | distance = self.sess.run(self.adaptive_policy_distance, feed_dict={ 787 | self.obs_adapt_noise: obs, self.obs_train: obs, 788 | self.param_noise_stddev: self.param_noise.current_stddev, 789 | }) 790 | 791 | mean_distance = MPI.COMM_WORLD.allreduce(distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size() 792 | self.param_noise.adapt(mean_distance) 793 | return mean_distance 794 | 795 | def _reset(self): 796 | """ 797 | Reset internal state after an episode is complete. 798 | """ 799 | if self.action_noise is not None: 800 | self.action_noise.reset() 801 | if self.param_noise is not None: 802 | self.sess.run(self.perturb_policy_ops, feed_dict={ 803 | self.param_noise_stddev: self.param_noise.current_stddev, 804 | }) 805 | 806 | def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="DDPG", 807 | reset_num_timesteps=True, replay_wrapper=None): 808 | 809 | new_tb_log = self._init_num_timesteps(reset_num_timesteps) 810 | callback = self._init_callback(callback) 811 | 812 | if replay_wrapper is not None: 813 | self.replay_buffer = replay_wrapper(self.replay_buffer) 814 | 815 | with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ 816 | as writer: 817 | self._setup_learn() 818 | 819 | # a list for tensorboard logging, to prevent logging with the same step number, if it already occured 820 | self.tb_seen_steps = [] 821 | 822 | rank = MPI.COMM_WORLD.Get_rank() 823 | 824 | if self.verbose >= 2: 825 | logger.log('Using agent with the following configuration:') 826 | logger.log(str(self.__dict__.items())) 827 | 828 | eval_episode_rewards_history = deque(maxlen=100) 829 | episode_rewards_history = deque(maxlen=100) 830 | episode_successes = [] 831 | 832 | 833 | with self.sess.as_default(), self.graph.as_default(): 834 | # Prepare everything. 835 | self._reset() 836 | obs = self.env.reset() 837 | # Retrieve unnormalized observation for saving into the buffer 838 | if self._vec_normalize_env is not None: 839 | obs_ = self._vec_normalize_env.get_original_obs().squeeze() 840 | eval_obs = None 841 | if self.eval_env is not None: 842 | eval_obs = self.eval_env.reset() 843 | episode_reward = 0. 844 | episode_step = 0 845 | episodes = 0 846 | step = 0 847 | total_steps = 0 848 | 849 | start_time = time.time() 850 | 851 | epoch_episode_rewards = [] 852 | epoch_episode_steps = [] 853 | epoch_actor_losses = [] 854 | epoch_critic_losses = [] 855 | epoch_adaptive_distances = [] 856 | eval_episode_rewards = [] 857 | eval_qs = [] 858 | epoch_actions = [] 859 | epoch_qs = [] 860 | epoch_episodes = 0 861 | epoch = 0 862 | 863 | callback.on_training_start(locals(), globals()) 864 | 865 | while True: 866 | for _ in range(log_interval): 867 | callback.on_rollout_start() 868 | # Perform rollouts. 869 | for _ in range(self.nb_rollout_steps): 870 | 871 | if total_steps >= total_timesteps: 872 | callback.on_training_end() 873 | return self 874 | 875 | # Predict next action. 876 | action, q_value = self._policy(obs, apply_noise=True, compute_q=True) 877 | assert action.shape == self.env.action_space.shape 878 | 879 | # Execute next action. 880 | if rank == 0 and self.render: 881 | self.env.render() 882 | 883 | # Randomly sample actions from a uniform distribution 884 | # with a probability self.random_exploration (used in HER + DDPG) 885 | if np.random.rand() < self.random_exploration: 886 | # actions sampled from action space are from range specific to the environment 887 | # but algorithm operates on tanh-squashed actions therefore simple scaling is used 888 | unscaled_action = self.action_space.sample() 889 | action = scale_action(self.action_space, unscaled_action) 890 | else: 891 | # inferred actions need to be transformed to environment action_space before stepping 892 | unscaled_action = unscale_action(self.action_space, action) 893 | 894 | new_obs, reward, done, info = self.env.step(unscaled_action) 895 | 896 | self.num_timesteps += 1 897 | 898 | if callback.on_step() is False: 899 | callback.on_training_end() 900 | return self 901 | 902 | step += 1 903 | total_steps += 1 904 | if rank == 0 and self.render: 905 | self.env.render() 906 | 907 | # Book-keeping. 908 | epoch_actions.append(action) 909 | epoch_qs.append(q_value) 910 | 911 | # Store only the unnormalized version 912 | if self._vec_normalize_env is not None: 913 | new_obs_ = self._vec_normalize_env.get_original_obs().squeeze() 914 | reward_ = self._vec_normalize_env.get_original_reward().squeeze() 915 | else: 916 | # Avoid changing the original ones 917 | obs_, new_obs_, reward_ = obs, new_obs, reward 918 | 919 | self._store_transition(obs_, action, reward_, new_obs_, done) 920 | obs = new_obs 921 | # Save the unnormalized observation 922 | if self._vec_normalize_env is not None: 923 | obs_ = new_obs_ 924 | 925 | episode_reward += reward_ 926 | episode_step += 1 927 | 928 | if writer is not None: 929 | ep_rew = np.array([reward_]).reshape((1, -1)) 930 | ep_done = np.array([done]).reshape((1, -1)) 931 | tf_util.total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, 932 | writer, self.num_timesteps) 933 | 934 | if done: 935 | # Episode done. 936 | epoch_episode_rewards.append(episode_reward) 937 | episode_rewards_history.append(episode_reward) 938 | epoch_episode_steps.append(episode_step) 939 | episode_reward = 0. 940 | episode_step = 0 941 | epoch_episodes += 1 942 | episodes += 1 943 | 944 | maybe_is_success = info.get('is_success') 945 | if maybe_is_success is not None: 946 | episode_successes.append(float(maybe_is_success)) 947 | 948 | self._reset() 949 | if not isinstance(self.env, VecEnv): 950 | obs = self.env.reset() 951 | 952 | callback.on_rollout_end() 953 | # Train. 954 | epoch_actor_losses = [] 955 | epoch_critic_losses = [] 956 | epoch_adaptive_distances = [] 957 | for t_train in range(self.nb_train_steps): 958 | # Not enough samples in the replay buffer 959 | if not self.replay_buffer.can_sample(self.batch_size): 960 | break 961 | 962 | # Adapt param noise, if necessary. 963 | if len(self.replay_buffer) >= self.batch_size and \ 964 | t_train % self.param_noise_adaption_interval == 0: 965 | distance = self._adapt_param_noise() 966 | epoch_adaptive_distances.append(distance) 967 | 968 | # weird equation to deal with the fact the nb_train_steps will be different 969 | # to nb_rollout_steps 970 | step = (int(t_train * (self.nb_rollout_steps / self.nb_train_steps)) + 971 | self.num_timesteps - self.nb_rollout_steps) 972 | 973 | critic_loss, actor_loss = self._train_step(step, writer, log=t_train == 0) 974 | epoch_critic_losses.append(critic_loss) 975 | epoch_actor_losses.append(actor_loss) 976 | self._update_target_net() 977 | 978 | # Evaluate. 979 | eval_episode_rewards = [] 980 | eval_qs = [] 981 | if self.eval_env is not None: 982 | eval_episode_reward = 0. 983 | for _ in range(self.nb_eval_steps): 984 | if total_steps >= total_timesteps: 985 | return self 986 | 987 | eval_action, eval_q = self._policy(eval_obs, apply_noise=False, compute_q=True) 988 | unscaled_action = unscale_action(self.action_space, eval_action) 989 | eval_obs, eval_r, eval_done, _ = self.eval_env.step(unscaled_action) 990 | if self.render_eval: 991 | self.eval_env.render() 992 | eval_episode_reward += eval_r 993 | 994 | eval_qs.append(eval_q) 995 | if eval_done: 996 | if not isinstance(self.env, VecEnv): 997 | eval_obs = self.eval_env.reset() 998 | eval_episode_rewards.append(eval_episode_reward) 999 | eval_episode_rewards_history.append(eval_episode_reward) 1000 | eval_episode_reward = 0. 1001 | 1002 | mpi_size = MPI.COMM_WORLD.Get_size() 1003 | # Not enough samples in the replay buffer 1004 | if not self.replay_buffer.can_sample(self.batch_size): 1005 | continue 1006 | # Log stats. 1007 | # XXX shouldn't call np.mean on variable length lists 1008 | duration = time.time() - start_time 1009 | stats = self._get_stats() 1010 | combined_stats = stats.copy() 1011 | combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) 1012 | combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) 1013 | combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) 1014 | combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) 1015 | combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) 1016 | combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) 1017 | combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) 1018 | if len(epoch_adaptive_distances) != 0: 1019 | combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances) 1020 | combined_stats['total/duration'] = duration 1021 | combined_stats['total/steps_per_second'] = float(step) / float(duration) 1022 | combined_stats['total/episodes'] = episodes 1023 | combined_stats['rollout/episodes'] = epoch_episodes 1024 | combined_stats['rollout/actions_std'] = np.std(epoch_actions) 1025 | # Evaluation statistics. 1026 | if self.eval_env is not None: 1027 | combined_stats['eval/return'] = np.mean(eval_episode_rewards) 1028 | combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) 1029 | combined_stats['eval/Q'] = np.mean(eval_qs) 1030 | combined_stats['eval/episodes'] = len(eval_episode_rewards) 1031 | 1032 | def as_scalar(scalar): 1033 | """ 1034 | check and return the input if it is a scalar, otherwise raise ValueError 1035 | 1036 | :param scalar: (Any) the object to check 1037 | :return: (Number) the scalar if x is a scalar 1038 | """ 1039 | if isinstance(scalar, np.ndarray): 1040 | assert scalar.size == 1 1041 | return scalar[0] 1042 | elif np.isscalar(scalar): 1043 | return scalar 1044 | else: 1045 | raise ValueError('expected scalar, got %s' % scalar) 1046 | 1047 | combined_stats_sums = MPI.COMM_WORLD.allreduce( 1048 | np.array([as_scalar(x) for x in combined_stats.values()])) 1049 | combined_stats = {k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums)} 1050 | 1051 | # Total statistics. 1052 | combined_stats['total/epochs'] = epoch + 1 1053 | combined_stats['total/steps'] = step 1054 | 1055 | for key in sorted(combined_stats.keys()): 1056 | logger.record_tabular(key, combined_stats[key]) 1057 | if len(episode_successes) > 0: 1058 | logger.logkv("success rate", np.mean(episode_successes[-100:])) 1059 | logger.dump_tabular() 1060 | logger.info('') 1061 | logdir = logger.get_dir() 1062 | if rank == 0 and logdir: 1063 | if hasattr(self.env, 'get_state'): 1064 | with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as file_handler: 1065 | pickle.dump(self.env.get_state(), file_handler) 1066 | if self.eval_env and hasattr(self.eval_env, 'get_state'): 1067 | with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as file_handler: 1068 | pickle.dump(self.eval_env.get_state(), file_handler) 1069 | 1070 | def predict(self, observation, state=None, mask=None, deterministic=True): 1071 | observation = np.array(observation) 1072 | vectorized_env = self._is_vectorized_observation(observation, self.observation_space) 1073 | 1074 | observation = observation.reshape((-1,) + self.observation_space.shape) 1075 | actions, _, = self._policy(observation, apply_noise=not deterministic, compute_q=False) 1076 | actions = actions.reshape((-1,) + self.action_space.shape) # reshape to the correct action shape 1077 | actions = unscale_action(self.action_space, actions) # scale the output for the prediction 1078 | 1079 | if not vectorized_env: 1080 | actions = actions[0] 1081 | 1082 | return actions, None 1083 | 1084 | def action_probability(self, observation, state=None, mask=None, actions=None, logp=False): 1085 | _ = np.array(observation) 1086 | 1087 | if actions is not None: 1088 | raise ValueError("Error: DDPG does not have action probabilities.") 1089 | 1090 | # here there are no action probabilities, as DDPG does not use a probability distribution 1091 | warnings.warn("Warning: action probability is meaningless for DDPG. Returning None") 1092 | return None 1093 | 1094 | def get_parameter_list(self): 1095 | return (self.params + 1096 | self.target_params + 1097 | self.obs_rms_params + 1098 | self.ret_rms_params) 1099 | 1100 | def save(self, save_path, cloudpickle=False): 1101 | data = { 1102 | "observation_space": self.observation_space, 1103 | "action_space": self.action_space, 1104 | "nb_eval_steps": self.nb_eval_steps, 1105 | "param_noise_adaption_interval": self.param_noise_adaption_interval, 1106 | "nb_train_steps": self.nb_train_steps, 1107 | "nb_rollout_steps": self.nb_rollout_steps, 1108 | "verbose": self.verbose, 1109 | "param_noise": self.param_noise, 1110 | "action_noise": self.action_noise, 1111 | "gamma": self.gamma, 1112 | "tau": self.tau, 1113 | "normalize_returns": self.normalize_returns, 1114 | "enable_popart": self.enable_popart, 1115 | "normalize_observations": self.normalize_observations, 1116 | "batch_size": self.batch_size, 1117 | "observation_range": self.observation_range, 1118 | "return_range": self.return_range, 1119 | "critic_l2_reg": self.critic_l2_reg, 1120 | "actor_lr": self.actor_lr, 1121 | "critic_lr": self.critic_lr, 1122 | "clip_norm": self.clip_norm, 1123 | "reward_scale": self.reward_scale, 1124 | "memory_limit": self.memory_limit, 1125 | "buffer_size": self.buffer_size, 1126 | "random_exploration": self.random_exploration, 1127 | "policy": self.policy, 1128 | "n_envs": self.n_envs, 1129 | "n_cpu_tf_sess": self.n_cpu_tf_sess, 1130 | "seed": self.seed, 1131 | "_vectorize_action": self._vectorize_action, 1132 | "policy_kwargs": self.policy_kwargs 1133 | } 1134 | 1135 | params_to_save = self.get_parameters() 1136 | 1137 | self._save_to_file(save_path, 1138 | data=data, 1139 | params=params_to_save, 1140 | cloudpickle=cloudpickle) 1141 | 1142 | @classmethod 1143 | def load(cls, load_path, env=None, custom_objects=None, **kwargs): 1144 | data, params = cls._load_from_file(load_path, custom_objects=custom_objects) 1145 | 1146 | if 'policy_kwargs' in kwargs and kwargs['policy_kwargs'] != data['policy_kwargs']: 1147 | raise ValueError("The specified policy kwargs do not equal the stored policy kwargs. " 1148 | "Stored kwargs: {}, specified kwargs: {}".format(data['policy_kwargs'], 1149 | kwargs['policy_kwargs'])) 1150 | 1151 | model = cls(None, env, _init_setup_model=False) 1152 | model.__dict__.update(data) 1153 | model.__dict__.update(kwargs) 1154 | model.set_env(env) 1155 | model.setup_model() 1156 | # Patch for version < v2.6.0, duplicated keys where saved 1157 | if len(params) > len(model.get_parameter_list()): 1158 | n_params = len(model.params) 1159 | n_target_params = len(model.target_params) 1160 | n_normalisation_params = len(model.obs_rms_params) + len(model.ret_rms_params) 1161 | # Check that the issue is the one from 1162 | # https://github.com/hill-a/stable-baselines/issues/363 1163 | assert len(params) == 2 * (n_params + n_target_params) + n_normalisation_params,\ 1164 | "The number of parameter saved differs from the number of parameters"\ 1165 | " that should be loaded: {}!={}".format(len(params), len(model.get_parameter_list())) 1166 | # Remove duplicates 1167 | params_ = params[:n_params + n_target_params] 1168 | if n_normalisation_params > 0: 1169 | params_ += params[-n_normalisation_params:] 1170 | params = params_ 1171 | model.load_parameters(params) 1172 | 1173 | return model 1174 | -------------------------------------------------------------------------------- /install/training-pip-requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow==1.14.0 2 | tensorboard==1.14.0 3 | cloudpickle==1.3.0 4 | pyKDL 5 | rospkg 6 | catkin_pkg 7 | matplotlib 8 | stable-baselines[mpi]==2.10.0 -------------------------------------------------------------------------------- /media/AMBF_Robots.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WPI-AIM/ambf_rl/79154bc3b1c6743752851829bf04bd6bd9431d24/media/AMBF_Robots.png -------------------------------------------------------------------------------- /media/architecture-overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WPI-AIM/ambf_rl/79154bc3b1c6743752851829bf04bd6bd9431d24/media/architecture-overview.png -------------------------------------------------------------------------------- /scripts/dVRK/PSM_cartesian_ddpg_algorithm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # //============================================================================ 4 | # /* 5 | # Software License Agreement (BSD License) 6 | # Copyright (c) 2019, AMBF 7 | # (www.aimlab.wpi.edu) 8 | 9 | # All rights reserved. 10 | 11 | # Redistribution and use in source and binary forms, with or without 12 | # modification, are permitted provided that the following conditions 13 | # are met: 14 | 15 | # * Redistributions of source code must retain the above copyright 16 | # notice, this list of conditions and the following disclaimer. 17 | 18 | # * Redistributions in binary form must reproduce the above 19 | # copyright notice, this list of conditions and the following 20 | # disclaimer in the documentation and/or other materials provided 21 | # with the distribution. 22 | 23 | # * Neither the name of authors nor the names of its contributors may 24 | # be used to endorse or promote products derived from this software 25 | # without specific prior written permission. 26 | 27 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 28 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 29 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 30 | # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 31 | # COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 32 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 33 | # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 34 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 35 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36 | # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 37 | # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 38 | # POSSIBILITY OF SUCH DAMAGE. 39 | 40 | # \author 41 | # \author , , 42 | # \author Dhruv Kool Rajamani, Vignesh Manoj Varier, and Adnan Munawar 43 | # \version 1.0.0 44 | # */ 45 | # //============================================================================ 46 | 47 | # System imports 48 | import os, sys, time, datetime 49 | 50 | # Additional packages 51 | import numpy as np 52 | 53 | # ARL Env 54 | from dVRK.PSM_cartesian_ddpg_env import PSMCartesianDDPGEnv 55 | 56 | # Stable baselines algorithms 57 | from stable_baselines.ddpg.policies import MlpPolicy 58 | from stable_baselines import HER, DDPG 59 | from stable_baselines.common.noise import NormalActionNoise 60 | from stable_baselines.common.noise import OrnsteinUhlenbeckActionNoise, AdaptiveParamNoiseSpec 61 | from stable_baselines.common.callbacks import CheckpointCallback 62 | 63 | 64 | def redirect_stdout(filepath: str = None): 65 | """Redirect the ouput stream to a file. Also redirect error output stream 66 | """ 67 | cdir = os.getcwd() 68 | basepath = os.path.join(cdir, '.logs') 69 | 70 | if not os.path.exists(basepath): 71 | os.makedirs(basepath) 72 | 73 | if filepath is None: 74 | now = datetime.datetime.now() 75 | filepath = 'log_' + now.strftime("%Y_%m_%d-%H_%M_%S.txt") 76 | filepath = os.path.join(basepath, filepath) 77 | 78 | err_filepath = filepath[:-4] + '_err.txt' 79 | 80 | if os.path.exists(filepath): 81 | filepath = filepath[:-4] 82 | filepath += now.strftime("_%H_%M_%S") + '.txt' 83 | 84 | sys.stdout = open(filepath, 'w') 85 | sys.stderr = open(err_filepath, 'w') 86 | print("Began logging") 87 | return 88 | 89 | 90 | def main(env: PSMCartesianDDPGEnv): 91 | # the noise objects for DDPG 92 | n_actions = env.action.action_space.shape[0] 93 | param_noise = None 94 | action_noise = OrnsteinUhlenbeckActionNoise( 95 | mean=np.zeros(n_actions), 96 | sigma=float(0.5) * np.ones(n_actions) 97 | ) 98 | 99 | model = DDPG( 100 | MlpPolicy, 101 | env, 102 | gamma=0.95, 103 | verbose=1, 104 | nb_train_steps=300, 105 | nb_rollout_steps=150, 106 | param_noise=param_noise, 107 | batch_size=128, 108 | action_noise=action_noise, 109 | random_exploration=0.05, 110 | normalize_observations=True, 111 | tensorboard_log="./ddpg_dvrk_tensorboard/", 112 | observation_range=(-1.5, 113 | 1.5), 114 | critic_l2_reg=0.01 115 | ) 116 | 117 | model.learn( 118 | total_timesteps=4000000, 119 | log_interval=100, 120 | callback=CheckpointCallback(save_freq=100000, 121 | save_path="./ddpg_dvrk_tensorboard/") 122 | ) 123 | model.save("./ddpg_robot_env") 124 | 125 | # NOTE: 126 | # If continuing learning from previous checkpoint, 127 | # Comment above chunk of code {model=DDPG(''') till model.save("./her_robot_env")} and uncomment below lines: 128 | # Replace the XXXXX below with the largest number present in (rl_model_) directory ./ddpg_dvrk_tensorboard/ 129 | # remaining_training_steps = 4000000 - XXXXX 130 | # model_log_dir = './ddpg_dvrk_tensorboard/rl_model_XXXXX_steps.zip' 131 | # model = DDPG.load(model_log_dir, env=env) 132 | # # Reset the model 133 | # env.reset() 134 | # model.learn(remaining_training_steps, log_interval=100, 135 | # callback=CheckpointCallback(save_freq=100000, save_path="./ddpg_dvrk_tensorboard/")) 136 | # model.save("./ddpg_robot_env") 137 | 138 | 139 | def load_model(eval_env): 140 | model = DDPG.load('./ddpg_robot_env', env=eval_env) 141 | count = 0 142 | step_num_arr = [] 143 | for _ in range(20): 144 | number_steps = 0 145 | obs = eval_env.reset() 146 | for _ in range(400): 147 | action, _ = model.predict(obs) 148 | obs, reward, done, _ = eval_env.step(action) 149 | number_steps += 1 150 | if done: 151 | step_num_arr.append(number_steps) 152 | count += 1 153 | print("----------------It reached terminal state -------------------") 154 | break 155 | print( 156 | "Robot reached the goal position successfully ", 157 | count, 158 | " times and the Average step count was ", 159 | np.average(np.array(step_num_arr)) 160 | ) 161 | 162 | 163 | if __name__ == '__main__': 164 | # redirect_stdout() 165 | root_link_name = 'baselink' 166 | env_kwargs = { 167 | 'action_space_limit': 0.05, 168 | 'goal_position_range': 0.05, 169 | 'position_error_threshold': 0.01, 170 | 'goal_error_margin': 0.0075, 171 | 'joint_limits': 172 | { 173 | 'lower_limit': np.array([-0.2, 174 | -0.2, 175 | 0.1, 176 | -1.5, 177 | -1.5, 178 | -1.5, 179 | -1.5]), 180 | 'upper_limit': np.array([0.2, 181 | 0.2, 182 | 0.24, 183 | 1.5, 184 | 1.5, 185 | 1.5, 186 | 1.5]) 187 | }, 188 | 'workspace_limits': 189 | { 190 | 'lower_limit': np.array([-0.04, 191 | -0.03, 192 | -0.2]), 193 | 'upper_limit': np.array([0.03, 194 | 0.04, 195 | -0.091]) 196 | }, 197 | 'enable_step_throttling': False, 198 | 'steps_to_print': 10000 199 | } 200 | # Training 201 | ambf_env = PSMCartesianDDPGEnv(**env_kwargs) 202 | time.sleep(5) 203 | ambf_env.make(root_link_name) 204 | ambf_env.reset() 205 | main(env=ambf_env) 206 | ambf_env.ambf_client.clean_up() 207 | # Evaluate learnt policy 208 | eval_env = PSMCartesianDDPGEnv(**env_kwargs) 209 | time.sleep(5) 210 | eval_env.make(root_link_name) 211 | eval_env.reset() 212 | load_model(eval_env=eval_env) 213 | eval_env.ambf_client.clean_up() 214 | -------------------------------------------------------------------------------- /scripts/dVRK/PSM_cartesian_herddpg_algorithm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # //============================================================================ 4 | # /* 5 | # Software License Agreement (BSD License) 6 | # Copyright (c) 2019, AMBF 7 | # (www.aimlab.wpi.edu) 8 | 9 | # All rights reserved. 10 | 11 | # Redistribution and use in source and binary forms, with or without 12 | # modification, are permitted provided that the following conditions 13 | # are met: 14 | 15 | # * Redistributions of source code must retain the above copyright 16 | # notice, this list of conditions and the following disclaimer. 17 | 18 | # * Redistributions in binary form must reproduce the above 19 | # copyright notice, this list of conditions and the following 20 | # disclaimer in the documentation and/or other materials provided 21 | # with the distribution. 22 | 23 | # * Neither the name of authors nor the names of its contributors may 24 | # be used to endorse or promote products derived from this software 25 | # without specific prior written permission. 26 | 27 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 28 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 29 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 30 | # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 31 | # COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 32 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 33 | # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 34 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 35 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36 | # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 37 | # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 38 | # POSSIBILITY OF SUCH DAMAGE. 39 | 40 | # \author 41 | # \author , , 42 | # \author Dhruv Kool Rajamani, Vignesh Manoj Varier, and Adnan Munawar 43 | # \version 1.0.0 44 | # */ 45 | # //============================================================================ 46 | 47 | # System imports 48 | import os, sys, time, datetime 49 | 50 | # Additional packages 51 | import numpy as np 52 | 53 | # ARL Env 54 | from dVRK.PSM_cartesian_herddpg_env import PSMCartesianHERDDPGEnv 55 | 56 | # Stable baselines algorithms 57 | from stable_baselines.ddpg.policies import MlpPolicy 58 | from stable_baselines import HER, DDPG 59 | from stable_baselines.common.noise import NormalActionNoise 60 | from stable_baselines.common.noise import OrnsteinUhlenbeckActionNoise, AdaptiveParamNoiseSpec 61 | from stable_baselines.bench import Monitor 62 | from stable_baselines.common.callbacks import CallbackList, CheckpointCallback, EvalCallback 63 | 64 | 65 | def redirect_stdout(filepath=None): 66 | # os.makedirs('/home/admin/ambf/.logs') 67 | cdir = os.getcwd() 68 | basepath = os.path.join(cdir, '.logs') 69 | 70 | if not os.path.exists(basepath): 71 | os.makedirs(basepath) 72 | 73 | if filepath is None: 74 | now = datetime.now() 75 | filepath = 'log_' + now.strftime("%Y_%m_%d-%H_%M_%S.txt") 76 | filepath = os.path.join(basepath, filepath) 77 | 78 | err_filepath = filepath[:-4] + '_err.txt' 79 | 80 | if os.path.exists(filepath): 81 | filepath = filepath[:-4] 82 | filepath += now.strftime("_%H_%M_%S") + '.txt' 83 | 84 | sys.stdout = open(filepath, 'w') 85 | sys.stderr = open(err_filepath, 'w') 86 | print("Began logging") 87 | return 88 | 89 | 90 | def main( 91 | training_env: PSMCartesianHERDDPGEnv, 92 | eval_env: PSMCartesianHERDDPGEnv = None, 93 | log_dir='./.logs/results' 94 | ): 95 | 96 | os.makedirs(log_dir, exist_ok=True) 97 | 98 | # training_env = Monitor(training_env, log_dir) 99 | 100 | n_actions = training_env.action_space.shape[0] 101 | noise_std = 0.2 102 | # Currently using OU noise 103 | action_noise = OrnsteinUhlenbeckActionNoise( 104 | mean=np.zeros(n_actions), 105 | sigma=noise_std * np.ones(n_actions) 106 | ) 107 | model_class = DDPG # works also with SAC, DDPG and TD3 108 | 109 | rl_model_kwargs = { 110 | 'actor_lr': 1e-3, 111 | 'critic_lr': 1e-3, 112 | 'action_noise': action_noise, 113 | 'nb_train_steps': 300, 114 | 'nb_rollout_steps': 100, 115 | 'gamma': 0.95, 116 | 'observation_range': (-1.5, 117 | 1.5), 118 | 'random_exploration': 0.05, 119 | 'normalize_observations': True, 120 | 'critic_l2_reg': 0.01 121 | } 122 | 123 | # Available strategies (cf paper): future, final, episode, random 124 | model = HER( 125 | 'MlpPolicy', 126 | training_env, 127 | model_class, 128 | verbose=1, 129 | n_sampled_goal=4, 130 | goal_selection_strategy='future', 131 | buffer_size=int(1e5), 132 | batch_size=128, 133 | tensorboard_log="./ddpg_dvrk_tensorboard/", 134 | **rl_model_kwargs 135 | ) 136 | # Reset the model 137 | training_env.reset() 138 | # Create callbacks 139 | checkpoint_callback = CheckpointCallback( 140 | save_freq=100000, 141 | save_path="./ddpg_dvrk_tensorboard/" 142 | ) # save_path="./.model/model_checkpoint/") #save_freq=100000 143 | # eval_callback = EvalCallback(training_env, best_model_save_path='./ddpg_dvrk_tensorboard/best_model', 144 | # log_path=log_dir, eval_freq=500) 145 | callback = CallbackList([checkpoint_callback]) # , eval_callback]) 146 | # Train the model 147 | model.learn(4000000, log_interval=100, callback=callback) 148 | model.save("./her_robot_env") 149 | 150 | # NOTE: 151 | # If continuing learning from previous checkpoint, 152 | # Comment above chunk of code {model=HER(''') till model.save("./her_robot_env")} and uncomment below lines: 153 | # Replace the XXXXX below with the largest number present in (rl_model_) directory ./ddpg_dvrk_tensorboard/ 154 | # remaining_training_steps = 4000000 - XXXXX 155 | # model_log_dir = './ddpg_dvrk_tensorboard/rl_model_XXXXX_steps.zip' 156 | # model = HER.load(model_log_dir, env=env) 157 | # # Reset the model 158 | # env.reset() 159 | # model.learn(remaining_training_steps, log_interval=100, 160 | # callback=CheckpointCallback(save_freq=100000, save_path="./ddpg_dvrk_tensorboard/")) 161 | # model.save("./her_robot_env") 162 | 163 | 164 | def load_model(eval_env): 165 | # WARNING: you must pass an env 166 | # or wrap your environment with HERGoalEnvWrapper to use the predict method 167 | model = HER.load('./her_robot_env', env=eval_env) 168 | count = 0 169 | step_num_arr = [] 170 | for _ in range(20): 171 | number_steps = 0 172 | obs = eval_env.reset() 173 | for _ in range(400): 174 | action, _ = model.predict(obs) 175 | obs, reward, done, _ = eval_env.step(action) 176 | number_steps += 1 177 | # print(obs['achieved_goal'][0:3], obs['desired_goal'][0:3], reward) 178 | if done: 179 | step_num_arr.append(number_steps) 180 | count += 1 181 | print("----------------It reached terminal state -------------------") 182 | break 183 | print( 184 | "Robot reached the goal position successfully ", 185 | count, 186 | " times and the Average step count was ", 187 | np.average(np.array(step_num_arr)) 188 | ) 189 | 190 | 191 | if __name__ == '__main__': 192 | # redirect_stdout() 193 | ENV_NAME = 'psm/baselink' 194 | env_kwargs = { 195 | 'action_space_limit': 0.05, 196 | 'goal_position_range': 0.05, 197 | 'position_error_threshold': 0.01, 198 | 'goal_error_margin': 0.0075, 199 | 'joint_limits': 200 | { 201 | 'lower_limit': np.array([-0.2, 202 | -0.2, 203 | 0.1, 204 | -1.5, 205 | -1.5, 206 | -1.5, 207 | -1.5]), 208 | 'upper_limit': np.array([0.2, 209 | 0.2, 210 | 0.24, 211 | 1.5, 212 | 1.5, 213 | 1.5, 214 | 1.5]) 215 | }, 216 | 'workspace_limits': 217 | { 218 | 'lower_limit': np.array([-0.04, 219 | -0.03, 220 | -0.2]), 221 | 'upper_limit': np.array([0.03, 222 | 0.04, 223 | -0.091]) 224 | }, 225 | 'enable_step_throttling': False, 226 | 'steps_to_print': 10000 227 | } 228 | # Training 229 | print("Creating training_env") 230 | ambf_env = PSMCartesianHERDDPGEnv(**env_kwargs) 231 | time.sleep(5) 232 | ambf_env.make(ENV_NAME) 233 | ambf_env.reset() 234 | 235 | main(training_env=ambf_env) #, eval_env=eval_env) 236 | ambf_env.ambf_client.clean_up() 237 | 238 | # Evaluate learnt policy 239 | print("Creating eval_env") 240 | eval_env = PSMCartesianHERDDPGEnv(**env_kwargs) 241 | time.sleep(5) 242 | eval_env.make(ENV_NAME) 243 | eval_env.reset() 244 | 245 | load_model(eval_env=eval_env) 246 | eval_env.ambf_client.clean_up() 247 | -------------------------------------------------------------------------------- /scripts/dVRK/bagging.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | arg=$1 4 | 5 | rosbag record -e "/ambf/env/(.*)/(.*)/Command" -O $arg 6 | -------------------------------------------------------------------------------- /scripts/dVRK/launch_ambf.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source /opt/ros/melodic/setup.bash 4 | source $AMBF_WS/build/devel/setup.bash 5 | 6 | pushd $AMBF_WS/bin/lin-x86_64/ 7 | ./ambf_simulator -l 5,22 -p 2000 -t 1 -s 2 # -g false 8 | popd 9 | -------------------------------------------------------------------------------- /scripts/dVRK/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python 2 | 3 | import os, sys, shutil 4 | from multiprocessing import Pool 5 | 6 | processes = ( 7 | 'roscore', 8 | 'launch_ambf', 9 | 'psmIK_service', 10 | 'training', 11 | 'tensorboard_launch', 12 | # 'bagging', 13 | 'print_output' 14 | ) 15 | 16 | 17 | def run_process(script_name): 18 | print('Running process: {}'.format(script_name)) 19 | if script_name == 'print_output': 20 | os.system('./{}.bash'.format(script_name)) 21 | elif script_name == 'training': 22 | os.system( 23 | './{}.bash 1> .logs/{}_out.txt 2> .logs/{}_err.txt'.format( 24 | script_name, 25 | script_name, 26 | script_name 27 | ) 28 | ) 29 | else: 30 | os.system('./{}.bash >> .logs/{}.txt 2>&1'.format(script_name, script_name)) # 31 | 32 | 33 | if __name__ == '__main__': 34 | 35 | if os.path.isdir('.logs'): 36 | print('Previous .logs dir exists, deleting it...') 37 | shutil.rmtree('.logs') 38 | 39 | if os.path.isdir('.model'): 40 | print('Previous .model dir exists, deleting it...') 41 | shutil.rmtree('.model') 42 | 43 | if os.path.isdir('ddpg_dvrk_tensorboard'): 44 | print('Previous tensorboard dir exists, deleting it...') 45 | shutil.rmtree('ddpg_dvrk_tensorboard') 46 | 47 | print('Creating new .logs dir...') 48 | os.mkdir('.logs') 49 | 50 | print('Running {} processes: {}'.format(len(processes), processes)) 51 | pool = Pool(processes=len(processes)) 52 | pool.map(run_process, processes) 53 | -------------------------------------------------------------------------------- /scripts/dVRK/print_output.bash: -------------------------------------------------------------------------------- 1 | if [ -f .logs/training_out.txt ]; then 2 | tail -f .logs/training_out.txt 3 | else 4 | echo "training_out.txt has not been generated yet, trying again in some time." 5 | fi 6 | -------------------------------------------------------------------------------- /scripts/dVRK/psmIK_service.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source /opt/ros/melodic/setup.bash 4 | source $AMBF_WS/build/devel/setup.bash 5 | 6 | rosrun dvrk_ambf_extensions psmIK_service.py -------------------------------------------------------------------------------- /scripts/dVRK/roscore.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH 4 | 5 | source /opt/ros/melodic/setup.bash 6 | roscore 7 | -------------------------------------------------------------------------------- /scripts/dVRK/tensorboard_launch.bash: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | 3 | tensorboard --logdir $AMBF_RL_WS/scripts/dVRK/ddpg_dvrk_tensorboard --port 12346 -------------------------------------------------------------------------------- /scripts/dVRK/training.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source /opt/ros/melodic/setup.bash 3 | source $AMBF_WS/build/devel/setup.bash 4 | 5 | python $AMBF_RL_WS/scripts/dVRK/PSM_cartesian_herddpg_algorithm.py 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from distutils.core import setup 4 | 5 | setup( 6 | name='AMBF_RL', 7 | version='1.0.0', 8 | description='AMBF Reinforcement Learning Toolkit', 9 | author='Dhruv Kool Rajamani', 10 | author_email='dkoolrajamani@wpi.edu', 11 | package_dir={'': 'envs'}, 12 | packages=['arl', 13 | 'dVRK'] 14 | ) 15 | -------------------------------------------------------------------------------- /utilities/Create Expert Dataset/examples/dVRK/move_ambf_psm_from_rosbagdata.py: -------------------------------------------------------------------------------- 1 | from ambf_client import Client 2 | import numpy as np 3 | import time 4 | import pandas as pd 5 | from psmFK import compute_FK 6 | from psmIK import * 7 | 8 | 9 | if __name__ == "__main__": 10 | 11 | csv_path = "path_to_csv_directory" 12 | trajectory_data_values = pd.read_csv(csv_path).to_numpy() 13 | data = trajectory_data_values 14 | # data = np.load(file_dir + 'rosbag_data.npy') 15 | print(data.shape) 16 | print(data[0]) 17 | _client = Client() 18 | _client.connect() 19 | print(_client.get_obj_names()) 20 | psm_handle = _client.get_obj_handle('psm/baselink') 21 | time.sleep(5) 22 | joint_pos = np.zeros(7) 23 | count = 0 24 | error_threshold = 0.01 25 | # desired_pos = np.array([[0.5, -0.33, 0.149, 0, 0, 0., 0.], 26 | # [-1.605, 0.026, 0.1, 1.569, -1.572, 0., -0.]]) 27 | joint_thresh = joint_pos 28 | counter = 0 29 | # Example for dvrk robot 30 | joints_to_control = np.array(['baselink-yawlink', 31 | 'yawlink-pitchbacklink', 32 | 'pitchendlink-maininsertionlink', 33 | 'maininsertionlink-toolrolllink', 34 | 'toolrolllink-toolpitchlink', 35 | 'toolpitchlink-toolgripper1link', 36 | 'toolpitchlink-toolgripper2link']) 37 | 38 | current_end_effector_frame = compute_FK(np.zeros(7)) 39 | desired_end_effector_frame = current_end_effector_frame 40 | for i in range(0, data.shape[0]): 41 | for j in range(3): 42 | # print("data ij ", data[i, j]) 43 | desired_end_effector_frame[j, 3] = round(data[i, j], 3) 44 | desired_joint_pos = np.around(compute_IK(convert_mat_to_frame(desired_end_effector_frame)), 3) 45 | desired_joint_pos = np.append(desired_joint_pos, 0) 46 | # print(desired_joint_pos) 47 | count = 0 48 | time.sleep(0.25) 49 | while True: 50 | 51 | for joint_idx, jt_name in enumerate(joints_to_control): 52 | psm_handle.set_joint_pos(jt_name, desired_joint_pos[joint_idx]) 53 | joint_pos[joint_idx] = round(psm_handle.get_joint_pos(jt_name), 3) 54 | 55 | error_in_pos = np.around(np.subtract(desired_joint_pos, joint_pos), decimals=3) 56 | # print("Error in pos ", error_in_pos) 57 | count += 1 58 | if np.all(np.abs(error_in_pos) <= 0.01) or count > 75: 59 | break 60 | -------------------------------------------------------------------------------- /utilities/Create Expert Dataset/generate_expert_dataset.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import csv 4 | import math 5 | import rosbag 6 | import rospy 7 | from numpy import linalg as LA 8 | 9 | 10 | def getData(rosbag_path, ros_topic_name, pos=True): 11 | cur_bag = rosbag.Bag(rosbag_path) 12 | 13 | t_start = rospy.Time(cur_bag.get_start_time()) 14 | t_end = rospy.Time(cur_bag.get_end_time()) 15 | 16 | num_msgs = cur_bag.get_message_count(ros_topic_name) 17 | # For position 18 | if pos: 19 | data = np.zeros(shape=(num_msgs, 3)) 20 | # For orientation (quaternion) 21 | else: 22 | data = np.zeros(shape=(num_msgs, 4)) 23 | idx = 0 24 | 25 | for top, msg, t in cur_bag.read_messages(topics=[ros_topic_name], 26 | start_time=t_start, 27 | end_time=t_end): 28 | if pos: 29 | cur = np.array([msg.pose.position.x, msg.pose.position.y, msg.pose.position.z]) 30 | else: 31 | cur = np.array([msg.pose.orientation.x, msg.pose.orientation.y, 32 | msg.pose.orientation.z, msg.pose.orientation.w]) 33 | data[idx, :] = cur 34 | idx = idx + 1 35 | 36 | return data 37 | 38 | 39 | def create_state_action_pairs(data): 40 | steps = [] 41 | fixed = [] 42 | curr = data[0] 43 | steps.append(curr-curr) 44 | fixed.append(np.round(curr, 3)) 45 | for j in range(0, len(data)-1): 46 | curr = round_nearest(data[j], 0.005) 47 | nxt = round_nearest(data[j+1], 0.005) 48 | interval = np.round(nxt-curr, 3) 49 | if abs(interval[0]) > 0 or abs(interval[1]) > 0 or abs(interval[2]) > 0: 50 | steps.append(interval) 51 | fixed.append(curr) 52 | 53 | return np.array(fixed), steps 54 | 55 | 56 | def make_file(robot_states, robot_actions, csv_name): 57 | with open(csv_name + '.csv', mode='w') as f: 58 | writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) 59 | for robot_state, robot_action in zip(robot_states, robot_actions): 60 | writer.writerow([robot_state[0], robot_state[1], robot_state[2], robot_action[0], 61 | robot_action[1], robot_action[2]]) 62 | 63 | 64 | def round_nearest(x, a): 65 | return np.round(np.round(x / a) * a, -int(math.floor(math.log10(a)))) 66 | 67 | 68 | def create_reward(data_values, goal_pos, file_dir): 69 | total_reward = 0 70 | list_of_reward = [] 71 | print(data_values.shape) 72 | for i in range(data_values.shape[0]): 73 | # dist_x = np.subtract(final_state, actual_val[i, 0]) 74 | # dist_y = np.subtract(final_state, actual_val[i, 1]) 75 | # dist_z = np.subtract(final_state, actual_val[i, 2]) 76 | # reward = math.sqrt(math.pow(dist_x, 2) + math.pow(dist_y, 2) + math.pow(dist_z, 2)) 77 | cur_dist = LA.norm(np.subtract(goal_pos, data_values[i, 0:3])) 78 | # reward = round(1 - float(abs(cur_dist)/0.05)*0.5, 5) 79 | reward = -cur_dist 80 | if abs(cur_dist) < 0.000001: 81 | reward = 1 82 | 83 | total_reward += reward 84 | list_of_reward.append(reward) 85 | print(total_reward) 86 | # np.savetxt(file_dir + 'episode_returns.csv', total_reward, delimiter=',') 87 | # np.savetxt(file_dir + 'rewards.csv', np.array(list_of_reward), delimiter=',') 88 | np.save(file_dir + 'rewards', np.array(list_of_reward)) 89 | np.save(file_dir + 'episode_returns', total_reward) 90 | 91 | 92 | def create_obs_action_func(data_values, file_dir): 93 | obs = [] 94 | actions = [] 95 | print(data_values.shape) 96 | for i in range(data_values.shape[0]): 97 | obs.append(data_values[i, 0:3]) 98 | actions.append(data_values[i, 3:6]) 99 | print(obs) 100 | np.save(file_dir + 'observations', np.array(obs)) 101 | np.save(file_dir + 'actions', np.array(actions)) 102 | 103 | 104 | if __name__ == "__main__": 105 | 106 | rosbag_file_dir = "path_to_rosbag file" 107 | rostopic_name = "/dvrk/PSM2/position_cartesian_current" 108 | csv_file_name = "path_to_csv_file" 109 | # Read rosbag data 110 | data = getData(rosbag_file_dir, rostopic_name) 111 | # Create state action pairs 112 | robot_state, robot_action = create_state_action_pairs(data) 113 | # Write it to a csv file 114 | make_file(robot_state, robot_action, csv_file_name) 115 | # Assuming reward is reward = round(1 - float(abs(cur_dist)/0.3)*0.5, 5) 116 | traj_csv_name = "path_to_where_rosbag_file_is_stored" 117 | trajectory_data_values = pd.read_csv(traj_csv_name).to_numpy() 118 | final_state = np.array([0.005, 0.054, -0.122]) 119 | # create_obs_action_func(trajectory_data_values, file_dir) 120 | # create_reward(trajectory_data_values, final_state, file_dir) 121 | epi_start = np.array([True]) 122 | for i in range(trajectory_data_values.shape[0]): 123 | epi_start = np.append(epi_start, False) 124 | np.save(file_dir + 'episode_starts', epi_start) 125 | 126 | 127 | 128 | --------------------------------------------------------------------------------