├── .ipynb_checkpoints └── plot-checkpoint.ipynb ├── README.md ├── __pycache__ ├── reacher_sawyer_env_boundingbox.cpython-36.pyc └── wrapper2.cpython-36.pyc ├── arms └── Sawyer.ttm ├── data ├── reward_log_dense.npy ├── reward_log_dense_aug.npy └── reward_log_sparse.npy ├── figures ├── comparison.pdf ├── comparison.png ├── reacher.gif ├── reacher.png └── training.png ├── hands ├── BaxterGripper.ttm └── JacoHand.ttm ├── model ├── sac_multi_policy ├── sac_multi_q1 ├── sac_multi_q2 └── trained_model │ ├── augmented_dense_reward │ ├── sac_multi_policy │ ├── sac_multi_q1 │ └── sac_multi_q2 │ └── dense_reward │ ├── sac_multi_policy │ ├── sac_multi_q1 │ └── sac_multi_q2 ├── objects └── table.ttm ├── plot.ipynb ├── reward_log.npy ├── sac_learn.py ├── sawyer_grasp_env_boundingbox.py ├── scenes ├── sawyer_reacher_rl_new.ttt └── sawyer_reacher_rl_new_ik.ttt └── training.pdf /.ipynb_checkpoints/plot-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Display Training Curve" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 10, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "data": { 17 | "image/png": "\n", 18 | "text/plain": [ 19 | "
" 20 | ] 21 | }, 22 | "metadata": { 23 | "needs_background": "light" 24 | }, 25 | "output_type": "display_data" 26 | } 27 | ], 28 | "source": [ 29 | "import numpy as np\n", 30 | "from matplotlib import pyplot as plt\n", 31 | "reward = np.load('reward_log.npy')\n", 32 | "\n", 33 | "def smooth(y, radius=100, mode='two_sided'):\n", 34 | " if len(y) < 2*radius+1:\n", 35 | " return np.ones_like(y) * y.mean()\n", 36 | " elif mode == 'two_sided':\n", 37 | " convkernel = np.ones(2 * radius+1)\n", 38 | " return np.convolve(y, convkernel, mode='same') / \\\n", 39 | " np.convolve(np.ones_like(y), convkernel, mode='same')\n", 40 | " elif mode == 'causal':\n", 41 | " convkernel = np.ones(radius)\n", 42 | " out = np.convolve(y, convkernel,mode='full') / \\\n", 43 | " np.convolve(np.ones_like(y), convkernel, mode='full')\n", 44 | " return out[:-radius+1]\n", 45 | "\n", 46 | " \n", 47 | "def moving_sum(y, window=100):\n", 48 | " c = y.cumsum()\n", 49 | " c[window:] = c[window:] - c[:-window]\n", 50 | " return c/float(window)\n", 51 | " \n", 52 | "success_list=np.zeros(len(reward))\n", 53 | "success_list[np.where(reward>-0)]=1 # reward larger than 0 indicates successful grasping\n", 54 | "\n", 55 | "early_stop=4500\n", 56 | "\n", 57 | "fig, axs = plt.subplots(2)\n", 58 | "# plot smoothed reward curve\n", 59 | "axs[0].plot(smooth(reward[:early_stop], radius=100))\n", 60 | "axs[0].set_title('Learning Curve')\n", 61 | "axs[0].set_ylabel('Smoothed Reward')\n", 62 | "axs[0].grid()\n", 63 | "\n", 64 | "axs[1].plot(moving_sum(success_list[:early_stop]))\n", 65 | "axs[1].set_xlabel('Training Episode')\n", 66 | "axs[1].set_ylabel('Success Rate')\n", 67 | "axs[1].grid()\n", 68 | "plt.tight_layout()\n", 69 | "plt.savefig('training.pdf')\n", 70 | "plt.show()\n" 71 | ] 72 | } 73 | ], 74 | "metadata": { 75 | "kernelspec": { 76 | "display_name": "Python 3", 77 | "language": "python", 78 | "name": "python3" 79 | }, 80 | "language_info": { 81 | "codemirror_mode": { 82 | "name": "ipython", 83 | "version": 2 84 | }, 85 | "file_extension": ".py", 86 | "mimetype": "text/x-python", 87 | "name": "python", 88 | "nbconvert_exporter": "python", 89 | "pygments_lexer": "ipython2", 90 | "version": "2.7.15" 91 | } 92 | }, 93 | "nbformat": 4, 94 | "nbformat_minor": 2 95 | } 96 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Chapter 16: Robot Learning in Simulation (Project 4) 2 | ## Description: 3 | Example of Sawyer robot learning to reach the target with paralleled Soft Actor-Critic (SAC) algorithm, using PyRep for Sawyer robot simulation and game building. The environment is wrapped into OpenAI Gym format. 4 |

5 | 6 |

7 | 8 | ## Dependencies: 9 | * [V-REP 3.6.2](http://www.coppeliarobotics.com/previousVersions) 10 | * [PyRep](https://github.com/deep-reinforcement-learning-book/PyRep) 11 | * PyTorch 12 | 13 | Note: 14 | * The later version of V-REP 3.6.2 is renamed CoppeliaSim after verison 4.0.0, which may have some incompatible issues with PyRep during the process of this project, so we suggest to use V-REP 3.6.2 [here](http://www.coppeliarobotics.com/previousVersions) and the maintained PyRep in our repository. 15 | * The official repository of PyRep is [here](https://github.com/stepjam/PyRep), but we maintain a stable version [here](https://github.com/deep-reinforcement-learning-book/PyRep) in our repository for supporting V-REP 3.6.2, please use the version we provide ([here](https://github.com/deep-reinforcement-learning-book/PyRep)) for avoiding unnecessary incompatibility. 16 | 17 | ## Contents: 18 | * `arms/`: object models of arms; 19 | * `hands/`: object models of grippers; 20 | * `objects/`: models of other objects in the scene; 21 | * `scenes/`: built scenes for Sawyer robot grasping; 22 | * `figures/`: figures for displaying; 23 | * `model/`: the model after training, and two pre-trained models with different reward functions; 24 | * `data/`: reward logs of with different reward functions; 25 | * `sawyer_grasp_env_boundingbox.py`: script of Sawyer robot grasping environment; 26 | * `sac_learn.py`: pralleled Soft Actor-Critic algorithm for solving Sawyer robot grasping task; 27 | * `reward_log.npy`: log of episode reward during training; 28 | * `plot.ipynb`: displaying the learning curves. 29 | 30 | 31 | ## Usage: 32 | 0. First check the environment can run successfully: 33 | 34 | `$ python sawyer_grasp_env_boundingbox.py` 35 | 36 | If it works properly with VRep called to run a scene, with Sawyer robot arm moving randomly, then go to next step; otherwise check the dependencies for necessary packages and versions. 37 | 1. Run `$ python sac_learn.py --train` for training the policy 38 | 39 | 2. Run `$ python sac_learn.py --test` for testing the trained policy, remember to change the `trained_model_path`, which is default to be the trained model we provided. 40 | 41 | 3. The training process will provide a `reward_log.npy` file for recording the reward value during training, which can be displayed with `$ jupyter notebook` in a new terminal, choose `plot.ipynb`and Shift+Enter to run the first cell, shown as follows: 42 |

43 | 44 |

45 | 46 | ## Authors: 47 | [Zihan Ding](https://github.com/quantumiracle), [Yanhua Huang](https://github.com/Officium) 48 | 49 | 50 | ## Citing: 51 | 52 | ``` 53 | @misc{DeepReinforcementLearning-Chapter16-RobotLearninginSimulation, 54 | author = {Zihan Ding, Yanhua Huang}, 55 | title = {Chapter16-RobotLearninginSimulation}, 56 | year = {2019}, 57 | publisher = {GitHub}, 58 | journal = {GitHub repository}, 59 | howpublished = {\url{https://github.com/deep-reinforcement-learning-book/Chapter16-Robot-Learning-in-Simulation}}, 60 | } 61 | ``` 62 | 63 | or 64 | 65 | ``` 66 | @book{deepRL-2020, 67 | title={Deep Reinforcement Learning: Fundamentals, Research, and Applications}, 68 | editor={Hao Dong, Zihan Ding, Shanghang Zhang}, 69 | author={Hao Dong, Zihan Ding, Shanghang Zhang, Hang Yuan, Hongming Zhang, Jingqing Zhang, Yanhua Huang, Tianyang Yu, Huaqing Zhang, Ruitong Huang}, 70 | publisher={Springer Nature}, 71 | note={\url{http://www.deepreinforcementlearningbook.org}}, 72 | year={2020} 73 | } 74 | ``` 75 | 76 | -------------------------------------------------------------------------------- /__pycache__/reacher_sawyer_env_boundingbox.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-reinforcement-learning-book/Chapter16-Robot-Learning-in-Simulation/604f06384106e365d28bb51b1280d884c09ead9e/__pycache__/reacher_sawyer_env_boundingbox.cpython-36.pyc -------------------------------------------------------------------------------- /__pycache__/wrapper2.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-reinforcement-learning-book/Chapter16-Robot-Learning-in-Simulation/604f06384106e365d28bb51b1280d884c09ead9e/__pycache__/wrapper2.cpython-36.pyc -------------------------------------------------------------------------------- /arms/Sawyer.ttm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-reinforcement-learning-book/Chapter16-Robot-Learning-in-Simulation/604f06384106e365d28bb51b1280d884c09ead9e/arms/Sawyer.ttm -------------------------------------------------------------------------------- /data/reward_log_dense.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-reinforcement-learning-book/Chapter16-Robot-Learning-in-Simulation/604f06384106e365d28bb51b1280d884c09ead9e/data/reward_log_dense.npy -------------------------------------------------------------------------------- /data/reward_log_dense_aug.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-reinforcement-learning-book/Chapter16-Robot-Learning-in-Simulation/604f06384106e365d28bb51b1280d884c09ead9e/data/reward_log_dense_aug.npy -------------------------------------------------------------------------------- /data/reward_log_sparse.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-reinforcement-learning-book/Chapter16-Robot-Learning-in-Simulation/604f06384106e365d28bb51b1280d884c09ead9e/data/reward_log_sparse.npy -------------------------------------------------------------------------------- /figures/comparison.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-reinforcement-learning-book/Chapter16-Robot-Learning-in-Simulation/604f06384106e365d28bb51b1280d884c09ead9e/figures/comparison.pdf -------------------------------------------------------------------------------- /figures/comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-reinforcement-learning-book/Chapter16-Robot-Learning-in-Simulation/604f06384106e365d28bb51b1280d884c09ead9e/figures/comparison.png -------------------------------------------------------------------------------- /figures/reacher.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-reinforcement-learning-book/Chapter16-Robot-Learning-in-Simulation/604f06384106e365d28bb51b1280d884c09ead9e/figures/reacher.gif -------------------------------------------------------------------------------- /figures/reacher.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-reinforcement-learning-book/Chapter16-Robot-Learning-in-Simulation/604f06384106e365d28bb51b1280d884c09ead9e/figures/reacher.png -------------------------------------------------------------------------------- /figures/training.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-reinforcement-learning-book/Chapter16-Robot-Learning-in-Simulation/604f06384106e365d28bb51b1280d884c09ead9e/figures/training.png -------------------------------------------------------------------------------- /hands/BaxterGripper.ttm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-reinforcement-learning-book/Chapter16-Robot-Learning-in-Simulation/604f06384106e365d28bb51b1280d884c09ead9e/hands/BaxterGripper.ttm -------------------------------------------------------------------------------- /hands/JacoHand.ttm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-reinforcement-learning-book/Chapter16-Robot-Learning-in-Simulation/604f06384106e365d28bb51b1280d884c09ead9e/hands/JacoHand.ttm -------------------------------------------------------------------------------- /model/sac_multi_policy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-reinforcement-learning-book/Chapter16-Robot-Learning-in-Simulation/604f06384106e365d28bb51b1280d884c09ead9e/model/sac_multi_policy -------------------------------------------------------------------------------- /model/sac_multi_q1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-reinforcement-learning-book/Chapter16-Robot-Learning-in-Simulation/604f06384106e365d28bb51b1280d884c09ead9e/model/sac_multi_q1 -------------------------------------------------------------------------------- /model/sac_multi_q2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-reinforcement-learning-book/Chapter16-Robot-Learning-in-Simulation/604f06384106e365d28bb51b1280d884c09ead9e/model/sac_multi_q2 -------------------------------------------------------------------------------- /model/trained_model/augmented_dense_reward/sac_multi_policy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-reinforcement-learning-book/Chapter16-Robot-Learning-in-Simulation/604f06384106e365d28bb51b1280d884c09ead9e/model/trained_model/augmented_dense_reward/sac_multi_policy -------------------------------------------------------------------------------- /model/trained_model/augmented_dense_reward/sac_multi_q1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-reinforcement-learning-book/Chapter16-Robot-Learning-in-Simulation/604f06384106e365d28bb51b1280d884c09ead9e/model/trained_model/augmented_dense_reward/sac_multi_q1 -------------------------------------------------------------------------------- /model/trained_model/augmented_dense_reward/sac_multi_q2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-reinforcement-learning-book/Chapter16-Robot-Learning-in-Simulation/604f06384106e365d28bb51b1280d884c09ead9e/model/trained_model/augmented_dense_reward/sac_multi_q2 -------------------------------------------------------------------------------- /model/trained_model/dense_reward/sac_multi_policy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-reinforcement-learning-book/Chapter16-Robot-Learning-in-Simulation/604f06384106e365d28bb51b1280d884c09ead9e/model/trained_model/dense_reward/sac_multi_policy -------------------------------------------------------------------------------- /model/trained_model/dense_reward/sac_multi_q1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-reinforcement-learning-book/Chapter16-Robot-Learning-in-Simulation/604f06384106e365d28bb51b1280d884c09ead9e/model/trained_model/dense_reward/sac_multi_q1 -------------------------------------------------------------------------------- /model/trained_model/dense_reward/sac_multi_q2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-reinforcement-learning-book/Chapter16-Robot-Learning-in-Simulation/604f06384106e365d28bb51b1280d884c09ead9e/model/trained_model/dense_reward/sac_multi_q2 -------------------------------------------------------------------------------- /objects/table.ttm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-reinforcement-learning-book/Chapter16-Robot-Learning-in-Simulation/604f06384106e365d28bb51b1280d884c09ead9e/objects/table.ttm -------------------------------------------------------------------------------- /plot.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Display Training Curve" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 85, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "data": { 17 | "image/png": "\n", 18 | "text/plain": [ 19 | "
" 20 | ] 21 | }, 22 | "metadata": { 23 | "needs_background": "light" 24 | }, 25 | "output_type": "display_data" 26 | } 27 | ], 28 | "source": [ 29 | "import numpy as np\n", 30 | "from matplotlib import pyplot as plt\n", 31 | "reward = np.load('reward_log.npy')\n", 32 | "\n", 33 | "def smooth(y, radius=10, mode='two_sided'):\n", 34 | " if len(y) < 2*radius+1:\n", 35 | " return np.ones_like(y) * y.mean()\n", 36 | " elif mode == 'two_sided':\n", 37 | " convkernel = np.ones(2 * radius+1)\n", 38 | " return np.convolve(y, convkernel, mode='same') / \\\n", 39 | " np.convolve(np.ones_like(y), convkernel, mode='same')\n", 40 | " elif mode == 'causal':\n", 41 | " convkernel = np.ones(radius)\n", 42 | " out = np.convolve(y, convkernel,mode='full') / \\\n", 43 | " np.convolve(np.ones_like(y), convkernel, mode='full')\n", 44 | " return out[:-radius+1]\n", 45 | "\n", 46 | " \n", 47 | "def moving_sum(y, window=100):\n", 48 | " c = y.cumsum()\n", 49 | " c[window:] = c[window:] - c[:-window]\n", 50 | " return c/float(window)\n", 51 | "\n", 52 | "def success_filter(r, threshold=4):\n", 53 | " success_list=np.zeros(len(r))\n", 54 | " success_list[np.where(r>threshold)]=1 # reward larger than threshold indicates successful grasping\n", 55 | " return success_list\n", 56 | " \n", 57 | "success_list=np.zeros(len(reward))\n", 58 | "success_list[np.where(reward>4)]=1 # reward larger than 0 indicates successful grasping\n", 59 | "\n", 60 | "early_stop=400000\n", 61 | "\n", 62 | "fig, axs = plt.subplots(2)\n", 63 | "# plot smoothed reward curve\n", 64 | "axs[0].plot(smooth(reward[:early_stop], radius=100))\n", 65 | "axs[0].set_title('Learning Curve')\n", 66 | "axs[0].set_ylabel('Smoothed Reward')\n", 67 | "axs[0].grid()\n", 68 | "\n", 69 | "axs[1].plot(moving_sum(success_filter(reward)[:early_stop]))\n", 70 | "axs[1].set_xlabel('Training Episode')\n", 71 | "axs[1].set_ylabel('Success Rate')\n", 72 | "axs[1].grid()\n", 73 | "plt.tight_layout()\n", 74 | "plt.savefig('training.pdf')\n", 75 | "plt.show()\n" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": {}, 81 | "source": [ 82 | "## Comparision with different reward functions" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 81, 88 | "metadata": {}, 89 | "outputs": [ 90 | { 91 | "data": { 92 | "image/png": "\n", 93 | "text/plain": [ 94 | "
" 95 | ] 96 | }, 97 | "metadata": { 98 | "needs_background": "light" 99 | }, 100 | "output_type": "display_data" 101 | } 102 | ], 103 | "source": [ 104 | "import numpy as np\n", 105 | "from matplotlib import pyplot as plt\n", 106 | "file='./data/'\n", 107 | "reward_sparse = np.load(file+'reward_log_sparse.npy')\n", 108 | "reward_dense = np.load(file+'reward_log_dense.npy')\n", 109 | "reward_dense_aug = np.load(file+'reward_log_dense_aug.npy')\n", 110 | "\n", 111 | "def smooth(y, radius=100, mode='two_sided'):\n", 112 | " if len(y) < 2*radius+1:\n", 113 | " return np.ones_like(y) * y.mean()\n", 114 | " elif mode == 'two_sided':\n", 115 | " convkernel = np.ones(2 * radius+1)\n", 116 | " return np.convolve(y, convkernel, mode='same') / \\\n", 117 | " np.convolve(np.ones_like(y), convkernel, mode='same')\n", 118 | " elif mode == 'causal':\n", 119 | " convkernel = np.ones(radius)\n", 120 | " out = np.convolve(y, convkernel,mode='full') / \\\n", 121 | " np.convolve(np.ones_like(y), convkernel, mode='full')\n", 122 | " return out[:-radius+1]\n", 123 | "\n", 124 | " \n", 125 | "def moving_sum(y, window=1000):\n", 126 | " c = y.cumsum()\n", 127 | " c[window:] = c[window:] - c[:-window]\n", 128 | " return c/float(window)\n", 129 | "\n", 130 | "def success_filter(r, threshold=4):\n", 131 | " success_list=np.zeros(len(r))\n", 132 | " success_list[np.where(r>threshold)]=1 # reward larger than threshold indicates successful grasping\n", 133 | " return success_list\n", 134 | "\n", 135 | "fig, axs = plt.subplots(4, figsize=(8,6))\n", 136 | "axs[0].plot(reward_sparse, c='g', label='Sparse Reward')\n", 137 | "axs[0].set_title('Learning Curve (Sparse Reward)')\n", 138 | "axs[0].set_ylabel('Reward')\n", 139 | "axs[0].grid()\n", 140 | "\n", 141 | "axs[1].plot(smooth(reward_dense), c='r', label='Dense Reward')\n", 142 | "axs[1].set_title('Learning Curve (Dense Reward)')\n", 143 | "axs[1].set_ylabel('Smoothed Reward')\n", 144 | "axs[1].grid()\n", 145 | "\n", 146 | "axs[2].plot(smooth(reward_dense_aug), c='b', label='(Augmented) Dense Reward')\n", 147 | "axs[2].set_title('Learning Curve (Augmented Reward)')\n", 148 | "axs[2].set_ylabel('Smoothed Reward')\n", 149 | "axs[2].grid()\n", 150 | "\n", 151 | "axs[3].plot(moving_sum(success_filter(reward_sparse)), c='g', label='Sparse Reward')\n", 152 | "axs[3].plot(moving_sum(success_filter(reward_dense)), c='r', label='Dense Reward')\n", 153 | "axs[3].plot(moving_sum(success_filter(reward_dense_aug)), c='b', label='(Augmented) Dense Reward')\n", 154 | "axs[3].set_title('Success')\n", 155 | "axs[3].set_xlabel('Training Episode')\n", 156 | "axs[3].set_ylabel('Success Rate')\n", 157 | "axs[3].grid()\n", 158 | "axs[3].legend()\n", 159 | "\n", 160 | "plt.tight_layout()\n", 161 | "plt.savefig('comparison.png')\n", 162 | "plt.show()\n" 163 | ] 164 | } 165 | ], 166 | "metadata": { 167 | "kernelspec": { 168 | "display_name": "Python 3", 169 | "language": "python", 170 | "name": "python3" 171 | }, 172 | "language_info": { 173 | "codemirror_mode": { 174 | "name": "ipython", 175 | "version": 2 176 | }, 177 | "file_extension": ".py", 178 | "mimetype": "text/x-python", 179 | "name": "python", 180 | "nbconvert_exporter": "python", 181 | "pygments_lexer": "ipython2", 182 | "version": "2.7.15" 183 | } 184 | }, 185 | "nbformat": 4, 186 | "nbformat_minor": 2 187 | } 188 | -------------------------------------------------------------------------------- /reward_log.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-reinforcement-learning-book/Chapter16-Robot-Learning-in-Simulation/604f06384106e365d28bb51b1280d884c09ead9e/reward_log.npy -------------------------------------------------------------------------------- /sac_learn.py: -------------------------------------------------------------------------------- 1 | import math 2 | import random 3 | 4 | import gym 5 | import numpy as np 6 | 7 | import torch 8 | import torch.nn as nn 9 | import torch.optim as optim 10 | import torch.nn.functional as F 11 | from torch.distributions import Normal 12 | # this line is critical to ensure torch initialization correctly for multi-processing 13 | torch.multiprocessing.set_start_method('forkserver', force=True) 14 | 15 | 16 | from IPython.display import clear_output 17 | import matplotlib.pyplot as plt 18 | from matplotlib import animation 19 | from IPython.display import display 20 | 21 | from sawyer_grasp_env_boundingbox import GraspEnv 22 | import argparse 23 | import time 24 | import pickle 25 | 26 | import torch.multiprocessing as mp 27 | from torch.multiprocessing import Process 28 | 29 | from multiprocessing import Process, Manager 30 | from multiprocessing.managers import BaseManager 31 | 32 | 33 | GPU = True 34 | device_idx = 0 35 | if GPU: 36 | device = torch.device("cuda:" + str(device_idx) if torch.cuda.is_available() else "cpu") 37 | else: 38 | device = torch.device("cpu") 39 | print(device) 40 | 41 | 42 | parser = argparse.ArgumentParser(description='Train or test neural net motor controller.') 43 | parser.add_argument('--train', dest='train', action='store_true', default=False) 44 | parser.add_argument('--test', dest='test', action='store_true', default=False) 45 | 46 | args = parser.parse_args() 47 | 48 | class ReplayBuffer: 49 | def __init__(self, capacity): 50 | self.capacity = capacity 51 | self.buffer = [] 52 | self.position = 0 53 | 54 | def push(self, state, action, reward, next_state, done): 55 | if len(self.buffer) < self.capacity: 56 | self.buffer.append(None) 57 | self.buffer[self.position] = (state, action, reward, next_state, done) 58 | self.position = int((self.position + 1) % self.capacity) # as a ring buffer 59 | 60 | def sample(self, batch_size): 61 | batch = random.sample(self.buffer, batch_size) 62 | state, action, reward, next_state, done = map(np.stack, zip(*batch)) # stack for each element 63 | ''' 64 | the * serves as unpack: sum(a,b) <=> batch=(a,b), sum(*batch) ; 65 | zip: a=[1,2], b=[2,3], zip(a,b) => [(1, 2), (2, 3)] ; 66 | the map serves as mapping the function on each list element: map(square, [2,3]) => [4,9] ; 67 | np.stack((1,2)) => array([1, 2]) 68 | ''' 69 | return state, action, reward, next_state, done 70 | 71 | def __len__(self): # cannot work in multiprocessing case, len(replay_buffer) is not available in proxy of manager! 72 | return len(self.buffer) 73 | 74 | def get_length(self): 75 | return len(self.buffer) 76 | 77 | class NormalizedActions(gym.ActionWrapper): 78 | def _action(self, action): 79 | low = self.action_space.low 80 | high = self.action_space.high 81 | 82 | action = low + (action + 1.0) * 0.5 * (high - low) 83 | action = np.clip(action, low, high) 84 | 85 | return action 86 | 87 | def _reverse_action(self, action): 88 | low = self.action_space.low 89 | high = self.action_space.high 90 | 91 | action = 2 * (action - low) / (high - low) - 1 92 | action = np.clip(action, low, high) 93 | 94 | return action 95 | 96 | class ValueNetwork(nn.Module): 97 | def __init__(self, state_dim, hidden_dim, init_w=3e-3): 98 | super(ValueNetwork, self).__init__() 99 | 100 | self.linear1 = nn.Linear(state_dim, hidden_dim) 101 | self.linear2 = nn.Linear(hidden_dim, hidden_dim) 102 | self.linear3 = nn.Linear(hidden_dim, hidden_dim) 103 | self.linear4 = nn.Linear(hidden_dim, 1) 104 | # weights initialization 105 | self.linear4.weight.data.uniform_(-init_w, init_w) 106 | self.linear4.bias.data.uniform_(-init_w, init_w) 107 | 108 | def forward(self, state): 109 | x = F.relu(self.linear1(state)) 110 | x = F.relu(self.linear2(x)) 111 | x = F.relu(self.linear3(x)) 112 | x = self.linear4(x) 113 | return x 114 | 115 | 116 | class SoftQNetwork(nn.Module): 117 | def __init__(self, num_inputs, num_actions, hidden_size, init_w=3e-3): 118 | super(SoftQNetwork, self).__init__() 119 | 120 | self.linear1 = nn.Linear(num_inputs + num_actions, hidden_size) 121 | self.linear2 = nn.Linear(hidden_size, hidden_size) 122 | self.linear3 = nn.Linear(hidden_size, hidden_size) 123 | self.linear4 = nn.Linear(hidden_size, 1) 124 | 125 | self.linear4.weight.data.uniform_(-init_w, init_w) 126 | self.linear4.bias.data.uniform_(-init_w, init_w) 127 | 128 | def forward(self, state, action): 129 | x = torch.cat([state, action], 1) # the dim 0 is number of samples 130 | x = F.relu(self.linear1(x)) 131 | x = F.relu(self.linear2(x)) 132 | x = F.relu(self.linear3(x)) 133 | x = self.linear4(x) 134 | return x 135 | 136 | 137 | class PolicyNetwork(nn.Module): 138 | def __init__(self, num_inputs, num_actions, hidden_size, action_range=1., init_w=3e-3, log_std_min=-20, log_std_max=2): 139 | super(PolicyNetwork, self).__init__() 140 | 141 | self.log_std_min = log_std_min 142 | self.log_std_max = log_std_max 143 | 144 | self.linear1 = nn.Linear(num_inputs, hidden_size) 145 | self.linear2 = nn.Linear(hidden_size, hidden_size) 146 | self.linear3 = nn.Linear(hidden_size, hidden_size) 147 | self.linear4 = nn.Linear(hidden_size, hidden_size) 148 | 149 | self.mean_linear = nn.Linear(hidden_size, num_actions) 150 | self.mean_linear.weight.data.uniform_(-init_w, init_w) 151 | self.mean_linear.bias.data.uniform_(-init_w, init_w) 152 | 153 | self.log_std_linear = nn.Linear(hidden_size, num_actions) 154 | self.log_std_linear.weight.data.uniform_(-init_w, init_w) 155 | self.log_std_linear.bias.data.uniform_(-init_w, init_w) 156 | 157 | self.action_range = action_range 158 | self.num_actions = num_actions 159 | 160 | 161 | def forward(self, state): 162 | x = F.relu(self.linear1(state)) 163 | x = F.relu(self.linear2(x)) 164 | x = F.relu(self.linear3(x)) 165 | x = F.relu(self.linear4(x)) 166 | 167 | mean = (self.mean_linear(x)) 168 | log_std = self.log_std_linear(x) 169 | log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max) 170 | 171 | return mean, log_std 172 | 173 | def evaluate(self, state, epsilon=1e-6): 174 | ''' 175 | generate sampled action with state as input wrt the policy network; 176 | ''' 177 | mean, log_std = self.forward(state) 178 | std = log_std.exp() # no clip in evaluation, clip affects gradients flow 179 | 180 | normal = Normal(0, 1) 181 | z = normal.sample() 182 | action_0 = torch.tanh(mean + std*z.to(device)) # TanhNormal distribution as actions; reparameterization trick 183 | action = self.action_range*action_0 184 | log_prob = Normal(mean, std).log_prob(mean+ std*z.to(device)) - torch.log(1. - action_0.pow(2) + epsilon) - np.log(self.action_range) 185 | # both dims of normal.log_prob and -log(1-a**2) are (N,dim_of_action); 186 | # the Normal.log_prob outputs the same dim of input features instead of 1 dim probability, 187 | # needs sum up across the features dim to get 1 dim prob; or else use Multivariate Normal. 188 | log_prob = log_prob.sum(dim=1, keepdim=True) 189 | return action, log_prob, z, mean, log_std 190 | 191 | 192 | def get_action(self, state, deterministic): 193 | state = torch.FloatTensor(state).unsqueeze(0).to(device) 194 | mean, log_std = self.forward(state) 195 | std = log_std.exp() 196 | 197 | normal = Normal(0, 1) 198 | z = normal.sample().to(device) 199 | action = self.action_range* torch.tanh(mean + std*z) 200 | 201 | action = self.action_range* torch.tanh(mean).detach().cpu().numpy()[0] if deterministic else action.detach().cpu().numpy()[0] 202 | return action 203 | 204 | 205 | def sample_action(self,): 206 | a=torch.FloatTensor(self.num_actions).uniform_(-1, 1) 207 | return self.action_range*a.numpy() 208 | 209 | 210 | class SAC_Trainer(): 211 | def __init__(self, replay_buffer, hidden_dim, action_range): 212 | self.replay_buffer = replay_buffer 213 | 214 | self.soft_q_net1 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(device) 215 | self.soft_q_net2 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(device) 216 | self.target_soft_q_net1 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(device) 217 | self.target_soft_q_net2 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(device) 218 | self.policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim, action_range).to(device) 219 | self.log_alpha = torch.zeros(1, dtype=torch.float32, requires_grad=True, device=device) 220 | print('Soft Q Network (1,2): ', self.soft_q_net1) 221 | print('Policy Network: ', self.policy_net) 222 | 223 | for target_param, param in zip(self.target_soft_q_net1.parameters(), self.soft_q_net1.parameters()): 224 | target_param.data.copy_(param.data) 225 | for target_param, param in zip(self.target_soft_q_net2.parameters(), self.soft_q_net2.parameters()): 226 | target_param.data.copy_(param.data) 227 | 228 | self.soft_q_criterion1 = nn.MSELoss() 229 | self.soft_q_criterion2 = nn.MSELoss() 230 | 231 | soft_q_lr = 3e-4 232 | policy_lr = 3e-4 233 | alpha_lr = 3e-4 234 | 235 | self.soft_q_optimizer1 = optim.Adam(self.soft_q_net1.parameters(), lr=soft_q_lr) 236 | self.soft_q_optimizer2 = optim.Adam(self.soft_q_net2.parameters(), lr=soft_q_lr) 237 | self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) 238 | self.alpha_optimizer = optim.Adam([self.log_alpha], lr=alpha_lr) 239 | 240 | 241 | def update(self, batch_size, reward_scale=10., auto_entropy=True, use_demons=False, target_entropy=-2, gamma=0.99,soft_tau=1e-2): 242 | state, action, reward, next_state, done = self.replay_buffer.sample(batch_size) 243 | 244 | if use_demons==True: # using demonstration data by feeding into training buffer 245 | data_file=open('./demons_data/demon_data.pickle', "rb") 246 | demons_data = pickle.load(data_file) 247 | state_, action_, reward_, next_state_, done_=map(np.stack, zip(*demons_data)) 248 | state = np.concatenate((state, state_), axis=0) 249 | action = np.concatenate((action, action_), axis=0) 250 | reward = np.concatenate((reward, reward_), axis=0) 251 | next_state = np.concatenate((next_state, next_state_), axis=0) 252 | done = np.concatenate((done, done_), axis=0) 253 | 254 | 255 | state = torch.FloatTensor(state).to(device) 256 | next_state = torch.FloatTensor(next_state).to(device) 257 | action = torch.FloatTensor(action).to(device) 258 | reward = torch.FloatTensor(reward).unsqueeze(1).to(device) # reward is single value, unsqueeze() to add one dim to be [reward] at the sample dim; 259 | done = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(device) 260 | 261 | predicted_q_value1 = self.soft_q_net1(state, action) 262 | predicted_q_value2 = self.soft_q_net2(state, action) 263 | new_action, log_prob, z, mean, log_std = self.policy_net.evaluate(state) 264 | new_next_action, next_log_prob, _, _, _ = self.policy_net.evaluate(next_state) 265 | reward = reward_scale * (reward - reward.mean(dim=0)) / (reward.std(dim=0) + 1e-6) # normalize with batch mean and std; plus a small number to prevent numerical problem 266 | # Updating alpha wrt entropy 267 | # alpha = 0.0 # trade-off between exploration (max entropy) and exploitation (max Q) 268 | if auto_entropy is True: 269 | alpha_loss = -(self.log_alpha * (log_prob + target_entropy).detach()).mean() 270 | self.alpha_optimizer.zero_grad() 271 | alpha_loss.backward() 272 | self.alpha_optimizer.step() 273 | self.alpha = self.log_alpha.exp() 274 | else: 275 | self.alpha = 1. 276 | alpha_loss = 0 277 | 278 | # Training Q Function 279 | target_q_min = torch.min(self.target_soft_q_net1(next_state, new_next_action),self.target_soft_q_net2(next_state, new_next_action)) - self.alpha * next_log_prob 280 | target_q_value = reward + (1 - done) * gamma * target_q_min # if done==1, only reward 281 | q_value_loss1 = self.soft_q_criterion1(predicted_q_value1, target_q_value.detach()) # detach: no gradients for the variable 282 | q_value_loss2 = self.soft_q_criterion2(predicted_q_value2, target_q_value.detach()) 283 | 284 | 285 | self.soft_q_optimizer1.zero_grad() 286 | q_value_loss1.backward() 287 | self.soft_q_optimizer1.step() 288 | self.soft_q_optimizer2.zero_grad() 289 | q_value_loss2.backward() 290 | self.soft_q_optimizer2.step() 291 | 292 | # Training Policy Function 293 | predicted_new_q_value = torch.min(self.soft_q_net1(state, new_action),self.soft_q_net2(state, new_action)) 294 | policy_loss = (self.alpha * log_prob - predicted_new_q_value).mean() 295 | 296 | self.policy_optimizer.zero_grad() 297 | policy_loss.backward() 298 | self.policy_optimizer.step() 299 | 300 | # Soft update the target value net 301 | for target_param, param in zip(self.target_soft_q_net1.parameters(), self.soft_q_net1.parameters()): 302 | target_param.data.copy_( # copy data value into target parameters 303 | target_param.data * (1.0 - soft_tau) + param.data * soft_tau 304 | ) 305 | for target_param, param in zip(self.target_soft_q_net2.parameters(), self.soft_q_net2.parameters()): 306 | target_param.data.copy_( # copy data value into target parameters 307 | target_param.data * (1.0 - soft_tau) + param.data * soft_tau 308 | ) 309 | return predicted_new_q_value.mean() 310 | 311 | def save_model(self, path): 312 | torch.save(self.soft_q_net1.state_dict(), path+'_q1') # have to specify different path name here! 313 | torch.save(self.soft_q_net2.state_dict(), path+'_q2') 314 | torch.save(self.policy_net.state_dict(), path+'_policy') 315 | 316 | def load_model(self, path): 317 | self.soft_q_net1.load_state_dict(torch.load(path+'_q1')) 318 | self.soft_q_net2.load_state_dict(torch.load(path+'_q2')) 319 | self.policy_net.load_state_dict(torch.load(path+'_policy')) 320 | 321 | self.soft_q_net1.eval() 322 | self.soft_q_net2.eval() 323 | self.policy_net.eval() 324 | 325 | 326 | def worker(id, sac_trainer, rewards_queue, replay_buffer, max_episodes, max_steps, batch_size, explore_steps, \ 327 | update_itr, AUTO_ENTROPY, DETERMINISTIC, USE_DEMONS, hidden_dim, model_path, headless): 328 | ''' 329 | the function for sampling with multi-processing 330 | ''' 331 | 332 | print(sac_trainer, replay_buffer) # sac_tainer are not the same, but all networks and optimizers in it are the same; replay buffer is the same one. 333 | env = GraspEnv(headless=headless) # no need to configure different port_number for calling different Vrep env at the same time 334 | 335 | action_dim = env.action_space.shape[0] 336 | state_dim = env.observation_space.shape[0] 337 | 338 | frame_idx=0 339 | # training loop 340 | for eps in range(max_episodes): 341 | 342 | episode_reward = 0 343 | state = env.reset() 344 | 345 | # reinitialize the environment every fixed interval during training, 346 | # to avoid the problems in environment, e.g. broken gripper, etc 347 | if eps%20==0 and eps>0: 348 | env.reinit() 349 | 350 | for step in range(max_steps): 351 | if frame_idx > explore_steps: 352 | action = sac_trainer.policy_net.get_action(state, deterministic = DETERMINISTIC) 353 | else: 354 | action = sac_trainer.policy_net.sample_action() 355 | 356 | try: 357 | next_state, reward, done, _ = env.step(action) 358 | except KeyboardInterrupt: 359 | print('Finished') 360 | sac_trainer.save_model(model_path) 361 | replay_buffer.push(state, action, reward, next_state, done) 362 | 363 | state = next_state 364 | episode_reward += reward 365 | frame_idx += 1 366 | 367 | # if len(replay_buffer) > batch_size: 368 | if replay_buffer.get_length() > batch_size: 369 | for i in range(update_itr): 370 | _=sac_trainer.update(batch_size, reward_scale=10., auto_entropy=AUTO_ENTROPY, use_demons=USE_DEMONS, target_entropy=-1.*action_dim) 371 | 372 | if eps % 10 == 0 and eps>0: 373 | sac_trainer.save_model(model_path) 374 | 375 | if done: 376 | break 377 | print('Episode: ', eps, '| Episode Reward: ', episode_reward) 378 | rewards_queue.put(episode_reward) 379 | 380 | sac_trainer.save_model(model_path) 381 | env.shutdown() 382 | 383 | 384 | 385 | def ShareParameters(adamoptim): 386 | ''' share parameters of Adamoptimizers for multiprocessing ''' 387 | for group in adamoptim.param_groups: 388 | for p in group['params']: 389 | state = adamoptim.state[p] 390 | # initialize: have to initialize here, or else cannot find 391 | state['step'] = 0 392 | state['exp_avg'] = torch.zeros_like(p.data) 393 | state['exp_avg_sq'] = torch.zeros_like(p.data) 394 | 395 | # share in memory 396 | state['exp_avg'].share_memory_() 397 | state['exp_avg_sq'].share_memory_() 398 | 399 | def plot(rewards): 400 | clear_output(True) 401 | # plt.figure(figsize=(20,5)) 402 | plt.plot(rewards) 403 | plt.savefig('sac_multi.png') 404 | # plt.show() 405 | plt.clf() 406 | 407 | 408 | if __name__ == '__main__': 409 | 410 | replay_buffer_size = 1e6 411 | # the replay buffer is a class, have to use torch manager to make it a proxy for sharing across processes 412 | BaseManager.register('ReplayBuffer', ReplayBuffer) 413 | manager = BaseManager() 414 | manager.start() 415 | replay_buffer = manager.ReplayBuffer(replay_buffer_size) # share the replay buffer through manager 416 | 417 | # hyper-parameters for RL training, no need for sharing across processes 418 | max_episodes = 500000 419 | max_steps = 30 420 | explore_steps = 0 # for random action sampling in the beginning of training 421 | batch_size=128 422 | update_itr = 1 423 | AUTO_ENTROPY=True 424 | DETERMINISTIC=False 425 | USE_DEMONS = False # using demonstrations 426 | hidden_dim = 512 427 | model_path = './model/sac_multi' 428 | num_workers=6 # or: mp.cpu_count() 429 | headless = True # if headless is True, no visualization 430 | 431 | # choose env 432 | env = GraspEnv(headless=headless) 433 | 434 | action_dim = env.action_space.shape[0] 435 | state_dim = env.observation_space.shape[0] 436 | action_range=1. # 0.1 for end_position control and 1. for joint_velocity control 437 | env.shutdown() 438 | 439 | 440 | sac_trainer=SAC_Trainer(replay_buffer, hidden_dim=hidden_dim, action_range=action_range ) 441 | 442 | if args.train: 443 | #sac_trainer.load_model(model_path) # if using pre-training 444 | 445 | # share the global parameters in multiprocessing 446 | sac_trainer.soft_q_net1.share_memory() 447 | sac_trainer.soft_q_net2.share_memory() 448 | sac_trainer.target_soft_q_net1.share_memory() 449 | sac_trainer.target_soft_q_net2.share_memory() 450 | sac_trainer.policy_net.share_memory() 451 | sac_trainer.log_alpha.share_memory_() 452 | ShareParameters(sac_trainer.soft_q_optimizer1) 453 | ShareParameters(sac_trainer.soft_q_optimizer2) 454 | ShareParameters(sac_trainer.policy_optimizer) 455 | ShareParameters(sac_trainer.alpha_optimizer) 456 | 457 | rewards_queue=mp.Queue() # used for get rewards from all processes and plot the curve 458 | processes=[] 459 | rewards=[] 460 | 461 | for i in range(num_workers): 462 | process = Process(target=worker, args=(i, sac_trainer, rewards_queue, replay_buffer, max_episodes, max_steps, \ 463 | batch_size, explore_steps, update_itr, AUTO_ENTROPY, DETERMINISTIC, USE_DEMONS, hidden_dim, model_path, headless)) # the args contain shared and not shared 464 | process.daemon=True # all processes closed when the main stops 465 | processes.append(process) 466 | 467 | [p.start() for p in processes] 468 | while True: # keep geting the episode reward from the queue 469 | r = rewards_queue.get() 470 | if r is not None: 471 | rewards.append(r) 472 | else: 473 | break 474 | 475 | if len(rewards)%20==0 and len(rewards)>0: 476 | # plot(rewards) 477 | np.save('reward_log', rewards) 478 | 479 | 480 | [p.join() for p in processes] # finished at the same time 481 | 482 | sac_trainer.save_model(model_path) 483 | 484 | if args.test: 485 | env = GraspEnv(headless=False, control_mode='joint_velocity') # for visualizing in test 486 | trained_model_path1 = './model/trained_model/augmented_dense_reward/sac_multi' # pre-trained model with augmented dense reward 487 | trained_model_path2 = './model/trained_model/dense_reward/sac_multi' # pre-trained model with dense reward 488 | sac_trainer.load_model(model_path) # new model after training 489 | for eps in range(30): 490 | state = env.reset() 491 | episode_reward = 0 492 | 493 | for step in range(max_steps): 494 | action = sac_trainer.policy_net.get_action(state, deterministic = DETERMINISTIC) 495 | next_state, reward, done, _ = env.step(action) 496 | episode_reward += reward 497 | state=next_state 498 | 499 | print('Episode: ', eps, '| Episode Reward: ', episode_reward) 500 | 501 | env.shutdown() 502 | -------------------------------------------------------------------------------- /sawyer_grasp_env_boundingbox.py: -------------------------------------------------------------------------------- 1 | """ 2 | The environment of Sawyer Arm + Baxter Gripper for graping object. 3 | With a bounding box of the arange that the gripper cannot move outside. 4 | """ 5 | from os.path import dirname, join, abspath 6 | from pyrep import PyRep 7 | from pyrep.robots.arms.sawyer import Sawyer 8 | from pyrep.robots.end_effectors.baxter_gripper import BaxterGripper 9 | from pyrep.objects.proximity_sensor import ProximitySensor 10 | from pyrep.objects.vision_sensor import VisionSensor 11 | from pyrep.objects.shape import Shape 12 | from pyrep.objects.dummy import Dummy 13 | from pyrep.const import JointType, JointMode 14 | import numpy as np 15 | import matplotlib.pyplot as plt 16 | import math 17 | 18 | POS_MIN, POS_MAX = [0.1, -0.3, 1.], [0.45, 0.3, 1.] # valid position range of target object 19 | 20 | 21 | class GraspEnv(object): 22 | ''' Sawyer robot grasping a cuboid ''' 23 | def __init__(self, headless, control_mode='joint_velocity'): 24 | ''' 25 | parameters: 26 | :headless: bool, if True, no visualization, else with visualization. 27 | :control mode: str, 'end_position' or 'joint_velocity'. 28 | ''' 29 | # set public variables 30 | self.headless = headless # if headless is True, no visualization 31 | self.reward_offset = 10.0 # reward of achieving the grasping object 32 | self.reward_range = self.reward_offset # reward range for register gym env when using vectorized env wrapper 33 | self.penalty_offset = 1. # penalty value for undesired cases 34 | self.fall_down_offset = 0.1 # distance for judging the target object fall off the table 35 | self.metadata=[] # gym env argument 36 | self.control_mode = control_mode # the control mode of robotic arm: 'end_position' or 'joint_velocity' 37 | 38 | # launch and set up the scene, and set the proxy variables in represent of the counterparts in the scene 39 | self.pr = PyRep() # call the PyRep 40 | if control_mode == 'end_position': # need to use different scene, the one with all joints in inverse kinematics mode 41 | SCENE_FILE = join(dirname(abspath(__file__)), './scenes/sawyer_reacher_rl_new_ik.ttt') # scene with joints controlled by ik (inverse kinematics) 42 | elif control_mode == 'joint_velocity': # the scene with all joints in force/torch mode for forward kinematics 43 | SCENE_FILE = join(dirname(abspath(__file__)), './scenes/sawyer_reacher_rl_new.ttt') # scene with joints controlled by forward kinematics 44 | self.pr.launch(SCENE_FILE, headless=headless) # lunch the scene, headless means no visualization 45 | self.pr.start() # start the scene 46 | self.agent = Sawyer() # get the robot arm in the scene 47 | self.gripper = BaxterGripper() # get the gripper in the scene 48 | self.gripper_left_pad = Shape('BaxterGripper_leftPad') # the left pad on the gripper finger 49 | self.proximity_sensor = ProximitySensor('BaxterGripper_attachProxSensor') # need the name of the sensor here 50 | self.vision_sensor = VisionSensor('Vision_sensor') # need the name of the sensor here 51 | self.table = Shape('diningTable') # the table in the scene for checking collision 52 | if control_mode == 'end_position': # control the robot arm by the position of its end using inverse kinematics 53 | self.agent.set_control_loop_enabled(True) # if false, inverse kinematics won't work 54 | self.action_space = np.zeros(4) # 3 DOF end position control + 1 rotation of gripper 55 | elif control_mode == 'joint_velocity': # control the robot arm by directly setting velocity values on each joint, using forward kinematics 56 | self.agent.set_control_loop_enabled(False) 57 | self.action_space = np.zeros(7) # 7 DOF velocity control, no need for extra control of end rotation, the 7th joint controls it. 58 | else: 59 | raise NotImplementedError 60 | self.observation_space = np.zeros(17) # position and velocity of 7 joints + position of the target 61 | self.agent.set_motor_locked_at_zero_velocity(True) 62 | self.target = Shape('target') # get the target object 63 | self.agent_ee_tip = self.agent.get_tip() # a part of robot as the end of inverse kinematics chain for controlling 64 | self.tip_target = Dummy('Sawyer_target') # the target point of the tip (end of the robot arm) to move towards 65 | self.tip_pos = self.agent_ee_tip.get_position() # tip x,y,z position 66 | 67 | # set a proper initial robot gesture or tip position 68 | if control_mode == 'end_position': 69 | initial_pos = [0.3, 0.1, 0.9] 70 | self.tip_target.set_position(initial_pos) 71 | # one big step for rotation setting is enough, with reset_dynamics=True, set the rotation instantaneously 72 | self.tip_target.set_orientation([0,np.pi,np.pi/2], reset_dynamics=True) # first two dimensions along x and y axis make gripper face downwards 73 | elif control_mode == 'joint_velocity': 74 | self.initial_joint_positions = [0.001815199851989746, -1.4224984645843506, \ 75 | 0.704303503036499, 2.54307222366333, 2.972468852996826, -0.4989511966705322, 4.105560302734375] # a proper initial gesture 76 | self.agent.set_joint_positions(self.initial_joint_positions) 77 | self.pr.step() 78 | self.initial_tip_positions = self.agent_ee_tip.get_position() 79 | self.initial_target_positions = self.target.get_position() 80 | 81 | def _get_state(self): 82 | ''' 83 | Return state containing arm joint positions/velocities & target position. 84 | ''' 85 | return np.array(self.agent.get_joint_positions() + # list, dim=7 86 | self.agent.get_joint_velocities() + # list, dim=7 87 | self.target.get_position()) # list, dim=3 88 | 89 | def _is_holding(self): 90 | ''' 91 | Return the state of holding the target or not, return bool. 92 | ''' 93 | # Note that the collision check is not always accurate all the time, 94 | # for continuous collision frames, maybe only the first 4-5 frames of collision can be detected. 95 | pad_collide_object = self.gripper_left_pad.check_collision(self.target) 96 | if pad_collide_object and self.proximity_sensor.is_detected(self.target)==True: 97 | return True 98 | else: 99 | return False 100 | 101 | 102 | def _move(self, action, bounding_offset=0.15, step_factor=0.2, max_itr=20, max_error=0.05, rotation_norm =5.): 103 | ''' 104 | Move the end effector on robot arm according to the action with inverse kinematics for 'end_position' control mode; 105 | Inverse kinematics mode control is achieved through setting the tip target instead of using .solve_ik(), 106 | because sometimes the .solve_ik() does not function correctly. 107 | Mode: a close-loop proportional control, using ik. 108 | 109 | parameters: 110 | :bounding_offset: offset of bounding box outside the valid target position range, as valid and safe range of action 111 | :step_factor: small step factor mulitplied on the difference of current and desired position, i.e. proportional factor 112 | :max_itr: maximum moving iterations 113 | :max_error: upper bound of distance error for movement at each call 114 | :rotation_norm: factor for normalization of rotation values, since the action are of the same scale for each dimension 115 | ''' 116 | pos=self.gripper.get_position() 117 | 118 | # check if state+action will be within of the bounding box, if so, move normally; otherwise the action is not conducted. 119 | # i.e. x_min < x < x_max and y_min < y < y_max and z > z_min 120 | if pos[0]+action[0]>POS_MIN[0]-bounding_offset and pos[0]+action[0] POS_MIN[1]-bounding_offset and pos[1]+action[1] < POS_MAX[1]+2*bounding_offset \ 122 | and pos[2]+action[2] > POS_MIN[2]-2*bounding_offset: # larger offset in z axis 123 | 124 | # there is a mismatch between the object set_orientation() and get_orientation(): 125 | # the (x,y,z) in set_orientation() will be (y,x,-z) in get_orientation(). 126 | ori_z=-self.agent_ee_tip.get_orientation()[2] # the minus is because the mismatch between the set_orientation() and get_orientation() 127 | target_pos = np.array(self.agent_ee_tip.get_position())+np.array(action[:3]) 128 | diff=1 # intialization 129 | itr=0 130 | while np.sum(np.abs(diff))>max_error and itr0.1 and itr