├── .gitignore ├── DO ├── __init__.py ├── design_optimization.py ├── pso_batch.py └── pso_sim.py ├── Environments ├── __init__.py ├── evoenvs.py └── pybullet_evo │ ├── README.md │ ├── __init__.py │ ├── gym_locomotion_envs.py │ ├── half_cheetah.xml │ ├── robot_bases.py │ └── robot_locomotors.py ├── LICENSE ├── README.md ├── RL ├── __init__.py ├── evoreplay.py ├── rl_algorithm.py └── soft_actor.py ├── __init__.py ├── coadapt.py ├── data_notebook.ipynb ├── experiment_configs.py ├── main.py ├── requirements.txt └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # Data 107 | data_*/ 108 | data_* 109 | -------------------------------------------------------------------------------- /DO/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ksluck/Coadaptation/c45d1d682262ef04cb797122d6103050b8d8b826/DO/__init__.py -------------------------------------------------------------------------------- /DO/design_optimization.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class Design_Optimization(object): 4 | 5 | def __init__(self): 6 | pass 7 | 8 | def optimize_design(self): 9 | pass 10 | -------------------------------------------------------------------------------- /DO/pso_batch.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import rlkit.torch.pytorch_util as ptu 4 | import pyswarms as ps 5 | from .design_optimization import Design_Optimization 6 | 7 | class PSO_batch(Design_Optimization): 8 | 9 | def __init__(self, config, replay, env): 10 | self._config = config 11 | self._replay = replay 12 | self._env = env 13 | 14 | if 'state_batch_size' in self._config.keys(): 15 | self._state_batch_size = self._config['state_batch_size'] 16 | else: 17 | self._state_batch_size = 32 18 | 19 | 20 | def optimize_design(self, design, q_network, policy_network): 21 | self._replay.set_mode('start') 22 | initial_state = self._replay.random_batch(self._state_batch_size) 23 | initial_state = initial_state['observations'] 24 | design_idxs = self._env.get_design_dimensions() 25 | # initial_state = initial_state[:,:-len(design)] 26 | # state_tensor = torch.from_numpy(initial_state).to(device=ptu.device, dtype=torch.float32) 27 | 28 | # initial_design = np.array(self._current_design) 29 | # initial_design = np.array(design) 30 | 31 | def f_qval(x_input, **kwargs): 32 | shape = x_input.shape 33 | cost = np.zeros((shape[0],)) 34 | with torch.no_grad(): 35 | for i in range(shape[0]): 36 | x = x_input[i:i+1,:] 37 | # X = ( 38 | # torch.from_numpy(x) 39 | # .to(device=ptu.device, dtype=torch.float32) 40 | # .contiguous() 41 | # .requires_grad_(False) 42 | # ) 43 | # X_expand = X.expand(self._state_batch_size, -1) 44 | # network_input = torch.cat((state_tensor,X_expand), -1) 45 | state_batch = initial_state.copy() 46 | state_batch[:,design_idxs] = x 47 | network_input = torch.from_numpy(state_batch).to(device=ptu.device, dtype=torch.float32) 48 | action, _, _, _, _, _, _, _, = policy_network(network_input, deterministic=True) 49 | output = q_network(network_input, action) 50 | #output = self._vf_pop.forward(input) 51 | loss = -output.mean().sum() 52 | fval = float(loss.item()) 53 | cost[i] = fval 54 | return cost 55 | 56 | lower_bounds = [l for l, _ in self._env.design_params_bounds] 57 | lower_bounds = np.array(lower_bounds) 58 | upper_bounds = [u for _, u in self._env.design_params_bounds] 59 | upper_bounds = np.array(upper_bounds) 60 | bounds = (lower_bounds, upper_bounds) 61 | options = {'c1': 0.5, 'c2': 0.3, 'w':0.9} 62 | #options = {'c1': 0.5, 'c2': 0.3, 'w': 0.9, 'k': 3, 'p': 2} 63 | optimizer = ps.single.GlobalBestPSO(n_particles=700, dimensions=len(design), bounds=bounds, options=options) 64 | 65 | # Perform optimization 66 | cost, new_design = optimizer.optimize(f_qval, print_step=100, iters=250, verbose=3) #, n_processes=2) 67 | return new_design 68 | -------------------------------------------------------------------------------- /DO/pso_sim.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import rlkit.torch.pytorch_util as ptu 4 | import pyswarms as ps 5 | from .design_optimization import Design_Optimization 6 | 7 | class PSO_simulation(Design_Optimization): 8 | 9 | def __init__(self, config, replay, env): 10 | self._config = config 11 | self._replay = replay 12 | self._env = env 13 | 14 | # TODO Make this independent of rl_algo config 15 | self._episode_length = self._config['rl_algorithm_config']['algo_params']['num_steps_per_epoch'] 16 | self._reward_scale = self._config['rl_algorithm_config']['algo_params']['reward_scale'] 17 | 18 | 19 | def optimize_design(self, design, q_network, policy_network): 20 | # Important: We reset the design of the environment. Previous design 21 | # will be lost 22 | 23 | def get_reward_for_design(design): 24 | self._env.set_new_design(design) 25 | state = self._env.reset() 26 | reward_episode = [] 27 | done = False 28 | nmbr_of_steps = 0 29 | while not(done) and nmbr_of_steps <= self._episode_length: 30 | nmbr_of_steps += 1 31 | action, _ = policy_network.get_action(state, deterministic=True) 32 | new_state, reward, done, info = self._env.step(action) 33 | reward = reward * self._reward_scale 34 | reward_episode.append(float(reward)) 35 | state = new_state 36 | reward_mean = np.mean(reward_episode) 37 | return reward_mean 38 | 39 | def f_qval(x_input, **kwargs): 40 | shape = x_input.shape 41 | cost = np.zeros((shape[0],)) 42 | for i in range(shape[0]): 43 | x = x_input[i,:] 44 | reward = get_reward_for_design(x) 45 | cost[i] = -reward 46 | return cost 47 | 48 | lower_bounds = [l for l, _ in self._env.design_params_bounds] 49 | lower_bounds = np.array(lower_bounds) 50 | upper_bounds = [u for _, u in self._env.design_params_bounds] 51 | upper_bounds = np.array(upper_bounds) 52 | bounds = (lower_bounds, upper_bounds) 53 | options = {'c1': 0.5, 'c2': 0.3, 'w':0.9} 54 | optimizer = ps.single.GlobalBestPSO(n_particles=35, dimensions=len(design), bounds=bounds, options=options) 55 | 56 | # Perform optimization 57 | cost, new_design = optimizer.optimize(f_qval, print_step=100, iters=30, verbose=3) 58 | return new_design 59 | -------------------------------------------------------------------------------- /Environments/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ksluck/Coadaptation/c45d1d682262ef04cb797122d6103050b8d8b826/Environments/__init__.py -------------------------------------------------------------------------------- /Environments/evoenvs.py: -------------------------------------------------------------------------------- 1 | from gym import spaces 2 | import numpy as np 3 | from .pybullet_evo.gym_locomotion_envs import HalfCheetahBulletEnv 4 | import copy 5 | from utils import BestEpisodesVideoRecorder 6 | 7 | class HalfCheetahEnv(object): 8 | def __init__(self, config = {'env' : {'render' : True, 'record_video': False}}): 9 | self._config = config 10 | self._render = self._config['env']['render'] 11 | self._record_video = self._config['env']['record_video'] 12 | self._current_design = [1.0] * 6 13 | self._config_numpy = np.array(self._current_design) 14 | self.design_params_bounds = [(0.8, 2.0)] * 6 15 | self._env = HalfCheetahBulletEnv(render=self._render, design=self._current_design) 16 | self.init_sim_params = [ 17 | [1.0] * 6, 18 | [1.41, 0.96, 1.97, 1.73, 1.97, 1.17], 19 | [1.52, 1.07, 1.11, 1.97, 1.51, 0.99], 20 | [1.08, 1.18, 1.39, 1.76 , 1.85, 0.92], 21 | [0.85, 1.54, 0.97, 1.38, 1.10, 1.49], 22 | ] 23 | self.observation_space = spaces.Box(-np.inf, np.inf, shape=[self._env.observation_space.shape[0] + 6], dtype=np.float32)#env.observation_space 24 | self.action_space = self._env.action_space 25 | self._initial_state = self._env.reset() 26 | 27 | if self._record_video: 28 | self._video_recorder = BestEpisodesVideoRecorder(path=config['data_folder_experiment'], max_videos=5) 29 | 30 | # Which dimensions in the state vector are design parameters? 31 | self._design_dims = list(range(self.observation_space.shape[0] - len(self._current_design), self.observation_space.shape[0])) 32 | assert len(self._design_dims) == 6 33 | 34 | def render(self): 35 | pass 36 | 37 | def step(self, a): 38 | info = {} 39 | state, reward, done, _ = self._env.step(a) 40 | state = np.append(state, self._config_numpy) 41 | info['orig_action_cost'] = 0.1 * np.mean(np.square(a)) 42 | info['orig_reward'] = reward 43 | 44 | if self._record_video: 45 | self._video_recorder.step(env=self._env, state=state, reward=reward, done=done) 46 | 47 | return state, reward, False, info 48 | 49 | 50 | def reset(self): 51 | state = self._env.reset() 52 | self._initial_state = state 53 | state = np.append(state, self._config_numpy) 54 | 55 | if self._record_video: 56 | self._video_recorder.reset(env=self._env, state=state, reward=0, done=False) 57 | 58 | return state 59 | 60 | def set_new_design(self, vec): 61 | self._env.reset_design(vec) 62 | self._current_design = vec 63 | self._config_numpy = np.array(vec) 64 | 65 | if self._record_video: 66 | self._video_recorder.increase_folder_counter() 67 | 68 | def get_random_design(self): 69 | optimized_params = np.random.uniform(low=0.8, high=2.0, size=6) 70 | return optimized_params 71 | 72 | def get_current_design(self): 73 | return copy.copy(self._current_design) 74 | 75 | def get_design_dimensions(self): 76 | return copy.copy(self._design_dims) 77 | -------------------------------------------------------------------------------- /Environments/pybullet_evo/README.md: -------------------------------------------------------------------------------- 1 | The files in this directory belong to the pybullet project and were changed to enable the automatic design adaptation of agents during the training process. 2 | 3 | You can find the original files here: https://github.com/bulletphysics/bullet3/tree/master/examples/pybullet/gym 4 | -------------------------------------------------------------------------------- /Environments/pybullet_evo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ksluck/Coadaptation/c45d1d682262ef04cb797122d6103050b8d8b826/Environments/pybullet_evo/__init__.py -------------------------------------------------------------------------------- /Environments/pybullet_evo/gym_locomotion_envs.py: -------------------------------------------------------------------------------- 1 | from pybullet_envs.scene_stadium import SinglePlayerStadiumScene 2 | from pybullet_envs.env_bases import MJCFBaseBulletEnv 3 | import numpy as np 4 | import pybullet 5 | from .robot_locomotors import HalfCheetah 6 | import os, inspect 7 | currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) 8 | from gym import spaces 9 | 10 | class WalkerBaseBulletEnv(MJCFBaseBulletEnv): 11 | def __init__(self, robot, render=False): 12 | # print("WalkerBase::__init__ start") 13 | MJCFBaseBulletEnv.__init__(self, robot, render) 14 | 15 | self.camera_x = 0 16 | self.walk_target_x = 1e3 # kilometer away 17 | self.walk_target_y = 0 18 | self.stateId=-1 19 | self._projectM = None 20 | self._param_init_camera_width = 320 21 | self._param_init_camera_height = 200 22 | self._param_camera_distance = 2.0 23 | 24 | 25 | def create_single_player_scene(self, bullet_client): 26 | self.stadium_scene = SinglePlayerStadiumScene(bullet_client, gravity=9.8, timestep=0.0165/4, frame_skip=4) 27 | return self.stadium_scene 28 | 29 | def reset(self): 30 | if (self.stateId>=0): 31 | # print("restoreState self.stateId:",self.stateId) 32 | self._p.restoreState(self.stateId) 33 | 34 | r = MJCFBaseBulletEnv.reset(self) 35 | self._p.configureDebugVisualizer(pybullet.COV_ENABLE_RENDERING,0) 36 | 37 | self.parts, self.jdict, self.ordered_joints, self.robot_body = self.robot.addToScene(self._p, 38 | self.stadium_scene.ground_plane_mjcf) 39 | # self.ground_ids = set([(self.parts[f].bodies[self.parts[f].bodyIndex], self.parts[f].bodyPartIndex) for f in 40 | # self.foot_ground_object_names]) 41 | self._p.configureDebugVisualizer(pybullet.COV_ENABLE_RENDERING,1) 42 | if (self.stateId<0): 43 | self.stateId=self._p.saveState() 44 | #print("saving state self.stateId:",self.stateId) 45 | self._p.setGravity(0,0,-.5) 46 | for _ in range(200): 47 | self.robot.reset_position() 48 | self.scene.global_step() 49 | self.robot.reset_position_final() 50 | self._p.setGravity(0,0,-9.81) 51 | r = self.robot.calc_state() 52 | self.robot._initial_z = r[-1] 53 | # for _ in range(20): 54 | # self.scene.global_step() 55 | # self.robot.reset_position_final() 56 | # time.sleep(0.1) 57 | # self.robot.reset_position() 58 | # self.scene.global_step() 59 | self.robot.initial_z = None 60 | r = self.robot.calc_state() 61 | 62 | return r 63 | 64 | def _isDone(self): 65 | return self._alive < 0 66 | 67 | def move_robot(self, init_x, init_y, init_z): 68 | "Used by multiplayer stadium to move sideways, to another running lane." 69 | self.cpp_robot.query_position() 70 | pose = self.cpp_robot.root_part.pose() 71 | pose.move_xyz(init_x, init_y, init_z) # Works because robot loads around (0,0,0), and some robots have z != 0 that is left intact 72 | self.cpp_robot.set_pose(pose) 73 | 74 | electricity_cost = -2.0 # cost for using motors -- this parameter should be carefully tuned against reward for making progress, other values less improtant 75 | stall_torque_cost = -0.1 # cost for running electric current through a motor even at zero rotational speed, small 76 | foot_collision_cost = -1.0 # touches another leg, or other objects, that cost makes robot avoid smashing feet into itself 77 | foot_ground_object_names = set(["floor"]) # to distinguish ground and other objects 78 | joints_at_limit_cost = -0.1 # discourage stuck joints 79 | 80 | def step(self, a): 81 | if not self.scene.multiplayer: # if multiplayer, action first applied to all robots, then global step() called, then _step() for all robots with the same actions 82 | self.robot.apply_action(a) 83 | self.scene.global_step() 84 | 85 | state = self.robot.calc_state() # also calculates self.joints_at_limit 86 | 87 | self._alive = float(self.robot.alive_bonus(state[0]+self.robot.initial_z, self.robot.body_rpy[1])) # state[0] is body height above ground, body_rpy[1] is pitch 88 | done = self._isDone() 89 | if not np.isfinite(state).all(): 90 | print("~INF~", state) 91 | done = True 92 | 93 | potential_old = self.potential 94 | self.potential = self.robot.calc_potential() 95 | progress = float(self.potential - potential_old) 96 | 97 | feet_collision_cost = 0.0 98 | for i,f in enumerate(self.robot.feet): # TODO: Maybe calculating feet contacts could be done within the robot code 99 | contact_ids = set((x[2], x[4]) for x in f.contact_list()) 100 | #print("CONTACT OF '%d' WITH %d" % (contact_ids, ",".join(contact_names)) ) 101 | if (self.ground_ids & contact_ids): 102 | #see Issue 63: https://github.com/openai/roboschool/issues/63 103 | #feet_collision_cost += self.foot_collision_cost 104 | self.robot.feet_contact[i] = 1.0 105 | else: 106 | self.robot.feet_contact[i] = 0.0 107 | 108 | 109 | electricity_cost = self.electricity_cost * float(np.abs(a*self.robot.joint_speeds).mean()) # let's assume we have DC motor with controller, and reverse current braking 110 | electricity_cost += self.stall_torque_cost * float(np.square(a).mean()) 111 | 112 | joints_at_limit_cost = float(self.joints_at_limit_cost * self.robot.joints_at_limit) 113 | debugmode=0 114 | if(debugmode): 115 | print("alive=") 116 | print(self._alive) 117 | print("progress") 118 | print(progress) 119 | print("electricity_cost") 120 | print(electricity_cost) 121 | print("joints_at_limit_cost") 122 | print(joints_at_limit_cost) 123 | print("feet_collision_cost") 124 | print(feet_collision_cost) 125 | 126 | self.rewards = [ 127 | self._alive, 128 | progress, 129 | electricity_cost, 130 | joints_at_limit_cost, 131 | feet_collision_cost 132 | ] 133 | if (debugmode): 134 | print("rewards=") 135 | print(self.rewards) 136 | print("sum rewards") 137 | print(sum(self.rewards)) 138 | self.HUD(state, a, done) 139 | self.reward += sum(self.rewards) 140 | 141 | return state, sum(self.rewards), bool(done), {} 142 | 143 | # def camera_adjust(self): 144 | # x, y, z = self.robot.body_xyz 145 | # self.camera._p = self._p 146 | # self.camera_x = 0.98*self.camera_x + (1-0.98)*x 147 | # self.camera.move_and_look_at(self.camera_x, y-1.0, 1.4, x, y, 1.0) 148 | def camera_adjust(self): 149 | if self._p is None : 150 | return 151 | self.camera._p = self._p 152 | x, y, z = self.robot.body_xyz 153 | if self.camera_x is not None: 154 | self.camera_x = x # 0.98*self.camera_x + (1-0.98)*x 155 | else: 156 | self.camera_x = x 157 | # self.camera.move_and_look_at(self.camera_x, y-2.0, 1.4, x, y, 1.0) 158 | lookat = [self.camera_x, y, z] 159 | distance = self._param_camera_distance 160 | yaw = 10 161 | self._p.resetDebugVisualizerCamera(distance, yaw, -20, lookat) 162 | 163 | def render_camera_image(self, pixelWidthHeight = None): 164 | if pixelWidthHeight is not None or self._projectM is None: 165 | if self._projectM is None: 166 | self._pixelWidth = self._param_init_camera_width 167 | self._pixelHeight = self._param_init_camera_height 168 | else: 169 | self._pixelWidth = pixelWidthHeight[0] 170 | self._pixelHeight = pixelWidthHeight[1] 171 | nearPlane = 0.01 172 | farPlane = 10 173 | aspect = self._pixelWidth / self._pixelHeight 174 | fov = 60 175 | self._projectM = self._p.computeProjectionMatrixFOV(fov, aspect, 176 | nearPlane, farPlane) 177 | 178 | x, y, z = self.robot.robot_body.pose().xyz() 179 | lookat = [x, y, 0.5] 180 | distance = 2.0 181 | yaw = -20 182 | viewM = self._p.computeViewMatrixFromYawPitchRoll( 183 | cameraTargetPosition=lookat, 184 | distance=distance, 185 | yaw=10., 186 | pitch=yaw, 187 | roll=0.0, 188 | upAxisIndex=2) 189 | 190 | # img_arr = pybullet.getCameraImage(self._pixelWidth, self._pixelHeight, viewM, self._projectM, shadow=1,lightDirection=[1,1,1],renderer=pybullet.ER_BULLET_HARDWARE_OPENGL) 191 | img_arr = pybullet.getCameraImage(self._pixelWidth, self._pixelHeight, viewM, self._projectM, shadow=False, renderer=pybullet.ER_BULLET_HARDWARE_OPENGL, flags=pybullet.ER_NO_SEGMENTATION_MASK) 192 | 193 | w=img_arr[0] #width of the image, in pixels 194 | h=img_arr[1] #height of the image, in pixels 195 | rgb=img_arr[2] #color data RGB 196 | 197 | image = np.reshape(rgb, (h, w, 4)) #Red, Green, Blue, Alpha 198 | image = image * (1./255.) 199 | image = image[:,:,0:3] 200 | return image 201 | 202 | class HalfCheetahBulletEnv(WalkerBaseBulletEnv): 203 | def __init__(self, render=False, design = None): 204 | self.robot = HalfCheetah(design) 205 | WalkerBaseBulletEnv.__init__(self, self.robot, render) 206 | self.observation_space = spaces.Box(-np.inf, np.inf, shape=[17], dtype=np.float32) 207 | 208 | def _isDone(self): 209 | return False 210 | 211 | def disconnect(self): 212 | self._p.disconnect() 213 | 214 | def step(self, a): 215 | if not self.scene.multiplayer: # if multiplayer, action first applied to all robots, then global step() called, then _step() for all robots with the same actions 216 | self.robot.apply_action(a) 217 | self.scene.global_step() 218 | 219 | state = self.robot.calc_state() # also calculates self.joints_at_limit 220 | 221 | done = self._isDone() 222 | if not np.isfinite(state).all(): 223 | print("~INF~", state) 224 | done = 0 225 | reward = max(state[-5]/10.0, 0.0) 226 | 227 | return state, reward, bool(done), {} 228 | 229 | def reset_design(self, design): 230 | self.stateId = -1 231 | self.scene = None 232 | self.robot.reset_design(self._p, design) 233 | -------------------------------------------------------------------------------- /Environments/pybullet_evo/half_cheetah.xml: -------------------------------------------------------------------------------- 1 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 83 | -------------------------------------------------------------------------------- /Environments/pybullet_evo/robot_bases.py: -------------------------------------------------------------------------------- 1 | import pybullet 2 | import gym, gym.spaces, gym.utils 3 | import numpy as np 4 | import os, inspect 5 | currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) 6 | parentdir = os.path.dirname(currentdir) 7 | os.sys.path.insert(0,parentdir) 8 | import pybullet_data 9 | 10 | 11 | class XmlBasedRobot: 12 | """ 13 | Base class for mujoco .xml based agents. 14 | """ 15 | 16 | self_collision = True 17 | def __init__(self, robot_name, action_dim, obs_dim, self_collision): 18 | self.parts = None 19 | self.objects = [] 20 | self.jdict = None 21 | self.ordered_joints = None 22 | self.robot_body = None 23 | 24 | high = np.ones([action_dim]) 25 | self.action_space = gym.spaces.Box(-high, high) 26 | high = np.inf * np.ones([obs_dim]) 27 | self.observation_space = gym.spaces.Box(-high, high) 28 | 29 | #self.model_xml = model_xml 30 | self.robot_name = robot_name 31 | self.self_collision = self_collision 32 | 33 | def addToScene(self, bullet_client, bodies): 34 | self._p = bullet_client 35 | 36 | if self.parts is not None: 37 | parts = self.parts 38 | else: 39 | parts = {} 40 | 41 | if self.jdict is not None: 42 | joints = self.jdict 43 | else: 44 | joints = {} 45 | 46 | if self.ordered_joints is not None: 47 | ordered_joints = self.ordered_joints 48 | else: 49 | ordered_joints = [] 50 | 51 | if np.isscalar(bodies): # streamline the case where bodies is actually just one body 52 | bodies = [bodies] 53 | 54 | dump = 0 55 | for i in range(len(bodies)): 56 | if self._p.getNumJoints(bodies[i]) == 0: 57 | part_name, robot_name = self._p.getBodyInfo(bodies[i]) 58 | self.robot_name = robot_name.decode("utf8") 59 | part_name = part_name.decode("utf8") 60 | parts[part_name] = BodyPart(self._p, part_name, bodies, i, -1) 61 | for j in range(self._p.getNumJoints(bodies[i])): 62 | self._p.setJointMotorControl2(bodies[i],j,pybullet.POSITION_CONTROL,positionGain=0.1,velocityGain=0.1,force=0) 63 | jointInfo = self._p.getJointInfo(bodies[i], j) 64 | joint_name=jointInfo[1] 65 | part_name=jointInfo[12] 66 | 67 | joint_name = joint_name.decode("utf8") 68 | part_name = part_name.decode("utf8") 69 | 70 | if dump: print("ROBOT PART '%s'" % part_name) 71 | if dump: print("ROBOT JOINT '%s'" % joint_name) # limits = %+0.2f..%+0.2f effort=%0.3f speed=%0.3f" % ((joint_name,) + j.limits()) ) 72 | 73 | parts[part_name] = BodyPart(self._p, part_name, bodies, i, j) 74 | 75 | if part_name == self.robot_name: 76 | self.robot_body = parts[part_name] 77 | 78 | if i == 0 and j == 0 and self.robot_body is None: # if nothing else works, we take this as robot_body 79 | parts[self.robot_name] = BodyPart(self._p, self.robot_name, bodies, 0, -1) 80 | self.robot_body = parts[self.robot_name] 81 | 82 | if joint_name[:6] == "ignore": 83 | Joint(self._p, joint_name, bodies, i, j).disable_motor() 84 | continue 85 | 86 | if joint_name[:8] != "jointfix": 87 | joints[joint_name] = Joint(self._p, joint_name, bodies, i, j) 88 | ordered_joints.append(joints[joint_name]) 89 | 90 | joints[joint_name].power_coef = 100.0 91 | 92 | # TODO: Maybe we need this 93 | # joints[joint_name].power_coef, joints[joint_name].max_velocity = joints[joint_name].limits()[2:4] 94 | # self.ordered_joints.append(joints[joint_name]) 95 | # self.jdict[joint_name] = joints[joint_name] 96 | 97 | return parts, joints, ordered_joints, self.robot_body 98 | 99 | def reset_pose(self, position, orientation): 100 | self.parts[self.robot_name].reset_pose(position, orientation) 101 | 102 | class MJCFBasedRobot(XmlBasedRobot): 103 | """ 104 | Base class for mujoco .xml based agents. 105 | """ 106 | 107 | def __init__(self, model_xml, robot_name, action_dim, obs_dim, self_collision=True): 108 | XmlBasedRobot.__init__(self, robot_name, action_dim, obs_dim, self_collision) 109 | self.model_xml = model_xml 110 | self.doneLoading=0 111 | def reset(self, bullet_client): 112 | # print("Load") 113 | # print(self.doneLoading) 114 | # print(self.ordered_joints) 115 | self._p = bullet_client 116 | #print("Created bullet_client with id=", self._p._client) 117 | if (self.doneLoading==0): 118 | self.ordered_joints = [] 119 | self.doneLoading=1 120 | file_path = self.model_xml #os.path.join(currentdir, self.model_xml) 121 | if self.self_collision: 122 | self.objects = self._p.loadMJCF(file_path, flags=pybullet.URDF_USE_SELF_COLLISION|pybullet.URDF_USE_SELF_COLLISION_EXCLUDE_ALL_PARENTS) 123 | self.parts, self.jdict, self.ordered_joints, self.robot_body = self.addToScene(self._p, self.objects ) 124 | else: 125 | self.objects = self._p.loadMJCF(file_path) 126 | self.parts, self.jdict, self.ordered_joints, self.robot_body = self.addToScene(self._p, self.objects) 127 | self.robot_specific_reset(self._p) 128 | 129 | s = self.calc_state() # optimization: calc_state() can calculate something in self.* for calc_potential() to use 130 | 131 | return s 132 | 133 | def calc_potential(self): 134 | return 0 135 | 136 | 137 | class URDFBasedRobot(XmlBasedRobot): 138 | """ 139 | Base class for URDF .xml based robots. 140 | """ 141 | 142 | def __init__(self, model_urdf, robot_name, action_dim, obs_dim, basePosition=[0, 0, 0], baseOrientation=[0, 0, 0, 1], fixed_base=False, self_collision=False): 143 | XmlBasedRobot.__init__(self, robot_name, action_dim, obs_dim, self_collision) 144 | 145 | self.model_urdf = model_urdf 146 | self.basePosition = basePosition 147 | self.baseOrientation = baseOrientation 148 | self.fixed_base = fixed_base 149 | 150 | def reset(self, bullet_client): 151 | self._p = bullet_client 152 | self.ordered_joints = [] 153 | 154 | print(os.path.join(os.path.dirname(__file__), "data", self.model_urdf)) 155 | 156 | if self.self_collision: 157 | self.parts, self.jdict, self.ordered_joints, self.robot_body = self.addToScene(self._p, 158 | self._p.loadURDF(os.path.join(pybullet_data.getDataPath(), self.model_urdf), 159 | basePosition=self.basePosition, 160 | baseOrientation=self.baseOrientation, 161 | useFixedBase=self.fixed_base, 162 | flags=pybullet.URDF_USE_SELF_COLLISION)) 163 | else: 164 | self.parts, self.jdict, self.ordered_joints, self.robot_body = self.addToScene(self._p, 165 | self._p.loadURDF(os.path.join(pybullet_data.getDataPath(), self.model_urdf), 166 | basePosition=self.basePosition, 167 | baseOrientation=self.baseOrientation, 168 | useFixedBase=self.fixed_base)) 169 | 170 | self.robot_specific_reset(self._p) 171 | 172 | s = self.calc_state() # optimization: calc_state() can calculate something in self.* for calc_potential() to use 173 | self.potential = self.calc_potential() 174 | 175 | return s 176 | 177 | def calc_potential(self): 178 | return 0 179 | 180 | 181 | class SDFBasedRobot(XmlBasedRobot): 182 | """ 183 | Base class for SDF robots in a Scene. 184 | """ 185 | 186 | def __init__(self, model_sdf, robot_name, action_dim, obs_dim, basePosition=[0, 0, 0], baseOrientation=[0, 0, 0, 1], fixed_base=False, self_collision=False): 187 | XmlBasedRobot.__init__(self, robot_name, action_dim, obs_dim, self_collision) 188 | 189 | self.model_sdf = model_sdf 190 | self.fixed_base = fixed_base 191 | 192 | def reset(self, bullet_client): 193 | self._p = bullet_client 194 | 195 | self.ordered_joints = [] 196 | 197 | self.parts, self.jdict, self.ordered_joints, self.robot_body = self.addToScene(self._p, # TODO: Not sure if this works, try it with kuka 198 | self._p.loadSDF(os.path.join("models_robot", self.model_sdf))) 199 | 200 | self.robot_specific_reset(self._p) 201 | 202 | s = self.calc_state() # optimization: calc_state() can calculate something in self.* for calc_potential() to use 203 | self.potential = self.calc_potential() 204 | 205 | return s 206 | 207 | def calc_potential(self): 208 | return 0 209 | 210 | 211 | class Pose_Helper: # dummy class to comply to original interface 212 | def __init__(self, body_part): 213 | self.body_part = body_part 214 | 215 | def xyz(self): 216 | return self.body_part.current_position() 217 | 218 | def rpy(self): 219 | return pybullet.getEulerFromQuaternion(self.body_part.current_orientation()) 220 | 221 | def orientation(self): 222 | return self.body_part.current_orientation() 223 | 224 | class BodyPart: 225 | def __init__(self, bullet_client, body_name, bodies, bodyIndex, bodyPartIndex): 226 | self.bodies = bodies 227 | self._p = bullet_client 228 | self.bodyIndex = bodyIndex 229 | self.bodyPartIndex = bodyPartIndex 230 | self.initialPosition = self.current_position() 231 | self.initialOrientation = self.current_orientation() 232 | self.bp_pose = Pose_Helper(self) 233 | 234 | def state_fields_of_pose_of(self, body_id, link_id=-1): # a method you will most probably need a lot to get pose and orientation 235 | if link_id == -1: 236 | (x, y, z), (a, b, c, d) = self._p.getBasePositionAndOrientation(body_id) 237 | else: 238 | (x, y, z), (a, b, c, d), _, _, _, _ = self._p.getLinkState(body_id, link_id) 239 | return np.array([x, y, z, a, b, c, d]) 240 | 241 | def get_position(self): return self.current_position() 242 | 243 | def get_pose(self): 244 | return self.state_fields_of_pose_of(self.bodies[self.bodyIndex], self.bodyPartIndex) 245 | 246 | def speed(self): 247 | if self.bodyPartIndex == -1: 248 | (vx, vy, vz), _ = self._p.getBaseVelocity(self.bodies[self.bodyIndex]) 249 | else: 250 | (x,y,z), (a,b,c,d), _,_,_,_, (vx, vy, vz), (vr,vp,vy) = self._p.getLinkState(self.bodies[self.bodyIndex], self.bodyPartIndex, computeLinkVelocity=1) 251 | return np.array([vx, vy, vz]) 252 | 253 | def speed_angular(self): 254 | if self.bodyPartIndex == -1: 255 | (vx, vy, vz), (vr, vp, vy) = self._p.getBaseVelocity(self.bodies[self.bodyIndex]) 256 | else: 257 | (x,y,z), (a,b,c,d), _,_,_,_, (vx, vy, vz), (vr,vp,vy) = self._p.getLinkState(self.bodies[self.bodyIndex], self.bodyPartIndex, computeLinkVelocity=1) 258 | return np.array([vr, vp, vy]) 259 | 260 | def current_position(self): 261 | return self.get_pose()[:3] 262 | 263 | def current_orientation(self): 264 | return self.get_pose()[3:] 265 | 266 | def get_orientation(self): 267 | return self.current_orientation() 268 | 269 | def reset_position(self, position): 270 | self._p.resetBasePositionAndOrientation(self.bodies[self.bodyIndex], position, self.get_orientation()) 271 | 272 | def reset_orientation(self, orientation): 273 | self._p.resetBasePositionAndOrientation(self.bodies[self.bodyIndex], self.get_position(), orientation) 274 | 275 | def reset_velocity(self, linearVelocity=[0,0,0], angularVelocity =[0,0,0]): 276 | self._p.resetBaseVelocity(self.bodies[self.bodyIndex], linearVelocity, angularVelocity) 277 | 278 | def reset_pose(self, position, orientation): 279 | self._p.resetBasePositionAndOrientation(self.bodies[self.bodyIndex], position, orientation) 280 | 281 | def pose(self): 282 | return self.bp_pose 283 | 284 | def contact_list(self): 285 | return self._p.getContactPoints(self.bodies[self.bodyIndex], -1, self.bodyPartIndex, -1) 286 | 287 | 288 | class Joint: 289 | def __init__(self, bullet_client, joint_name, bodies, bodyIndex, jointIndex): 290 | self.bodies = bodies 291 | self._p = bullet_client 292 | self.bodyIndex = bodyIndex 293 | self.jointIndex = jointIndex 294 | self.joint_name = joint_name 295 | 296 | jointInfo = self._p.getJointInfo(self.bodies[self.bodyIndex], self.jointIndex) 297 | self.lowerLimit = jointInfo[8] 298 | self.upperLimit = jointInfo[9] 299 | 300 | self.power_coeff = 0 301 | 302 | def set_state(self, x, vx): 303 | self._p.resetJointState(self.bodies[self.bodyIndex], self.jointIndex, x, vx) 304 | 305 | def current_position(self): # just some synonyme method 306 | return self.get_state() 307 | 308 | def current_relative_position(self): 309 | pos, vel = self.get_state() 310 | pos_mid = 0.5 * (self.lowerLimit + self.upperLimit); 311 | return ( 312 | 2 * (pos - pos_mid) / (self.upperLimit - self.lowerLimit), 313 | 0.1 * vel 314 | ) 315 | 316 | def get_state(self): 317 | x, vx,_,_ = self._p.getJointState(self.bodies[self.bodyIndex],self.jointIndex) 318 | return x, vx 319 | 320 | def get_position(self): 321 | x, _ = self.get_state() 322 | return x 323 | 324 | def get_orientation(self): 325 | _,r = self.get_state() 326 | return r 327 | 328 | def get_velocity(self): 329 | _, vx = self.get_state() 330 | return vx 331 | 332 | def set_position(self, position): 333 | self._p.setJointMotorControl2(self.bodies[self.bodyIndex],self.jointIndex,pybullet.POSITION_CONTROL, targetPosition=position) 334 | 335 | def set_velocity(self, velocity): 336 | self._p.setJointMotorControl2(self.bodies[self.bodyIndex],self.jointIndex,pybullet.VELOCITY_CONTROL, targetVelocity=velocity) 337 | 338 | def set_motor_torque(self, torque): # just some synonyme method 339 | self.set_torque(torque) 340 | 341 | def set_torque(self, torque): 342 | self._p.setJointMotorControl2(bodyIndex=self.bodies[self.bodyIndex], jointIndex=self.jointIndex, controlMode=pybullet.TORQUE_CONTROL, force=torque) #, positionGain=0.1, velocityGain=0.1) 343 | 344 | def reset_current_position(self, position, velocity): # just some synonyme method 345 | self.reset_position(position, velocity) 346 | 347 | def reset_position(self, position, velocity): 348 | self._p.resetJointState(self.bodies[self.bodyIndex],self.jointIndex,targetValue=position, targetVelocity=velocity) 349 | self.disable_motor() 350 | 351 | def disable_motor(self): 352 | self._p.setJointMotorControl2(self.bodies[self.bodyIndex],self.jointIndex,controlMode=pybullet.POSITION_CONTROL, targetPosition=0, targetVelocity=0, positionGain=0.1, velocityGain=0.1, force=0) 353 | -------------------------------------------------------------------------------- /Environments/pybullet_evo/robot_locomotors.py: -------------------------------------------------------------------------------- 1 | from .robot_bases import XmlBasedRobot, MJCFBasedRobot, URDFBasedRobot 2 | import numpy as np 3 | import pybullet 4 | import os, inspect 5 | import pybullet_data 6 | from .robot_bases import BodyPart 7 | import tempfile 8 | import atexit 9 | import xmltodict 10 | 11 | currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) 12 | 13 | def cleanup_func_for_tmp(filepath): 14 | os.remove(filepath) 15 | 16 | class WalkerBase(MJCFBasedRobot): 17 | def __init__(self, fn, robot_name, action_dim, obs_dim, power, self_collision=True): 18 | MJCFBasedRobot.__init__(self, fn, robot_name, action_dim, obs_dim, self_collision=True) 19 | self.power = power 20 | self.camera_x = 0 21 | self.start_pos_x, self.start_pos_y, self.start_pos_z = 0, 0, 0 22 | self.walk_target_x = 1e3 # kilometer away 23 | self.walk_target_y = 0 24 | self.body_xyz=[0,0,0] 25 | 26 | def robot_specific_reset(self, bullet_client): 27 | self._p = bullet_client 28 | self._reset_position = [] 29 | for j in self.ordered_joints: 30 | pos = self.np_random.uniform(low=-0.5, high=0.5) 31 | j.reset_current_position(pos, 0) 32 | self._reset_position.append(pos) 33 | 34 | self.feet = [self.parts[f] for f in self.foot_list] 35 | self.feet_contact = np.array([0.0 for f in self.foot_list], dtype=np.float32) 36 | self.scene.actor_introduce(self) 37 | self.initial_z = None 38 | 39 | def reset_position(self): 40 | for j, pos in zip(self.ordered_joints, self._reset_position): 41 | j.set_velocity(0) 42 | # j.disable_motor() 43 | 44 | def reset_position_final(self): 45 | for j, pos in zip(self.ordered_joints, self._reset_position): 46 | # j.reset_current_position(pos, 0) 47 | j.disable_motor() 48 | 49 | 50 | def apply_action(self, a): 51 | assert (np.isfinite(a).all()) 52 | for n, j in enumerate(self.ordered_joints): 53 | j.set_motor_torque(self.power * j.power_coef * float(np.clip(a[n], -1, +1))) 54 | 55 | def calc_state(self): 56 | j = np.array([j.current_relative_position() for j in self.ordered_joints], dtype=np.float32).flatten() 57 | # even elements [0::2] position, scaled to -1..+1 between limits 58 | # odd elements [1::2] angular speed, scaled to show -1..+1 59 | self.joint_speeds = j[1::2] 60 | self.joints_at_limit = np.count_nonzero(np.abs(j[0::2]) > 0.99) 61 | 62 | body_pose = self.robot_body.pose() 63 | parts_xyz = np.array([p.pose().xyz() for p in self.parts.values()]).flatten() 64 | self.body_xyz = ( 65 | parts_xyz[0::3].mean(), parts_xyz[1::3].mean(), body_pose.xyz()[2]) # torso z is more informative than mean z 66 | self.body_rpy = body_pose.rpy() 67 | z = self.body_xyz[2] 68 | if self.initial_z == None: 69 | self.initial_z = z 70 | r, p, yaw = self.body_rpy 71 | self.walk_target_theta = np.arctan2(self.walk_target_y - self.body_xyz[1], 72 | self.walk_target_x - self.body_xyz[0]) 73 | self.walk_target_dist = np.linalg.norm( 74 | [self.walk_target_y - self.body_xyz[1], self.walk_target_x - self.body_xyz[0]]) 75 | angle_to_target = self.walk_target_theta - yaw 76 | 77 | rot_speed = np.array( 78 | [[np.cos(-yaw), -np.sin(-yaw), 0], 79 | [np.sin(-yaw), np.cos(-yaw), 0], 80 | [ 0, 0, 1]] 81 | ) 82 | vx, vy, vz = np.dot(rot_speed, self.robot_body.speed()) # rotate speed back to body point of view 83 | 84 | more = np.array([ z-self.initial_z, 85 | np.sin(angle_to_target), np.cos(angle_to_target), 86 | 0.3* vx , 0.3* vy , 0.3* vz , # 0.3 is just scaling typical speed into -1..+1, no physical sense here 87 | r, p], dtype=np.float32) 88 | return np.clip( np.concatenate([more] + [j] + [self.feet_contact]), -5, +5) 89 | 90 | def calc_potential(self): 91 | # progress in potential field is speed*dt, typical speed is about 2-3 meter per second, this potential will change 2-3 per frame (not per second), 92 | # all rewards have rew/frame units and close to 1.0 93 | debugmode=0 94 | if (debugmode): 95 | print("calc_potential: self.walk_target_dist") 96 | print(self.walk_target_dist) 97 | print("self.scene.dt") 98 | print(self.scene.dt) 99 | print("self.scene.frame_skip") 100 | print(self.scene.frame_skip) 101 | print("self.scene.timestep") 102 | print(self.scene.timestep) 103 | return - self.walk_target_dist / self.scene.dt 104 | 105 | class HalfCheetah(WalkerBase): 106 | foot_list = ["ffoot", "fshin", "fthigh", "bfoot", "bshin", "bthigh"] # track these contacts with ground 107 | 108 | def __init__(self, design = None): 109 | self._adapted_xml_file = tempfile.NamedTemporaryFile(delete=False, prefix='halfcheetah_', suffix='.xml') 110 | self._adapted_xml_filepath = self._adapted_xml_file.name 111 | file = self._adapted_xml_filepath 112 | self._adapted_xml_file.close() 113 | atexit.register(cleanup_func_for_tmp, self._adapted_xml_filepath) 114 | self.adapt_xml(self._adapted_xml_filepath, design) 115 | #WalkerBase.__init__(self, self._adapted_xml_filepath, "torso", action_dim=6, obs_dim=26, power=0.90) 116 | WalkerBase.__init__(self, file, "torso", action_dim=6, obs_dim=26, power=0.90) 117 | 118 | def adapt_xml(self, file, design = None): 119 | with open(os.path.join(currentdir, 'half_cheetah.xml'), 'r') as fd: 120 | xml_string = fd.read() 121 | height = 1.0 122 | bth_r = 1.0 123 | bsh_r = 1.0 124 | bfo_r = 1.0 125 | fth_r = 1.0 126 | fsh_r = 1.0 127 | ffo_r = 1.0 128 | if design is None: 129 | bth_r, bsh_r, bfo_r, fth_r, fsh_r, ffo_r = np.random.uniform(low=0.5, high=1.5, size=6) 130 | else: 131 | bth_r, bsh_r, bfo_r, fth_r, fsh_r, ffo_r = design 132 | height = max(.145 * bth_r + .15 * bsh_r + .094 * bfo_r, .133 * fth_r + .106 * fsh_r + .07 * ffo_r) 133 | height *= 2.0 + 0.01 134 | 135 | xml_dict = xmltodict.parse(xml_string) 136 | xml_dict['mujoco']['worldbody']['body']['@pos'] = "0 0 {}".format(height) 137 | 138 | ### Btigh 139 | 140 | xml_dict['mujoco']['worldbody']['body']['body'][0]['geom']['@pos'] = '.1 0 -.13' 141 | xml_dict['mujoco']['worldbody']['body']['body'][0]['geom']['@pos'] = '{} 0 {}'.format(.1 * bth_r, -.13 * bth_r) 142 | 143 | xml_dict['mujoco']['worldbody']['body']['body'][0]['geom']['@size'] = '0.046 .145' 144 | xml_dict['mujoco']['worldbody']['body']['body'][0]['geom']['@size'] = '0.046 {}'.format(.145 * bth_r) 145 | 146 | xml_dict['mujoco']['worldbody']['body']['body'][0]['body']['@pos'] = '.16 0 -.25' 147 | xml_dict['mujoco']['worldbody']['body']['body'][0]['body']['@pos'] = '{} 0 {}'.format(.16 * bth_r, -.25 * bth_r) 148 | 149 | ### bshin 150 | 151 | xml_dict['mujoco']['worldbody']['body']['body'][0]['body']['geom']['@pos'] = '-.14 0 -.07' 152 | xml_dict['mujoco']['worldbody']['body']['body'][0]['body']['geom']['@pos'] = '{} 0 {}'.format(-.14 * bsh_r, -.07 * bsh_r) 153 | 154 | xml_dict['mujoco']['worldbody']['body']['body'][0]['body']['geom']['@size'] = '0.046 .15' 155 | xml_dict['mujoco']['worldbody']['body']['body'][0]['body']['geom']['@size'] = '0.046 {}'.format(.15 * bsh_r) 156 | 157 | xml_dict['mujoco']['worldbody']['body']['body'][0]['body']['body']['@pos'] = '-.28 0 -.14' 158 | xml_dict['mujoco']['worldbody']['body']['body'][0]['body']['body']['@pos'] = '{} 0 {}'.format(-.28 * bsh_r, -.14 * bsh_r) 159 | 160 | ### bfoot 161 | 162 | xml_dict['mujoco']['worldbody']['body']['body'][0]['body']['body']['geom']['@pos'] = '.03 0 -.097' 163 | xml_dict['mujoco']['worldbody']['body']['body'][0]['body']['body']['geom']['@pos'] = '{} 0 {}'.format(.03 * bfo_r, -.097 * bfo_r) 164 | 165 | xml_dict['mujoco']['worldbody']['body']['body'][0]['body']['body']['geom']['@size'] = '0.046 .094' 166 | xml_dict['mujoco']['worldbody']['body']['body'][0]['body']['body']['geom']['@size'] = '0.046 {}'.format(.094 * bfo_r) 167 | 168 | ### fthigh 169 | 170 | xml_dict['mujoco']['worldbody']['body']['body'][1]['geom']['@pos'] = '-.07 0 -.12' 171 | xml_dict['mujoco']['worldbody']['body']['body'][1]['geom']['@pos'] = '{} 0 {}'.format(-.07 * fth_r, -.12 * fth_r) 172 | 173 | xml_dict['mujoco']['worldbody']['body']['body'][1]['geom']['@size'] = '0.046 .133' 174 | xml_dict['mujoco']['worldbody']['body']['body'][1]['geom']['@size'] = '0.046 {}'.format(.133 * fth_r) 175 | 176 | xml_dict['mujoco']['worldbody']['body']['body'][1]['body']['@pos'] = '-.14 0 -.24' 177 | xml_dict['mujoco']['worldbody']['body']['body'][1]['body']['@pos'] = '{} 0 {}'.format(-.14 *fth_r, -.24 * fth_r) 178 | 179 | ### fshin 180 | 181 | xml_dict['mujoco']['worldbody']['body']['body'][1]['body']['geom']['@pos'] = '.065 0 -.09' 182 | xml_dict['mujoco']['worldbody']['body']['body'][1]['body']['geom']['@pos'] = '{} 0 {}'.format(.065 * fsh_r, -.09 * fsh_r) 183 | 184 | xml_dict['mujoco']['worldbody']['body']['body'][1]['body']['geom']['@size'] = '0.046 .106' 185 | xml_dict['mujoco']['worldbody']['body']['body'][1]['body']['geom']['@size'] = '0.046 {}'.format(.106 * fsh_r) 186 | 187 | xml_dict['mujoco']['worldbody']['body']['body'][1]['body']['body']['@pos'] = '.13 0 -.18' 188 | xml_dict['mujoco']['worldbody']['body']['body'][1]['body']['body']['@pos'] = '{} 0 {}'.format(.13 * fsh_r, -.18 * fsh_r) 189 | 190 | ### ffoot 191 | 192 | xml_dict['mujoco']['worldbody']['body']['body'][1]['body']['body']['geom']['@pos'] = '.045 0 -.07' 193 | xml_dict['mujoco']['worldbody']['body']['body'][1]['body']['body']['geom']['@pos'] = '{} 0 {}'.format(.045 * ffo_r, -.07 * ffo_r) 194 | 195 | xml_dict['mujoco']['worldbody']['body']['body'][1]['body']['body']['geom']['@size'] = '0.046 .07' 196 | xml_dict['mujoco']['worldbody']['body']['body'][1]['body']['body']['geom']['@size'] = '0.046 {}'.format(.07 * ffo_r) 197 | 198 | xml_string = xmltodict.unparse(xml_dict, pretty=True) 199 | with open(file, 'w') as fd: 200 | fd.write(xml_string) 201 | 202 | def alive_bonus(self, z, pitch): 203 | # Use contact other than feet to terminate episode: due to a lot of strange walks using knees 204 | return 0 #+1 if np.abs(pitch) < 1.0 and not self.feet_contact[1] and not self.feet_contact[2] and not self.feet_contact[4] and not self.feet_contact[5] else -1 205 | 206 | def robot_specific_reset(self, bullet_client): 207 | WalkerBase.robot_specific_reset(self, bullet_client) 208 | self.jdict["bthigh"].power_coef = 120.0 209 | self.jdict["bshin"].power_coef = 90.0 210 | self.jdict["bfoot"].power_coef = 60.0 211 | self.jdict["fthigh"].power_coef = 90.0 #140? 212 | self.jdict["fshin"].power_coef = 60.0 213 | self.jdict["ffoot"].power_coef = 30.0 214 | body_pose = self.robot_body.pose() 215 | x, y, z = body_pose.xyz() 216 | self._initial_z = z 217 | 218 | def reset_design(self, bullet_client, design): 219 | self._adapted_xml_file = tempfile.NamedTemporaryFile(delete=False, prefix='halfcheetah_', suffix='.xml') 220 | self._adapted_xml_filepath = self._adapted_xml_file.name 221 | file = self._adapted_xml_filepath 222 | self._adapted_xml_file.close() 223 | atexit.register(cleanup_func_for_tmp, self._adapted_xml_filepath) 224 | 225 | self.adapt_xml(file, design) 226 | self.model_xml = file 227 | 228 | self.doneLoading = 0 229 | self.parts = None 230 | self.objects = [] 231 | self.jdict = None 232 | self.ordered_joints = None 233 | self.robot_body = None 234 | self.robot_name = "torso" 235 | bullet_client.resetSimulation() 236 | self.reset(bullet_client) 237 | # self.reset(self._p) 238 | 239 | def calc_potential(self): 240 | return 0 241 | 242 | def calc_state(self): 243 | j = np.array([j.current_relative_position() for j in self.ordered_joints], dtype=np.float32).flatten() 244 | # even elements [0::2] position, scaled to -1..+1 between limits 245 | # odd elements [1::2] angular speed, scaled to show -1..+1 246 | self.joint_pos = j[0::2] 247 | self.joint_speeds = j[1::2] 248 | body_pose = self.robot_body.pose() 249 | x, y, z = body_pose.xyz() 250 | z = z - self._initial_z 251 | r, p, yaw = self.body_rpy = body_pose.rpy() 252 | xv, yv, zv = self.robot_body.speed() 253 | vr, vp, vy = self.robot_body.speed_angular() 254 | state = np.array([xv, vp, zv, p, z]) 255 | state = np.append(self.joint_speeds, state) 256 | state = np.append(self.joint_pos, state) 257 | return state 258 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Kevin Sebastian Luck 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Fast Evolution through Actor-Critic Reinforcement Learning 2 | This is the official repository providing a refactored implementation of the data-driven 3 | design optimization method presented in the paper [**Data-efficient Co-Adaptation of Morphology and Behaviour with Deep Reinforcement Learning**](https://research.fb.com/publications/data-efficient-co-adaptation-of-morphology-and-behaviour-with-deep-reinforcement-learning/). 4 | This paper was presented on the Conference on Robot Learning in 2019. 5 | The website for this project can be found [here](https://sites.google.com/view/drl-coadaptation/home). 6 | 7 | At the moment, the repository contains a basic implementation of the proposed algorithm and its baseline. We use particle swarm optimization on the Q-function, which is used as a surrogate function predicting the performance of design candidates and, thus, avoiding the necessity to simulate/evaluate design candidates. The baseline uses also particle swarm optimization but evaluates design candidates in simulation instead. 8 | 9 | The current environment provided is Half-Cheetah, using pybullet, for which we have to learn effective movement strategies and the optimal leg lengths, maximizing the performance of the agent. 10 | 11 | Additional methods and environments which are shown in the paper will be added over time and the structure of the repository might change in the future. 12 | 13 | ## Citation 14 | If you use this code in your research, please cite 15 | ``` 16 | @inproceedings{luck2019coadapt, 17 | title={Data-efficient Co-Adaptation of Morphology and Behaviour with Deep Reinforcement Learning}, 18 | author={Luck, Kevin Sebastian and Ben Amor, Heni and Calandra, Roberto}, 19 | booktitle={Conference on Robot Learning}, 20 | year={2019} 21 | } 22 | ``` 23 | 24 | ## Acknowledgements of Previous Work 25 | This project would have been harder to implement without the great work of 26 | the developers behind rlkit and pybullet. 27 | 28 | The reinforcement learning loop makes extensive use of rlkit, a framework developed 29 | and maintained by Vitchyr Pong. You find this repository [here](https://github.com/vitchyr/rlkit). 30 | We made slight adaptations to the Soft-Actor-Critic algorithm used in this repository. 31 | 32 | Tasks were simulated in [PyBullet](https://pybullet.org/wordpress/), the 33 | repository can be found [here](https://github.com/bulletphysics/bullet3/tree/master/examples/pybullet). 34 | Adaptations were made to the files found in pybullet_evo to enable the dynamic adaptation 35 | of design parameters during the training process. 36 | 37 | ## Installation 38 | 39 | Make sure that PyTorch is installed. You find more information here: https://pytorch.org/ 40 | 41 | First, clone this repository to your local computer as usual. 42 | Then, install the required packages via pip by executing `pip3 install -r requirements.txt`. 43 | 44 | The current version of Coadapt is using now the latest version of rlkit by [Vitchyr Pong](https://github.com/vitchyr/rlkit). 45 | 46 | Clone the rlkit with 47 | ```bash 48 | git clone https://github.com/vitchyr/rlkit.git 49 | ``` 50 | Now, set in your terminal the environment variable PYTHONPATH with 51 | ```bash 52 | export PYTHONPATH=/path/to/rlkit/ 53 | ``` 54 | where the folder `/path/to/rlkit` contains the folder `rlkit`. This enables us 55 | to import rlkit with `import rlkit`. 56 | Alternatively, follow the installations guidelines you can find in the rlkit repository. 57 | 58 | You may have to set the environmental variable every time you open a new terminal. 59 | 60 | ## Starting experiments 61 | 62 | After setting the environmental variable and installing the packages you can 63 | proceed to run the experiments. 64 | There are two experimental configurations already set up for you in `experiment_configs.py`. 65 | You can execute them with 66 | ```bash 67 | python3 main.py sac_pso_batch 68 | ``` 69 | and 70 | ```bash 71 | python3 main.py sac_pso_sim 72 | ``` 73 | 74 | You may change the configs or add new ones. Make sure to add new configurations to 75 | the `config_dict` in `experiment_configs.py`. 76 | 77 | ## Data logging 78 | If you execute these commands, they will automatically create directories in which 79 | the performance and achieved rewards will be stored. Each experiment creates 80 | a specific folder with the current date/time and a random string as name. 81 | You can find in this folder a copy of the config you executed and one csv file 82 | for each design on which the reinforcement learning algorithm was executed. 83 | Each csv file contains three rows: The type of the design (either 'Initial', 'Optimized' or 'Random'); 84 | The design vector; And the subsequent, cumulative rewards for each episode/trial. 85 | 86 | The file `ADDVIZFILE` provieds a basic jupyter notebook to visualize the collected 87 | data. 88 | 89 | ## Changelog 90 | - 20 July 2022: 91 | - Fixed an issue where the species/individual replay buffer was never reset. 92 | - 9 August 2020: 93 | - We use now the current version of rlkit. Alpha parameter can now be set via the experiment_config file. 94 | - 19 June 2020: 95 | - Updated the repository to use the current version of rlkit. However, we cannot set our own alpha parameter for SAC, so it might come with some initial performance decrease. 96 | - Added a basic Video recorder which retains the best 5 episodes recorded 97 | - Fixed a bug which was introduced when refactoring the code 98 | -------------------------------------------------------------------------------- /RL/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ksluck/Coadaptation/c45d1d682262ef04cb797122d6103050b8d8b826/RL/__init__.py -------------------------------------------------------------------------------- /RL/evoreplay.py: -------------------------------------------------------------------------------- 1 | from rlkit.data_management.replay_buffer import ReplayBuffer 2 | from rlkit.data_management.env_replay_buffer import EnvReplayBuffer 3 | import numpy as np 4 | 5 | class EvoReplayLocalGlobalStart(ReplayBuffer): 6 | def __init__(self, env, max_replay_buffer_size_species, max_replay_buffer_size_population): 7 | self._species_buffer = EnvReplayBuffer(env=env, max_replay_buffer_size=max_replay_buffer_size_species) 8 | self._population_buffer = EnvReplayBuffer(env=env, max_replay_buffer_size=max_replay_buffer_size_population) 9 | self._init_state_buffer = EnvReplayBuffer(env=env, max_replay_buffer_size=max_replay_buffer_size_population) 10 | self._env = env 11 | self._max_replay_buffer_size_species = max_replay_buffer_size_species 12 | self._mode = "species" 13 | self._ep_counter = 0 14 | self._expect_init_state = True 15 | print("Use EvoReplayLocalGlobalStart replay buffer") 16 | 17 | def add_sample(self, observation, action, reward, next_observation, 18 | terminal, **kwargs): 19 | """ 20 | Add a transition tuple. 21 | """ 22 | if True or self._mode == "species": # This is deactivated now 23 | self._species_buffer.add_sample(observation=observation, action=action, reward=reward, next_observation=next_observation, 24 | terminal=terminal, env_info={}, **kwargs) 25 | if self._expect_init_state: 26 | self._init_state_buffer.add_sample(observation=observation, action=action, reward=reward, next_observation=next_observation, 27 | terminal=terminal, env_info={}, **kwargs) 28 | self._init_state_buffer.terminate_episode() 29 | self._expect_init_state = False 30 | 31 | if self._ep_counter >= 0: 32 | self._population_buffer.add_sample(observation=observation, action=action, reward=reward, next_observation=next_observation, 33 | terminal=terminal, env_info={}, **kwargs) 34 | 35 | def terminate_episode(self): 36 | """ 37 | Let the replay buffer know that the episode has terminated in case some 38 | special book-keeping has to happen. 39 | :return: 40 | """ 41 | if self._mode == "species": 42 | self._species_buffer.terminate_episode() 43 | self._population_buffer.terminate_episode() 44 | self._ep_counter += 1 45 | self._expect_init_state = True 46 | 47 | def num_steps_can_sample(self, **kwargs): 48 | """ 49 | :return: # of unique items that can be sampled. 50 | """ 51 | if self._mode == "species": 52 | return self._species_buffer.num_steps_can_sample(**kwargs) 53 | elif self._mode == "population": 54 | return self._population_buffer.num_steps_can_sample(**kwargs) 55 | else: 56 | pass 57 | 58 | def random_batch(self, batch_size): 59 | """ 60 | Return a batch of size `batch_size`. 61 | :param batch_size: 62 | :return: 63 | """ 64 | if self._mode == "species": 65 | # return self._species_buffer.random_batch(batch_size) 66 | species_batch_size = int(np.floor(batch_size * 0.9)) 67 | pop_batch_size = int(np.ceil(batch_size * 0.1)) 68 | pop = self._population_buffer.random_batch(pop_batch_size) 69 | spec = self._species_buffer.random_batch(species_batch_size) 70 | for key, item in pop.items(): 71 | pop[key] = np.concatenate([pop[key], spec[key]], axis=0) 72 | return pop 73 | elif self._mode == "population": 74 | return self._population_buffer.random_batch(batch_size) 75 | elif self._mode == "start": 76 | return self._init_state_buffer.random_batch(batch_size) 77 | else: 78 | pass 79 | 80 | def set_mode(self, mode): 81 | if mode == "species": 82 | self._mode = mode 83 | elif mode == "population": 84 | self._mode = mode 85 | elif mode == "start": 86 | self._mode = mode 87 | else: 88 | print("No known mode :(") 89 | 90 | def reset_species_buffer(self): 91 | self._species_buffer = EnvReplayBuffer(env = self._env, max_replay_buffer_size=self._max_replay_buffer_size_species) 92 | self._ep_counter = 0 93 | -------------------------------------------------------------------------------- /RL/rl_algorithm.py: -------------------------------------------------------------------------------- 1 | import rlkit.torch.pytorch_util as ptu 2 | 3 | class RL_algorithm(object): 4 | 5 | def __init__(self, config, env, replay, networks): 6 | self._config = config 7 | self.file_str = config['data_folder_experiment'] 8 | 9 | self._env = env 10 | self._networks = networks 11 | self._replay = replay 12 | 13 | if 'use_only_global_networks' in config.keys(): 14 | self._use_only_global_networks = config['use_only_global_networks'] 15 | else: 16 | self._use_only_global_networks = False 17 | 18 | def episode_init(self): 19 | pass 20 | -------------------------------------------------------------------------------- /RL/soft_actor.py: -------------------------------------------------------------------------------- 1 | from rlkit.torch.sac.policies import TanhGaussianPolicy 2 | # from rlkit.torch.sac.sac import SoftActorCritic 3 | from rlkit.torch.networks import FlattenMlp 4 | import numpy as np 5 | from .rl_algorithm import RL_algorithm 6 | from rlkit.torch.sac.sac import SACTrainer as SoftActorCritic_rlkit 7 | import rlkit.torch.pytorch_util as ptu 8 | import torch 9 | import utils 10 | 11 | # networks = {individual:, population:} 12 | class SoftActorCritic(RL_algorithm): 13 | 14 | def __init__(self, config, env, replay, networks): 15 | """ Bascally a wrapper class for SAC from rlkit. 16 | 17 | Args: 18 | config: Configuration dictonary 19 | env: Environment 20 | replay: Replay buffer 21 | networks: dict containing two sub-dicts, 'individual' and 'population' 22 | which contain the networks. 23 | 24 | """ 25 | super().__init__(config, env, replay, networks) 26 | 27 | self._variant_pop = config['rl_algorithm_config']['algo_params_pop'] 28 | self._variant_spec = config['rl_algorithm_config']['algo_params'] 29 | 30 | self._ind_qf1 = networks['individual']['qf1'] 31 | self._ind_qf2 = networks['individual']['qf2'] 32 | self._ind_qf1_target = networks['individual']['qf1_target'] 33 | self._ind_qf2_target = networks['individual']['qf2_target'] 34 | self._ind_policy = networks['individual']['policy'] 35 | 36 | self._pop_qf1 = networks['population']['qf1'] 37 | self._pop_qf2 = networks['population']['qf2'] 38 | self._pop_qf1_target = networks['population']['qf1_target'] 39 | self._pop_qf2_target = networks['population']['qf2_target'] 40 | self._pop_policy = networks['population']['policy'] 41 | 42 | self._batch_size = config['rl_algorithm_config']['batch_size'] 43 | self._nmbr_indiv_updates = config['rl_algorithm_config']['indiv_updates'] 44 | self._nmbr_pop_updates = config['rl_algorithm_config']['pop_updates'] 45 | 46 | self._algorithm_ind = SoftActorCritic_rlkit( 47 | env=self._env, 48 | policy=self._ind_policy, 49 | qf1=self._ind_qf1, 50 | qf2=self._ind_qf2, 51 | target_qf1=self._ind_qf1_target, 52 | target_qf2=self._ind_qf2_target, 53 | use_automatic_entropy_tuning = False, 54 | **self._variant_spec 55 | ) 56 | 57 | self._algorithm_pop = SoftActorCritic_rlkit( 58 | env=self._env, 59 | policy=self._pop_policy, 60 | qf1=self._pop_qf1, 61 | qf2=self._pop_qf2, 62 | target_qf1=self._pop_qf1_target, 63 | target_qf2=self._pop_qf2_target, 64 | use_automatic_entropy_tuning = False, 65 | **self._variant_pop 66 | ) 67 | 68 | # self._algorithm_ind.to(ptu.device) 69 | # self._algorithm_pop.to(ptu.device) 70 | 71 | def episode_init(self): 72 | """ Initializations to be done before the first episode. 73 | 74 | In this case basically creates a fresh instance of SAC for the 75 | individual networks and copies the values of the target network. 76 | """ 77 | self._algorithm_ind = SoftActorCritic_rlkit( 78 | env=self._env, 79 | policy=self._ind_policy, 80 | qf1=self._ind_qf1, 81 | qf2=self._ind_qf2, 82 | target_qf1=self._ind_qf1_target, 83 | target_qf2=self._ind_qf2_target, 84 | use_automatic_entropy_tuning = False, 85 | # alt_alpha = self._alt_alpha, 86 | **self._variant_spec 87 | ) 88 | if self._config['rl_algorithm_config']['copy_from_gobal']: 89 | utils.copy_pop_to_ind(networks_pop=self._networks['population'], networks_ind=self._networks['individual']) 90 | # We have only to do this becasue the version of rlkit which we use 91 | # creates internally a target network 92 | # vf_dict = self._algorithm_pop.target_vf.state_dict() 93 | # self._algorithm_ind.target_vf.load_state_dict(vf_dict) 94 | # self._algorithm_ind.target_vf.eval() 95 | # self._algorithm_ind.to(ptu.device) 96 | 97 | def single_train_step(self, train_ind=True, train_pop=False): 98 | """ A single trianing step. 99 | 100 | Args: 101 | train_ind: Boolean. If true the individual networks will be trained. 102 | train_pop: Boolean. If true the population networks will be trained. 103 | """ 104 | if train_ind: 105 | # Get only samples from the species buffer 106 | self._replay.set_mode('species') 107 | # self._algorithm_ind.num_updates_per_train_call = self._variant_spec['num_updates_per_epoch'] 108 | # self._algorithm_ind._try_to_train() 109 | for _ in range(self._nmbr_indiv_updates): 110 | batch = self._replay.random_batch(self._batch_size) 111 | self._algorithm_ind.train(batch) 112 | 113 | if train_pop: 114 | # Get only samples from the population buffer 115 | self._replay.set_mode('population') 116 | # self._algorithm_pop.num_updates_per_train_call = self._variant_pop['num_updates_per_epoch'] 117 | # self._algorithm_pop._try_to_train() 118 | for _ in range(self._nmbr_pop_updates): 119 | batch = self._replay.random_batch(self._batch_size) 120 | self._algorithm_pop.train(batch) 121 | 122 | @staticmethod 123 | def create_networks(env, config): 124 | """ Creates all networks necessary for SAC. 125 | 126 | These networks have to be created before instantiating this class and 127 | used in the constructor. 128 | 129 | Args: 130 | config: A configuration dictonary containing population and 131 | individual networks 132 | 133 | Returns: 134 | A dictonary which contains the networks. 135 | """ 136 | network_dict = { 137 | 'individual' : SoftActorCritic._create_networks(env=env, config=config), 138 | 'population' : SoftActorCritic._create_networks(env=env, config=config), 139 | } 140 | return network_dict 141 | 142 | @staticmethod 143 | def _create_networks(env, config): 144 | """ Creates all networks necessary for SAC. 145 | 146 | These networks have to be created before instantiating this class and 147 | used in the constructor. 148 | 149 | TODO: Maybe this should be reworked one day... 150 | 151 | Args: 152 | config: A configuration dictonary. 153 | 154 | Returns: 155 | A dictonary which contains the networks. 156 | """ 157 | obs_dim = int(np.prod(env.observation_space.shape)) 158 | action_dim = int(np.prod(env.action_space.shape)) 159 | net_size = config['rl_algorithm_config']['net_size'] 160 | hidden_sizes = [net_size] * config['rl_algorithm_config']['network_depth'] 161 | # hidden_sizes = [net_size, net_size, net_size] 162 | qf1 = FlattenMlp( 163 | hidden_sizes=hidden_sizes, 164 | input_size=obs_dim + action_dim, 165 | output_size=1, 166 | ).to(device=ptu.device) 167 | qf2 = FlattenMlp( 168 | hidden_sizes=hidden_sizes, 169 | input_size=obs_dim + action_dim, 170 | output_size=1, 171 | ).to(device=ptu.device) 172 | qf1_target = FlattenMlp( 173 | hidden_sizes=hidden_sizes, 174 | input_size=obs_dim + action_dim, 175 | output_size=1, 176 | ).to(device=ptu.device) 177 | qf2_target = FlattenMlp( 178 | hidden_sizes=hidden_sizes, 179 | input_size=obs_dim + action_dim, 180 | output_size=1, 181 | ).to(device=ptu.device) 182 | policy = TanhGaussianPolicy( 183 | hidden_sizes=hidden_sizes, 184 | obs_dim=obs_dim, 185 | action_dim=action_dim, 186 | ).to(device=ptu.device) 187 | 188 | clip_value = 1.0 189 | for p in qf1.parameters(): 190 | p.register_hook(lambda grad: torch.clamp(grad, -clip_value, clip_value)) 191 | for p in qf2.parameters(): 192 | p.register_hook(lambda grad: torch.clamp(grad, -clip_value, clip_value)) 193 | for p in policy.parameters(): 194 | p.register_hook(lambda grad: torch.clamp(grad, -clip_value, clip_value)) 195 | 196 | return {'qf1' : qf1, 'qf2' : qf2, 'qf1_target' : qf1_target, 'qf2_target' : qf2_target, 'policy' : policy} 197 | 198 | @staticmethod 199 | def get_q_network(networks): 200 | """ Returns the q network from a dict of networks. 201 | 202 | This method extracts the q-network from the dictonary of networks 203 | created by the function create_networks. 204 | 205 | Args: 206 | networks: Dict containing the networks. 207 | 208 | Returns: 209 | The q-network as torch object. 210 | """ 211 | return networks['qf1'] 212 | 213 | @staticmethod 214 | def get_policy_network(networks): 215 | """ Returns the policy network from a dict of networks. 216 | 217 | This method extracts the policy network from the dictonary of networks 218 | created by the function create_networks. 219 | 220 | Args: 221 | networks: Dict containing the networks. 222 | 223 | Returns: 224 | The policy network as torch object. 225 | """ 226 | return networks['policy'] 227 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ksluck/Coadaptation/c45d1d682262ef04cb797122d6103050b8d8b826/__init__.py -------------------------------------------------------------------------------- /coadapt.py: -------------------------------------------------------------------------------- 1 | from RL.soft_actor import SoftActorCritic 2 | from DO.pso_batch import PSO_batch 3 | from DO.pso_sim import PSO_simulation 4 | from Environments import evoenvs as evoenvs 5 | import utils 6 | import time 7 | from RL.evoreplay import EvoReplayLocalGlobalStart 8 | import numpy as np 9 | import os 10 | import csv 11 | import torch 12 | 13 | def select_design_opt_alg(alg_name): 14 | """ Selects the design optimization method. 15 | 16 | Args: 17 | alg_name: String which states the design optimization method. Can be 18 | `pso_batch` or `pso_sim`. 19 | 20 | Returns: 21 | The class of a design optimization method. 22 | 23 | Raises: 24 | ValueError: If the string alg_name is unknown. 25 | """ 26 | if alg_name == "pso_batch": 27 | return PSO_batch 28 | elif alg_name == "pso_sim": 29 | return PSO_simulation 30 | else: 31 | raise ValueError("Design Optimization method not found.") 32 | 33 | def select_environment(env_name): 34 | """ Selects an environment. 35 | 36 | Args: 37 | env_name: Name (string) of the environment on which the experiment is to 38 | be executed. Can be `HalfCheetah`. 39 | 40 | Returns: 41 | The class of an environment 42 | 43 | Raises: 44 | ValueError: If the string env_name is unknown. 45 | 46 | """ 47 | if env_name == 'HalfCheetah': 48 | return evoenvs.HalfCheetahEnv 49 | else: 50 | raise ValueError("Environment class not found.") 51 | 52 | def select_rl_alg(rl_name): 53 | """ Selectes the reinforcement learning method. 54 | 55 | Args: 56 | rl_name: Name (string) of the rl method. 57 | 58 | Returns: 59 | The class of a reinforcement learning method. 60 | 61 | Raises: 62 | ValueError: If the string rl_name is unknown. 63 | """ 64 | if rl_name == 'SoftActorCritic': 65 | return SoftActorCritic 66 | else: 67 | raise ValueError('RL method not fund.') 68 | 69 | class Coadaptation(object): 70 | """ Basic Co-Adaptaton class. 71 | 72 | """ 73 | 74 | def __init__(self, config): 75 | """ 76 | Args: 77 | config: A config dictonary. 78 | 79 | """ 80 | 81 | self._config = config 82 | utils.move_to_cuda(self._config) 83 | 84 | # TODO This should not depend on rl_algorithm_config in the future 85 | self._episode_length = self._config['steps_per_episodes'] 86 | self._reward_scale = 1.0 #self._config['rl_algorithm_config']['algo_params']['reward_scale'] 87 | 88 | self._env_class = select_environment(self._config['env']['env_name']) 89 | self._env = evoenvs.HalfCheetahEnv(config=self._config) 90 | 91 | self._replay = EvoReplayLocalGlobalStart(self._env, 92 | max_replay_buffer_size_species=int(1e6), 93 | max_replay_buffer_size_population=int(1e7)) 94 | 95 | self._rl_alg_class = select_rl_alg(self._config['rl_method']) 96 | 97 | self._networks = self._rl_alg_class.create_networks(env=self._env, config=config) 98 | 99 | self._rl_alg = self._rl_alg_class(config=self._config, env=self._env , replay=self._replay, networks=self._networks) 100 | 101 | self._do_alg_class = select_design_opt_alg(self._config['design_optim_method']) 102 | self._do_alg = self._do_alg_class(config=self._config, replay=self._replay, env=self._env) 103 | 104 | # if self._config['use_cpu_for_rollout']: 105 | # utils.move_to_cpu() 106 | # else: 107 | # utils.move_to_cuda(self._config) 108 | # # TODO this is a temp fix - should be cleaned up, not so hppy with it atm 109 | # self._policy_cpu = self._rl_alg_class.get_policy_network(SoftActorCritic.create_networks(env=self._env, config=config)['individual']) 110 | utils.move_to_cuda(self._config) 111 | 112 | self._last_single_iteration_time = 0 113 | self._design_counter = 0 114 | self._episode_counter = 0 115 | self._data_design_type = 'Initial' 116 | 117 | def initialize_episode(self): 118 | """ Initializations required before the first episode. 119 | 120 | Should be called before the first episode of a new design is 121 | executed. Resets variables such as _data_rewards for logging purposes 122 | etc. 123 | 124 | """ 125 | # self._rl_alg.initialize_episode(init_networks = True, copy_from_gobal = True) 126 | self._rl_alg.episode_init() 127 | self._replay.reset_species_buffer() 128 | 129 | self._data_rewards = [] 130 | self._episode_counter = 0 131 | 132 | 133 | def single_iteration(self): 134 | """ A single iteration. 135 | 136 | Makes all necessary function calls for a single iterations such as: 137 | - Collecting training data 138 | - Executing a training step 139 | - Evaluate the current policy 140 | - Log data 141 | 142 | """ 143 | print("Time for one iteration: {}".format(time.time() - self._last_single_iteration_time)) 144 | self._last_single_iteration_time = time.time() 145 | self._replay.set_mode("species") 146 | self.collect_training_experience() 147 | # TODO Change here to train global only after five designs 148 | train_pop = self._design_counter > 3 149 | if self._episode_counter >= self._config['initial_episodes']: 150 | self._rl_alg.single_train_step(train_ind=True, train_pop=train_pop) 151 | self._episode_counter += 1 152 | self.execute_policy() 153 | self.save_logged_data() 154 | self.save_networks() 155 | 156 | def collect_training_experience(self): 157 | """ Collect training data. 158 | 159 | This function executes a single episode in the environment using the 160 | exploration strategy/mechanism and the policy. 161 | The data, i.e. state-action-reward-nextState, is stored in the replay 162 | buffer. 163 | 164 | """ 165 | state = self._env.reset() 166 | nmbr_of_steps = 0 167 | done = False 168 | 169 | if self._episode_counter < self._config['initial_episodes']: 170 | policy_gpu_ind = self._rl_alg_class.get_policy_network(self._networks['population']) 171 | else: 172 | policy_gpu_ind = self._rl_alg_class.get_policy_network(self._networks['individual']) 173 | # self._policy_cpu = utils.copy_network(network_to=self._policy_cpu, network_from=policy_gpu_ind, config=self._config, force_cpu=self._config['use_cpu_for_rollout']) 174 | self._policy_cpu = policy_gpu_ind 175 | 176 | if self._config['use_cpu_for_rollout']: 177 | utils.move_to_cpu() 178 | else: 179 | utils.move_to_cuda(self._config) 180 | 181 | while not(done) and nmbr_of_steps <= self._episode_length: 182 | nmbr_of_steps += 1 183 | action, _ = self._policy_cpu.get_action(state) 184 | new_state, reward, done, info = self._env.step(action) 185 | # TODO this has to be fixed _variant_spec 186 | reward = reward * self._reward_scale 187 | terminal = np.array([done]) 188 | reward = np.array([reward]) 189 | self._replay.add_sample(observation=state, action=action, reward=reward, next_observation=new_state, 190 | terminal=terminal) 191 | state = new_state 192 | self._replay.terminate_episode() 193 | utils.move_to_cuda(self._config) 194 | 195 | def execute_policy(self): 196 | """ Evaluates the current deterministic policy. 197 | 198 | Evaluates the current policy in the environment by unrolling a single 199 | episode in the environment. 200 | The achieved cumulative reward is logged. 201 | 202 | """ 203 | state = self._env.reset() 204 | done = False 205 | reward_ep = 0.0 206 | reward_original = 0.0 207 | action_cost = 0.0 208 | nmbr_of_steps = 0 209 | 210 | if self._episode_counter < self._config['initial_episodes']: 211 | policy_gpu_ind = self._rl_alg_class.get_policy_network(self._networks['population']) 212 | else: 213 | policy_gpu_ind = self._rl_alg_class.get_policy_network(self._networks['individual']) 214 | # self._policy_cpu = utils.copy_network(network_to=self._policy_cpu, network_from=policy_gpu_ind, config=self._config, force_cpu=self._config['use_cpu_for_rollout']) 215 | self._policy_cpu = policy_gpu_ind 216 | 217 | if self._config['use_cpu_for_rollout']: 218 | utils.move_to_cpu() 219 | else: 220 | utils.move_to_cuda(self._config) 221 | 222 | while not(done) and nmbr_of_steps <= self._episode_length: 223 | nmbr_of_steps += 1 224 | action, _ = self._policy_cpu.get_action(state, deterministic=True) 225 | new_state, reward, done, info = self._env.step(action) 226 | action_cost += info['orig_action_cost'] 227 | reward_ep += float(reward) 228 | reward_original += float(info['orig_reward']) 229 | state = new_state 230 | utils.move_to_cuda(self._config) 231 | # Do something here to log the results 232 | self._data_rewards.append(reward_ep) 233 | 234 | def save_networks(self): 235 | """ Saves the networks on the disk. 236 | """ 237 | if not self._config['save_networks']: 238 | return 239 | 240 | checkpoints_pop = {} 241 | for key, net in self._networks['population'].items(): 242 | checkpoints_pop[key] = net.state_dict() 243 | 244 | checkpoints_ind = {} 245 | for key, net in self._networks['individual'].items(): 246 | checkpoints_ind[key] = net.state_dict() 247 | 248 | checkpoint = { 249 | 'population' : checkpoints_pop, 250 | 'individual' : checkpoints_ind, 251 | } 252 | file_path = os.path.join(self._config['data_folder_experiment'], 'checkpoints') 253 | if not os.path.exists(file_path): 254 | os.makedirs(file_path) 255 | torch.save(checkpoint, os.path.join(file_path, 'checkpoint_design_{}.chk'.format(self._design_counter))) 256 | 257 | def load_networks(self, path): 258 | """ Loads netwokrs from the disk. 259 | """ 260 | model_data = torch.load(path) #, map_location=ptu.device) 261 | 262 | model_data_pop = model_data['population'] 263 | for key, net in self._networks['population'].items(): 264 | params = model_data_pop[key] 265 | net.load_state_dict(params) 266 | 267 | model_data_ind = model_data['individual'] 268 | for key, net in self._networks['individual'].items(): 269 | params = model_data_ind[key] 270 | net.load_state_dict(params) 271 | 272 | def save_logged_data(self): 273 | """ Saves the logged data to the disk as csv files. 274 | 275 | This function creates a log-file in csv format on the disk. For each 276 | design an individual log-file is creates in the experient-directory. 277 | The first row states if the design was one of the initial designs 278 | (as given by the environment), a random design or an optimized design. 279 | The second row gives the design parameters (eta). The third row 280 | contains all subsequent cumulative rewards achieved by the policy 281 | throughout the reinforcement learning process on the current design. 282 | """ 283 | file_path = self._config['data_folder_experiment'] 284 | current_design = self._env.get_current_design() 285 | 286 | with open( 287 | os.path.join(file_path, 288 | 'data_design_{}.csv'.format(self._design_counter) 289 | ), 'w') as fd: 290 | cwriter = csv.writer(fd) 291 | cwriter.writerow(['Design Type:', self._data_design_type]) 292 | cwriter.writerow(current_design) 293 | cwriter.writerow(self._data_rewards) 294 | 295 | def run(self): 296 | """ Runs the Fast Evolution through Actor-Critic RL algorithm. 297 | 298 | First the initial design loop is executed in which the rl-algorithm 299 | is exeuted on the initial designs. Then the design-optimization 300 | process starts. 301 | It is possible to have different numbers of iterations for initial 302 | designs and the design optimization process. 303 | """ 304 | iterations_init = self._config['iterations_init'] 305 | iterations = self._config['iterations'] 306 | design_cycles = self._config['design_cycles'] 307 | exploration_strategy = self._config['exploration_strategy'] 308 | 309 | self._intial_design_loop(iterations_init) 310 | self._training_loop(iterations, design_cycles, exploration_strategy) 311 | 312 | def _training_loop(self, iterations, design_cycles, exploration_strategy): 313 | """ The trianing process which optimizes designs and policies. 314 | 315 | The function executes the reinforcement learning loop and the design 316 | optimization process. 317 | 318 | Args: 319 | iterations: An integer stating the number of iterations/episodes 320 | to be used per design during the reinforcement learning loop. 321 | design_cycles: Integer stating how many designs to evaluate. 322 | exploration_strategy: String which describes which 323 | design exploration strategy to use. Is not used at the moment, 324 | i.e. only the (uniform) random exploration strategy is used. 325 | 326 | """ 327 | self.initialize_episode() 328 | # TODO fix the following 329 | initial_state = self._env._env.reset() 330 | 331 | self._data_design_type = 'Optimized' 332 | 333 | optimized_params = self._env.get_random_design() 334 | q_network = self._rl_alg_class.get_q_network(self._networks['population']) 335 | policy_network = self._rl_alg_class.get_policy_network(self._networks['population']) 336 | optimized_params = self._do_alg.optimize_design(design=optimized_params, q_network=q_network, policy_network=policy_network) 337 | optimized_params = list(optimized_params) 338 | 339 | for i in range(design_cycles): 340 | self._design_counter += 1 341 | self._env.set_new_design(optimized_params) 342 | 343 | # Reinforcement Learning 344 | for _ in range(iterations): 345 | self.single_iteration() 346 | 347 | # Design Optimization 348 | if i % 2 == 1: 349 | self._data_design_type = 'Optimized' 350 | q_network = self._rl_alg_class.get_q_network(self._networks['population']) 351 | policy_network = self._rl_alg_class.get_policy_network(self._networks['population']) 352 | optimized_params = self._do_alg.optimize_design(design=optimized_params, q_network=q_network, policy_network=policy_network) 353 | optimized_params = list(optimized_params) 354 | else: 355 | self._data_design_type = 'Random' 356 | optimized_params = self._env.get_random_design() 357 | optimized_params = list(optimized_params) 358 | self.initialize_episode() 359 | 360 | def _intial_design_loop(self, iterations): 361 | """ The initial training loop for initial designs. 362 | 363 | The initial training loop in which no designs are optimized but only 364 | initial designs, provided by the environment, are used. 365 | 366 | Args: 367 | iterations: Integer stating how many training iterations/episodes 368 | to use per design. 369 | 370 | """ 371 | self._data_design_type = 'Initial' 372 | for params in self._env.init_sim_params: 373 | self._design_counter += 1 374 | self._env.set_new_design(params) 375 | self.initialize_episode() 376 | 377 | # Reinforcement Learning 378 | for _ in range(iterations): 379 | self.single_iteration() 380 | -------------------------------------------------------------------------------- /data_notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 5, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Import Libraries and define functions\n", 10 | "import csv\n", 11 | "import numpy as np\n", 12 | "from matplotlib import pyplot as plt\n", 13 | "plt.rcParams['figure.figsize'] = [15, 5]\n", 14 | "plt.rcParams.update({'font.size': 22})\n", 15 | "\n", 16 | "def set_style():\n", 17 | " plt.minorticks_on()\n", 18 | " plt.grid(b=True, which='minor', color='white', linestyle='-', alpha=0.4)\n", 19 | " plt.style.use('ggplot')\n", 20 | " plt.rcParams.update({'font.size': 22})\n", 21 | " legend = plt.gca().legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3, ncol=2, mode=\"expand\", borderaxespad=0., frameon = 1)\n", 22 | " frame = legend.get_frame()\n", 23 | " frame.set_facecolor('white')\n", 24 | " frame.set_edgecolor('white')\n", 25 | "\n", 26 | "\n", 27 | "def save_fig(file_name):\n", 28 | " plt.savefig(file_name, dpi=300, facecolor='w', edgecolor='w',\n", 29 | " orientation='portrait', papertype=None, format=None,\n", 30 | " transparent=False, bbox_inches='tight', pad_inches=0.02,\n", 31 | " frameon=None, metadata=None)\n", 32 | "\n", 33 | "\n", 34 | "def read_file(path):\n", 35 | " with open(path, 'r') as fd:\n", 36 | " cwriter = csv.reader(fd)\n", 37 | " data = []\n", 38 | " skipped_first_row = False\n", 39 | " for row in cwriter:\n", 40 | " if not skipped_first_row:\n", 41 | " skipped_first_row = True\n", 42 | " continue\n", 43 | " row_tmp = row\n", 44 | " for idx in range(len(row_tmp)):\n", 45 | " row_tmp[idx] = float(row_tmp[idx])\n", 46 | " data.append(row_tmp)\n", 47 | " config = data[0]\n", 48 | " data = data[1]\n", 49 | " return data, config\n", 50 | "\n", 51 | "\n", 52 | "def read_data_folder(path):\n", 53 | " data_max = []\n", 54 | " data_all = []\n", 55 | " configs = []\n", 56 | " for idx in range(1, DESIGNS+1):\n", 57 | " data, config = read_file(path + '{}.csv'.format(idx))\n", 58 | " for d in data:\n", 59 | " data_all.append(d)\n", 60 | " data_max.append(np.amax(data))\n", 61 | " data = np.array(data)\n", 62 | " data_all.append(data)\n", 63 | " configs.append(config)\n", 64 | " return data_all, data_max, configs\n", 65 | " \n", 66 | "\n", 67 | "def compute_mean_std(paths):\n", 68 | " data_max_ar = []\n", 69 | " data_top_mean_ar = []\n", 70 | " configs_ar = []\n", 71 | " data_all_ar = []\n", 72 | " for p in paths:\n", 73 | " data_all, data_max, configs = read_data_folder(p)\n", 74 | " data_all_ar.append(data_all)\n", 75 | " data_max_ar.append(data_max)\n", 76 | " configs_ar.append(configs)\n", 77 | " data_max_mean = np.mean(data_max_ar, axis=0)\n", 78 | " data_max_std = np.std(data_max_ar, axis=0)\n", 79 | " return data_max_mean, data_max_std\n", 80 | "\n", 81 | "\n", 82 | "def plot_data_means(mean_data, std_data, color, label='', plot_random_std=False):\n", 83 | " x = np.arange(1, len(mean_data) + 1)\n", 84 | " init_data_mean = mean_data[0:INITIAL_DESIGNS+1]\n", 85 | " init_data_std = std_data[0:INITIAL_DESIGNS+1]\n", 86 | " init_x = x[0:INITIAL_DESIGNS+1]\n", 87 | " optim_data_mean = mean_data[INITIAL_DESIGNS::2]\n", 88 | " optim_data_std = std_data[INITIAL_DESIGNS::2]\n", 89 | " optim_x = x[INITIAL_DESIGNS::2]\n", 90 | " random_data_mean = mean_data[INITIAL_DESIGNS+1::2]\n", 91 | " random_data_std = std_data[INITIAL_DESIGNS+1::2]\n", 92 | " random_x = x[INITIAL_DESIGNS+1::2]\n", 93 | " plt.plot(init_x, init_data_mean, color=color, alpha=0.3, linewidth=2.0)\n", 94 | " plt.fill_between(init_x, init_data_mean - init_data_std, init_data_mean + init_data_std, facecolor=color, alpha=0.1)\n", 95 | " plt.plot(optim_x, optim_data_mean, color=color, label=label, linewidth=2.0)\n", 96 | " plt.fill_between(optim_x, optim_data_mean - optim_data_std, optim_data_mean + optim_data_std, facecolor=color, alpha=0.2)\n", 97 | " #plt.plot(random_x, random_data_mean, color=color, linestyle='--')\n", 98 | " if plot_random_std:\n", 99 | " plt.fill_between(random_x, random_data_mean - random_data_std, random_data_mean + random_data_std, facecolor=color, alpha=0.2)\n", 100 | " set_style()\n", 101 | "\n", 102 | " \n", 103 | "def plot_data_means_optim_random(mean_data, std_data, color, label=''):\n", 104 | " x = np.arange(1, len(mean_data) + 1)\n", 105 | " init_data_mean = mean_data[0:INITIAL_DESIGNS+1]\n", 106 | " init_data_std = std_data[0:INITIAL_DESIGNS+1]\n", 107 | " init_x = x[0:INITIAL_DESIGNS+1]\n", 108 | " optim_data_mean = mean_data[INITIAL_DESIGNS::2]\n", 109 | " optim_data_std = std_data[INITIAL_DESIGNS::2]\n", 110 | " optim_x = x[INITIAL_DESIGNS::2]\n", 111 | " random_data_mean = mean_data[INITIAL_DESIGNS+1::2]\n", 112 | " random_data_std = std_data[INITIAL_DESIGNS+1::2]\n", 113 | " random_x = x[INITIAL_DESIGNS+1::2]\n", 114 | " plt.plot(init_x, init_data_mean, color=color, alpha=0.3, linewidth=2.0)\n", 115 | " plt.fill_between(init_x, init_data_mean - init_data_std, init_data_mean + init_data_std, facecolor=color, alpha=0.1)\n", 116 | " plt.plot(optim_x, optim_data_mean, color=color, linewidth=2.0, label=label)\n", 117 | " plt.fill_between(optim_x, optim_data_mean - optim_data_std, optim_data_mean + optim_data_std, facecolor=color, alpha=0.2)\n", 118 | " plt.plot(random_x, random_data_mean, color=color, label=label + ' Random Exploration', linestyle='--', linewidth=2.0)\n", 119 | " plt.fill_between(random_x, random_data_mean - random_data_std, random_data_mean + random_data_std, facecolor=color, alpha=0.2)\n", 120 | " set_style()\n", 121 | "\n", 122 | "\n" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 6, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "# Change these parameters as required\n", 132 | "INITIAL_DESIGNS = 5\n", 133 | "DESIGNS = 50\n", 134 | "\n", 135 | "# Proposed method using data-driven design optimization\n", 136 | "# Change folder names\n", 137 | "EXPERIMENT_FOLDERS_1 = [\n", 138 | " 'data_exp_sac_pso_batch/Mon_Oct_21_05:47:55_2019__e37381f2',\n", 139 | " 'data_exp_sac_pso_batch/Mon_Oct_21_05:48:30_2019__0041f8c5',\n", 140 | " 'data_exp_sac_pso_batch/Mon_Oct_21_05:49:00_2019__5e2e3460',\n", 141 | " 'data_exp_sac_pso_batch/Sun_Oct_20_23:36:49_2019__5db36c30',\n", 142 | " 'data_exp_sac_pso_batch/Sun_Oct_20_23:37:06_2019__30302df7',\n", 143 | "]\n", 144 | "\n", 145 | "# Baseline using simulations for the evaluation of design candidates\n", 146 | "# Change folder names\n", 147 | "EXPERIMENTS_FOLDERS_2 = [\n", 148 | " 'data_exp_sac_pso_sim/Wed_Oct_23_05:49:05_2019__32509883',\n", 149 | " 'data_exp_sac_pso_sim/Wed_Oct_23_05:49:22_2019__6878474d',\n", 150 | " 'data_exp_sac_pso_sim/Wed_Oct_23_05:51:21_2019__36c0fafc',\n", 151 | " 'data_exp_sac_pso_sim/Wed_Oct_23_05:51:33_2019__b21f861b',\n", 152 | " 'data_exp_sac_pso_sim/Wed_Oct_23_05:51:45_2019__cc74a513',\n", 153 | "]\n" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 7, 159 | "metadata": {}, 160 | "outputs": [ 161 | { 162 | "name": "stdout", 163 | "output_type": "stream", 164 | "text": [ 165 | "Using matplotlib backend: TkAgg\n" 166 | ] 167 | } 168 | ], 169 | "source": [ 170 | "%matplotlib" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 8, 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "exp_files = ['{}/data_design_'.format(folder) for folder in EXPERIMENT_FOLDERS] # Novelty Search + PSO on Q\n", 180 | "exp_mean, exp_std = compute_mean_std(exp_files)" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 22, 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [ 189 | "# Plot performance of optimized designs and randomly selected designs\n", 190 | "plot_data_means_optim_random(exp_mean, exp_std, color='red', label='Proposed Method')\n", 191 | "plt.ylabel('Cum. Episodic Reward')\n", 192 | "plt.xlabel('Designs')\n", 193 | "plt.ylim([150,600])\n", 194 | "plt.xlim([1,DESIGNS])\n", 195 | "plt.show()\n", 196 | "#save_fig('plots_HalfCheetah_random_vs_novelty_search.pdf')" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 23, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "# Compare two experiments/methods against each other\n", 206 | "exp_files = ['{}/data_design_'.format(folder) for folder in EXPERIMENT_FOLDERS_1] # Novelty Search + PSO on Q\n", 207 | "exp_mean, exp_std = compute_mean_std(exp_files)\n", 208 | "plot_data_means(exp_mean, exp_std, color='red', label='Proposed Method')\n", 209 | "\n", 210 | "exp_files = ['{}/data_design_'.format(folder) for folder in EXPERIMENTS_FOLDERS_2] # Novelty Search + PSO on Q\n", 211 | "exp_mean, exp_std = compute_mean_std(exp_files)\n", 212 | "plot_data_means(exp_mean, exp_std, color='blue', label='Using Simulations')\n", 213 | "plt.ylabel('Cum. Episodic Reward')\n", 214 | "plt.xlabel('Designs')\n", 215 | "plt.ylim([150,600])\n", 216 | "plt.xlim([1,DESIGNS])\n", 217 | "plt.show()" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [] 226 | } 227 | ], 228 | "metadata": { 229 | "kernelspec": { 230 | "display_name": "Python 3", 231 | "language": "python", 232 | "name": "python3" 233 | }, 234 | "language_info": { 235 | "codemirror_mode": { 236 | "name": "ipython", 237 | "version": 3 238 | }, 239 | "file_extension": ".py", 240 | "mimetype": "text/x-python", 241 | "name": "python", 242 | "nbconvert_exporter": "python", 243 | "pygments_lexer": "ipython3", 244 | "version": "3.6.8" 245 | } 246 | }, 247 | "nbformat": 4, 248 | "nbformat_minor": 2 249 | } 250 | -------------------------------------------------------------------------------- /experiment_configs.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | sac_pso_batch = { 4 | 'name' : 'Experiment 1: PSO Batch', # Name of the experiment. Can be used just for reference later 5 | 'data_folder' : 'data_exp_sac_pso_batch', # Name of the folder in which data generated during the experiments are saved 6 | 'nmbr_random_designs' : 0, # Number of random designs to run after the initial design TODO: Not used atm 7 | 'iterations_init' : 300, # Number of episodes for all initial designs as provided by the environment class 8 | 'iterations_random': 100, # Number of episodes for all random designs TODO: Not used atm 9 | 'iterations' : 100, # Number of episodes to run for all designs after the initial 10 | 'design_cycles' : 55, # Number of design adaptations after the initial designs 11 | 'state_batch_size' : 32, # Size of the batch used during the design optimization process to estimate fitness of design 12 | 'initial_episodes' : 3, # Number of initial episodes for each design before we start the training of the individual networks. Useful if steps per episode is low and we want to fill the replay. 13 | 'use_gpu' : True, # Use True when GPU should be used for training and inference 14 | 'use_cpu_for_rollout': False, # TODO: Not used 15 | 'cuda_device': 0, # Which cuda device to use. Only relevant if you have more than one GPU 16 | 'exploration_strategy': 'random', # Type of design exploration to use - we do usually one greedy design optimization and one random selection of a design 17 | 'design_optim_method' : 'pso_batch', # Which design optimization method to use 18 | 'steps_per_episodes' : 1000, # Number of steps per episode 19 | 'save_networks' : True, # If True networks are checkpointed and saved for each design 20 | 'rl_method' : 'SoftActorCritic', # Which reinforcement learning method to use. 21 | 'rl_algorithm_config' : dict( # Dictonary which contains the parameters for the RL algorithm 22 | algo_params=dict( # Parameters for the RL learner for the individual networks 23 | # num_epochs=int(1), 24 | # num_steps_per_epoch=1000, 25 | # num_steps_per_eval=1, 26 | # num_updates_per_env_step=10, 27 | # num_updates_per_epoch=1000, 28 | # batch_size=256, 29 | discount=0.99, 30 | reward_scale=1.0, 31 | 32 | soft_target_tau=0.005, 33 | target_update_period=1, 34 | policy_lr=3E-4, 35 | qf_lr=3E-4, 36 | alpha=0.01, 37 | ), 38 | algo_params_pop=dict( # Parameters for the RL learner for the individual networks 39 | # num_epochs=int(1), 40 | # num_steps_per_epoch=1, 41 | # num_steps_per_eval=1, 42 | # num_updates_per_env_step=10, 43 | # num_updates_per_epoch=250, 44 | # batch_size=256, 45 | discount=0.99, 46 | reward_scale=1.0, 47 | 48 | soft_target_tau=0.005, 49 | target_update_period=1, 50 | policy_lr=3E-4, 51 | qf_lr=3E-4, 52 | alpha=0.01, 53 | ), 54 | net_size=200, # Number of neurons in hidden layer 55 | network_depth=3, # Number of hidden layers 56 | copy_from_gobal=True, # Shall we pre-initialize the individual network with the global network? 57 | indiv_updates=1000, # Number of training updates per episode for individual networks 58 | pop_updates=250, # Number of training updates per episode for population networks 59 | batch_size=256, # Batch size 60 | ), 61 | 'env' : dict( # Parameters and which environment to use 62 | env_name='HalfCheetah', # Name of environment 63 | render=False, # Use True if you want to visualize/render the environment 64 | record_video=False, # Use True if you want to record videos 65 | ), 66 | } 67 | 68 | sac_pso_sim = { 69 | 'name' : 'Experiment 2: PSO using Simulations', 70 | 'data_folder' : 'data_exp_sac_pso_sim', 71 | 'nmbr_random_designs' : 0, 72 | 'iterations_init' : 300, 73 | 'iterations_random': 100, 74 | 'iterations' : 100, 75 | 'design_cycles' : 55, 76 | 'state_batch_size' : 32, 77 | 'initial_episodes' : 3, 78 | 'use_gpu' : True, 79 | 'use_cpu_for_rollout': False, 80 | 'cuda_device': 0, 81 | 'exploration_strategy': 'random', 82 | 'design_optim_method' : 'pso_sim', 83 | 'save_networks' : True, 84 | 'rl_method' : 'SoftActorCritic', 85 | 'rl_algorithm_config' : dict( 86 | algo_params=dict( 87 | # num_epochs=int(1), 88 | # num_steps_per_epoch=1000, 89 | # num_steps_per_eval=1, 90 | # num_updates_per_env_step=10, 91 | # num_updates_per_epoch=1000, 92 | # batch_size=256, 93 | discount=0.99, 94 | reward_scale=1.0, 95 | 96 | soft_target_tau=0.005, 97 | target_update_period=1, 98 | policy_lr=3E-4, 99 | qf_lr=3E-4, 100 | alpha=0.01, 101 | ), 102 | algo_params_pop=dict( 103 | # num_epochs=int(1), 104 | # num_steps_per_epoch=1, 105 | # num_steps_per_eval=1, 106 | # num_updates_per_env_step=10, 107 | # num_updates_per_epoch=250, 108 | # batch_size=256, 109 | discount=0.99, 110 | reward_scale=1.0, 111 | 112 | soft_target_tau=0.005, 113 | target_update_period=1, 114 | policy_lr=3E-4, 115 | qf_lr=3E-4, 116 | alpha=0.01, 117 | ), 118 | net_size=200, 119 | network_depth=3, 120 | copy_from_gobal=True, 121 | indiv_updates=1000, 122 | pop_updates=250, 123 | batch_size=256, 124 | ), 125 | 'env' : dict( 126 | env_name='HalfCheetah', 127 | render=True, 128 | record_video=False, 129 | ), 130 | } 131 | 132 | config_dict = { 133 | 'sac_pso_batch' : sac_pso_batch, 134 | 'sac_pso_sim' : sac_pso_sim, 135 | } 136 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import os, sys, time 2 | import hashlib 3 | import coadapt 4 | import experiment_configs as cfg 5 | import json 6 | 7 | def main(config): 8 | 9 | # Create foldr in which to save results 10 | folder = config['data_folder'] 11 | #generate random hash string - unique identifier if we start 12 | # multiple experiments at the same time 13 | rand_id = hashlib.md5(os.urandom(128)).hexdigest()[:8] 14 | file_str = './' + folder + '/' + time.ctime().replace(' ', '_') + '__' + rand_id 15 | config['data_folder_experiment'] = file_str 16 | 17 | # Create experiment folder 18 | if not os.path.exists(file_str): 19 | os.makedirs(file_str) 20 | 21 | # Store config 22 | with open(os.path.join(file_str, 'config.json'), 'w') as fd: 23 | fd.write(json.dumps(config,indent=2)) 24 | 25 | co = coadapt.Coadaptation(config) 26 | co.run() 27 | 28 | 29 | 30 | if __name__ == "__main__": 31 | # We assume we call the program only with the name of the config we want to run 32 | # nothing too complex 33 | if len(sys.argv) > 1: 34 | config = cfg.config_dict[sys.argv[1]] 35 | else: 36 | config = cfg.config_dict['base'] 37 | main(config) 38 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | gtimer 2 | gym 3 | joblib 4 | numpy 5 | pickleshare 6 | ptyprocess 7 | pybullet 8 | pyswarms 9 | PyYAML 10 | torch 11 | torchvision 12 | xmltodict 13 | xmlutils 14 | opencv-python 15 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import rlkit.torch.pytorch_util as ptu 2 | import cv2 3 | import os 4 | from shutil import copyfile, move 5 | import time 6 | import numpy as np 7 | 8 | def move_to_cpu(): 9 | """ Set device to cpu for torch. 10 | """ 11 | ptu.set_gpu_mode(False) 12 | 13 | def move_to_cuda(config): 14 | """ Set device to CUDA and which GPU, or CPU if not set. 15 | 16 | Args: 17 | config: Dict containing the configuration. 18 | """ 19 | if config['use_gpu']: 20 | if 'cuda_device' in config: 21 | cuda_device = config['cuda_device'] 22 | else: 23 | cuda_device = 0 24 | ptu.set_gpu_mode(True, cuda_device) 25 | 26 | def copy_pop_to_ind(networks_pop, networks_ind): 27 | """ Function used to copy params from pop. networks to individual networks. 28 | 29 | The parameters of all networks in network_ind will be set to the parameters 30 | of the networks in networks_ind. 31 | 32 | Args: 33 | networks_pop: Dictonary containing the population networks. 34 | networks_ind: Dictonary containing the individual networks. These 35 | networks will be updated. 36 | """ 37 | for key in networks_pop: 38 | state_dict = networks_pop[key].state_dict() 39 | networks_ind[key].load_state_dict(state_dict) 40 | networks_ind[key].eval() 41 | 42 | def copy_policie_to_cpu(policy_cpu, policy_gpu): 43 | # not used anymore 44 | policy_dict = policy_gpu.state_dict() 45 | for key, val in policy_dict.items(): 46 | policy_dict[key] = val.cpu() 47 | policy_cpu.load_state_dict(policy_dict) 48 | policy_cpu = policy_cpu.cpu() 49 | policy_cpu.eval() 50 | return policy_cpu 51 | 52 | def copy_network(network_to, network_from, config, force_cpu=False): 53 | """ Copies networks and set them to device or cpu. 54 | 55 | Args: 56 | networks_to: Netwoks to which we want to copy (destination). 57 | networks_from: Networks from which we want to copy (source). These 58 | networks will be changed. 59 | force_cpu: Boolean, if True the desitnation nateworks will be placed on 60 | the cpu. If not the current device will be used. 61 | """ 62 | network_from_dict = network_from.state_dict() 63 | if force_cpu: 64 | for key, val in network_from_dict.items(): 65 | network_from_dict[key] = val.cpu() 66 | else: 67 | move_to_cuda(config) 68 | network_to.load_state_dict(network_from_dict) 69 | if force_cpu: 70 | network_to = network_to.to('cpu') 71 | else: 72 | network_to.to(ptu.device) 73 | network_to.eval() 74 | return network_to 75 | 76 | class BestEpisodesVideoRecorder(object): 77 | def __init__(self, path=None, max_videos=1): 78 | self._vid_path = '/tmp/videos' if path is None else path 79 | 80 | self._folder_counter = 0 81 | self._keep_n_best = max(max_videos, 1) 82 | self._record_evy_n_episodes = 5 83 | 84 | self._frame_width = 200 85 | self._frame_height = 200 86 | self._fps_per_frame = 0 87 | 88 | self.increase_folder_counter() 89 | self._create_vid_stream() 90 | self._time_start = time.time() 91 | 92 | def _episodic_reset(self): 93 | self._current_episode_reward = 0 94 | self._did_at_least_one_step = False 95 | self._step_counter = 1 96 | 97 | def reset_recorder(self): 98 | self._episode_counter = 0 99 | self._episodic_rewards = [-float('inf')] * self._keep_n_best 100 | self._episodic_reset() 101 | 102 | 103 | def increase_folder_counter(self): 104 | self._current_vid_path = os.path.join(self._vid_path, str(self._folder_counter)) 105 | self.reset_recorder() 106 | self._folder_counter += 1 107 | 108 | def step(self, env, state, reward, done): 109 | if self._episode_counter % self._record_evy_n_episodes == 0: 110 | self._current_episode_reward += reward 111 | env.camera_adjust() 112 | frame = env.render_camera_image((self._frame_width, self._frame_height)) 113 | frame = frame * 255 114 | frame = frame.astype(np.uint8) 115 | frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) 116 | self._vid_writer.write(frame) 117 | self._did_at_least_one_step = True 118 | proc_time = (time.time() - self._time_start)*1000 119 | proc_time = 1000/proc_time 120 | self._time_start = time.time() 121 | self._fps_per_frame += proc_time 122 | self._step_counter += 1 123 | 124 | def _do_video_file_rotation(self): 125 | for idx, elem in enumerate(self._episodic_rewards): 126 | if idx > 1: 127 | try: 128 | move(os.path.join(self._current_vid_path, 'video_{}.avi'.format(idx-1)), os.path.join(self._current_vid_path, 'video_{}.avi'.format(idx-2))) 129 | except: 130 | pass 131 | if self._current_episode_reward < elem: 132 | self._episodic_rewards = self._episodic_rewards[1:idx] + [self._current_episode_reward] + self._episodic_rewards[idx:] 133 | copyfile(os.path.join(self._current_vid_path, 'current_video.avi'), os.path.join(self._current_vid_path, 'video_{}.avi'.format(idx-1))) 134 | break 135 | # Will only be true in last iteration and only be hit if last element is to be moved 136 | if idx == len(self._episodic_rewards)-1: 137 | try: 138 | move(os.path.join(self._current_vid_path, 'video_{}.avi'.format(idx)), os.path.join(self._current_vid_path, 'video_{}.avi'.format(idx-1))) 139 | except: 140 | pass 141 | self._episodic_rewards = self._episodic_rewards[1:] + [self._current_episode_reward] 142 | copyfile(os.path.join(self._current_vid_path, 'current_video.avi'), os.path.join(self._current_vid_path, 'video_{}.avi'.format(idx))) 143 | 144 | 145 | def reset(self, env, state, reward, done): 146 | # final processing of data from previous episode 147 | if self._episode_counter % self._record_evy_n_episodes == 0: 148 | env.camera_adjust() 149 | self._vid_writer.release() 150 | if not os.path.exists(self._current_vid_path): 151 | os.makedirs(self._current_vid_path) 152 | if self._did_at_least_one_step and min(self._episodic_rewards) < self._current_episode_reward: 153 | self._do_video_file_rotation() 154 | print('Average FPS of last episode: {}'.format(self._fps_per_frame/self._step_counter)) 155 | 156 | self._episode_counter += 1 157 | self._episodic_reset() 158 | # set up everything for this episode if we record 159 | if self._episode_counter % self._record_evy_n_episodes == 0: 160 | self._create_vid_stream() 161 | frame = env.render_camera_image((self._frame_width, self._frame_height)) 162 | frame = frame * 255 163 | frame = frame.astype(np.uint8) 164 | frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) 165 | self._vid_writer.write(frame) 166 | 167 | self._time_start = time.time() 168 | self._fps_per_frame = 0 169 | self._step_counter = 1 170 | 171 | def _create_vid_stream(self): 172 | if not os.path.exists(self._current_vid_path): 173 | os.makedirs(self._current_vid_path) 174 | self._vid_writer = cv2.VideoWriter(os.path.join(self._current_vid_path, 'current_video.avi'), 175 | cv2.VideoWriter_fourcc('M','J','P','G'), 30, (self._frame_width, self._frame_height)) 176 | --------------------------------------------------------------------------------