├── .gitignore ├── IO_dataset_torch.py ├── IO_evaluator.py ├── IO_rt1_demo.ipynb ├── IO_trainer_torch.py ├── LICENSE ├── README.md ├── __init__.py ├── img └── wechatgroup.jpg ├── maruya24_rt1 ├── .gitignore ├── LICENSE ├── README.md ├── film_efficientnet │ ├── __init__.py │ ├── efficientnet_checkpoints │ │ ├── efficientnetb3.pth │ │ ├── efficientnetb3_notop.pth │ │ └── imagenet_classes.json │ ├── film_conditioning_layer.py │ ├── film_efficientnet_encoder.py │ ├── film_efficientnet_encoder_test.py │ ├── preprocessors.py │ ├── pretrained_efficientnet_encoder.py │ └── pretrained_efficientnet_encoder_test.py ├── tokenizers │ ├── __init__.py │ ├── action_tokenizer.py │ ├── action_tokenizer_test.py │ ├── image_tokenizer.py │ ├── image_tokenizer_test.py │ ├── token_learner.py │ ├── token_learner_test.py │ └── utils.py ├── transformer.py ├── transformer_network.py ├── transformer_network_test.py ├── transformer_network_test_set_up.py └── transformer_test.py ├── multi.sh ├── requirements.txt ├── robots.py ├── sim_env.py ├── train_config.json ├── urdf └── cube │ ├── cube.obj │ ├── cube.png │ └── cube.urdf ├── util ├── __init__.py └── misc.py └── vis.mp4 /.gitignore: -------------------------------------------------------------------------------- 1 | *__pycache__ -------------------------------------------------------------------------------- /IO_dataset_torch.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import glob 4 | from PIL import Image 5 | import pandas as pd 6 | import numpy as np 7 | 8 | import torch 9 | from torch.utils.data import Dataset, DataLoader 10 | 11 | from tqdm import tqdm as tqdm 12 | 13 | 14 | def build_dataset( 15 | data_path, 16 | time_sequence_length=6, 17 | predicting_next_ts=True, 18 | num_train_episode=200, 19 | num_val_episode=100, 20 | cam_view=["front"], 21 | language_embedding_size=512, 22 | ): 23 | """ 24 | This function is for building the training and validation dataset 25 | 26 | Parameters: 27 | - data_path(str): locates the path where the dataset is stored 28 | the dataset path should have the following file structures: 29 | - [robotname]_[taskname] 30 | - [cam_view_0] 31 | - data_000 32 | - rgb # where those image stored 33 | - image_001.png 34 | - image_002.png 35 | - ... 36 | - results.csv # robot actions stored 37 | - results_raw.csv # joint and target object position stored 38 | - data_001 39 | - ... 40 | - [cam_view_1] 41 | - data_000 42 | - data_001 43 | - ... 44 | - ... 45 | - time_sequence_length(int) : number of history length input for RT-1 model, 46 | 6 means current frame image and past 5 frames of images will be packed and input to RT-1 47 | - predicting_next_ts(bool) : in our dataset's results.csv and results_raw.csv, we stored current frame's action and joint status. 48 | if we want to predict next frame's action, this option needs to be True and result in the 1 step offset reading on csv files 49 | this differs between the samplings method of different dataset. 50 | - num_train_episode(int) : specifies numbers of training episodes 51 | - num_train_episode(int) : specifies numbers of validation episodes 52 | - cam_view(list of strs) : camera views used for training. 53 | 54 | Returns: 55 | - train_dataset(torch.utils.data.Dataset) 56 | - val_dataset(torch.utils.data.Dataset) 57 | """ 58 | 59 | with open(os.path.join(data_path, cam_view[0], "dataset_info.json"), "r") as f: 60 | info = json.load(f) 61 | episode_length = info["episode_length"] 62 | episode_dirs = sorted(glob.glob(data_path + "/" + cam_view[0] + "/*/")) 63 | assert len(episode_dirs) == len( 64 | episode_length 65 | ), "length of episode directories and episode length not equal, check dataset's dataset_info.json" 66 | perm_indice = torch.randperm(len(episode_dirs)).tolist() 67 | dirs_lengths = dict( 68 | episode_dirs=np.array(episode_dirs)[perm_indice], 69 | episode_length=np.array(episode_length)[perm_indice], 70 | ) 71 | train_episode_dirs = dirs_lengths["episode_dirs"][:num_train_episode] 72 | train_episode_length = dirs_lengths["episode_length"][:num_train_episode] 73 | val_episode_dirs = dirs_lengths["episode_dirs"][ 74 | num_train_episode : num_train_episode + num_val_episode 75 | ] 76 | val_episode_length = dirs_lengths["episode_length"][ 77 | num_train_episode : num_train_episode + num_val_episode 78 | ] 79 | 80 | train_dataset = IODataset( 81 | episode_dirs=train_episode_dirs, 82 | episode_length=train_episode_length, 83 | time_sequence_length=time_sequence_length, 84 | predicting_next_ts=predicting_next_ts, 85 | cam_view=cam_view, 86 | language_embedding_size=language_embedding_size, 87 | ) 88 | val_dataset = IODataset( 89 | episode_dirs=val_episode_dirs, 90 | episode_length=val_episode_length, 91 | time_sequence_length=time_sequence_length, 92 | predicting_next_ts=predicting_next_ts, 93 | cam_view=cam_view, 94 | language_embedding_size=language_embedding_size, 95 | ) 96 | return train_dataset, val_dataset 97 | 98 | 99 | class IODataset(Dataset): 100 | def __init__( 101 | self, 102 | episode_dirs, 103 | episode_length, 104 | time_sequence_length=6, 105 | predicting_next_ts=True, 106 | cam_view=["front"], 107 | robot_dof=9, 108 | language_embedding_size=512, 109 | ): 110 | self._cam_view = cam_view 111 | self.predicting_next_ts = predicting_next_ts 112 | self._time_sequence_length = time_sequence_length 113 | self._episode_length = episode_length 114 | self.querys = self.generate_history_steps(episode_length) 115 | self._episode_dirs = episode_dirs 116 | self.keys_image = self.generate_fn_lists(self._episode_dirs) 117 | self.values, self.num_zero_history_list = self.organize_file_names() 118 | self._robot_dof = robot_dof 119 | self._language_embedding_size = language_embedding_size 120 | 121 | def generate_fn_lists(self, episode_dirs): 122 | """ 123 | This function globs all the image path in the dataset 124 | Parameters: 125 | - episode_dirs(list of strs): directories where image is stored, etc: 126 | - [robotname]_[taskname] 127 | - [cam_view_0] 128 | - data_000 129 | - data_001 130 | - data_002 131 | - ... 132 | Returns: 133 | - keys(list of strs): all globbed image filename in a list 134 | """ 135 | keys = [] 136 | for ed in episode_dirs: 137 | image_files = sorted(glob.glob(f"{ed}rgb/*.png")) 138 | keys.append(image_files) 139 | return keys 140 | 141 | def generate_history_steps(self, episode_length): 142 | """ 143 | This function generates the step for current frame and history frames 144 | Parameters: 145 | - episode_length(list of int): number of episode lengths for each episode 146 | Returns: 147 | - keys(list of tensors): history steps for each data 148 | """ 149 | querys = [] 150 | for el in episode_length: 151 | q = torch.cat( 152 | ( 153 | [ 154 | torch.arange(el)[:, None] - i 155 | for i in range(self._time_sequence_length) 156 | ] 157 | ), 158 | dim=1, 159 | ) 160 | q[q < 0] = -1 161 | querys.append(q.flip(1)) 162 | return querys 163 | 164 | def organize_file_names(self): 165 | """ 166 | This function generates the infor for each data, including how many zeros were padded 167 | data's episode directory, image filenames, and all the other parameters for data 168 | Parameters: 169 | - 170 | Returns: 171 | - values(list): each value including 172 | - num_zero_history: when we read at initial frames of a episode, it doesn't have history, 173 | then we need to pad zeros to make sure these aligns to data with history frames. 174 | this number specified how many frames of zero is padded 175 | - episode_dir: the episode directory where this data is stored 176 | - img_fns = img_fns: the images this data should read 177 | - query_index = index of this data in this episode 178 | - episode_length = total length of this episode 179 | """ 180 | values = [] 181 | num_zero_history_list = [] 182 | for i, (query, key_img, ed) in enumerate( 183 | zip(self.querys, self.keys_image, self._episode_dirs) 184 | ): 185 | for q in query: 186 | img_fns = [] 187 | for img_idx in q: 188 | img_fns.append(key_img[img_idx] if img_idx >= 0 else None) 189 | num_zero_history = (q < 0).sum() 190 | num_zero_history_list.append(int(num_zero_history)) 191 | values.append( 192 | dict( 193 | num_zero_history=num_zero_history, 194 | episode_dir=ed, 195 | img_fns=img_fns, 196 | query_index=q, 197 | episode_length=self._episode_length[i], 198 | ) 199 | ) 200 | return values, num_zero_history_list 201 | 202 | def __len__(self): 203 | return len(self.values) 204 | 205 | def __getitem__(self, idx): 206 | value = self.values[idx] 207 | img = self.get_image(value["img_fns"]) 208 | lang = self.get_language_instruction() 209 | ee_pos_cmd, ee_rot_cmd, gripper_cmd, joint, tar_obj_pose = self.get_ee_data( 210 | value["episode_dir"], value["query_index"], value["num_zero_history"] 211 | ) 212 | terminate_episode = self.get_episode_status( 213 | value["episode_length"], value["query_index"], value["num_zero_history"] 214 | ) 215 | sample_obs = { 216 | "image": img.float().permute(0, 3, 1, 2), 217 | # we permute the channel dimension to the second dimension to cope with rt1's convolution layers 218 | "natural_language_embedding": torch.tensor(lang).float(), 219 | "joint_position": torch.tensor(joint).float(), 220 | "tar_obj_pose": torch.tensor(tar_obj_pose).float(), 221 | } 222 | sample_action = { 223 | "world_vector": torch.tensor(ee_pos_cmd).float(), 224 | "rotation_delta": torch.tensor(ee_rot_cmd).float(), 225 | "gripper_closedness_action": torch.tensor(gripper_cmd).float(), 226 | "terminate_episode": torch.tensor(terminate_episode.argmax(-1)).long(), 227 | } 228 | 229 | return sample_obs, sample_action 230 | 231 | def get_image(self, img_fns): 232 | """ 233 | This function generates the step for current frame and history frames 234 | Parameters: 235 | - episode_length(list of int): number of episode lengths for each episode 236 | Returns: 237 | - keys(list of tensors): history steps for each data 238 | """ 239 | imgs = [] 240 | for img_fn in img_fns: 241 | img_multi_view = [] 242 | for c_v in self._cam_view: 243 | img_multi_view.append( 244 | np.array(Image.open(img_fn.replace(self._cam_view[0], c_v))) 245 | if img_fn != None 246 | else np.zeros_like(Image.open(img_fns[-1])) 247 | ) 248 | img = np.concatenate(img_multi_view, axis=0) 249 | imgs.append(torch.from_numpy(img[:, :, :3])) 250 | return torch.stack(imgs, dim=0) / 255.0 251 | 252 | def get_ee_data(self, episode_dir, query_index, pad_step_num): 253 | """ 254 | This function reads the csvs for ground truth robot actions, robot joint status and target object position and orientation: 255 | Parameters: 256 | - episode_dir(str): directory where the results.csv and results_raw.csv is stored 257 | - query_index(tensor): index where exact data is read, padded zeros has a special index of -1 258 | - pad_step_num(int): how many timestep of zeros is padded 259 | Returns: 260 | - ee_pos_cmd(np.array): stores the ground truth command for robot move in position(x, y, z) 261 | - ee_rot_cmd(np.array): stores the ground truth command for robot move in rotation(rx, ry, rz) 262 | - gripper_cmd(np.array): stores the ground truth command for robot's gripper open or close 263 | - joint(np.array): stores the robot's joint status, which can be used to calculate ee's position 264 | - tar_obj_pose: stores the target object's position and orientation (x, y, z, rx, ry, rz) 265 | """ 266 | start_idx = query_index[(query_index > -1).nonzero()[0, 0]] 267 | end_idx = query_index[-1] 268 | visual_data_filename = f"{episode_dir}result.csv" 269 | raw_data = pd.read_csv(visual_data_filename) 270 | visual_data_filename_raw = f"{episode_dir}result_raw.csv" 271 | raw_raw_data = pd.read_csv(visual_data_filename_raw) 272 | if self.predicting_next_ts: 273 | """ 274 | if predicting next timestep's results, then we shift first column to last column 275 | """ 276 | first_row = raw_data.iloc[0] 277 | raw_data = raw_data.iloc[1:] 278 | raw_data = pd.concat([raw_data, first_row.to_frame().T], ignore_index=True) 279 | first_row = raw_raw_data.iloc[0] 280 | raw_raw_data = raw_raw_data.iloc[1:] 281 | raw_raw_data = pd.concat( 282 | [raw_raw_data, first_row.to_frame().T], ignore_index=True 283 | ) 284 | # position has 3 dimensions [x, y, z] 285 | ee_pos_cmd = np.zeros([pad_step_num, 3]) 286 | # rotation has 3 dimensions [rx, ry, rz] 287 | ee_rot_cmd = np.zeros([pad_step_num, 3]) 288 | # gripper has 1 dimension which controls open/close of the gripper 289 | gripper_cmd = np.zeros([pad_step_num, 1]) 290 | # we are using Franka Panda robot, whose has 9 dofs of joint 291 | joint = np.zeros([pad_step_num, 9]) 292 | # tar_obj_pose is 7 dimension [x,y,z,rx,ry,rz,w] 293 | # however, in this version we are not using tar_obj_pose 294 | tar_obj_pose = np.zeros([pad_step_num, 7]) 295 | ee_pos_cmd = np.vstack( 296 | ( 297 | ee_pos_cmd, 298 | raw_data.loc[ 299 | start_idx:end_idx, 300 | [f"ee_command_position_{ax}" for ax in ["x", "y", "z"]], 301 | ].to_numpy(), 302 | ) 303 | ) 304 | ee_rot_cmd = np.vstack( 305 | ( 306 | ee_rot_cmd, 307 | raw_data.loc[ 308 | start_idx:end_idx, 309 | [f"ee_command_rotation_{ax}" for ax in ["x", "y", "z"]], 310 | ].to_numpy(), 311 | ) 312 | ) 313 | joint = np.vstack( 314 | ( 315 | joint, 316 | raw_raw_data.loc[ 317 | start_idx:end_idx, 318 | [f"joint_{str(ax)}" for ax in range(self._robot_dof)], 319 | ].to_numpy(), 320 | ) 321 | ) 322 | tar_obj_pose = np.vstack( 323 | ( 324 | tar_obj_pose, 325 | raw_raw_data.loc[ 326 | start_idx:end_idx, 327 | [ 328 | f"tar_obj_pose_{ax}" 329 | for ax in ["x", "y", "z", "rx", "ry", "rz", "rw"] 330 | ], 331 | ].to_numpy(), 332 | ) 333 | ) 334 | gripper_data = ( 335 | raw_data.loc[start_idx:end_idx, "gripper_closedness_commanded"] 336 | .to_numpy() 337 | .reshape(-1, 1) 338 | ) 339 | gripper_cmd = np.vstack((gripper_cmd, gripper_data)) 340 | return ee_pos_cmd, ee_rot_cmd, gripper_cmd, joint, tar_obj_pose 341 | 342 | def get_language_instruction(self): 343 | """ 344 | since we are only training single-task model, this language embedding is set as constant. 345 | modify it to language instructions if multi-task model is training. 346 | it seems that google directly loads embedded language instruction from its language model 347 | this results in our loading a language embedding instead of language sentence 348 | """ 349 | return np.zeros([self._time_sequence_length, self._language_embedding_size]) 350 | 351 | def get_episode_status(self, episode_length, query_index, pad_step_num): 352 | """ 353 | This function is to find whether current frame and history frame is start or middle or end of the episode: 354 | Parameters: 355 | - episode_length(int): length of current episode 356 | - query_index(tensor): index where exact data is read, padded zeros has a special index of -1 357 | - pad_step_num(int): how many timestep of zeros is padded 358 | Returns: 359 | - episode_status(np.array): specifies status(start, middle or end) of each frame in history 360 | """ 361 | start_idx = query_index[(query_index > -1).nonzero()[0, 0]] 362 | end_idx = query_index[-1] 363 | episode_status = np.zeros([pad_step_num, 4], dtype=np.int32) 364 | episode_status[:, -1] = 1 365 | for i in range(start_idx, end_idx + 1): 366 | status = np.array( 367 | [i == 0, i not in [0, episode_length - 2], i == episode_length - 2, 0], 368 | dtype=np.int32, 369 | ) 370 | episode_status = np.vstack((episode_status, status)) 371 | if pad_step_num > 0: 372 | episode_status[pad_step_num] = np.array([1, 0, 0, 0]) 373 | return episode_status 374 | 375 | 376 | def load_config_from_json(json_path): 377 | with open(json_path, "r") as f: 378 | config = json.load(f) 379 | return config 380 | 381 | 382 | if __name__ == "__main__": 383 | args = load_config_from_json("train_config.json") 384 | dataset, _ = build_dataset( 385 | data_path=args["data_path"], 386 | time_sequence_length=args["time_sequence_length"], 387 | predicting_next_ts=args["predicting_next_ts"], 388 | num_train_episode=args["num_train_episode"], 389 | num_val_episode=args["num_val_episode"], 390 | cam_view=args["cam_view"], 391 | language_embedding_size=args["network_configs"]["language_embedding_size"], 392 | ) 393 | # dataset = dataset[:100] 394 | 395 | wv_x = [] 396 | wv_y = [] 397 | wv_z = [] 398 | rd_x = [] 399 | rd_y = [] 400 | rd_z = [] 401 | from maruya24_rt1.tokenizers import action_tokenizer 402 | from gym import spaces 403 | from collections import OrderedDict 404 | import matplotlib.pyplot as plt 405 | 406 | output_tensor_space = spaces.Dict( 407 | OrderedDict( 408 | [ 409 | ("terminate_episode", spaces.Discrete(4)), 410 | ( 411 | "world_vector", 412 | spaces.Box(low=-0.025, high=0.025, shape=(3,), dtype=np.float32), 413 | ), 414 | ( 415 | "rotation_delta", 416 | spaces.Box( 417 | low=-np.pi / 20, 418 | high=np.pi / 20, 419 | shape=(3,), 420 | dtype=np.float32, 421 | ), 422 | ), 423 | ( 424 | "gripper_closedness_action", 425 | spaces.Box(low=-1.0, high=1.0, shape=(1,), dtype=np.float32), 426 | ), 427 | ] 428 | ) 429 | ) 430 | at = action_tokenizer.RT1ActionTokenizer( 431 | output_tensor_space, vocab_size=256 # action space 432 | ) 433 | dataloader = DataLoader(dataset, batch_size=64, num_workers=64) 434 | for i, batch in tqdm(enumerate(dataloader), total=len(dataset) // 64): 435 | batch = at.tokenize(batch[1]) 436 | for i in range(batch.size(0)): 437 | wv_x.append(int(batch[i, -1, 1])) 438 | wv_y.append(int(batch[i, -1, 2])) 439 | wv_z.append(int(batch[i, -1, 3])) 440 | rd_x.append(int(batch[i, -1, 4])) 441 | rd_y.append(int(batch[i, -1, 5])) 442 | rd_z.append(int(batch[i, -1, 6])) 443 | # print(batch) 444 | plt.subplot(2, 3, 1) 445 | plt.title("world_vector_x") 446 | plt.hist(wv_x, bins=256, range=(0, 256)) 447 | plt.xlim(0, 256) 448 | plt.subplot(2, 3, 2) 449 | plt.title("world_vector_y") 450 | plt.hist(wv_y, bins=256, range=(0, 256)) 451 | plt.xlim(0, 256) 452 | plt.subplot(2, 3, 3) 453 | plt.title("world_vector_z") 454 | plt.hist(wv_z, bins=256, range=(0, 256)) 455 | plt.xlim(0, 256) 456 | plt.subplot(2, 3, 4) 457 | plt.title("rotation_delta_x") 458 | plt.hist(rd_x, bins=256, range=(0, 256)) 459 | plt.xlim(0, 256) 460 | plt.subplot(2, 3, 5) 461 | plt.title("rotation_delta_y") 462 | plt.hist(rd_y, bins=256, range=(0, 256)) 463 | plt.xlim(0, 256) 464 | plt.subplot(2, 3, 6) 465 | plt.title("rotation_delta_z") 466 | plt.hist(rd_z, bins=256, range=(0, 256)) 467 | plt.xlim(0, 256) 468 | plt.show() 469 | -------------------------------------------------------------------------------- /IO_evaluator.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pybullet as p 3 | import pybullet_data as pdata 4 | import numpy as np 5 | import argparse 6 | import random 7 | import time 8 | import os 9 | import cv2 10 | from sim_env import get_cam_view_img 11 | from skvideo.io import vwrite 12 | import torch 13 | from gym import spaces 14 | from collections import OrderedDict 15 | import copy 16 | from func_timeout import func_set_timeout 17 | import func_timeout 18 | from tqdm import trange 19 | import json 20 | 21 | from maruya24_rt1.transformer_network import TransformerNetwork 22 | from maruya24_rt1.transformer_network_test_set_up import state_space_list 23 | from maruya24_rt1.tokenizers.utils import batched_space_sampler 24 | from maruya24_rt1.tokenizers.utils import np_to_tensor 25 | from sim_env import * 26 | import util.misc as utils 27 | import torch.nn.functional as F 28 | from torchvision import transforms 29 | 30 | 31 | class SimTester: 32 | def __init__(self, task_name, max_step=200, device="cuda", sim_interval=0.005): 33 | p.connect(p.DIRECT) 34 | p.configureDebugVisualizer(p.COV_ENABLE_GUI, 0) 35 | p.setPhysicsEngineParameter(maxNumCmdPer1ms=1000) 36 | p.resetDebugVisualizerCamera( 37 | cameraDistance=1.5, 38 | cameraPitch=-20, 39 | cameraYaw=180, 40 | cameraTargetPosition=[0, 0, 0.6], 41 | ) 42 | p.setAdditionalSearchPath(pdata.getDataPath()) 43 | p.setGravity(0, 0, -9.8) 44 | self.task_env = globals()[TASKS[task_name]]() 45 | self.max_step = max_step 46 | self.device = torch.device(device) 47 | self.sim_interval = sim_interval 48 | 49 | def test_step(self, delta_ee_pos, delta_ee_rot, gripper_cmd, cam_views): 50 | try: 51 | self.execute_action(delta_ee_pos, delta_ee_rot, gripper_cmd, cam_views) 52 | except func_timeout.exceptions.FunctionTimedOut: 53 | # print("robot stuck") 54 | pass 55 | self.update_obs(cam_views) 56 | 57 | @func_set_timeout( 58 | 0.5 59 | ) # Timeout set to handle cases where the robot may get stuck, allowing the next action to execute 60 | def execute_action( 61 | self, delta_ee_pos, delta_ee_rot, gripper_cmd, cam_views, relative=True 62 | ): 63 | # Calculate the new end-effector pose based on relative or absolute movements 64 | if relative: 65 | # Obtain the current end-effector pose 66 | last_ee_pose = p.getLinkState( 67 | self.task_env.robot.robot_id, self.task_env.robot.ee_index 68 | )[0:2] 69 | 70 | # Calculate the target end-effector pose based on the relative movement 71 | target_ee_pose = p.multiplyTransforms( 72 | last_ee_pose[0], 73 | last_ee_pose[1], 74 | delta_ee_pos, 75 | p.getQuaternionFromEuler(delta_ee_rot), 76 | ) 77 | else: 78 | # Handling for absolute movements, if implemented 79 | raise NotImplementedError("Absolute movement handling not implemented yet") 80 | 81 | # Calculate inverse kinematics to obtain joint positions for the target end-effector pose 82 | tar_j_pos = self.task_env.robot.calc_ik([target_ee_pose[0], target_ee_pose[1]]) 83 | 84 | # Move the robot's arm to the calculated joint positions 85 | self.task_env.robot.move_arm(tar_j_pos) 86 | 87 | # Control the gripper based on the provided command 88 | self.task_env.robot.move_gripper(gripper_cmd[0]) 89 | 90 | # Ensure the robot reaches the target joint positions within a threshold 91 | while not (self.task_env.robot.is_j_arrived(tar_j_pos, threshold=1e-3)): 92 | p.stepSimulation() # Step the simulation 93 | time.sleep(self.sim_interval) # Wait for the simulation to progress 94 | 95 | def reset_tester(self, cam_views): 96 | self.task_env.reset_env() 97 | self.reset_obs(cam_views) 98 | self.gripper_triggered = False 99 | self.episode_succ = [False, False] 100 | 101 | def reset_obs(self, cam_views): 102 | self.imgs = [torch.zeros(1, 3, 256 * len(cam_views), 320)] * self.time_sequence_length 103 | self.joints = [torch.zeros(1, 9)] * self.time_sequence_length 104 | 105 | def update_obs(self, cam_views): 106 | # Obtain observation images from specified camera views 107 | img = self.get_obs_img(cam_views) 108 | 109 | # Remove the oldest image from the image collection and add the new observation image 110 | self.imgs.pop(0) 111 | self.imgs.append(torch.from_numpy(img).permute(2, 0, 1).unsqueeze(0)) 112 | 113 | # Retrieve the current joint status 114 | joint = self.get_joint_status() 115 | 116 | # Remove the oldest joint status and add the current joint status to the collection 117 | self.joints.pop(0) 118 | self.joints.append(joint.unsqueeze(0)) 119 | 120 | def get_joint_status(self): 121 | """ 122 | Obtain the current joint status of the robot. 123 | 124 | Returns: 125 | - torch.tensor: Tensor containing the current joint states of the robot. 126 | """ 127 | return torch.tensor( 128 | [ 129 | s[0] 130 | for s in p.getJointStates( 131 | self.task_env.robot.robot_id, 132 | list(range(self.task_env.robot.arm_dof)) + [9, 10], 133 | ) 134 | ] 135 | ) 136 | 137 | def get_obs_img(self, cam_views): 138 | """ 139 | Generate observation images from specified camera views. 140 | 141 | Args: 142 | - cam_views (list): List of camera views. 143 | 144 | Returns: 145 | - np.ndarray: Observation image array. 146 | """ 147 | imgs = [] 148 | for cview in cam_views: 149 | imgs.append( 150 | get_cam_view_img( 151 | cview, self.task_env.robot.robot_id, self.task_env.robot.ee_index 152 | ) 153 | ) 154 | cur_img = np.concatenate(imgs, axis=0) 155 | return cur_img / 255.0 156 | 157 | def check_episode_succ(self): 158 | """ 159 | Check if the episode is successful based on end-effector and target object positions. 160 | 161 | Returns: 162 | - float: Difference in z-axis positions between the end-effector and the target object. 163 | """ 164 | ee_z_pos = p.getLinkState( 165 | self.task_env.robot.robot_id, self.task_env.robot.ee_index 166 | )[0][2] 167 | tar_obj_z_pos = self.task_env.tar_obj_pose[0][2] 168 | contact_points = p.getContactPoints( 169 | self.task_env.robot.robot_id, self.task_env.tar_obj 170 | ) 171 | if abs(ee_z_pos - tar_obj_z_pos) < 0.035 and len(contact_points) > 0: 172 | self.episode_succ[0] = True 173 | if ee_z_pos - tar_obj_z_pos > 0.08 and len(contact_points) > 0: 174 | self.episode_succ[1] = True 175 | return ee_z_pos - tar_obj_z_pos 176 | 177 | 178 | def load_config_from_json(json_path): 179 | with open(json_path, "r") as f: 180 | config = json.load(f) 181 | return config 182 | 183 | 184 | class Evaluator: 185 | def __init__(self, task_name) -> None: 186 | self.sim_tester = SimTester(task_name) 187 | self.pandaEndEffectorIndex = 11 188 | args = load_config_from_json("train_config.json") 189 | self.args = args 190 | 191 | def write_results(self, fn, episode_succ): 192 | with open(fn, "a") as f: 193 | f.write(str(int(episode_succ[0])) + str(int(episode_succ[1])) + "\n") 194 | f.close() 195 | 196 | def calc_fk(self, obs): 197 | """ 198 | get end effector's position and orientation in world coordinate system 199 | Parameter: 200 | - obs(dict): observations with joints status 201 | Returns: 202 | - obs(dict): position and orientation will be stored in obs 203 | """ 204 | ee_position, ee_orientation = [], [] 205 | for joint in obs["joint_position"]: 206 | position, orientation = [], [] 207 | for i in range(len(joint)): 208 | p.resetJointStatesMultiDof( 209 | self.sim_tester.task_env.robot.robot_id, 210 | range(9), 211 | [[pos] for pos in joint[i]], 212 | ) 213 | pos, orn = p.getLinkState(self.sim_tester.task_env.robot.robot_id, 11)[ 214 | :2 215 | ] 216 | pos = list(pos) 217 | pos.append(0) 218 | position.append(torch.FloatTensor(pos)) 219 | orientation.append(torch.FloatTensor(orn)) 220 | ee_position.append(torch.stack(position)) 221 | ee_orientation.append(torch.stack(orientation)) 222 | obs["position"] = torch.stack(ee_position).to(self.device) 223 | obs["orientation"] = torch.stack(ee_orientation).to(self.device) 224 | del obs["joint_position"] 225 | return obs 226 | 227 | def inference(self, network, imgs, joints): 228 | """ 229 | Perform inference using the provided neural network on input observations. 230 | 231 | Args: 232 | - network (NeuralNetwork): Neural network model for inference. 233 | - imgs (list): List of observation images. 234 | - joints (list): List of joint positions. 235 | 236 | Returns: 237 | - list: List containing the inferred actions related to termination, world vector, rotation delta, and gripper closedness. 238 | """ 239 | # Batched space sampling for the network state 240 | network_state = batched_space_sampler(network._state_space, batch_size=1) 241 | network_state = np_to_tensor(network_state) # Convert np.ndarray to tensor 242 | for k, v in network_state.items(): 243 | network_state[k] = torch.zeros_like(v).to(self.device) 244 | 245 | output_actions = [] 246 | obs = dict() 247 | obs["image"] = torch.stack(imgs, dim=1).to( 248 | self.device 249 | ) # Stack images on the device 250 | obs["natural_language_embedding"] = torch.zeros( 251 | 1, network._time_sequence_length, network._language_embedding_size 252 | ).to( 253 | self.device 254 | ) # Prepare language embedding tensor 255 | 256 | # Stack joint positions and adjust dimensions 257 | obs["joint_position"] = torch.stack(joints).permute(1, 0, 2) 258 | 259 | # If the network uses proprioception, calculate forward kinematics 260 | if network.using_proprioception: 261 | obs = self.calc_fk(obs) 262 | 263 | with torch.no_grad(): 264 | for i_ts in range(network._time_sequence_length): 265 | ob = utils.retrieve_single_timestep(obs, i_ts) 266 | output_action, network_state = network(ob, network_state) 267 | output_actions.append(output_action) 268 | 269 | # Retrieve the final inferred action 270 | action = output_actions[-1] 271 | 272 | # Move the action to the CPU and convert it to a list for return 273 | action = utils.dict_to_device(action, torch.device("cpu")) 274 | return [ 275 | action["terminate_episode"].flatten().tolist(), 276 | action["world_vector"].flatten().tolist(), 277 | action["rotation_delta"].flatten().tolist(), 278 | action["gripper_closedness_action"].flatten().tolist(), 279 | ] 280 | 281 | def val( 282 | self, 283 | resume_from_checkpoint, 284 | ckpt_dir, 285 | proc_name, 286 | cam_views, 287 | test_num, 288 | results_fn, 289 | epoch, 290 | gpu_name, 291 | using_pos_and_orn, 292 | tar_obj_pos_rot, 293 | ): 294 | self.resume_from_checkpoint = resume_from_checkpoint 295 | 296 | checkpoint = torch.load(resume_from_checkpoint, map_location="cpu") 297 | self._action_space = checkpoint["action_space"] 298 | network_configs = self.args["network_configs"] 299 | network_configs["time_sequence_length"] = self.args["time_sequence_length"] 300 | network_configs["num_encoders"] = len(self.args["cam_view"]) 301 | network_configs["using_proprioception"] = self.args["using_proprioception"] 302 | network_configs["token_embedding_size"] = network_configs[ 303 | "token_embedding_size_per_image" 304 | ] * len(self.args["cam_view"]) 305 | del network_configs["token_embedding_size_per_image"] 306 | network_configs["input_tensor_space"] = state_space_list()[0] 307 | network_configs["output_tensor_space"] = self._action_space 308 | self.network = TransformerNetwork(**network_configs) 309 | setattr(self.sim_tester, "time_sequence_length", self.network._time_sequence_length) 310 | try: 311 | local_rank = os.environ["LOCAL_RANK"] 312 | torch.cuda.set_device(int(local_rank)) 313 | except: 314 | pass 315 | self.device = torch.device("cuda") 316 | self.network.to(self.device) 317 | self.network.load_state_dict(checkpoint["model_state_dict"]) 318 | self.network.eval() 319 | try: 320 | os.mkdir(os.path.join(ckpt_dir, "vids")) 321 | except: 322 | pass 323 | lift_count = 0 324 | for idx in range(test_num): 325 | vid_fn = os.path.join( 326 | ckpt_dir, 327 | "vids", 328 | "e" 329 | + str(epoch) 330 | + "_i" 331 | + str(idx) 332 | + "_p" 333 | + str(proc_name) 334 | + "_g" 335 | + str(gpu_name) 336 | + ".mp4", 337 | ) 338 | self.sim_tester.reset_tester(cam_views) 339 | self.sim_tester.task_env.reset_tar_obj( 340 | tar_pos_rot=tar_obj_pos_rot, random_pos_rot=False 341 | ) 342 | time.sleep(1) 343 | self.sim_tester.update_obs(cam_views) 344 | vid = [] 345 | for _ in trange(self.sim_tester.max_step): 346 | imgs = self.sim_tester.imgs 347 | joints = self.sim_tester.joints 348 | ( 349 | terminate_episode, 350 | delta_ee_pos, 351 | delta_ee_rot, 352 | gripper_cmd, 353 | ) = self.inference(self.network, imgs, joints) 354 | self.sim_tester.test_step( 355 | delta_ee_pos, delta_ee_rot, gripper_cmd, cam_views 356 | ) 357 | distance_to_target = self.sim_tester.check_episode_succ() 358 | frame = copy.deepcopy(imgs[-1][0].permute(1, 2, 0).numpy() * 255) 359 | frame_rgb = cv2.cvtColor(frame.astype(np.uint8), cv2.COLOR_BGR2RGB) 360 | text = ( 361 | "epoch: " 362 | + str(epoch) 363 | + " dis: " 364 | + str(round(distance_to_target, 3)) 365 | ) 366 | cv2.putText( 367 | frame_rgb, text, (0, 20), cv2.FONT_ITALIC, 0.75, (0, 0, 0), 2 368 | ) 369 | if self.sim_tester.episode_succ[0] == True: 370 | cv2.putText( 371 | frame_rgb, 372 | "grabbed", 373 | (0, 40), 374 | cv2.FONT_ITALIC, 375 | 0.75, 376 | (0, 255, 0), 377 | 2, 378 | ) 379 | else: 380 | cv2.putText( 381 | frame_rgb, 382 | "not grabbed", 383 | (0, 40), 384 | cv2.FONT_ITALIC, 385 | 0.75, 386 | (0, 0, 255), 387 | 2, 388 | ) 389 | if self.sim_tester.episode_succ[1] == True: 390 | cv2.putText( 391 | frame_rgb, 392 | "lifted", 393 | (0, 60), 394 | cv2.FONT_ITALIC, 395 | 0.75, 396 | (0, 255, 0), 397 | 2, 398 | ) 399 | else: 400 | cv2.putText( 401 | frame_rgb, 402 | "not lifted", 403 | (0, 60), 404 | cv2.FONT_ITALIC, 405 | 0.75, 406 | (0, 0, 255), 407 | 2, 408 | ) 409 | frame_rgb = cv2.cvtColor(frame_rgb, cv2.COLOR_BGR2RGB) 410 | vid.append(frame_rgb) 411 | if self.sim_tester.episode_succ[1] == True: 412 | lift_count += 1 413 | if lift_count >= 5: 414 | print("lifted") 415 | break 416 | self.sim_tester.check_episode_succ() 417 | self.write_results(results_fn, self.sim_tester.episode_succ) 418 | vwrite(vid_fn, vid) 419 | 420 | 421 | def test_loss( 422 | ckpt_dir, 423 | evaluator, 424 | cam_views, 425 | model_name, 426 | proc_name, 427 | epoch, 428 | gpu_name, 429 | using_pos_and_orn, 430 | num_threads, 431 | ): 432 | cam_views = cam_views.split("_") 433 | fn = os.path.join(ckpt_dir, str(epoch) + ".txt") 434 | fn_episodes = os.path.join(ckpt_dir, str(epoch) + "_val_episodes.txt") 435 | if num_threads > 0: 436 | with open(fn_episodes, "r") as f: 437 | val_episode_dirs = f.readlines() 438 | 439 | val_episode_dir = val_episode_dirs[ 440 | int(gpu_name) * int(num_threads) + int(proc_name) 441 | ].strip() 442 | 443 | visual_data_filename_raw = f"{val_episode_dir}result_raw.csv" 444 | raw_raw_data = pd.read_csv(visual_data_filename_raw) 445 | # print(raw_raw_data.iloc[0]) 446 | tar_obj_pos_rot = [ 447 | raw_raw_data.iloc[0]["tar_obj_pose_x"], 448 | raw_raw_data.iloc[0]["tar_obj_pose_y"], 449 | raw_raw_data.iloc[0]["tar_obj_pose_rx"], 450 | raw_raw_data.iloc[0]["tar_obj_pose_ry"], 451 | raw_raw_data.iloc[0]["tar_obj_pose_rz"], 452 | raw_raw_data.iloc[0]["tar_obj_pose_rw"], 453 | ] 454 | else: 455 | tar_obj_pos_rot = [0, 0, 0, 0, 0, 0, 0] 456 | utils.set_seed() 457 | evaluator.val( 458 | os.path.join(ckpt_dir, model_name), 459 | ckpt_dir=ckpt_dir, 460 | proc_name=proc_name, 461 | cam_views=cam_views, 462 | test_num=1, 463 | results_fn=fn, 464 | epoch=epoch, 465 | gpu_name=gpu_name, 466 | using_pos_and_orn=using_pos_and_orn, 467 | tar_obj_pos_rot=tar_obj_pos_rot, 468 | ) 469 | 470 | 471 | if __name__ == "__main__": 472 | parser = argparse.ArgumentParser() 473 | parser.add_argument("-t", "--task", type=str, default="pick") 474 | parser.add_argument("-d", "--ckpt_dir", type=str, default="/mnt/logs_1/1700447417") 475 | parser.add_argument("-c", "--cam_views", type=str) 476 | parser.add_argument("-m", "--model_name", type=str, default="49-checkpoint.pth") 477 | parser.add_argument("-p", "--proc_name", type=int, default=200) 478 | parser.add_argument("-e", "--epoch_num", type=str, default="0") 479 | parser.add_argument("-g", "--gpu_name", type=str, default="3") 480 | parser.add_argument("-u", "--using_pos_and_orn", type=int, default=False) 481 | parser.add_argument("-n", "--num_threads", type=int, default=0) 482 | 483 | args = parser.parse_args() 484 | print(args.proc_name) 485 | # exit() 486 | task_name = args.task 487 | evaluator = Evaluator(task_name) 488 | test_loss( 489 | ckpt_dir=args.ckpt_dir, 490 | evaluator=evaluator, 491 | cam_views=args.cam_views, 492 | model_name=args.model_name, 493 | proc_name=args.proc_name, 494 | epoch=args.epoch_num, 495 | gpu_name=args.gpu_name, 496 | using_pos_and_orn=bool(args.using_pos_and_orn), 497 | num_threads=args.num_threads, 498 | ) 499 | # python IO_evaluator.py -d /mnt/logs_1/1698325798/999-checkpoint.pth -c front wrist -m 999-checkpoint.pth 500 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [2023] [IO intelligence] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Reproducing RT-1 in PyTorch 2 | 3 | [![IO](https://img.shields.io/badge/io%20intelligence%20-000000)](https://io-ai.tech) 4 | 5 | This repository contains complete PyTorch implementation for **RT-1** based on the paper: [RT-1 Paper](https://arxiv.org/abs/2212.06817) and implementation of RT-1 model by **maruya24's RT-1**: [maruya24's RT-1 GitHub](https://github.com/maruya24/pytorch_robotics_transformer). Our implementation complete the training, validation pipeline, and soon-to-come evaluation pipeline. 6 | 7 | **Try our implementation in [Colab](https://drive.google.com/file/d/18nWZ6pgy2_0fS8BjZsUiTjOFaE6WXMi3/view?usp=sharing)** 8 | 9 | ## Acknowledgements 10 | 11 | - **maruya24**: For their work on RT-1, which serves as the foundation for this implementation - [maruya24's RT-1 GitHub](https://github.com/maruya24/pytorch_robotics_transformer) 12 | - changes on model structure: similar to what it looks like in [diffusion policy](https://diffusion-policy.cs.columbia.edu/), robot's end effector's position and orientation is concatenated to the end of the sequence before it is sent into the transformer 13 | - **detr (Facebook)**: Utilities for distributed training from DETR - [detr/util/misc.py](https://github.com/facebookresearch/detr/blob/main/util/misc.py). 14 | 15 | ## Training 16 | 17 | To train RT-1 in distributed mode with 4 GPUs: 18 | 19 | ```bash 20 | python -m torch.distributed.launch --nproc_per_node=[your_gpu_num] --use_env IO_trainer_torch.py 21 | ``` 22 | 23 | The training configuration is stored in train_config.json. 24 | 25 | ## Training Configuration 26 | 27 | The training configuration for RT-1 includes various parameters that influence the training process. Here's an overview of some key configurations: 28 | 29 | - **Mode:** Training mode, options ['train', 'eval'] 30 | - **Device:** CUDA device for computation, options ['cpu', 'cuda'] 31 | - **Data Path:** Path to the dataset/[robotname]_[taskname] 32 | - **Camera Views:** Views used in training (`front`, `wrist`, ... see these in dataset folder) 33 | - **Log Directory:** Directory to store logs 34 | - **Time Sequence Length:** Length of the time sequence (e.g., 6), RT-1 takes history timesteps of images as part of model input, which means `1` frame of current timestep image and `time_sequence_length - 1` frames of history image. 35 | - **Learning Rate:** Initial learning rate 36 | - **Batch Size:** Size of each training batch 37 | - **Epochs:** Number of training epochs 38 | - **Resume:** Whether to resume training from a checkpoint 39 | - **Resume from checkpoint:** resume training from checkpoint path 40 | - **World size:** how many gpus you are intended to use during training 41 | - **Dist url:** distributed urls used for initialize distributed training (e.g., `"env://"`) 42 | - **Validation Interval:** Interval between validation steps 43 | - **Num train episode:** number of training episode used 44 | - **Num val episode:** number of validation episode used 45 | - **Network Configurations:** Parameters for the network architecture 46 | - **Scheduler Configurations:** Parameters for the learning rate scheduler 47 | 48 | ### Example Configuration 49 | 50 | 51 | ```json 52 | { 53 | "mode": "train", 54 | "device": "cuda", 55 | "data_path": "IO_pybullet_open_dataset/Panda_pick", 56 | "cam_view" : ["front", "wrist"], 57 | "log_dir": "/mnt/logs_1", 58 | "time_sequence_length": 6, 59 | "lr": 0.0001, 60 | "batch_size": 6, 61 | "epochs": 50, 62 | "resume": false, 63 | "resume_from_checkpoint": "", 64 | "predicting_next_ts": true, 65 | "world_size": 4, 66 | "dist_url": "env://", 67 | "val_interval" : 25, 68 | "num_val_threads": 25, 69 | "num_train_episode" : 200, 70 | "num_val_episode" : 10, 71 | "using_proprioception" : false, 72 | "network_configs": { 73 | "vocab_size" : 256, 74 | "token_embedding_size_per_image" : 512, 75 | "language_embedding_size" : 512, 76 | "num_layers" : 8, 77 | "layer_size" : 128, 78 | "num_heads" : 8, 79 | "feed_forward_size" : 512, 80 | "dropout_rate" : 0.1, 81 | "crop_size" : 236, 82 | "use_token_learner" : true 83 | }, 84 | "scheduler_configs" : { 85 | "T_0" : 10, 86 | "T_mult" : 2, 87 | "eta_min" : 1e-6, 88 | "verbose" : true 89 | } 90 | 91 | } 92 | 93 | ``` 94 | Pretrained weights trained on settings above can be downloaded from [pretrained_weights](https://drive.google.com/uc?export=download&id=1USLqOfqYfqIrigx1hkY37SGyrLJuaPwc), setting "resume_from_checkpoint" to path of pretrained weight and setting "resume" to True to resume from the checkpoint. 95 | 96 | 97 | ## Limitations 98 | 99 | - We are currently validating the code in the PyBullet environment. Validation code will be added within a week. 100 | - The mode is presently limited to single-task training. 101 | 102 | ## Dataset Structure 103 | 104 | Our dataset follows a specific file structure: 105 | 106 | - [robotname]_[taskname] 107 | - [cam_view_0] 108 | - data_000 109 | - rgb # Images 110 | - image_001.png 111 | - image_002.png 112 | - ... 113 | - results.csv # Robot actions 114 | - results_raw.csv # Joint and target object positions 115 | - data_001 116 | - ... 117 | - [cam_view_1] 118 | - data_000 119 | - data_001 120 | - ... 121 | - ... 122 | 123 | Simliar to Robomimic's lift mission [robomimic](https://robomimic.github.io/), we collected dataset from third-person and first-person perspectives. 124 | 125 | You can download our dataset collected from PyBullet [IO_open_dataset](https://drive.google.com/uc?export=download&id=1RoTxnipQf2SIXqzvroDOXAleNvvNIIwZ). 126 | 127 | ## Contacts 128 | 129 | Join wechat group for discussion 130 | 131 | 132 | ![wechat group](img/wechatgroup.jpg) 133 | 134 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """robotics_transformer API.""" 15 | 16 | # A new PyPI release will be pushed everytime `__version__` is increased 17 | # When changing this, also update the CHANGELOG.md 18 | __version__ = '0.1.0' 19 | from .maruya24_rt1 import * 20 | from .util import * 21 | 22 | -------------------------------------------------------------------------------- /img/wechatgroup.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ioai-tech/pytorch_rt1_with_trainer_and_tester/bd86991521dda9cfd0664a0d4ad602ab34ad01a1/img/wechatgroup.jpg -------------------------------------------------------------------------------- /maruya24_rt1/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | .DS_Store 3 | *.png 4 | unused_file/ -------------------------------------------------------------------------------- /maruya24_rt1/LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /maruya24_rt1/README.md: -------------------------------------------------------------------------------- 1 | # RT-1: Robotics Transformer, in PyTorch 2 | PyTorch re-implementation of [RT-1](https://github.com/google-research/robotics_transformer). This model is based on 2022 paper ["RT-1: Robotics Transformer for Real-World Control at Scale"](https://arxiv.org/abs/2212.06817). 3 | Although this implementation exactly reproduces the [original code](https://github.com/google-research/robotics_transformer) unlike other PyTorch implementation such as [lucidrains version](https://github.com/lucidrains/robotic-transformer-pytorch), I made some changes for PyTorch compatibility or simplification. 4 | 5 | ## Features 6 | * Film efficient net based image tokenizer backbone 7 | * Token learner based compression of input tokens 8 | * Transformer for end to end robotic control 9 | * Testing utilities 10 | 11 | 12 | ## Main Changes 13 | - Translated TensorFlow implementation into PyTorch implementation. 14 | - Used spaces of OpenAI gym to define observation and action variables instead of using tensor_spec of TF-Agents. 15 | - Abolished 4-d constraint of some specs in transformer_network.py 16 | - Added useful functions and put those in utils.py. 17 | - Added extra comments. 18 | - Didn't implement the add_summaries function of transformer_network.py which is for tensorboard logging. 19 | - Omitted some variables, functions, classes, and tests including those that are no longer necessary for this repository. 20 | 21 | ## Note 22 | I didn't implement squence_agent.py and its related codes. Because those codes are for training and inference that use the tf_agents library, so I figure it is not necessary in PyTorch implementation. 23 | This repository enables you to use RT-1 just like the other Reinforcement learning codes that utilize PyTorch and OpenAI Gym, without the tf_agents library. 24 | 25 | 26 | ## License 27 | This library is licensed under the terms of the Apache license. 28 | -------------------------------------------------------------------------------- /maruya24_rt1/film_efficientnet/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ioai-tech/pytorch_rt1_with_trainer_and_tester/bd86991521dda9cfd0664a0d4ad602ab34ad01a1/maruya24_rt1/film_efficientnet/__init__.py -------------------------------------------------------------------------------- /maruya24_rt1/film_efficientnet/efficientnet_checkpoints/efficientnetb3.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ioai-tech/pytorch_rt1_with_trainer_and_tester/bd86991521dda9cfd0664a0d4ad602ab34ad01a1/maruya24_rt1/film_efficientnet/efficientnet_checkpoints/efficientnetb3.pth -------------------------------------------------------------------------------- /maruya24_rt1/film_efficientnet/efficientnet_checkpoints/efficientnetb3_notop.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ioai-tech/pytorch_rt1_with_trainer_and_tester/bd86991521dda9cfd0664a0d4ad602ab34ad01a1/maruya24_rt1/film_efficientnet/efficientnet_checkpoints/efficientnetb3_notop.pth -------------------------------------------------------------------------------- /maruya24_rt1/film_efficientnet/film_conditioning_layer.py: -------------------------------------------------------------------------------- 1 | # Subject to the terms and conditions of the Apache License, Version 2.0 that the original code follows, 2 | # I have retained the following copyright notice written on it. 3 | 4 | # Copyright 2022 Google LLC 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | # You can find the original code from here[https://github.com/google-research/robotics_transformer]. 19 | 20 | import torch 21 | import torch.nn as nn 22 | 23 | 24 | class FilmConditioning(nn.Module): 25 | def __init__( 26 | self, 27 | num_channels: int, 28 | text_embed_dim: int = 512, 29 | ): 30 | super().__init__() 31 | self._projection_add = nn.Linear(text_embed_dim, num_channels) 32 | self._projection_mult = nn.Linear(text_embed_dim, num_channels) 33 | 34 | # Note that we initialize with zeros because empirically we have found 35 | # this works better than initializing with glorot. 36 | nn.init.constant_(self._projection_add.weight, 0) 37 | nn.init.constant_(self._projection_add.bias, 0) 38 | nn.init.constant_(self._projection_mult.weight, 0) 39 | nn.init.constant_(self._projection_mult.bias, 0) 40 | 41 | # conv_filter: feature maps which corresponds to F in FiLM paper. (B, C, H, W) 42 | # conditioning: text which corresponds to x in FiLM paper. this is one vector that is created from a text, 43 | # note that this is not embedding vectors from a text. Please refer to Universal Sentence Encoder. (B, D). D = 512. 44 | def forward(self, conv_filters: torch.Tensor, conditioning: torch.Tensor): 45 | projected_cond_add = self._projection_add(conditioning) # (B, D) -> (B, C) 46 | projected_cond_mult = self._projection_mult(conditioning) 47 | 48 | projected_cond_add = projected_cond_add.unsqueeze(2).unsqueeze( 49 | 3 50 | ) # (B, C) -> (B, C, 1, 1) 51 | projected_cond_mult = projected_cond_mult.unsqueeze(2).unsqueeze(3) 52 | 53 | # Original FiLM paper argues that 1 + gamma centers the initialization at 54 | # identity transform. 55 | # see 7.2 section in FiLM paper 56 | result = (1 + projected_cond_mult) * conv_filters + projected_cond_add 57 | 58 | return result 59 | -------------------------------------------------------------------------------- /maruya24_rt1/film_efficientnet/film_efficientnet_encoder.py: -------------------------------------------------------------------------------- 1 | # Subject to the terms and conditions of the Apache License, Version 2.0 that the original code follows, 2 | # I have retained the following copyright notice written on it. 3 | 4 | # Copyright 2022 Google LLC 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | # You can find the original code from here[https://github.com/google-research/robotics_transformer]. 19 | 20 | import copy 21 | import math 22 | import os 23 | import json 24 | 25 | import torch 26 | import torch.nn as nn 27 | import torch.nn.functional as F 28 | from torchvision.ops import StochasticDepth 29 | from torchvision.ops.misc import Conv2dNormActivation 30 | import numpy as np 31 | 32 | from maruya24_rt1.film_efficientnet.film_conditioning_layer import FilmConditioning 33 | 34 | 35 | # This is based on Table 1 in a EfficientNet paper. 36 | DEFAULT_BLOCKS_ARGS = [ 37 | { 38 | "kernel_size": 3, 39 | "repeats": 1, 40 | "in_size": 32, 41 | "out_size": 16, 42 | "expand_ratio": 1, 43 | "id_skip": True, 44 | "strides": 1, 45 | "se_ratio": 0.25, 46 | }, 47 | { 48 | "kernel_size": 3, 49 | "repeats": 2, 50 | "in_size": 16, 51 | "out_size": 24, 52 | "expand_ratio": 6, 53 | "id_skip": True, 54 | "strides": 2, 55 | "se_ratio": 0.25, 56 | }, 57 | { 58 | "kernel_size": 5, 59 | "repeats": 2, 60 | "in_size": 24, 61 | "out_size": 40, 62 | "expand_ratio": 6, 63 | "id_skip": True, 64 | "strides": 2, 65 | "se_ratio": 0.25, 66 | }, 67 | { 68 | "kernel_size": 3, 69 | "repeats": 3, 70 | "in_size": 40, 71 | "out_size": 80, 72 | "expand_ratio": 6, 73 | "id_skip": True, 74 | "strides": 2, 75 | "se_ratio": 0.25, 76 | }, 77 | { 78 | "kernel_size": 5, 79 | "repeats": 3, 80 | "in_size": 80, 81 | "out_size": 112, 82 | "expand_ratio": 6, 83 | "id_skip": True, 84 | "strides": 1, 85 | "se_ratio": 0.25, 86 | }, 87 | { 88 | "kernel_size": 5, 89 | "repeats": 4, 90 | "in_size": 112, 91 | "out_size": 192, 92 | "expand_ratio": 6, 93 | "id_skip": True, 94 | "strides": 2, 95 | "se_ratio": 0.25, 96 | }, 97 | { 98 | "kernel_size": 3, 99 | "repeats": 1, 100 | "in_size": 192, 101 | "out_size": 320, 102 | "expand_ratio": 6, 103 | "id_skip": True, 104 | "strides": 1, 105 | "se_ratio": 0.25, 106 | }, 107 | ] 108 | 109 | CONV_KERNEL_INITIALIZER = { 110 | "class_name": "VarianceScaling", 111 | "config": {"scale": 2.0, "mode": "fan_out", "distribution": "truncated_normal"}, 112 | } 113 | 114 | DENSE_KERNEL_INITIALIZER = { 115 | "class_name": "VarianceScaling", 116 | "config": {"scale": 1.0 / 3.0, "mode": "fan_out", "distribution": "uniform"}, 117 | } 118 | 119 | 120 | # Multiply the number of filters by width_coefficient. 121 | # Usually, divisor = 8. This means new_filters is a multiple of 8. 122 | # Filters means channels. 123 | # We round by the 8, not by the 10. 124 | def round_filters(filters, divisor, width_coefficient): 125 | """Round number of filters based on depth multiplier.""" 126 | filters *= width_coefficient 127 | 128 | # int(filters + divisor / 2) // divisor * divisor is an operation like rounding off. 129 | # Usual rounding off is operated at 5(=10 / 2) interval. But this is operated at 4(=8 / 2) interval. 130 | # If filters = 35, new_filter = 32. If filters = 36, new_filter = 40. 131 | new_filters = max(divisor, int(filters + divisor / 2) // divisor * divisor) 132 | 133 | # Make sure that round down does not go down by more than 10%. 134 | if new_filters < 0.9 * filters: 135 | new_filters += divisor 136 | return int(new_filters) 137 | 138 | 139 | # Multiply the number of repeats by depth_coefficient. 140 | def round_repeats(repeats, depth_coefficient): 141 | """Round number of repeats based on depth multiplier.""" 142 | return int(math.ceil(depth_coefficient * repeats)) 143 | 144 | 145 | class SeModule(nn.Module): 146 | def __init__(self, expand_size, block_in_size, se_ratio=0.25): 147 | super(SeModule, self).__init__() 148 | 149 | se_size = max(1, int(block_in_size * se_ratio)) 150 | 151 | self.avgpool = nn.AdaptiveAvgPool2d(1) 152 | self.fc1 = nn.Conv2d( 153 | expand_size, se_size, kernel_size=1, stride=1, padding=0 154 | ) # Note that we use bias=True here. 155 | self.silu0 = nn.SiLU(inplace=True) 156 | self.fc2 = nn.Conv2d( 157 | se_size, expand_size, kernel_size=1, stride=1, padding=0 158 | ) # Note that we use bias=True here. 159 | self.act = nn.Sigmoid() 160 | 161 | def forward(self, inputs): 162 | x = self.avgpool(inputs) 163 | x = self.fc1(x) 164 | x = self.silu0(x) 165 | x = self.fc2(x) 166 | x = self.act(x) 167 | return inputs * x 168 | 169 | 170 | # Inverted Residual + Squeeze-and-Excitation 171 | class MBConvBlock(nn.Module): 172 | """expand + depthwise + pointwise""" 173 | 174 | def __init__( 175 | self, 176 | kernel_size, 177 | in_size, 178 | out_size, 179 | expand_ratio, 180 | id_skip, 181 | strides, 182 | se_ratio, 183 | drop_rate, 184 | ): 185 | super(MBConvBlock, self).__init__() 186 | self.in_size = in_size 187 | self.out_size = out_size 188 | self.expand_ratio = expand_ratio 189 | self.strides = strides 190 | self.se_ratio = se_ratio 191 | self.id_skip = id_skip 192 | self.drop_rate = drop_rate 193 | 194 | expand_size = in_size * expand_ratio 195 | 196 | layers = [] 197 | 198 | if not (1 <= strides <= 2): 199 | raise ValueError("illegal stride value") 200 | 201 | # expansion phase (1x1 conv) 202 | if expand_ratio != 1: 203 | layers.append( 204 | Conv2dNormActivation( 205 | in_size, 206 | expand_size, 207 | kernel_size=1, 208 | stride=1, 209 | norm_layer=nn.BatchNorm2d, 210 | activation_layer=nn.SiLU, 211 | ) 212 | ) 213 | 214 | # depthwise conv 215 | layers.append( 216 | Conv2dNormActivation( 217 | expand_size, 218 | expand_size, 219 | kernel_size=kernel_size, 220 | stride=strides, 221 | groups=expand_size, 222 | norm_layer=nn.BatchNorm2d, 223 | activation_layer=nn.SiLU, 224 | ) 225 | ) 226 | 227 | # Squeeze-and-Excitation module 228 | if 0 < se_ratio <= 1: 229 | layers.append(SeModule(expand_size, in_size, se_ratio)) 230 | 231 | # output phase (1x1 conv) 232 | layers.append( 233 | Conv2dNormActivation( 234 | expand_size, 235 | out_size, 236 | kernel_size=1, 237 | stride=1, 238 | norm_layer=nn.BatchNorm2d, 239 | activation_layer=None, # Note that there is no activation here 240 | ) 241 | ) 242 | # self.output_conv = nn.Conv2d(expand_size, out_size, kernel_size=1, stride=1, padding=0, bias=False) 243 | # self.bn2 = nn.BatchNorm2d(out_size) 244 | 245 | self.block = nn.Sequential(*layers) 246 | 247 | # Dropout 248 | if drop_rate > 0: 249 | self.dropout = StochasticDepth(drop_rate, "row") 250 | 251 | def forward(self, inputs): 252 | x = self.block(inputs) 253 | 254 | # Dropout and skip connection 255 | if self.id_skip and self.strides == 1 and self.in_size == self.out_size: 256 | if self.drop_rate > 0: 257 | x = self.dropout(x) 258 | x = inputs + x 259 | 260 | return x 261 | 262 | 263 | class EfficientNet(nn.Module): 264 | def __init__( 265 | self, 266 | width_coefficient, 267 | depth_coefficient, 268 | dropout_rate=0.2, 269 | drop_connect_rate=0.2, 270 | depth_divisor=8, 271 | blocks_args="default", 272 | include_top=True, 273 | classes=1000, 274 | include_film=False, 275 | text_embed_dim=512, 276 | ): 277 | super().__init__() 278 | self.dropout_rate = dropout_rate 279 | self.include_top = include_top 280 | self.include_film = include_film 281 | 282 | # input dimensions 283 | in_channels = 3 284 | 285 | # get defualt setting of MBConv blocks 286 | if blocks_args == "default": 287 | blocks_args = DEFAULT_BLOCKS_ARGS 288 | 289 | # stem 290 | out_channels = round_filters(32, depth_divisor, width_coefficient) 291 | self.convNormAct0 = Conv2dNormActivation( 292 | in_channels, 293 | out_channels, 294 | kernel_size=3, 295 | stride=2, 296 | norm_layer=nn.BatchNorm2d, 297 | activation_layer=nn.SiLU, 298 | ) 299 | 300 | # Build blocks 301 | blocks_args = copy.deepcopy(blocks_args) 302 | # Detach blocks_args from original DEFAULT_BLOCKS_ARGS. This is necessary to keep DEFAULT_BLOCKS_ARGS as it is when you add change to blocks_args. 303 | # Therefore, you can make multiple models. 304 | blocks = [] 305 | films = [] 306 | b = 0 307 | total_repeats = float( 308 | sum( 309 | round_repeats(args["repeats"], depth_coefficient) 310 | for args in blocks_args 311 | ) 312 | ) # sum of all of repeat 313 | for args in blocks_args: # args is dictionary 314 | assert args["repeats"] > 0 315 | # Update block input and output filters based on depth multiplier. 316 | args["in_size"] = round_filters( 317 | args["in_size"], depth_divisor, width_coefficient 318 | ) 319 | args["out_size"] = round_filters( 320 | args["out_size"], depth_divisor, width_coefficient 321 | ) 322 | 323 | # We delete repeats in args so that we could write MBConv(**args). 324 | for j in range(round_repeats(args.pop("repeats"), depth_coefficient)): 325 | if j == 0: 326 | # The first block 327 | blocks.append( 328 | MBConvBlock( 329 | **args, 330 | drop_rate=drop_connect_rate 331 | * b 332 | / total_repeats, # increase drop_connect_rate linearlly 333 | ) 334 | ) 335 | args["strides"] = 1 336 | args["in_size"] = args["out_size"] 337 | else: 338 | blocks.append( 339 | MBConvBlock( 340 | **args, 341 | drop_rate=drop_connect_rate * b / total_repeats, 342 | ) 343 | ) 344 | 345 | if include_film: 346 | films.append( 347 | FilmConditioning( 348 | num_channels=args["out_size"], 349 | text_embed_dim=text_embed_dim, 350 | ) 351 | ) 352 | b += 1 353 | 354 | # Make modulelist. We have to use modulelist. If we don't use it,the blocks will not emerge when we visualize this model. 355 | self.blocks = nn.ModuleList(blocks) 356 | if include_film: 357 | self.films = nn.ModuleList(films) 358 | 359 | # Build top 360 | in_channels = args["out_size"] 361 | out_channels = 1280 362 | out_channels = round_filters(out_channels, depth_divisor, width_coefficient) 363 | self.convNormAct1 = Conv2dNormActivation( 364 | in_channels, 365 | out_channels, 366 | kernel_size=1, 367 | stride=1, 368 | norm_layer=nn.BatchNorm2d, 369 | activation_layer=nn.SiLU, 370 | ) 371 | 372 | # If we include top, we do gloval average pooling and dropout. Then, we add fully connected layer. 373 | if include_top: 374 | self.glovalAvePool = nn.AdaptiveAvgPool2d(1) 375 | if dropout_rate > 0: 376 | self.dropout = nn.Dropout(dropout_rate) 377 | # Fully connected layer 378 | self.fc = nn.Linear(out_channels, classes) 379 | 380 | # inputs: image (bs, c, h, w) 381 | # context: text (bs, embedding_dim). Each vector that is created from a text, not a word. 382 | def forward(self, inputs, context=None): 383 | # stem 384 | outputs = self.convNormAct0(inputs) 385 | 386 | # Blocks(26 MBconv) 387 | if self.include_film: 388 | for block, film in zip(self.blocks, self.films): 389 | outputs = block(outputs) # MBConv 390 | outputs = film(outputs, context) # FiLM 391 | 392 | else: 393 | for block in self.blocks: 394 | outputs = block(outputs) 395 | 396 | # top 397 | outputs = self.convNormAct1(outputs) 398 | 399 | if self.include_top: 400 | outputs = self.glovalAvePool(outputs) 401 | if self.dropout_rate > 0: 402 | outputs = self.dropout(outputs) 403 | outputs = torch.flatten(outputs, 1) 404 | outputs = self.fc(outputs) 405 | return outputs 406 | else: 407 | return outputs 408 | 409 | 410 | # If you use FiLM, this function allow us to load pretrained weight from naive efficientnet. 411 | def maybe_restore_with_film( 412 | *args, weights="imagenet", include_top=True, include_film=False, **kwargs 413 | ): 414 | assert ( 415 | weights == None or weights == "imagenet" 416 | ), "Set weights to either None or 'imagenet'." 417 | # Create model without FiLM 418 | n1 = EfficientNet(*args, include_top=include_top, include_film=False, **kwargs) 419 | 420 | # Load weights. 421 | if weights == "imagenet": 422 | if include_top: 423 | weights_path = os.path.join( 424 | os.path.dirname(__file__), "efficientnet_checkpoints/efficientnetb3.pth" 425 | ) 426 | else: 427 | weights_path = os.path.join( 428 | os.path.dirname(__file__), 429 | "efficientnet_checkpoints/efficientnetb3_notop.pth", 430 | ) 431 | # This EfficientNet differs from the official pytorch implementation only in parameter names. 432 | # So we load the pytorch weights in using this function. 433 | n1 = load_official_pytorch_param(n1, weights_path) 434 | 435 | # If you don't use FiLM, that's all. 436 | if not include_film: 437 | return n1 438 | 439 | # Create model with FiLM if you use FiLM. And we load pretrained weights from n1. 440 | n2 = EfficientNet(*args, include_top=include_top, include_film=True, **kwargs) 441 | if weights == None: 442 | return n2 443 | 444 | n1_state_dict = n1.state_dict().copy() 445 | n2_state_dict = n2.state_dict().copy() 446 | 447 | for name, param in n2_state_dict.items(): 448 | if name in n1_state_dict: 449 | n2_state_dict[name] = n1_state_dict[name] 450 | 451 | n2.load_state_dict(n2_state_dict) 452 | return n2 453 | 454 | 455 | # This function helps load official pytorch efficientnet's weights. 456 | def load_official_pytorch_param(model: nn.Module, weights_path): 457 | # load weights 458 | official_state_dict = torch.load(weights_path) 459 | 460 | film_eff_state_dict = model.state_dict().copy() 461 | keys_official_list = list(official_state_dict) 462 | keys_film_eff_list = list(film_eff_state_dict) 463 | 464 | for key_official, key_film_eff in zip(keys_official_list, keys_film_eff_list): 465 | film_eff_state_dict[key_film_eff] = official_state_dict[key_official] 466 | # print(str(key_official) + "->" + str(key_film_eff)) 467 | 468 | # load new weights 469 | model.load_state_dict(film_eff_state_dict) 470 | return model 471 | 472 | 473 | # EfficientNetB3 is tranined on 300x300 image. 474 | def EfficientNetB3( 475 | weights="imagenet", include_top=True, classes=1000, include_film=False, **kwargs 476 | ): 477 | return maybe_restore_with_film( 478 | 1.2, 479 | 1.4, 480 | 0.3, 481 | weights=weights, 482 | include_top=include_top, 483 | classes=classes, 484 | include_film=include_film, 485 | **kwargs, 486 | ) 487 | 488 | 489 | # Class for Postprocessing model's output 490 | class ILSVRCPredictor: 491 | def __init__(self, top=5): 492 | # Load label imformation of ILSVRC 493 | image_json_path = os.path.join( 494 | os.path.dirname(__file__), "efficientnet_checkpoints/imagenet_classes.json" 495 | ) 496 | with open(image_json_path, "r") as f: 497 | self.class_index = json.load(f) 498 | 499 | self.top = top 500 | 501 | # Obtain label name of top-k probability 502 | def predict_topk(self, out): 503 | assert ( 504 | out.shape[0] == 1 505 | ), "Only accept tonsor with batch size 1 to simplify implementation" 506 | top_indices = out[0].detach().numpy().argsort()[-self.top :][::-1] 507 | predicted_label_names = [self.class_index[str(idx)][1] for idx in top_indices] 508 | 509 | return predicted_label_names 510 | -------------------------------------------------------------------------------- /maruya24_rt1/film_efficientnet/film_efficientnet_encoder_test.py: -------------------------------------------------------------------------------- 1 | # Subject to the terms and conditions of the Apache License, Version 2.0 that the original code follows, 2 | # I have retained the following copyright notice written on it. 3 | 4 | # Copyright 2022 Google LLC 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | # You can find the original code from here[https://github.com/google-research/robotics_transformer]. 19 | 20 | 21 | import unittest 22 | import torch 23 | from skimage import data 24 | from absl.testing import parameterized 25 | from torchvision.models import efficientnet_b3, EfficientNet_B3_Weights 26 | import numpy as np 27 | from torchvision import transforms 28 | import matplotlib.pyplot as plt 29 | 30 | from pytorch_robotics_transformer.film_efficientnet.film_efficientnet_encoder import EfficientNetB3, ILSVRCPredictor 31 | 32 | 33 | # If you want run this test, move to the directory above pytorch_robotics_transformer directory, type below command on terminal. 34 | # python -m pytorch_robotics_transformer.film_efficientnet.film_efficientnet_encoder_test 35 | # If you want to test all test files in film_efficientnet directory, type below command on terminal. 36 | # python -m unittest discover -s pytorch_robotics_transformer/film_efficientnet -p "*_test.py" 37 | 38 | resize = 300 39 | mean = (0.485, 0.456, 0.406) 40 | std = (0.229, 0.224, 0.225) 41 | 42 | # This transformation is for imagenet. Use another transformatioin for RT-1 43 | img_trasnform = transforms.Compose([ 44 | transforms.ToPILImage(), 45 | transforms.Resize(resize), 46 | transforms.CenterCrop(resize), 47 | transforms.ToTensor(), 48 | transforms.Normalize(mean, std) 49 | ]) 50 | 51 | class FilmEfficientnetTest(parameterized.TestCase, unittest.TestCase): 52 | # This test method will be implemented twice. One is implemented with include_film = True. Another is implemented with include_film = False. 53 | # To know further more, see a comment of https://github.com/abseil/abseil-py/blob/main/absl/testing/parameterized.py 54 | @parameterized.parameters([True, False]) 55 | def test_film_efficientnet(self, include_film): 56 | fe = EfficientNetB3(include_top=True, weights='imagenet', include_film=include_film) 57 | fe.eval() 58 | 59 | image = data.chelsea() # ndarray 60 | image_transformed = img_trasnform(image) # tensor 61 | 62 | # Show the image. 63 | # image_show = image_transformed.numpy().transpose((1, 2, 0)) 64 | # image_show = np.clip(image_show, 0, 1) 65 | # plt.imshow(image_show) 66 | # plt.show() 67 | 68 | image_transformed = image_transformed.unsqueeze(0) 69 | context = torch.rand(1, 512) 70 | 71 | if include_film: 72 | eff_output = fe(image_transformed, context) 73 | else: 74 | eff_output = fe(image_transformed) 75 | 76 | predictor = ILSVRCPredictor(top=10) 77 | 78 | film_preds = predictor.predict_topk(eff_output) 79 | # print(film_preds) 80 | self.assertIn('tabby', film_preds) 81 | 82 | if __name__ == '__main__': 83 | unittest.main() -------------------------------------------------------------------------------- /maruya24_rt1/film_efficientnet/preprocessors.py: -------------------------------------------------------------------------------- 1 | # Subject to the terms and conditions of the Apache License, Version 2.0 that the original code follows, 2 | # I have retained the following copyright notice written on it. 3 | 4 | # Copyright 2022 Google LLC 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | # You can find the original code from here[https://github.com/google-research/robotics_transformer]. 19 | 20 | 21 | 22 | # This prepocessor is a simplfied version of original code. 23 | # This will pad the image and crop it at a random location 24 | # The cropped image maintain original image size and almost full field of view. 25 | # Omitted preprocessor_test.py. But you can do simple test by running this code. 26 | 27 | import torch 28 | import torch.nn.functional as F 29 | from skimage import data 30 | import numpy as np 31 | import matplotlib.pyplot as plt 32 | 33 | # images: [B, 3 ,H, W] 34 | # receive images whose values is in the range [0,255] 35 | # -> change images values range into [0.0 ,1.0] 36 | # -> padding and crop image 37 | def convert_dtype_and_crop_images(images: torch.Tensor, ratio: float = 0.07): 38 | if images.dtype == torch.uint8: 39 | images = images / 255. 40 | images = images.to(torch.float32) 41 | 42 | _, _, height, width = images.shape 43 | ud_pad = int(height * ratio) 44 | lr_pad = int(width * ratio) 45 | 46 | images= F.pad(images, pad=(lr_pad, lr_pad, ud_pad, ud_pad)) 47 | 48 | shif_h = torch.randint(0, 2*ud_pad+1, size=[]) 49 | shif_w = torch.randint(0, 2*lr_pad+1, size=[]) 50 | 51 | grid_h, grid_w = torch.meshgrid(torch.arange(shif_h, shif_h + height), 52 | torch.arange(shif_w, shif_w + width), 53 | indexing='ij') 54 | images = images[..., grid_h, grid_w] # fancy index 55 | 56 | return images 57 | 58 | 59 | 60 | if __name__ == '__main__': 61 | images = data.coffee() # ndarray 62 | images = np.tile(np.expand_dims(images,0), (10,1,1,1)) # batch size: 10 63 | images = torch.from_numpy(images).permute(0, 3, 1, 2) # (b, h, w, c) -> (b, c, h, w) 64 | images = convert_dtype_and_crop_images(images) 65 | print(torch.max(images)) 66 | print(torch.min(images)) 67 | print(images.shape) 68 | image_show = images.permute(0, 2, 3 , 1).numpy() 69 | plt.imshow(image_show[0]) 70 | plt.show() -------------------------------------------------------------------------------- /maruya24_rt1/film_efficientnet/pretrained_efficientnet_encoder.py: -------------------------------------------------------------------------------- 1 | # Subject to the terms and conditions of the Apache License, Version 2.0 that the original code follows, 2 | # I have retained the following copyright notice written on it. 3 | 4 | # Copyright 2022 Google LLC 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | # You can find the original code from here[https://github.com/google-research/robotics_transformer]. 19 | 20 | 21 | """Encoder based on Efficientnet.""" 22 | 23 | 24 | # film_efficientnet_encoder receive [bs, 3, 300, 300] and returns [bs, 1536, 10, 10] 25 | # Here we use 1x1 conv. [bs, 1536, 10, 10] -> [bs, 512, 10, 10] 26 | # then apply FiLM. 27 | 28 | import torch 29 | import torch.nn as nn 30 | from typing import Optional 31 | 32 | from maruya24_rt1.film_efficientnet.film_efficientnet_encoder import EfficientNetB3 33 | from maruya24_rt1.film_efficientnet.film_conditioning_layer import FilmConditioning 34 | 35 | 36 | class EfficientNetEncoder(nn.Module): 37 | def __init__( 38 | self, 39 | token_embedding_size: int = 512, 40 | language_embedding_size: int = 512, 41 | weights: Optional[str] = "imagenet", 42 | early_film: bool = True, 43 | include_top: bool = False, 44 | pooling: bool = True, 45 | ): 46 | super().__init__() 47 | 48 | self.conv1x1 = nn.Conv2d( 49 | in_channels=1536, # If we use EfficientNetB3 and input image has 3 channels, in_channels is 1536. 50 | out_channels=token_embedding_size, 51 | kernel_size=1, 52 | stride=1, 53 | padding=0, 54 | bias=False, 55 | ) 56 | self.net = EfficientNetB3( 57 | weights=weights, 58 | include_top=include_top, 59 | include_film=early_film, 60 | text_embed_dim=language_embedding_size, 61 | ) 62 | self.film_layer = FilmConditioning( 63 | num_channels=token_embedding_size, text_embed_dim=language_embedding_size 64 | ) 65 | 66 | self.early_film = early_film 67 | self._pooling = pooling 68 | 69 | def _encode(self, image: torch.Tensor, context: torch.Tensor) -> torch.Tensor: 70 | """Run the image through the efficientnet encoder.""" 71 | if self.early_film: 72 | return self.net(image, context) 73 | return self.net(image) 74 | 75 | def forward(self, image: torch.Tensor, context: torch.Tensor) -> torch.Tensor: 76 | features = self._encode(image, context) 77 | features = self.conv1x1(features) 78 | features = self.film_layer(features, context) 79 | 80 | if not self._pooling: 81 | return features 82 | 83 | # Global average pool. 84 | return torch.mean(features, dim=(2, 3)) 85 | -------------------------------------------------------------------------------- /maruya24_rt1/film_efficientnet/pretrained_efficientnet_encoder_test.py: -------------------------------------------------------------------------------- 1 | # Subject to the terms and conditions of the Apache License, Version 2.0 that the original code follows, 2 | # I have retained the following copyright notice written on it. 3 | 4 | # Copyright 2022 Google LLC 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | # You can find the original code from here[https://github.com/google-research/robotics_transformer]. 19 | 20 | 21 | from pytorch_robotics_transformer.film_efficientnet import film_efficientnet_encoder 22 | from pytorch_robotics_transformer.film_efficientnet import pretrained_efficientnet_encoder as eff 23 | import unittest 24 | import numpy as np 25 | from skimage import data 26 | import torch 27 | # import film_efficientnet_encoder 28 | # import pretrained_efficientnet_encoder as eff 29 | from torchvision import transforms 30 | 31 | resize = 300 32 | mean = (0.485, 0.456, 0.406) 33 | std = (0.229, 0.224, 0.225) 34 | 35 | # This transformation is for imagenet. Use another transformatioin for RT-1 36 | img_trasnform = transforms.Compose([ 37 | transforms.ToPILImage(), 38 | transforms.Resize(resize), 39 | transforms.CenterCrop(resize), 40 | transforms.ToTensor(), 41 | transforms.Normalize(mean, std) 42 | ]) 43 | 44 | 45 | class PretrainedEfficientnetEncoderTest(unittest.TestCase): 46 | def test_encoding(self): 47 | """Test that we get a correctly shaped decoding.""" 48 | # prepare context 49 | state = np.random.RandomState(0) 50 | context = state.uniform(-1, 1, (10, 512)) 51 | context = torch.from_numpy(context).to(torch.float32) 52 | 53 | # prepare image 54 | image = data.chelsea() # ndarray 55 | image_transformed = img_trasnform(image) # tensor 56 | image_transformed = torch.tile(image_transformed.unsqueeze(0), (10,1,1,1)) 57 | 58 | token_embedding_size = 512 59 | model = eff.EfficientNetEncoder(token_embedding_size = token_embedding_size) 60 | model.eval() 61 | preds = model(image_transformed, context) 62 | self.assertEqual(preds.shape, (10, token_embedding_size)) 63 | 64 | def test_imagenet_classification(self): 65 | """Test that we can correctly classify an image of a cat.""" 66 | # prepare context 67 | state = np.random.RandomState(0) 68 | context = state.uniform(-1, 1, (10, 512)) 69 | context = torch.from_numpy(context).to(torch.float32) 70 | 71 | # prepare image 72 | image = data.chelsea() # ndarray 73 | image_transformed = img_trasnform(image) # tensor 74 | image_transformed = torch.tile(image_transformed.unsqueeze(0), (10,1,1,1)) 75 | 76 | # prediction 77 | model = eff.EfficientNetEncoder(include_top=True) 78 | model.eval() 79 | preds = model._encode(image_transformed, context) 80 | preds = preds[0:1] 81 | 82 | predictor = film_efficientnet_encoder.ILSVRCPredictor() 83 | predicted_names = predictor.predict_topk(preds) 84 | 85 | 86 | self.assertIn('tabby', predicted_names) 87 | 88 | 89 | if __name__ == '__main__': 90 | unittest.main() -------------------------------------------------------------------------------- /maruya24_rt1/tokenizers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ioai-tech/pytorch_rt1_with_trainer_and_tester/bd86991521dda9cfd0664a0d4ad602ab34ad01a1/maruya24_rt1/tokenizers/__init__.py -------------------------------------------------------------------------------- /maruya24_rt1/tokenizers/action_tokenizer.py: -------------------------------------------------------------------------------- 1 | # Subject to the terms and conditions of the Apache License, Version 2.0 that the original code follows, 2 | # I have retained the following copyright notice written on it. 3 | 4 | # Copyright 2022 Google LLC 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | # You can find the original code from here[https://github.com/google-research/robotics_transformer]. 19 | 20 | 21 | from gym import spaces 22 | import torch 23 | 24 | from typing import Dict, Union 25 | """ 26 | Please define action space using gym.spaces.Dict. 27 | 28 | As an example, if an action is: 29 | terminate = [0, 1] # this is one hot vector. 30 | world_vector = [0.9, 0.8, -0.3] 31 | rotation_delta = [-0.1, 0.2, .6] 32 | gripper_closedness = 0.9 33 | 34 | then action space will look like 35 | action_space = gym.spaces.Dict( 36 | { 37 | 'terminate': gym.spaces.Discrete(2), 38 | 'world_vector': gym.spaces.Box(low= -1.0, high= 1.0, shape=(3,), dtype=np.float32), 39 | 'rotation_delta': gym.spaces.Box(low= -np.pi / 2 , high= np.pi / 2, shape=(3,), dtype=np.float32), 40 | 'gripper_closedness_action': gym.spaces.Box(low= -1.0 , high= 1.0, shape=(1,), dtype=np.float32) 41 | } 42 | ) 43 | or 44 | action_space = gym.spaces.Dict( 45 | OrderedDict([ 46 | ('terminate', gym.spaces.Discrete(2)), 47 | ('world_vector', gym.spaces.Box(low= -1.0, high= 1.0, shape=(3,), dtype=np.float32)), 48 | ('rotation_delta', gym.spaces.Box(low= -np.pi / 2., high= np.pi / 2., shape=(3,), dtype=np.float32)), 49 | ('gripper_closedness_action', gym.spaces.Box(low= -1.0 , high= 1.0, shape=(1,), dtype=np.float32)) 50 | ]) 51 | ) 52 | Please use OrderedDict if you want gym.spaces.Dict to keep order of actions. 53 | 54 | This action_space is just information about each action. 55 | These information are very convenient when interpreting, examining, cliping, and processing the action 56 | because the action is dictionary with key names which are the same as the action_space. 57 | 58 | action value will be look like 59 | action = { 60 | 'terminate': 1, 61 | 'world_vector': [0.9, 0.8, -0.3], 62 | 'rotation_delta': [-0.1, 0.2, .6], 63 | 'gripper_closedness_action': [0.9] 64 | } 65 | Note that values are int and numpy 1-d arrays. 66 | """ 67 | 68 | class RT1ActionTokenizer(): 69 | def __init__(self, 70 | action_space: spaces.Dict, 71 | vocab_size: int): 72 | """Instantiates an RT1ActionTokenizer. 73 | 74 | Args: 75 | action_space: A dictionary of OpenAI gym spaces of the expected actions. 76 | vocab_size: Number of buckets to discretize action to. 77 | """ 78 | 79 | self._action_space = action_space 80 | self._vocab_size = vocab_size 81 | self._action_order = list(action_space.keys()) # Order of tokenizing 82 | 83 | self._tokens_per_action = 0 84 | # Calculate the number of tokens per action 85 | for action in self._action_order: 86 | # If this action is spaces.Discrete, this is one token. 87 | if isinstance(self._action_space[action], spaces.Discrete): 88 | self._tokens_per_action += 1 89 | 90 | # If a action is spaces.Box, get the number of tokens using shape. 91 | elif isinstance(self._action_space[action], spaces.Box): 92 | action_shape = self._action_space[action].shape 93 | if len(action_shape) != 1: 94 | raise ValueError( 95 | f'Only action shapes with single dimension supported, got {action_shape}') 96 | self._tokens_per_action += action_shape[0] 97 | else: 98 | raise ValueError('We assume action_space is defined by either gym.spaces.Discrete or gym.spaces.Box') 99 | 100 | 101 | @property 102 | def tokens_per_action(self) -> int: 103 | return self._tokens_per_action 104 | 105 | def tokenize(self, action: Dict[str, torch.Tensor]) -> torch.Tensor: 106 | """Tokenizes an action.""" 107 | action_tokens = [] 108 | # Perform tokenizing in order of self._action_order 109 | for k in self._action_order: # If a is Discrete, the size of a is () or (batch,), which means the former is scalar of Tensor type and the later is 1-d Tensor. 110 | a = action[k] 111 | a_space = self._action_space[k] 112 | if isinstance(a_space, spaces.Discrete): 113 | assert torch.all(a < self._vocab_size), "Discrete action should be smaller than vocab size." 114 | token = a # Discrete action is already token. The size is () or (batch,) 115 | token = a.unsqueeze(-1) # The size is (1,) or (batch, 1). Discrete action will be one token. 116 | else: # if a is Box, size of a is (action_size) or (batch, action_size). 117 | low = torch.tensor(a_space.low).to(a.device) 118 | high = torch.tensor(a_space.high).to(a.device) 119 | a = torch.clamp(a, low, high) 120 | # Normalize the action. 121 | token = (a - low) / (high - low) 122 | # Bucket and discretize the action to vocab_size. 123 | token = token * (self._vocab_size - 1) 124 | token = token.to(torch.int32) # The size is (action_size) or (batch, action_size). 125 | action_tokens.append(token) # if this action has action_size, this action will be action_size tokens. 126 | # Contatenate all actions. The size will be (tokens_per_action) or (batch, tokens_per_action) 127 | action_tokens = torch.concat(action_tokens, dim=-1) 128 | return action_tokens 129 | 130 | # The size of action_tokens is (tokens_per_action) or (batch, tokens_per_action) 131 | def detokenize(self, action_tokens: torch.Tensor) -> Dict[str, torch.Tensor]: 132 | """Detokenizes an action.""" 133 | action = {} 134 | token_index = 0 135 | # action_tokens is in self._action_order order 136 | # So we will detokenize in self._action_order order 137 | for k in self._action_order: 138 | # Discrete actions are already assumed to be tokens. 139 | space = self._action_space[k] 140 | if isinstance(space, spaces.Discrete): 141 | # The size of action_tokens[k] : (1,) or (batch,), which means the former is scalar of Tensor type and the later is 1-d Tensor. 142 | action[k] = action_tokens[..., token_index] 143 | # A poor model may output tokens outside the allowed range, in that case 144 | # set them to a default value, the 0 token in this case. 145 | action[k] = torch.where(action[k] > space.n, torch.zeros_like(action[k]), action[k]) 146 | token_index += 1 147 | else: 148 | actions = [] 149 | action_dim = space.shape[0] 150 | for j in range(action_dim): 151 | a = action_tokens[..., token_index:token_index + 1] # The size of a: (1,) or (batch, 1) 152 | a = a.to(torch.float32) # Change int32 to float32 153 | # de-normalize 154 | a = a / (self._vocab_size - 1) 155 | a = a * (space.high[j] - space.low[j]) + space.low[j] 156 | actions.append(a) 157 | token_index += 1 158 | action[k] = torch.concat(actions, dim=-1) # size: (action_dim) or (batch, action_dim) 159 | return action -------------------------------------------------------------------------------- /maruya24_rt1/tokenizers/action_tokenizer_test.py: -------------------------------------------------------------------------------- 1 | # Subject to the terms and conditions of the Apache License, Version 2.0 that the original code follows, 2 | # I have retained the following copyright notice written on it. 3 | 4 | # Copyright 2022 Google LLC 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | # You can find the original code from here[https://github.com/google-research/robotics_transformer]. 19 | 20 | import unittest 21 | from gym import spaces 22 | from pytorch_robotics_transformer.tokenizers.action_tokenizer import RT1ActionTokenizer 23 | import numpy as np 24 | from collections import OrderedDict 25 | from typing import List, Dict 26 | from pytorch_robotics_transformer.tokenizers.utils import batched_space_sampler, np_to_tensor 27 | import torch 28 | import sys 29 | 30 | class ActionTokenizerTest(unittest.TestCase): 31 | 32 | # Use one Discrete action. 33 | # Check tokens_per_action and token. 34 | def testTokenize_Discrete(self): 35 | action_space = spaces.Dict( 36 | { 37 | 'terminate': spaces.Discrete(2) 38 | } 39 | ) 40 | tokenizer = RT1ActionTokenizer(action_space, vocab_size=10) 41 | self.assertEqual(1, tokenizer.tokens_per_action) 42 | 43 | action = { 44 | 'terminate': 1 45 | } 46 | action = np_to_tensor(action) 47 | action_tokens = tokenizer.tokenize(action) 48 | self.assertEqual(torch.tensor([1]), action_tokens) 49 | 50 | 51 | def testDetokenize_Discrete(self): 52 | action_space = spaces.Dict( 53 | { 54 | 'terminate': spaces.Discrete(2) 55 | } 56 | ) 57 | tokenizer = RT1ActionTokenizer(action_space, vocab_size=10) 58 | # 0 token should become 0 59 | action = tokenizer.detokenize(torch.tensor([0])) # tokenized action is ndarray. 60 | self.assertEqual(torch.tensor(0), action['terminate']) 61 | # 1 token should become 1 62 | action = tokenizer.detokenize(torch.tensor([1])) 63 | self.assertEqual(torch.tensor(1), action['terminate']) 64 | # OOV(Out of vocabulary) 3 token should become a default 0 65 | action = tokenizer.detokenize(torch.tensor([3])) 66 | self.assertEqual(torch.tensor(0), action['terminate']) 67 | 68 | 69 | # Use one Box action. 70 | # Check tokens_per_action and token. 71 | def testTokenize_Box(self): 72 | action_space = spaces.Dict( 73 | { 74 | 'world_vector': spaces.Box(low= -1., high= 1., shape=(3,), dtype=np.float32) 75 | } 76 | ) 77 | tokenizer = RT1ActionTokenizer(action_space, vocab_size=10) 78 | self.assertEqual(3, tokenizer.tokens_per_action) 79 | 80 | action = { 81 | 'world_vector': np.array([0.1, 0.5, -0.8]) 82 | } 83 | action = np_to_tensor(action) 84 | action_tokens = tokenizer.tokenize(action) 85 | self.assertSequenceEqual([4, 6, 0], list(action_tokens)) 86 | 87 | 88 | def testTokenize_Box_with_time_dimension(self): 89 | action_space = spaces.Dict( 90 | { 91 | 'world_vector': spaces.Box(low= -1., high= 1., shape=(3,), dtype=np.float32) 92 | } 93 | ) 94 | tokenizer = RT1ActionTokenizer(action_space, vocab_size=10) 95 | self.assertEqual(3, tokenizer.tokens_per_action) 96 | 97 | # We check if tokenizer works correctly when action vector has dimensions of batch size and time. 98 | batch_size = 2 99 | time_dimension = 3 100 | world_vec = np.array([[0.1, 0.5, -0.8], [0.1, 0.5, -0.8], [0.1, 0.5, -0.8], [0.1, 0.5, -0.8], [0.1, 0.5, -0.8], [0.1, 0.5, -0.8]]) 101 | world_vec = np.reshape(world_vec, (batch_size, time_dimension, tokenizer.tokens_per_action)) 102 | action = { 103 | 'world_vector': world_vec 104 | } 105 | action = np_to_tensor(action) 106 | action_tokens = tokenizer.tokenize(action) 107 | self.assertSequenceEqual( 108 | [batch_size, time_dimension, tokenizer.tokens_per_action], list(action_tokens.shape)) 109 | 110 | 111 | def testTokenize_Box_at_limits(self): 112 | minimum = -1. 113 | maximum = 1. 114 | vocab_size = 10 115 | action_space = spaces.Dict( 116 | { 117 | 'world_vector': spaces.Box(low=minimum, high= maximum, shape=(2,), dtype=np.float32) 118 | } 119 | ) 120 | tokenizer = RT1ActionTokenizer(action_space, vocab_size=vocab_size) 121 | self.assertEqual(2, tokenizer.tokens_per_action) 122 | action = { 123 | 'world_vector': [minimum, maximum] 124 | } 125 | action = np_to_tensor(action) 126 | action_tokens = tokenizer.tokenize(action) 127 | # Minimum value will go to 0 128 | # Maximum value witll go to vocab_size-1 129 | self.assertSequenceEqual([0, vocab_size - 1], list(action_tokens)) 130 | 131 | 132 | def testTokenize_invalid_action_spec_shape(self): 133 | action_space = spaces.Dict( 134 | { 135 | 'world_vector': spaces.Box(low=-1., high=1., shape=(2,2), dtype=np.float32) 136 | } 137 | ) 138 | with self.assertRaises(ValueError): 139 | RT1ActionTokenizer(action_space, vocab_size=10) 140 | 141 | 142 | # This test is the closest to a real situation. 143 | def testTokenizeAndDetokenizeIsEqual(self): 144 | action_space_dict = OrderedDict([('terminate', spaces.Discrete(2)), 145 | ('world_vector', spaces.Box(low= -1.0, high= 1.0, shape=(3,), dtype=np.float32)), 146 | ('rotation_delta', spaces.Box(low= -np.pi / 2., high= np.pi / 2., shape=(3,), dtype=np.float32)), 147 | ('gripper_closedness_action', spaces.Box(low= -1.0 , high= 1.0, shape=(1,), dtype=np.float32)) 148 | ]) 149 | action_space = spaces.Dict(action_space_dict) 150 | tokenizer = RT1ActionTokenizer(action_space, vocab_size=1024) 151 | self.assertEqual(8, tokenizer.tokens_per_action) 152 | 153 | # Repeat the following test N times with fuzzy inputs. 154 | n_repeat = 10 155 | for _ in range(n_repeat): 156 | action = action_space.sample() 157 | action = np_to_tensor(action) 158 | action_tokens = tokenizer.tokenize(action) 159 | policy_action = tokenizer.detokenize(action_tokens) 160 | # print(action) 161 | # print(action_tokens) 162 | # print(policy_action) 163 | for k in action: 164 | np.testing.assert_allclose(action[k].numpy(), policy_action[k].numpy(), 2) 165 | 166 | # Repeat the test with batched actions 167 | batch_size = 2 168 | batched_action = batched_space_sampler(action_space, batch_size) 169 | batched_action = np_to_tensor(batched_action) 170 | action_tokens = tokenizer.tokenize(batched_action) 171 | policy_action = tokenizer.detokenize(action_tokens) 172 | 173 | # print(batched_action) 174 | # print(action_tokens) 175 | # print(policy_action) 176 | 177 | for k in batched_action: 178 | for a, policy_a in zip(batched_action[k], policy_action[k]): 179 | np.testing.assert_almost_equal(a.numpy(), policy_a.numpy(), decimal=2) 180 | 181 | 182 | 183 | if __name__ == '__main__': 184 | unittest.main() -------------------------------------------------------------------------------- /maruya24_rt1/tokenizers/image_tokenizer.py: -------------------------------------------------------------------------------- 1 | # Subject to the terms and conditions of the Apache License, Version 2.0 that the original code follows, 2 | # I have retained the following copyright notice written on it. 3 | 4 | # Copyright 2022 Google LLC 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | # You can find the original code from here[https://github.com/google-research/robotics_transformer]. 19 | 20 | """A FiLM Efficientnet contextual image tokenizer used in Robotics Transformer 1. 21 | """ 22 | 23 | from typing import Optional 24 | import torch 25 | import torch.nn as nn 26 | import torch.nn.functional as F 27 | 28 | from maruya24_rt1.film_efficientnet.pretrained_efficientnet_encoder import ( 29 | EfficientNetEncoder, 30 | ) 31 | from maruya24_rt1.tokenizers.token_learner import TokenLearnerModule 32 | 33 | 34 | class RT1ImageTokenizer(nn.Module): 35 | def __init__( 36 | self, 37 | embedding_output_dim: int = 512, 38 | language_embedding_size: int = 512, 39 | use_token_learner: bool = False, 40 | num_tokens: int = 8, 41 | ): 42 | super().__init__() 43 | self._tokenizer = EfficientNetEncoder( 44 | token_embedding_size=embedding_output_dim, 45 | language_embedding_size=language_embedding_size, 46 | early_film=True, 47 | pooling=False, 48 | ) 49 | 50 | self._use_token_learner = use_token_learner 51 | if self._use_token_learner: 52 | self._num_tokens = num_tokens 53 | self._token_learner = TokenLearnerModule( 54 | inputs_channels=embedding_output_dim, num_tokens=self._num_tokens 55 | ) 56 | 57 | @property 58 | def tokens_per_context_image(self) -> int: 59 | if self._use_token_learner: 60 | num_tokens = self._num_tokens 61 | else: 62 | num_tokens = 100 63 | return num_tokens 64 | 65 | # Note that context is the same value along with time axis. 66 | # This means (b, 0, embedding_dim) == (b, 1, embedding_dim) == (b, 2, embedding_dim) ... 67 | def forward( 68 | self, image: torch.Tensor, context: Optional[torch.Tensor] = None 69 | ) -> torch.Tensor: 70 | """Gets image tokens. 71 | 72 | Args: 73 | image: Images of shape (b, t, 3, h, w) to tokenize. 74 | context: An optional context vector (e.g., a natural language embedding). 75 | Expected to have shape (b, t, embedding_dim). 76 | training: Whether or not we are in training mode. 77 | 78 | Returns: 79 | tokens: has shape (batch, t, num_tokens_per_timestep, embedding_dim) 80 | """ 81 | b, t, c, h, w = image.shape 82 | 83 | # Fold the time axis into the batch axis. 84 | image = image.view(b * t, c, h, w) 85 | if context is not None: 86 | context = context.view(b * t, -1) 87 | 88 | tokens = self._tokenizer(image, context=context) # [b * t, 512 , 10, 10] 89 | 90 | if self._use_token_learner: 91 | tokens = self._token_learner(tokens) # [b * t, num_token, 512] 92 | # Unflatten the time axis, which was previously flattened into the batch. 93 | tokens = tokens.view(b, t, tokens.shape[1], -1) 94 | return tokens # [b, t, num_token, 512] 95 | else: 96 | # Unflatten the time axis, which was previously flattened into the batch. 97 | tokens = tokens.view(b, t, 512, -1) # [b, t, 512 , 10 * 10] 98 | # If you don't use token learner, the number of token is 100. 99 | tokens = tokens.transpose(2, 3) # [b, t, 10 * 10, 512] 100 | return tokens 101 | -------------------------------------------------------------------------------- /maruya24_rt1/tokenizers/image_tokenizer_test.py: -------------------------------------------------------------------------------- 1 | # Subject to the terms and conditions of the Apache License, Version 2.0 that the original code follows, 2 | # I have retained the following copyright notice written on it. 3 | 4 | # Copyright 2022 Google LLC 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | # You can find the original code from here[https://github.com/google-research/robotics_transformer]. 19 | 20 | import unittest 21 | import torch 22 | from pytorch_robotics_transformer.tokenizers import image_tokenizer 23 | from absl.testing import parameterized 24 | 25 | # Run this command above the robotics_transformer_pytorch directory 26 | # python -m pytorch.robotics_transformer.tokenizers.image_tokenizer_test 27 | 28 | class ImageTokenizerTest(parameterized.TestCase, unittest.TestCase): 29 | 30 | @parameterized.named_parameters( 31 | ('sample_image', 300, False, 8), 32 | ('sample_image_token_learner', 300, True, 8)) 33 | def testTokenize(self, image_resolution, use_token_learner, num_tokens): 34 | batch = 1 35 | seq = 2 36 | tokenizer = image_tokenizer.RT1ImageTokenizer( 37 | use_token_learner=use_token_learner, 38 | num_tokens=num_tokens) 39 | image = torch.randn(batch, seq, 3,image_resolution, image_resolution) 40 | image = torch.clamp(image, min=0, max=1) 41 | context_vector = torch.rand(batch, seq, 512) 42 | image_tokens = tokenizer(image, context_vector) # [b, t, num_token, 512] or [b, t, 10 * 10, 512] 43 | if use_token_learner: 44 | self.assertEqual(list(image_tokens.shape), [batch, seq, num_tokens, 512]) 45 | else: 46 | self.assertEqual(list(image_tokens.shape), [batch, seq, 100, 512]) 47 | 48 | if __name__ == '__main__': 49 | unittest.main() -------------------------------------------------------------------------------- /maruya24_rt1/tokenizers/token_learner.py: -------------------------------------------------------------------------------- 1 | # Subject to the terms and conditions of the Apache License, Version 2.0 that the original code follows, 2 | # I have retained the following copyright notice written on it. 3 | 4 | # Copyright 2022 Google LLC 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | # You can find the original code from here[https://github.com/google-research/robotics_transformer]. 19 | 20 | """PyTorch implementation of Token Learner(Ryoo et al 2021).""" 21 | 22 | import torch 23 | import torch.nn as nn 24 | import torch.nn.functional as F 25 | 26 | 27 | class TokenLearnerModule(nn.Module): 28 | def __init__( 29 | self, 30 | inputs_channels: int, 31 | num_tokens: int, 32 | bottleneck_dim: int = 64, 33 | dropout_rate: float = 0.0, 34 | ): 35 | super().__init__() 36 | 37 | self.layerNorm = nn.LayerNorm(inputs_channels) 38 | 39 | self.conv1 = nn.Conv2d( 40 | in_channels=inputs_channels, 41 | out_channels=bottleneck_dim, 42 | kernel_size=1, 43 | stride=1, 44 | padding=0, 45 | ) 46 | 47 | self.gelu1 = nn.GELU(approximate="tanh") 48 | 49 | if dropout_rate > 0: 50 | self.dropout1 = nn.Dropout2d(dropout_rate) 51 | else: 52 | self.dropout1 = nn.Identity() 53 | 54 | self.conv2 = nn.Conv2d( 55 | in_channels=bottleneck_dim, 56 | out_channels=num_tokens, 57 | kernel_size=1, 58 | stride=1, 59 | padding=0, 60 | ) 61 | 62 | if dropout_rate > 0: 63 | self.dropout2 = nn.Dropout2d(dropout_rate) 64 | else: 65 | self.dropout2 = nn.Identity() 66 | 67 | # inputs: [bs, c, h, w] or [bs * seq, c, h, w] 68 | # seq is time-series length such as frame 69 | def forward(self, inputs: torch.Tensor): 70 | # layer norm 71 | x = self.layerNorm(inputs.permute(0, 2, 3, 1)) 72 | x = x.permute(0, 3, 1, 2) 73 | 74 | # create weights map 75 | x = self.gelu1(self.conv1(x)) 76 | x = self.dropout1(x) 77 | x = self.conv2(x) 78 | x = self.dropout2(x) # (bs, num_tokens, h, w) 79 | 80 | x = x.view(x.shape[0], x.shape[1], -1) # (bs, num_tokens, h*w) 81 | weights_maps = F.softmax(x, dim=-1) 82 | 83 | # create tokens 84 | bs, c, h, w = inputs.shape 85 | inputs = inputs.permute(0, 2, 3, 1).view(bs, h * w, c) 86 | 87 | tokens = torch.bmm(weights_maps, inputs) 88 | # weighs_maps: [bs, n_token, h*w] 89 | # inputs: [bs, h * w, c] 90 | # tokens: [bs, n_token, c] 91 | 92 | # Above computation is equivalent to below explanation. 93 | # weights_maps has n_tokens channels. each channels is a weight map with the size of (h, w). 94 | # inputs has c channels, each channels size is (h, w). 95 | # compute the element-wise product of one weight map of weights_maps and one of channels of inputs 96 | # That result in (H, W). After sum this, we get a scalar. 97 | # Iterate this operation to all other channels of inputs using the same weight map, we get c scalar. 98 | # reshape (1, 1, c), then we get a learned token, as shown in Fig. 1 in tokenlearner paper. 99 | # We do the computation using all other weight map, then we get all tokens. 100 | return tokens 101 | -------------------------------------------------------------------------------- /maruya24_rt1/tokenizers/token_learner_test.py: -------------------------------------------------------------------------------- 1 | # Subject to the terms and conditions of the Apache License, Version 2.0 that the original code follows, 2 | # I have retained the following copyright notice written on it. 3 | 4 | # Copyright 2022 Google LLC 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | # You can find the original code from here[https://github.com/google-research/robotics_transformer]. 19 | 20 | import unittest 21 | import torch 22 | from pytorch_robotics_transformer.tokenizers.token_learner import TokenLearnerModule 23 | 24 | # Type this command on your terminal aboved robotics_transformer_pytorch directory. 25 | # python -m robotics_transformer_pytorch.tokenizer.token_learnerr_test 26 | 27 | class TokenLearnerTest(unittest.TestCase): 28 | def testTokenLearner(self, embedding_dim=512, num_tokens=8): 29 | batch = 1 30 | seq = 2 31 | token_learner_layer = TokenLearnerModule( 32 | inputs_channels=embedding_dim, 33 | num_tokens=num_tokens) 34 | # seq is time-series length 35 | # embedding_dim = channels 36 | inputvec = torch.randn((batch * seq, embedding_dim, 10, 10)) 37 | 38 | learnedtokens = token_learner_layer(inputvec) 39 | self.assertEqual(list(learnedtokens.shape), [batch * seq, num_tokens, embedding_dim]) 40 | 41 | if __name__ == '__main__': 42 | unittest.main() -------------------------------------------------------------------------------- /maruya24_rt1/tokenizers/utils.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Union 2 | import numpy as np 3 | from gym import spaces 4 | import torch 5 | 6 | # This function will turn the space into the batched space and return a batched action sample. 7 | # The output format is compatible with OpenAI gym's Vectorized Environments. 8 | def batched_space_sampler(space: spaces.Dict, batch_size: int): 9 | batched_sample : Dict[str, np.ndarray] = {} 10 | samples = [space.sample() for _ in range(batch_size)] # 辞書のリスト 11 | for key in samples[0].keys(): 12 | value_list = [] 13 | for i in range(batch_size): 14 | value_list.append(samples[i][key]) 15 | value_list = np.stack(value_list, axis=0) 16 | batched_sample[key] = value_list 17 | return batched_sample 18 | 19 | # This function turn all dict values into tensor. 20 | def np_to_tensor(sample_dict: Dict[str, Union[int,np.ndarray]]) -> Dict[str, torch.Tensor]: 21 | new_dict = {} 22 | for key, value in sample_dict.items(): 23 | value = torch.tensor(value) 24 | new_dict[key] = value 25 | 26 | return new_dict -------------------------------------------------------------------------------- /maruya24_rt1/transformer.py: -------------------------------------------------------------------------------- 1 | # Subject to the terms and conditions of the Apache License, Version 2.0 that the original code follows, 2 | # I have retained the following copyright notice written on it. 3 | 4 | # Copyright 2022 Google LLC 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | # You can find the original code from here[https://github.com/google-research/robotics_transformer]. 19 | 20 | import torch 21 | import torch.nn as nn 22 | import torch.nn.functional as F 23 | import math 24 | from typing import Optional, Tuple, Union, List 25 | 26 | # This implementation is similar to tf.keras.layers.MultiHeadAttention, not torch.nn.MultiheadAttention. 27 | # This can be used in the situation where query = key = value. 28 | # In RT-1 we don't set value_dim. Therefore, values_dim = key_dim. 29 | class TF_MultiHeadAttention(nn.Module): 30 | def __init__(self, 31 | heads: int, 32 | d_model: int, 33 | key_dim: int, 34 | value_dim: Optional[int] = None, 35 | dropout: float = 0.1, 36 | return_attention_scores: bool = False): 37 | super().__init__() 38 | 39 | self.d_model = d_model 40 | self.h = heads 41 | self.key_dim = key_dim 42 | self.value_dim = value_dim if value_dim else key_dim # if value_dim is None, value_dim will be key_dim. 43 | self.return_attention_scores = return_attention_scores 44 | 45 | self.q_linear = nn.Linear(d_model, self.h * self.key_dim) 46 | self.k_linear = nn.Linear(d_model, self.h * self.key_dim) 47 | self.v_linear = nn.Linear(d_model, self.h * self.value_dim) 48 | self.dropout = nn.Dropout(dropout) 49 | self.out = nn.Linear(self.h * self.value_dim, d_model) 50 | 51 | def forward(self, q, k, v, mask=None): 52 | 53 | bs = q.size(0) 54 | 55 | # perform linear operation and split into h heads 56 | 57 | k = self.k_linear(k).view(bs, -1, self.h, self.key_dim) 58 | q = self.q_linear(q).view(bs, -1, self.h, self.key_dim) 59 | v = self.v_linear(v).view(bs, -1, self.h, self.value_dim) 60 | 61 | # transpose to get dimensions bs * h * sl * key_dim or bs * h * sl * value_dim 62 | 63 | k = k.transpose(1,2) 64 | q = q.transpose(1,2) 65 | v = v.transpose(1,2) # calculate attention using function we will define next 66 | 67 | if self.return_attention_scores: 68 | attention_output, score = self.attention(q, k, v, self.key_dim, mask, self.dropout, self.return_attention_scores) # attention_output: (bs, h, sl, value_dim), score: (bs, h, sl, sl) 69 | else: 70 | attention_output = self.attention(q, k, v, self.key_dim, mask, self.dropout, self.return_attention_scores) # (bs, h, sl, value_dim) 71 | 72 | # concatenate heads and put through final linear layer 73 | concat = attention_output.transpose(1,2).contiguous().view(bs, -1, self.h * self.value_dim) # (bs, sl, heads * value_dim) 74 | output = self.out(concat) # (bs, sl, d_model) 75 | 76 | if self.return_attention_scores: 77 | return output, score 78 | else: 79 | return output 80 | 81 | 82 | def attention(self, q, k, v, key_dim, mask=None, dropout=None, return_attention_scores=False): 83 | 84 | scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(key_dim) 85 | # q: (bs, h, sl, key_dim) 86 | # k.transpose(-2, -1) : (bs, h, key_dim, sl) 87 | # score: (bs, h, sl, sl) 88 | 89 | if mask is not None: 90 | # mask: (sl, sl) 91 | mask = mask.unsqueeze(0).unsqueeze(1).to(scores.device) 92 | scores = scores.masked_fill(mask == 0, -1e9) 93 | 94 | scores = F.softmax(scores, dim=-1) 95 | 96 | 97 | if dropout is not None: 98 | scores = dropout(scores) 99 | 100 | 101 | output = torch.matmul(scores, v) 102 | # score: (bs, h, sl, sl) 103 | # v : (bs, h, sl, value_dim) 104 | # output: (bs, h, sl, value_dim) 105 | 106 | if return_attention_scores: 107 | return output, scores 108 | else: 109 | return output 110 | 111 | # input_size and output_size: (bs, sl, feed_forward_size) 112 | class _TransformerLayer(nn.Module): 113 | """A single transformer block.""" 114 | def __init__(self, 115 | layer_size: int = 4096, # This corresponds to key_dim which is the size of each attention head for query, key and values. 116 | num_heads: int = 8, 117 | feed_forward_size: int = 512, # This corresponds to d_model which is embedding dimension of each token in transformer part. 118 | dropout_rate: float = 0.1, 119 | return_attention_scores: bool = False): 120 | 121 | super().__init__() 122 | self._return_attention_scores = return_attention_scores 123 | 124 | self.norm_1 = nn.LayerNorm(feed_forward_size) 125 | self.attn = TF_MultiHeadAttention(num_heads,feed_forward_size, layer_size, dropout=dropout_rate, return_attention_scores=return_attention_scores) 126 | self.ff = nn.Linear(feed_forward_size, feed_forward_size) 127 | self.norm_2 = nn.LayerNorm(feed_forward_size) 128 | self.dropout_1 = nn.Dropout(dropout_rate) 129 | 130 | def forward(self, x: torch.Tensor, mask: torch.Tensor) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: 131 | x1 = self.norm_1(x) 132 | attn_results = self.attn(x1, x1, x1, mask=mask) 133 | if self._return_attention_scores: 134 | x1, score = attn_results 135 | else: 136 | x1, score = attn_results, None 137 | x = x + x1 138 | 139 | y = self.norm_2(x) 140 | ff_y = self.ff(y) 141 | ff_y = self.dropout_1(ff_y) 142 | x = x + ff_y 143 | 144 | return x, score 145 | 146 | class Transformer(nn.Module): 147 | def __init__(self, 148 | num_layers: int = 1, # Number of transformer layers. 149 | layer_size: int = 4096, # This corresponds to key_dim which is the size of each attention head for query, key and values. 150 | num_heads: int = 8, 151 | feed_forward_size: int = 512, # This corresponds to d_model which is embedding dimension of each token in transformer part. 152 | dropout_rate: float = 0.1, 153 | vocab_size: int = 256, # Dimensionality of tokens from the output layer. This is also dimensionality of tokens from the input layer. 154 | input_token_emb_dim: int = 512, # embedding dim of input tokens. 155 | return_attention_scores: bool = False, 156 | max_seq_len: int = 256, # Maximum sequence length. This Transformer can't receive tokens that are more than this number. 157 | ): 158 | super(Transformer, self).__init__() 159 | 160 | self._layers = nn.ModuleList([ 161 | _TransformerLayer( # pylint: disable=g-complex-comprehension 162 | layer_size=layer_size, 163 | num_heads=num_heads, 164 | feed_forward_size=feed_forward_size, 165 | dropout_rate=dropout_rate, 166 | return_attention_scores=return_attention_scores) 167 | for _ in range(num_layers) 168 | ]) 169 | 170 | self._token_emb = nn.Linear(input_token_emb_dim, feed_forward_size) 171 | self._position_emb = nn.Embedding(max_seq_len, feed_forward_size) # <--- 工夫が必要 ここだけBERTのようにする? 172 | self._output_tokens = nn.Linear(feed_forward_size, vocab_size) 173 | 174 | # inputs: (bs, seq, emb_dim). emb_dim = vocab_size 175 | def forward(self, inputs: torch.Tensor, attention_mask: torch.Tensor) -> Union[torch.Tensor, Tuple[torch.Tensor, List[torch.Tensor]]]: 176 | batch_size = inputs.shape[0] 177 | seq_len = inputs.shape[1] 178 | 179 | # 1. Token Embeddings 180 | tokens_embeddings = self._token_emb(inputs) # (bs, seq_len, feed_forward_size) 181 | 182 | # 2. Transformer Positional Embedding: 183 | position_ids = torch.arange(seq_len, dtype=torch.long, device=inputs.device) 184 | position_ids = torch.tile(position_ids.unsqueeze(0), dims=(batch_size, 1)) # (bs, seq_len) 185 | position_embeddings = self._position_emb(position_ids) # (bs, seq_len, feed_forward_size) 186 | 187 | # Add the two embedded tensors together 188 | x = tokens_embeddings + position_embeddings # (bs, seq_len, feed_forward_size) 189 | 190 | scores = [] 191 | 192 | for layer in self._layers: 193 | x, score = layer(x, mask=attention_mask) 194 | if score is not None: 195 | scores.append(score) 196 | x = self._output_tokens(x) # (bs, seq_len, vocab_size) 197 | return x, scores -------------------------------------------------------------------------------- /maruya24_rt1/transformer_network_test.py: -------------------------------------------------------------------------------- 1 | # Subject to the terms and conditions of the Apache License, Version 2.0 that the original code follows, 2 | # I have retained the following copyright notice written on it. 3 | 4 | # Copyright 2022 Google LLC 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | # You can find the original code from here[https://github.com/google-research/robotics_transformer]. 19 | 20 | 21 | """Tests for networks.""" 22 | 23 | import torch 24 | import torch.nn.functional as F 25 | from absl.testing import parameterized 26 | import unittest 27 | import numpy as np 28 | from typing import Dict 29 | import sys 30 | sys.path.append("/home/io011/workspace/robotics_transformer/") 31 | import transformer_network 32 | from transformer_network_test_set_up import BATCH_SIZE 33 | from transformer_network_test_set_up import NAME_TO_INF_OBSERVATIONS 34 | from transformer_network_test_set_up import NAME_TO_STATE_SPACES 35 | from transformer_network_test_set_up import observations_list 36 | from transformer_network_test_set_up import space_names_list 37 | from transformer_network_test_set_up import state_space_list 38 | from transformer_network_test_set_up import TIME_SEQUENCE_LENGTH 39 | from transformer_network_test_set_up import TransformerNetworkTestUtils 40 | from tokenizers.utils import batched_space_sampler 41 | from tokenizers.utils import np_to_tensor 42 | 43 | 44 | class TransformerNetworkTest(TransformerNetworkTestUtils): 45 | @parameterized.named_parameters([{ 46 | 'testcase_name': '_' + name, 47 | 'state_space': spec, 48 | 'train_observation': obs, 49 | } for (name, spec, obs) in zip(space_names_list(), state_space_list(), observations_list())]) 50 | def testTransformerTrainLossCall(self, state_space, train_observation): 51 | network = transformer_network.TransformerNetwork( 52 | vocab_size=256, 53 | token_embedding_size=512, 54 | num_layers=8, 55 | layer_size=128, 56 | num_heads=8, 57 | feed_forward_size=512, 58 | dropout_rate=0.1, 59 | crop_size=236, 60 | input_tensor_space=state_space, 61 | output_tensor_space=self._action_space, 62 | time_sequence_length=TIME_SEQUENCE_LENGTH, 63 | use_token_learner=True) 64 | 65 | 66 | network.set_actions(self._train_action) 67 | 68 | network_state = batched_space_sampler(network._state_space, batch_size=BATCH_SIZE) 69 | network_state = np_to_tensor(network_state) # change np.ndarray type of sample values into tensor type 70 | 71 | output_actions, network_state = network( 72 | train_observation, network_state=network_state) 73 | 74 | expected_shape = [2, 3] 75 | 76 | self.assertEqual(list(network.get_actor_loss().shape), expected_shape) 77 | 78 | self.assertCountEqual(self._train_action.keys(), output_actions.keys()) 79 | 80 | @parameterized.named_parameters([{ 81 | 'testcase_name': '_' + name, 82 | 'space_name': name, 83 | } for name in space_names_list()]) 84 | def testTransformerInferenceLossCall(self, space_name): 85 | state_space = NAME_TO_STATE_SPACES[space_name] 86 | observation = NAME_TO_INF_OBSERVATIONS[space_name] # observation has no time dimension unlike during training. 87 | 88 | network = transformer_network.TransformerNetwork( 89 | input_tensor_space=state_space, 90 | #Dict('image': Box(0.0, 1.0, (3, 256, 320), float32), 'natural_language_embedding': Box(-inf, inf, (512,), float32)) 91 | output_tensor_space=self._action_space, 92 | # Dict('terminate_episode': Discrete(2), 'world_vector': Box(-1.0, 1.0, (3,), float32), 'rotation_delta': Box(-1.5707964, 1.5707964, (3,), float32), 'gripper_closedness_action': Box(-1.0, 1.0, (1,), float32)) 93 | time_sequence_length=TIME_SEQUENCE_LENGTH) 94 | 95 | network.set_actions(self._inference_action) # self._inference_action has no time dimension unlike self._train_action. 96 | # inference currently only support batch size of 1 97 | network_state = batched_space_sampler(network._state_space, batch_size=1) 98 | network_state = np_to_tensor(network_state) # change np.ndarray type of sample values into tensor type 99 | 100 | output_actions, network_state = network( 101 | observation, network_state=network_state) 102 | # torch.float32 torch.Size([1, 3, 256, 320]) 103 | # {'image': tensor([[[[0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000], 104 | # [0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000], 105 | # [0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000], 106 | # ..., 107 | # [0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000], 108 | # [0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000], 109 | # [0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000]], 110 | 111 | # [[0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000], 112 | # [0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000], 113 | # [0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000], 114 | # ..., 115 | # [0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000], 116 | # [0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000], 117 | # [0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000]], 118 | 119 | # [[0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000], 120 | # [0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000], 121 | # [0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000], 122 | # ..., 123 | # [0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000], 124 | # [0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000], 125 | # [0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000]]]]), 126 | # 127 | # torch.float32 torch.Size([1, 512]) 128 | # 'natural_language_embedding': tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 129 | # 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 130 | # 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 131 | # 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 132 | # 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 133 | # 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 134 | # 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 135 | # 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 136 | # 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 137 | # 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 138 | # 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 139 | # 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 140 | # 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 141 | # 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 142 | # 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 143 | # 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 144 | # 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 145 | # 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 146 | # 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 147 | # 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 148 | # 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 149 | # 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 150 | # 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 151 | # 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 152 | # 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 153 | # 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 154 | # 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 155 | # 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 156 | # 1., 1., 1., 1., 1., 1., 1., 1.]])} 157 | 158 | # torch.int64 torch.Size([1, 3, 8]) 159 | # {'action_tokens': tensor([[[116, 61, 70, 126, 170, 243, 58, 127], 160 | # [167, 251, 204, 19, 222, 236, 61, 91], 161 | # [254, 249, 204, 14, 64, 162, 53, 248]]]), 162 | # torch.float32 torch.Size([1, 3, 8, 512]) 163 | # 'context_image_tokens': tensor([[[[-0.8647, -1.9947, -0.7335, ..., -1.0252, -0.1364, 0.6359], 164 | # [ 0.1899, 0.1604, -0.4497, ..., -0.3601, 0.0933, -0.6361], 165 | # [ 1.5404, 0.2238, -1.0574, ..., 0.0621, 1.2833, -2.0537], 166 | # ..., 167 | # [ 1.5256, -0.4647, -0.5676, ..., 1.4825, -0.5317, 0.7946], 168 | # [-1.3059, 0.8747, -2.9062, ..., -0.0762, 1.8340, -0.8561], 169 | # [-0.6595, -2.0878, 0.2709, ..., 0.1764, 1.2843, -0.4609]], 170 | 171 | # [[-0.0947, 0.0133, -0.2301, ..., 0.1116, -0.1111, -0.0391], 172 | # [-0.0998, 0.0090, -0.2151, ..., 0.1465, -0.1033, -0.0342], 173 | # [-0.0935, 0.0161, -0.2835, ..., 0.1232, -0.1025, -0.0332], 174 | # ..., 175 | # [-0.1249, -0.0076, -0.2253, ..., 0.1392, -0.1412, -0.0528], 176 | # [-0.1013, -0.0141, -0.2092, ..., 0.1403, -0.1279, -0.0494], 177 | # [-0.0929, 0.0040, -0.1969, ..., 0.1384, -0.1017, -0.0603]], 178 | 179 | # [[-1.8412, -1.0423, -0.4420, ..., 0.8732, -0.1140, -0.1008], 180 | # [ 0.6039, -0.6286, -0.1894, ..., -0.4552, 1.3157, -1.1879], 181 | # [ 1.1295, -0.1596, -0.0862, ..., -1.0804, 1.0614, -0.0249], 182 | # ..., 183 | # [-0.3196, -1.3627, 0.3153, ..., -0.7006, 1.0846, -0.2101], 184 | # [ 2.3243, 1.2046, 1.8313, ..., -1.1140, 0.4493, -0.8400], 185 | # [ 1.1736, -0.3540, -0.6332, ..., 0.0968, 1.5409, 0.7139]]]], 186 | # grad_fn=), 187 | # torch.int64 torch.Size([]) 188 | # 'seq_idx': tensor(2)} 189 | 190 | self.assertEqual(network.get_actor_loss().item(), 0.0) 191 | self.assertCountEqual(self._inference_action.keys(), output_actions.keys()) 192 | 193 | @parameterized.named_parameters([{ 194 | 'testcase_name': '_' + name, 195 | 'state_space': spec, 196 | } for name, spec in zip(space_names_list(), state_space_list())]) 197 | def testTransformerCausality(self, state_space): 198 | network = transformer_network.TransformerNetwork( 199 | input_tensor_space=state_space, 200 | output_tensor_space=self._action_space, 201 | time_sequence_length=TIME_SEQUENCE_LENGTH, 202 | dropout_rate=0.0) 203 | 204 | network.eval() 205 | 206 | time_sequence_length = network._time_sequence_length 207 | tokens_per_image = network._tokens_per_context_image 208 | tokens_per_action = network._tokens_per_action 209 | 210 | # size of all_tokens: (time_sequence_length * (tokens_per_image + tokens_per_action)) 211 | def _split_image_and_action_tokens(all_tokens): 212 | image_start_indices = [(tokens_per_image + tokens_per_action) * k 213 | for k in range(time_sequence_length)] 214 | 215 | image_tokens = torch.stack( 216 | [all_tokens[i:i + tokens_per_image] for i in image_start_indices], 217 | dim=0) 218 | action_start_indices = [i + tokens_per_image for i in image_start_indices] 219 | action_tokens = torch.stack([ 220 | all_tokens[i:i + tokens_per_action] for i in action_start_indices], 221 | dim=0) 222 | 223 | image_tokens = F.one_hot(image_tokens, network._token_embedding_size) 224 | # Add batch dimension. 225 | image_tokens = image_tokens.unsqueeze(0) # image_tokens: (1, time_sequence_length, tokens_per_image, emb_dim) 226 | action_tokens = action_tokens.unsqueeze(0) # action: (1, time_sequence_length, tokens_per_action) 227 | 228 | return image_tokens, action_tokens 229 | 230 | # Generate some random tokens for image and actions. 231 | all_tokens = torch.randint(low=0, high=10, size=(time_sequence_length * (tokens_per_image + tokens_per_action),)) 232 | context_image_tokens, action_tokens = _split_image_and_action_tokens(all_tokens) 233 | # Get the output tokens without any zeroed out input tokens. 234 | # output_tokens: (t*num_tokens, vocab_size) 235 | output_tokens = network._transformer_call( 236 | context_image_tokens=context_image_tokens, 237 | action_tokens=action_tokens, 238 | attention_mask=network._default_attention_mask, 239 | batch_size=1)[0] 240 | 241 | for t in range(time_sequence_length * 242 | (tokens_per_image + tokens_per_action)): 243 | # Zero out future input tokens. 244 | all_tokens_at_t = torch.concat([all_tokens[:t + 1], torch.zeros_like(all_tokens[t + 1:])], 0) 245 | context_image_tokens, action_tokens = _split_image_and_action_tokens(all_tokens_at_t) 246 | 247 | # Get the output tokens with zeroed out input tokens after t. 248 | output_tokens_at_t = network._transformer_call( 249 | context_image_tokens=context_image_tokens, 250 | action_tokens=action_tokens, 251 | attention_mask=network._default_attention_mask, 252 | batch_size=1)[0] 253 | 254 | # The output token is unchanged if future input tokens are zeroed out. 255 | np.testing.assert_array_equal(output_tokens[:t + 1].detach().numpy(), output_tokens_at_t[:t + 1].detach().numpy()) 256 | 257 | 258 | if __name__ == '__main__': 259 | unittest.main() 260 | -------------------------------------------------------------------------------- /maruya24_rt1/transformer_network_test_set_up.py: -------------------------------------------------------------------------------- 1 | 2 | # Subject to the terms and conditions of the Apache License, Version 2.0 that the original code follows, 3 | # I have retained the following copyright notice written on it. 4 | 5 | # Copyright 2022 Google LLC 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | # You can find the original code from here[https://github.com/google-research/robotics_transformer]. 20 | 21 | 22 | import copy 23 | from typing import Optional, Tuple, Union, List, Dict 24 | from absl.testing import parameterized 25 | import numpy as np 26 | from gym import spaces 27 | import torch 28 | import unittest 29 | from collections import OrderedDict 30 | 31 | BATCH_SIZE = 3 32 | TIME_SEQUENCE_LENGTH = 3 33 | HEIGHT = 256 34 | WIDTH = 320 35 | NUM_IMAGE_TOKENS = 2 36 | 37 | # For now, we use one type of spaces. 38 | def space_names_list() -> List[str]: 39 | """Lists the different types of spaces accepted by the transformer.""" 40 | return ['default'] 41 | 42 | 43 | def state_space_list() -> List[spaces.Dict]: 44 | """Lists the different types of state spec accepted by the transformer.""" 45 | # This will be input_tensor_spec 46 | state_space = spaces.Dict( 47 | { 48 | 'image': spaces.Box(low=0.0, high=1.0, 49 | shape=(3, HEIGHT, WIDTH), dtype=np.float32), 50 | 'natural_language_embedding': spaces.Box(low=-np.inf, high=np.inf, 51 | shape=[512], dtype=np.float32) 52 | } 53 | ) 54 | 55 | return [state_space] 56 | 57 | def observations_list(training: bool = True) -> List[Dict[str, torch.Tensor]]: 58 | """Lists the different types of observations accepted by the transformer.""" 59 | if training: 60 | image_shape = [BATCH_SIZE, TIME_SEQUENCE_LENGTH, 3, HEIGHT, WIDTH] 61 | emb_shape = [BATCH_SIZE, TIME_SEQUENCE_LENGTH, 512] 62 | else: 63 | # inference currently only support batch size of 1 64 | image_shape = [1, 3, HEIGHT, WIDTH] 65 | emb_shape = [1, 512] 66 | return [ 67 | { 68 | 'image': torch.full(image_shape, 0.5), 69 | 'natural_language_embedding': torch.full(emb_shape, 1.0) 70 | } 71 | ] 72 | 73 | NAME_TO_STATE_SPACES = dict(zip(space_names_list(), state_space_list())) 74 | NAME_TO_OBSERVATIONS = dict(zip(space_names_list(), observations_list())) 75 | NAME_TO_INF_OBSERVATIONS = dict( 76 | zip(space_names_list(), observations_list(False))) 77 | 78 | # This class will be inherited by TransformerNetworkTestUtils in transformer_network_test.py. 79 | class TransformerNetworkTestUtils(parameterized.TestCase, unittest.TestCase): 80 | """Defines spaces, SequenceAgent, and various other testing utilities.""" 81 | 82 | def _define_spaces(self, 83 | train_batch_size=BATCH_SIZE, 84 | inference_batch_size=1, 85 | time_sequence_length=TIME_SEQUENCE_LENGTH, 86 | inference_sequence_length=TIME_SEQUENCE_LENGTH, 87 | token_embedding_size=512, 88 | image_width=WIDTH, 89 | image_height=HEIGHT): 90 | """Defines spaces and observations (both training and inference).""" 91 | 92 | self.train_batch_size = train_batch_size 93 | self.inference_batch_size = inference_batch_size 94 | self.time_sequence_length = time_sequence_length 95 | self.inference_sequence_length = inference_sequence_length 96 | self.token_embedding_size = token_embedding_size 97 | 98 | # action space need to keep order of actions so that it can tokenize and detokenize actions correctly. 99 | # So we define action_space with OrderedDict. 100 | action_space = spaces.Dict( 101 | OrderedDict([ 102 | ('terminate_episode', spaces.Discrete(2)), 103 | ('world_vector', spaces.Box(low= -1.0, high= 1.0, shape=(3,), dtype=np.float32)), 104 | ('rotation_delta', spaces.Box(low= -np.pi / 2, high= np.pi / 2, shape=(3,), dtype=np.float32)), 105 | ('gripper_closedness_action', spaces.Box(low= -1.0 , high= 1.0, shape=(1,), dtype=np.float32)) 106 | ]) 107 | ) 108 | 109 | state_space = spaces.Dict( 110 | { 111 | 'image': spaces.Box(low=0.0, high=1.0, 112 | shape=(3, image_height, image_width), dtype=np.float32), 113 | 'natural_language_embedding': spaces.Box(low=-np.inf, high=np.inf, 114 | shape=[self.token_embedding_size], dtype=np.float32) 115 | } 116 | ) 117 | 118 | self._policy_info_space = { 119 | 'return': spaces.Box(low=0.0, high=1.0, shape=(),dtype=np.float32), 120 | 'discounted_return': spaces.Box(low=0.0, high=1.0, shape=(),dtype=np.float32) 121 | } 122 | self._state_space = state_space 123 | self._action_space = action_space 124 | 125 | self._inference_observation = { 126 | 'image': 127 | torch.full([self.inference_batch_size, 3, image_height, image_width], 1.0), 128 | 'natural_language_embedding': 129 | torch.full([self.inference_batch_size, self.token_embedding_size], 1.0), 130 | } 131 | 132 | self._train_observation = { 133 | 'image': 134 | torch.full( 135 | [self.train_batch_size, self.time_sequence_length, 3, image_height, image_width], 0.5), 136 | 'natural_language_embedding': 137 | torch.full( 138 | [self.train_batch_size, self.time_sequence_length, self.token_embedding_size], 1.) 139 | } 140 | self._inference_action = { 141 | 'world_vector': 142 | torch.full([self.inference_batch_size, 3], 0.5), 143 | 'rotation_delta': 144 | torch.full([self.inference_batch_size, 3], 0.5), 145 | 'terminate_episode': 146 | torch.full([self.inference_batch_size], 1), 147 | 'gripper_closedness_action': 148 | torch.full([self.inference_batch_size, 1], 0.5), 149 | } 150 | 151 | self._train_action = { 152 | 'world_vector': 153 | torch.full([self.train_batch_size, self.time_sequence_length, 3], 0.5), 154 | 'rotation_delta': 155 | torch.full([self.train_batch_size, self.time_sequence_length, 3], 0.5), 156 | 'terminate_episode': 157 | torch.full([self.train_batch_size, self.time_sequence_length], 1), 158 | 'gripper_closedness_action': 159 | torch.full([self.train_batch_size, self.time_sequence_length, 1], 0.5), 160 | } 161 | 162 | def setUp(self): 163 | self._define_spaces() 164 | super().setUp() 165 | 166 | -------------------------------------------------------------------------------- /maruya24_rt1/transformer_test.py: -------------------------------------------------------------------------------- 1 | # Subject to the terms and conditions of the Apache License, Version 2.0 that the original code follows, 2 | # I have retained the following copyright notice written on it. 3 | 4 | # Copyright 2022 Google LLC 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | # You can find the original code from here[https://github.com/google-research/robotics_transformer]. 19 | 20 | import unittest 21 | import torch 22 | 23 | from maruya24_rt1.transformer import Transformer 24 | from absl.testing import parameterized 25 | 26 | class TransformerTest(parameterized.TestCase, unittest.TestCase): 27 | 28 | def setUp(self): 29 | self._vocab_size = 10 30 | batch_size = 8 31 | sequence_len = 12 32 | self._tokens = torch.rand((batch_size, sequence_len, self._vocab_size)) 33 | 34 | @parameterized.parameters(True, False) 35 | def test_transformer_forwardpass(self, return_attention_scores): 36 | network = Transformer( 37 | num_layers=2, 38 | layer_size=512, 39 | num_heads=4, 40 | feed_forward_size=256, 41 | dropout_rate=0.1, 42 | vocab_size=self._vocab_size, 43 | input_token_emb_dim=self._vocab_size, 44 | return_attention_scores=return_attention_scores, 45 | max_seq_len=15) 46 | 47 | output_tokens, attention_scores = network(self._tokens, attention_mask=None) 48 | self.assertSequenceEqual(self._tokens.shape, output_tokens.shape) 49 | if return_attention_scores: 50 | self.assertNotEmpty(attention_scores) 51 | else: 52 | self.assertEmpty(attention_scores) 53 | 54 | 55 | 56 | if __name__ == '__main__': 57 | unittest.main() -------------------------------------------------------------------------------- /multi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Assigning the Python script name to a variable 4 | python_script="IO_evaluator.py" 5 | 6 | # Getting input arguments from the command line 7 | epoch=$1 8 | task="pick" 9 | ckpt_dir=$2 10 | model_name=$3 11 | gpu_name=$4 12 | cam_view=$5 13 | using_pos_and_orn=$6 14 | num_threads=$7 15 | initial_c_value=0 16 | override_exist=1 17 | 18 | # Function to run the Python script with specified arguments 19 | run_python_script() { 20 | local c_value=$1 21 | python $python_script -t $task -d $ckpt_dir -m $model_name -c $cam_view -e $epoch -g $gpu_name -u $using_pos_and_orn -p $c_value -n $num_threads 22 | } 23 | 24 | # Function to handle cleanup when Ctrl+C is detected 25 | cleanup() { 26 | echo "Ctrl+C detected. Killing all threads..." 27 | kill 0 28 | exit 1 29 | } 30 | 31 | # Function to run multiple threads of the Python script 32 | run_threads() { 33 | local num_threads=$1 34 | local c_value=$2 35 | local increment=1 36 | 37 | # Trap Ctrl+C to call the cleanup function 38 | trap cleanup INT 39 | 40 | # Run the Python script in multiple threads 41 | for ((i=0; i None: 10 | self.arm_dof = None 11 | self.ee_index = None 12 | self.finger_index = None 13 | self.home_j_pos = None 14 | self.gripper_range = None 15 | self.is_hand = False 16 | self.home_ee_pose = ((-0.0395, 0.415, 1.145), (1, 0, 0, 0)) 17 | self.grasp_force_threshold = None 18 | 19 | def load(self, urdf_path, base_pos, base_ori): 20 | self.robot_id = p.loadURDF( 21 | urdf_path, 22 | base_pos, 23 | base_ori, 24 | useFixedBase=True, 25 | flags=p.URDF_ENABLE_CACHED_GRAPHICS_SHAPES, 26 | ) 27 | 28 | def calc_ik(self, pose): 29 | return list( 30 | p.calculateInverseKinematics( 31 | self.robot_id, 32 | self.ee_index, 33 | pose[0], 34 | pose[1], 35 | [-7] * 7, 36 | [7] * 7, 37 | [7] * 7, 38 | self.home_j_pos, 39 | maxNumIterations=100, 40 | solver=p.IK_DLS, 41 | ) 42 | ) 43 | 44 | def reset_j_home(self, random_home=False): 45 | if random_home: 46 | for i in range(self.arm_dof): 47 | self.home_j_pos[i] += random.uniform(-np.pi / 10, np.pi / 10) 48 | # self.home_j_pos = [1.22, -0.458, 0.31, -3.0, 0.20, 4* math.pi / 5, 0, 0.04, 0.04] 49 | self.reset_j_pos(self.home_j_pos) 50 | 51 | def reset_j_pos(self, j_pos): 52 | index = 0 53 | for j in range(p.getNumJoints(self.robot_id)): 54 | p.changeDynamics(self.robot_id, j, linearDamping=0, angularDamping=0) 55 | joint_type = p.getJointInfo(self.robot_id, j)[2] 56 | if joint_type in [ 57 | p.JOINT_PRISMATIC, 58 | p.JOINT_REVOLUTE, 59 | ]: 60 | p.resetJointState(self.robot_id, j, j_pos[index]) 61 | index = index + 1 62 | 63 | def move_arm(self, arm_pos, max_vel, force): 64 | for i in range(self.arm_dof): 65 | p.setJointMotorControl2( 66 | self.robot_id, 67 | i, 68 | p.POSITION_CONTROL, 69 | arm_pos[i], 70 | maxVelocity=max_vel[i], 71 | force=force[i], 72 | ) 73 | 74 | def move_gripper(self, gripper_pos, max_vel, force): 75 | for i, j_idx in enumerate(self.finger_index): 76 | p.setJointMotorControl2( 77 | self.robot_id, 78 | j_idx, 79 | p.POSITION_CONTROL, 80 | gripper_pos[i], 81 | maxVelocity=max_vel[i], 82 | force=force[i], 83 | ) 84 | 85 | def move_j(self): 86 | raise NotImplementedError 87 | 88 | def is_j_arrived(self, j_pos, include_finger=True, threshold=1e-2): 89 | cur_joint_position = [ 90 | s[0] 91 | for s in p.getJointStates( 92 | self.robot_id, list(range(self.arm_dof)) + self.finger_index 93 | ) 94 | ] 95 | diff_arm = np.abs( 96 | np.array(cur_joint_position) 97 | - np.array(j_pos)[: self.arm_dof + len(self.finger_index)] 98 | ) 99 | is_arrive = np.all(diff_arm[: self.arm_dof - 1] <= threshold) 100 | # if include_finger: 101 | # is_arrive = is_arrive and np.all( 102 | # diff_arm[-len(self.finger_index) :] <= threshold 103 | # ) 104 | return is_arrive 105 | 106 | def is_ee_arrived(self, ee_pose, tar_obj_id=None, threshold=2 * 1e-2): 107 | robot_ee_pos = np.array(p.getLinkState(self.robot_id, self.ee_index)[0]) 108 | diff_pos = np.abs(robot_ee_pos - ee_pose[0]) 109 | is_arrive = np.all(diff_pos <= threshold) 110 | if tar_obj_id != None: 111 | is_arrive = ( 112 | is_arrive 113 | and p.getClosestPoints(self.robot_id, tar_obj_id, 1e-5) 114 | and p.getContactPoints(self.robot_id, tar_obj_id) 115 | and p.getContactPoints(self.robot_id, tar_obj_id)[0][9] 116 | > self.grasp_force_threshold 117 | ) 118 | # p.getContactPoints(self.robot_id, tar_obj_id) 119 | return is_arrive 120 | 121 | 122 | class UR5(RobotBase): 123 | def __init__(self): 124 | super().__init__() 125 | self.arm_dof = 6 126 | self.ee_index = 18 127 | self.finger_index = [8] 128 | self.home_j_pos = [1.40, -1.58, 0.94, -0.93, -1.57, -1.74] + [0.0] * 6 129 | self.gripper_range = [0, 0.085] 130 | self.grasp_force_threshold = 30 131 | 132 | def load(self): 133 | super().load( 134 | "urdf/ur5_robotiq/urdf/ur5_robotiq_85.urdf", [0, 0, 0.62], [0, 0, 0, 1] 135 | ) 136 | finger_child_multiplier = {10: -1, 12: 1, 13: 1, 15: -1, 17: 1} 137 | for joint_id, multiplier in finger_child_multiplier.items(): 138 | c = p.createConstraint( 139 | self.robot_id, 140 | self.finger_index[0], 141 | self.robot_id, 142 | joint_id, 143 | jointType=p.JOINT_GEAR, 144 | jointAxis=[0, 1, 0], 145 | parentFramePosition=[0, 0, 0], 146 | childFramePosition=[0, 0, 0], 147 | ) 148 | p.changeConstraint( 149 | c, gearRatio=-multiplier, maxForce=100, erp=1 150 | ) # Note: the mysterious `erp` is of EXTREME importance 151 | 152 | def move_gripper(self, gripper_state): 153 | # open_length = np.clip(open_length, *self.gripper_range) 154 | open_length = ( 155 | gripper_state * (self.gripper_range[1] - self.gripper_range[0]) 156 | + self.gripper_range[0] 157 | ) 158 | open_angle = 0.715 - math.asin((open_length - 0.010) / 0.1143) 159 | # angle calculation 160 | super().move_gripper([open_angle], max_vel=[4], force=[1000]) 161 | 162 | def move_arm(self, j_pos): 163 | super().move_arm( 164 | j_pos[: self.arm_dof], 165 | [1] * self.arm_dof, 166 | [5 * 240.0] * self.arm_dof, 167 | ) 168 | 169 | 170 | class Panda(RobotBase): 171 | def __init__(self): 172 | super().__init__() 173 | self.arm_dof = 7 174 | self.ee_index = 11 175 | self.finger_index = [9, 10] 176 | self.home_j_pos = [1.22, -0.458, 0.31, -2.0, 0.20, 1.56, 2.32, 0.04, 0.04] 177 | self.gripper_range = [0.01, 0.04] 178 | self.grasp_force_threshold = 2 179 | 180 | def load(self): 181 | super().load("franka_panda/panda.urdf", [0, 0, 0.62], [0, 0, 0, 1]) 182 | # create a constraint to keep the fingers centered, 9 and 10 for finger indices 183 | c = p.createConstraint( 184 | self.robot_id, 185 | self.finger_index[0], 186 | self.robot_id, 187 | self.finger_index[1], 188 | jointType=p.JOINT_GEAR, 189 | jointAxis=[1, 0, 0], 190 | parentFramePosition=[0, 0, 0], 191 | childFramePosition=[0, 0, 0], 192 | ) 193 | p.changeConstraint(c, gearRatio=-1, erp=0.1, maxForce=50) 194 | 195 | def move_arm(self, j_pos): 196 | super().move_arm( 197 | j_pos[: self.arm_dof], 198 | [1 if i == self.arm_dof - 1 else 0.5 for i in range(self.arm_dof)], 199 | [5 * 240.0] * self.arm_dof, 200 | ) 201 | 202 | def move_gripper(self, gripper_state): 203 | gripper_pos = ( 204 | gripper_state * (self.gripper_range[1] - self.gripper_range[0]) 205 | + self.gripper_range[0] 206 | ) 207 | super().move_gripper([gripper_pos] * 2, [0.1] * 2, [1000] * 2) 208 | 209 | 210 | class Iiwa(RobotBase): 211 | def __init__(self): 212 | super().__init__() 213 | self.arm_dof = 7 214 | self.ee_index = 14 215 | self.finger_index = [8, 10, 11, 13] 216 | self.home_j_pos = [ 217 | -0.695, 218 | -0.117, 219 | -0.692, 220 | 1.261, 221 | 0.140, 222 | -1.773, 223 | 0.125, 224 | 0.125, 225 | -0.5, 226 | 0.5, 227 | 0.5, 228 | -0.5, 229 | ] 230 | 231 | self.gripper_range = [0, 0.5] 232 | self.grasp_force_threshold = 20 233 | 234 | def load(self): 235 | self.robot_id = p.loadSDF("urdf/kuka_iiwa/kuka_with_gripper2.sdf")[0] 236 | p.resetBasePositionAndOrientation(self.robot_id, [-0.1, 0, 0.7], [0, 0, 0, 1]) 237 | 238 | def move_gripper(self, gripper_state): 239 | val = ( 240 | gripper_state * (self.gripper_range[1] - self.gripper_range[0]) 241 | + self.gripper_range[0] 242 | ) 243 | gripper_pos = [-val, val, val, -val] 244 | super().move_gripper(gripper_pos, max_vel=[0.5] * 4, force=[1000] * 4) 245 | 246 | def move_arm(self, j_pos): 247 | super().move_arm( 248 | j_pos[: self.arm_dof], 249 | [1] * self.arm_dof, 250 | [5 * 240.0] * self.arm_dof, 251 | ) 252 | 253 | 254 | class Xarm(RobotBase): 255 | def __init__(self): 256 | super().__init__() 257 | self.arm_dof = 6 258 | self.ee_index = 13 259 | self.finger_index = [7, 8, 9, 10, 11, 12] 260 | # self.finger_index = [7] 261 | self.home_j_pos = [1.623, -0.0326, -1.939, 0.070, 1.960, 1.683] + [0] * 6 262 | self.gripper_range = [0, 0.85] 263 | self.grasp_force_threshold = 10 264 | 265 | def load(self): 266 | super().load("urdf/xarm/xarm6_with_gripper.urdf", [0, 0, 0.62], [0, 0, 0, 1]) 267 | 268 | def move_gripper(self, gripper_state): 269 | val = (1 - gripper_state) * ( 270 | self.gripper_range[1] - self.gripper_range[0] 271 | ) + self.gripper_range[0] 272 | gripper_pos = [val] * len(self.finger_index) 273 | super().move_gripper( 274 | gripper_pos, 275 | max_vel=[1] * len(self.finger_index), 276 | force=[100000] * len(self.finger_index), 277 | ) 278 | 279 | def move_arm(self, j_pos): 280 | super().move_arm( 281 | j_pos[: self.arm_dof], 282 | [1] * self.arm_dof, 283 | [5 * 240.0] * self.arm_dof, 284 | ) 285 | 286 | 287 | class HandBase(RobotBase): 288 | def __init__(self): 289 | super().__init__() 290 | 291 | def move_ee(self, ee_pose): 292 | cur_ee_pose = p.getLinkState(self.robot_id, self.ee_index)[0:2] 293 | inv_cur_ee_pose = p.invertTransform(cur_ee_pose[0], cur_ee_pose[1]) 294 | delta_pose = p.multiplyTransforms( 295 | inv_cur_ee_pose[0], inv_cur_ee_pose[1], ee_pose[0], ee_pose[1] 296 | ) 297 | delta_pos = delta_pose[0] 298 | delta_rot = p.getEulerFromQuaternion(delta_pose[1]) 299 | max_delta_pos = 0.01 300 | max_delta_rot = np.pi / 50 301 | 302 | delta_pos = np.clip(delta_pos, -max_delta_pos, max_delta_pos) 303 | delta_rot = np.clip(delta_rot, -max_delta_rot, max_delta_rot) 304 | ee_pose = p.multiplyTransforms( 305 | cur_ee_pose[0], 306 | cur_ee_pose[1], 307 | delta_pos, 308 | p.getQuaternionFromEuler(delta_rot), 309 | ) 310 | p.changeConstraint(self.hand_cid, ee_pose[0], ee_pose[1], maxForce=100, erp=10) 311 | 312 | def move_gripper(self, gripper_status): 313 | for index, range in self.gripper_range.items(): 314 | val = ((1 - gripper_status) * (range[1] - range[0]) + range[0]) * range[2] 315 | p.setJointMotorControl2( 316 | self.robot_id, 317 | index, 318 | p.POSITION_CONTROL, 319 | val, 320 | maxVelocity=8, 321 | force=100, 322 | ) 323 | 324 | def is_ee_arrived(self, ee_pose, tar_obj_id=None, threshold=2 * 1e-2): 325 | robot_ee_pos = np.array(p.getLinkState(self.robot_id, self.ee_index)[0]) 326 | diff_pos = np.abs(robot_ee_pos - ee_pose[0]) 327 | is_arrive = np.all(diff_pos <= threshold) 328 | if tar_obj_id != None: 329 | for i, (key, val) in enumerate(self.gripper_range.items()): 330 | is_arrive = is_arrive and np.all( 331 | abs(p.getJointState(self.robot_id, key)[0] - val[1] * val[2]) <= 0.2 332 | ) 333 | return is_arrive 334 | 335 | 336 | class MplHand(HandBase): 337 | def __init__(self): 338 | super().__init__() 339 | self.ee_index = 7 340 | self.arm_dof = 3 341 | self.home_j_pos = [0] * 22 342 | self.is_hand = True 343 | self.finger_index = [8, 10, 12, 18, 20] 344 | self.gripper_range = { 345 | 8: [0, 1.8, 1], 346 | 10: [0, 1, -1], 347 | 12: [0, 2.0, 1], 348 | 18: [0, 1.57, 1], 349 | 20: [0, 0.5, 1], 350 | } 351 | self.grasp_force_threshold = 2 352 | 353 | def load(self): 354 | self.robot_id = p.loadMJCF("urdf/MPL/MPL.xml")[0] 355 | p.resetBasePositionAndOrientation( 356 | self.robot_id, self.home_ee_pose[0], (0, 0, 1, 0) 357 | ) 358 | for i in range(p.getNumJoints(self.robot_id)): 359 | p.setJointMotorControl2(self.robot_id, i, p.POSITION_CONTROL, 1.05, 0) 360 | for i in range(22): 361 | p.setJointMotorControl2(self.robot_id, i, p.POSITION_CONTROL, 0, 0) 362 | self.hand_cid = p.createConstraint( 363 | self.robot_id, 364 | self.ee_index, 365 | -1, 366 | -1, 367 | p.JOINT_FIXED, 368 | [0, 0, 0], 369 | [0, 0, 0], 370 | [0, 0, 0], 371 | ) 372 | state = random.getstate() 373 | random.seed(6) 374 | link_indeices = [-1, 1, 6] + list(range(9, 24, 2)) + list(range(24, 47, 2)) 375 | for i in link_indeices: 376 | p.changeVisualShape( 377 | self.robot_id, 378 | i, 379 | rgbaColor=[ 380 | random.uniform(0, 1), 381 | random.uniform(0, 1), 382 | random.uniform(0, 1), 383 | 1, 384 | ], 385 | ) 386 | random.setstate(state) 387 | p.changeConstraint(self.hand_cid, self.home_ee_pose[0], (1, 0, 0, 0)) 388 | 389 | 390 | class AllegroHand(HandBase): 391 | def __init__(self): 392 | super().__init__() 393 | self.ee_index = 21 394 | self.arm_dof = 3 395 | self.home_j_pos = [0] * 22 396 | self.is_hand = True 397 | self.finger_index = [2, 3, 16, 18] 398 | self.gripper_range = { 399 | 2: [0, 1.8, 1], 400 | 3: [0, 0.2, 1], 401 | 16: [0, 1.4, 1], 402 | 18: [0, 0.2, 1], 403 | } 404 | self.grasp_force_threshold = 2 405 | 406 | def load(self): 407 | self.robot_id = p.loadURDF( 408 | "urdf/allegro_hand_description/urdf/allegro_hand_description_right.urdf" 409 | ) 410 | p.resetBasePositionAndOrientation( 411 | self.robot_id, self.home_ee_pose[0], [0, 0.707, 0, 0.707] 412 | ) 413 | for i in range(p.getNumJoints(self.robot_id)): 414 | p.setJointMotorControl2(self.robot_id, i, p.POSITION_CONTROL, 0, 0) 415 | self.hand_cid = p.createConstraint( 416 | self.robot_id, 417 | self.ee_index, 418 | -1, 419 | -1, 420 | p.JOINT_FIXED, 421 | [0, 0, 0], 422 | [0, 0, 0], 423 | [0, 0, 0], 424 | ) 425 | p.changeConstraint(self.hand_cid, self.home_ee_pose[0], [1, 0, 0, 0]) 426 | 427 | 428 | if __name__ == "__main__": 429 | pass 430 | -------------------------------------------------------------------------------- /sim_env.py: -------------------------------------------------------------------------------- 1 | import time 2 | import numpy as np 3 | import math 4 | import random 5 | import matplotlib.pyplot as plt 6 | import pandas as pd 7 | import shutil 8 | import os 9 | import glob 10 | import pybullet as p 11 | import threading 12 | from robots import Panda 13 | from PIL import Image 14 | 15 | # Dictionary defining task names and their corresponding environment classes 16 | TASKS = { 17 | "touch": "TouchTaskEnv", # Task 'touch' is associated with 'TouchTaskEnv' class 18 | "pick": "PickTaskEnv", # Task 'pick' is associated with 'PickTaskEnv' class 19 | } 20 | 21 | # Dictionary defining various camera views along with their configuration parameters 22 | CAM_INFO = { 23 | "front": [ 24 | [0, 0, 0.7], 25 | 1.8, 26 | 180, 27 | -20, 28 | 0, 29 | 40, 30 | ], # Front view: [position], distance, angles, fov 31 | "fronttop": [ 32 | [0, 0.5, 0.7], 33 | 1.5, 34 | 180, 35 | -60, 36 | 0, 37 | 35, 38 | ], # Front-top view: [position], distance, angles, fov 39 | "topdown": [ 40 | [0, 0.35, 0], 41 | 2.0, 42 | 0, 43 | -90, 44 | 0, 45 | 45, 46 | ], # Top-down view: [position], distance, angles, fov 47 | "side": [ 48 | [0, 0.35, 0.9], 49 | 1.5, 50 | 90, 51 | 0, 52 | 0, 53 | 40, 54 | ], # Side view: [position], distance, angles, fov 55 | "root": [ 56 | [0, 0.6, 0.75], 57 | 1.3, 58 | -35, 59 | -5, 60 | 0, 61 | 40, 62 | ], # Root view: [position], distance, angles, fov 63 | "wrist": [], # Placeholder for the 'wrist', since wrist view goes with the end effector, so no predefined camera parameters required 64 | } 65 | 66 | # Tuple defining the resolution of the camera (width x height) 67 | cam_resolution = (1080, 864) 68 | 69 | 70 | def get_cam_projection_matrix(cam_view): 71 | """ 72 | Calculates the camera projection matrix based on the given camera view. 73 | 74 | Parameters: 75 | - cam_view (str): Specifies the camera view. 76 | 77 | Returns: 78 | - cam_projection_matrix (list): Projection matrix for the specified camera view. 79 | """ 80 | 81 | # Calculate the aspect ratio based on camera resolution 82 | aspect = float(cam_resolution[0]) / cam_resolution[1] 83 | nearVal = 0.1 # Default near clipping plane value 84 | farVal = 100 # Default far clipping plane value 85 | 86 | if cam_view == "wrist": 87 | # Adjust parameters for wrist camera view 88 | fov = 100 # Field of view for wrist camera 89 | nearVal = 0.018 # Adjusted near clipping plane value for wrist camera 90 | else: 91 | # Use field of view based on the specified camera view 92 | fov = CAM_INFO[cam_view][-1] # Get field of view for the specified camera view 93 | 94 | # Compute the camera projection matrix using PyBullet's function 95 | cam_projection_matrix = p.computeProjectionMatrixFOV( 96 | fov=fov, 97 | aspect=aspect, 98 | nearVal=nearVal, 99 | farVal=farVal, 100 | ) 101 | 102 | # Return the calculated camera projection matrix 103 | return cam_projection_matrix 104 | 105 | 106 | def get_view_matrix(cam_view, robot_id, ee_index): 107 | """ 108 | Generates the view matrix for a specified camera view relative to a robot's end-effector. 109 | 110 | Parameters: 111 | - cam_view (str): Specifies the camera view. 112 | - robot_id (int): Identifier for the robot. 113 | - ee_index (int): Index of the end-effector on the robot. 114 | 115 | Returns: 116 | - cam_view_matrix (list): View matrix for the specified camera view. 117 | """ 118 | 119 | if cam_view == "wrist": 120 | # Calculate view matrix for wrist camera view 121 | eye_pos, eye_ori = p.getLinkState( 122 | robot_id, 123 | ee_index, 124 | computeForwardKinematics=True, 125 | )[0:2] 126 | eye_pos = list(eye_pos) 127 | eye_pos = p.multiplyTransforms(eye_pos, eye_ori, [0, 0, -0.05], [0, 0, 0, 1])[0] 128 | r_mat = p.getMatrixFromQuaternion(eye_ori) 129 | tx_vec = np.array([r_mat[0], r_mat[3], r_mat[6]]) 130 | ty_vec = np.array([r_mat[1], r_mat[4], r_mat[7]]) 131 | tz_vec = np.array([r_mat[2], r_mat[5], r_mat[8]]) 132 | camera_position = np.array(eye_pos) 133 | target_position = eye_pos + 0.001 * tz_vec 134 | 135 | # Compute view matrix for wrist camera using PyBullet's function 136 | cam_view_matrix = p.computeViewMatrix( 137 | cameraEyePosition=camera_position, 138 | cameraTargetPosition=target_position, 139 | cameraUpVector=ty_vec, 140 | ) 141 | else: 142 | # Calculate view matrix for non-wrist camera views using yaw, pitch, and roll 143 | cam_view_matrix = p.computeViewMatrixFromYawPitchRoll( 144 | CAM_INFO[cam_view][0], 145 | CAM_INFO[cam_view][1], 146 | CAM_INFO[cam_view][2], 147 | CAM_INFO[cam_view][3], 148 | CAM_INFO[cam_view][4], 149 | 2, 150 | ) 151 | 152 | # Return the computed camera view matrix 153 | return cam_view_matrix 154 | 155 | 156 | def get_cam_view_img(cam_view, robot_id=None, ee_index=None): 157 | """ 158 | Captures an image from a specified camera view using PyBullet. 159 | 160 | Parameters: 161 | - cam_view (str): Specifies the camera view. 162 | - robot_id (int, optional): Identifier for the robot. 163 | - ee_index (int, optional): Index of the end-effector on the robot. 164 | 165 | Returns: 166 | - img (numpy.ndarray): Captured image from the specified camera view. 167 | """ 168 | 169 | # Obtain the view matrix for the camera view 170 | cam_view_matrix = get_view_matrix(cam_view, robot_id, ee_index) 171 | 172 | # Obtain the projection matrix for the camera view 173 | cam_projection_matrix = get_cam_projection_matrix(cam_view) 174 | 175 | # Capture the camera image using PyBullet 176 | (width, height, rgb_pixels, _, _) = p.getCameraImage( 177 | cam_resolution[0], 178 | cam_resolution[1], 179 | viewMatrix=cam_view_matrix, 180 | projectionMatrix=cam_projection_matrix, 181 | ) 182 | 183 | # Reshape and process the image data 184 | rgb_array = np.array(rgb_pixels).reshape((height, width, 4)).astype(np.uint8) 185 | img = np.array(resize_and_crop(rgb_array[:, :, :3])) # Process the image 186 | 187 | # Return the captured and processed image 188 | return img 189 | 190 | 191 | def resize_and_crop(input_image): 192 | """ 193 | Crop the image to a 5:4 aspect ratio and resize it to 320x256 pixels. 194 | 195 | Parameters: 196 | - input_image (numpy.ndarray): Input image data in array format. 197 | 198 | Returns: 199 | - input_image (PIL.Image.Image): Cropped and resized image in PIL Image format. 200 | """ 201 | 202 | # Convert the input image array to a PIL Image 203 | input_image = Image.fromarray(input_image) 204 | 205 | # Get the width and height of the input image 206 | width, height = input_image.size 207 | 208 | # Define target and current aspect ratios 209 | target_aspect = 5 / 4 210 | current_aspect = width / height 211 | 212 | if current_aspect > target_aspect: 213 | # If the image is too wide, crop its width 214 | new_width = int(target_aspect * height) 215 | left_margin = (width - new_width) / 2 216 | input_image = input_image.crop((left_margin, 0, width - left_margin, height)) 217 | elif current_aspect < target_aspect: 218 | # If the image is too tall, crop its height 219 | new_height = int(width / target_aspect) 220 | top_margin = (height - new_height) / 2 221 | input_image = input_image.crop((0, top_margin, width, height - top_margin)) 222 | 223 | # Resize the cropped image to 320x256 pixels 224 | input_image = input_image.resize((320, 256)) 225 | 226 | # Return the cropped and resized image as a PIL Image 227 | return input_image 228 | 229 | 230 | class SimEnv(object): 231 | def __init__(self): 232 | # Set solverResidualThreshold to 0 for physics engine parameter 233 | p.setPhysicsEngineParameter(solverResidualThreshold=0) 234 | 235 | # Control time step and reset environment wait time 236 | self.control_dt = 1.0 / 240.0 237 | self.reset_env_wait_time = 0.5 238 | 239 | # Initialize attributes related to the robot, target object, poses, state, waypoints, and data recording 240 | self.robot = None 241 | self.tar_obj = None 242 | self.tar_obj_pose = None 243 | self.state = None 244 | self.target_waypoints = None 245 | self.data_record_fq = None 246 | self.collected_traj = 700 247 | 248 | # Load environment, set up camera, and reset environment 249 | self.load_env() 250 | self.set_camera() 251 | self.reset_env() 252 | 253 | # Initialize a lock for thread safety 254 | self.lock = threading.Lock() 255 | 256 | def load_env(self): 257 | raise NotImplementedError 258 | 259 | def reset_env(self): 260 | raise NotImplementedError 261 | 262 | def set_camera(self): 263 | # Set camera resolution 264 | self.cam_resolution = (1080, 864) # Define camera resolution 265 | 266 | # Retrieve camera information from CAM_INFO dictionary 267 | self.cam_info = CAM_INFO 268 | 269 | # Initialize an empty list to store view matrices for each camera view 270 | self.cam_view_matrices = [] 271 | 272 | # Iterate through each camera view in CAM_INFO and compute view matrices 273 | for key, val in self.cam_info.items(): 274 | if key == "wrist": 275 | self.cam_view_matrices.append([]) # Placeholder for 'wrist' view 276 | else: 277 | # Compute view matrix using yaw, pitch, roll, and other parameters from CAM_INFO 278 | view_matrix = p.computeViewMatrixFromYawPitchRoll( 279 | val[0], val[1], val[2], val[3], val[4], 2 280 | ) 281 | self.cam_view_matrices.append( 282 | view_matrix 283 | ) # Store the computed view matrix 284 | 285 | # Compute projection matrix for the camera with specified FOV, aspect ratio, and depth range 286 | self.cam_projection_matrix = p.computeProjectionMatrixFOV( 287 | fov=60, 288 | aspect=float(self.cam_resolution[0]) / self.cam_resolution[1], 289 | nearVal=0.1, 290 | farVal=100, 291 | ) 292 | 293 | 294 | class TouchTaskEnv(SimEnv): 295 | def __init__(self): 296 | super().__init__() 297 | 298 | def load_env(self): 299 | p.loadURDF("table/table.urdf", [0, 0.35, 0], [0, 0, 0, 1]) 300 | self.tar_obj = p.loadURDF("urdf/cube/cube.urdf", [0, 0, 0], globalScaling=0.04) 301 | p.changeVisualShape(self.tar_obj, -1, rgbaColor=[1, 0, 0, 1]) 302 | self.robot = Panda() 303 | self.robot.load() 304 | 305 | def reset_tar_obj(self, tar_obj_range=None, tar_pos_rot=None, random_pos_rot=True): 306 | if random_pos_rot: 307 | # Generate random position and rotation within the specified range 308 | x = random.uniform(tar_obj_range[0], tar_obj_range[1]) 309 | y = random.uniform(tar_obj_range[2], tar_obj_range[3]) 310 | r = random.uniform(tar_obj_range[4], tar_obj_range[5]) 311 | pos = [x, y, 0.645] # Set the z-coordinate (height) of the object 312 | rot = p.getQuaternionFromEuler( 313 | [0, np.pi / 2, r] 314 | ) # Convert Euler angles to quaternion 315 | else: 316 | # Use provided target position and rotation 317 | x, y, r = tar_pos_rot[0], tar_pos_rot[1], tar_pos_rot[4] 318 | pos = [x, y, 0.645] 319 | rot = [tar_pos_rot[2], tar_pos_rot[3], tar_pos_rot[4], tar_pos_rot[5]] 320 | 321 | # Reset the target object's position and orientation 322 | p.resetBasePositionAndOrientation( 323 | self.tar_obj, 324 | pos, 325 | rot, 326 | ) 327 | 328 | # Update the stored target object's pose (position and orientation) 329 | self.tar_obj_pose = p.getBasePositionAndOrientation(self.tar_obj) 330 | 331 | def reset_env(self): 332 | # Reset the robot's joints to their home positions 333 | self.robot.reset_j_home() 334 | 335 | # Pause execution for 1 second to allow for resetting 336 | time.sleep(1) 337 | 338 | # Reset state variables related to the environment 339 | self.state = 0 # Reset the state to 0 340 | self.t = 0 # Reset time counter to 0 341 | self.state_stuck_t = 0 # Reset the state stuck time to 0 342 | 343 | 344 | class PickTaskEnv(TouchTaskEnv): 345 | def __init__(self): 346 | super().__init__() 347 | -------------------------------------------------------------------------------- /train_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "mode": "train", 3 | "device": "cuda", 4 | "data_path": "/mnt/data/io_rt1_dataset/20231123_pick_r10_cube151212_drop_after_lift_straight_action/Panda_pick", 5 | "cam_view" : ["front", "side"], 6 | "log_dir": "/mnt/logs_1", 7 | "time_sequence_length": 12, 8 | "lr": 0.0001, 9 | "batch_size": 4, 10 | "epochs": 20, 11 | "resume": false, 12 | "resume_from_checkpoint": "/mnt/logs_1/1701051203/19-checkpoint.pth", 13 | "predicting_next_ts": true, 14 | "world_size": 4, 15 | "dist_url": "env://", 16 | "val_interval" : 10, 17 | "num_eval_threads": 25, 18 | "num_train_episode" : 200, 19 | "num_val_episode" : 1, 20 | "using_proprioception" : false, 21 | "network_configs": { 22 | "vocab_size" : 256, 23 | "token_embedding_size_per_image" : 512, 24 | "language_embedding_size" : 512, 25 | "num_layers" : 8, 26 | "layer_size" : 128, 27 | "num_heads" : 8, 28 | "feed_forward_size" : 512, 29 | "dropout_rate" : 0.1, 30 | "crop_size" : 236, 31 | "use_token_learner" : true 32 | }, 33 | "scheduler_configs" : { 34 | "T_0" : 50, 35 | "T_mult" : 2, 36 | "eta_min" : 1e-5, 37 | "verbose" : true 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /urdf/cube/cube.obj: -------------------------------------------------------------------------------- 1 | # cube.obj 2 | # 3 | 4 | o cube 5 | mtllib cube.mtl 6 | 7 | v -0.500000 -0.500000 0.500000 8 | v 0.500000 -0.500000 0.500000 9 | v -0.500000 0.500000 0.500000 10 | v 0.500000 0.500000 0.500000 11 | v -0.500000 0.500000 -0.500000 12 | v 0.500000 0.500000 -0.500000 13 | v -0.500000 -0.500000 -0.500000 14 | v 0.500000 -0.500000 -0.500000 15 | 16 | vt 0.000000 0.000000 17 | vt 1.000000 0.000000 18 | vt 0.000000 1.000000 19 | vt 1.000000 1.000000 20 | 21 | vn 0.000000 0.000000 1.000000 22 | vn 0.000000 1.000000 0.000000 23 | vn 0.000000 0.000000 -1.000000 24 | vn 0.000000 -1.000000 0.000000 25 | vn 1.000000 0.000000 0.000000 26 | vn -1.000000 0.000000 0.000000 27 | 28 | g cube 29 | usemtl cube 30 | s 1 31 | f 1/1/1 2/2/1 3/3/1 32 | f 3/3/1 2/2/1 4/4/1 33 | s 2 34 | f 3/1/2 4/2/2 5/3/2 35 | f 5/3/2 4/2/2 6/4/2 36 | s 3 37 | f 5/4/3 6/3/3 7/2/3 38 | f 7/2/3 6/3/3 8/1/3 39 | s 4 40 | f 7/1/4 8/2/4 1/3/4 41 | f 1/3/4 8/2/4 2/4/4 42 | s 5 43 | f 2/1/5 8/2/5 4/3/5 44 | f 4/3/5 8/2/5 6/4/5 45 | s 6 46 | f 7/1/6 1/2/6 5/3/6 47 | f 5/3/6 1/2/6 3/4/6 48 | 49 | -------------------------------------------------------------------------------- /urdf/cube/cube.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ioai-tech/pytorch_rt1_with_trainer_and_tester/bd86991521dda9cfd0664a0d4ad602ab34ad01a1/urdf/cube/cube.png -------------------------------------------------------------------------------- /urdf/cube/cube.urdf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /util/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | -------------------------------------------------------------------------------- /vis.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ioai-tech/pytorch_rt1_with_trainer_and_tester/bd86991521dda9cfd0664a0d4ad602ab34ad01a1/vis.mp4 --------------------------------------------------------------------------------