├── .gitignore
├── IO_dataset_torch.py
├── IO_evaluator.py
├── IO_rt1_demo.ipynb
├── IO_trainer_torch.py
├── LICENSE
├── README.md
├── __init__.py
├── img
    └── wechatgroup.jpg
├── maruya24_rt1
    ├── .gitignore
    ├── LICENSE
    ├── README.md
    ├── film_efficientnet
    │   ├── __init__.py
    │   ├── efficientnet_checkpoints
    │   │   ├── efficientnetb3.pth
    │   │   ├── efficientnetb3_notop.pth
    │   │   └── imagenet_classes.json
    │   ├── film_conditioning_layer.py
    │   ├── film_efficientnet_encoder.py
    │   ├── film_efficientnet_encoder_test.py
    │   ├── preprocessors.py
    │   ├── pretrained_efficientnet_encoder.py
    │   └── pretrained_efficientnet_encoder_test.py
    ├── tokenizers
    │   ├── __init__.py
    │   ├── action_tokenizer.py
    │   ├── action_tokenizer_test.py
    │   ├── image_tokenizer.py
    │   ├── image_tokenizer_test.py
    │   ├── token_learner.py
    │   ├── token_learner_test.py
    │   └── utils.py
    ├── transformer.py
    ├── transformer_network.py
    ├── transformer_network_test.py
    ├── transformer_network_test_set_up.py
    └── transformer_test.py
├── multi.sh
├── requirements.txt
├── robots.py
├── sim_env.py
├── train_config.json
├── urdf
    └── cube
    │   ├── cube.obj
    │   ├── cube.png
    │   └── cube.urdf
├── util
    ├── __init__.py
    └── misc.py
└── vis.mp4


/.gitignore:
--------------------------------------------------------------------------------
1 | *__pycache__


--------------------------------------------------------------------------------
/IO_dataset_torch.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import glob
  4 | from PIL import Image
  5 | import pandas as pd
  6 | import numpy as np
  7 | 
  8 | import torch
  9 | from torch.utils.data import Dataset, DataLoader
 10 | 
 11 | from tqdm import tqdm as tqdm
 12 | 
 13 | 
 14 | def build_dataset(
 15 |     data_path,
 16 |     time_sequence_length=6,
 17 |     predicting_next_ts=True,
 18 |     num_train_episode=200,
 19 |     num_val_episode=100,
 20 |     cam_view=["front"],
 21 |     language_embedding_size=512,
 22 | ):
 23 |     """
 24 |     This function is for building the training and validation dataset
 25 | 
 26 |     Parameters:
 27 |     - data_path(str): locates the path where the dataset is stored
 28 |             the dataset path should have the following file structures:
 29 |                 - [robotname]_[taskname]
 30 |                     - [cam_view_0]
 31 |                         - data_000
 32 |                             - rgb # where those image stored
 33 |                                 - image_001.png
 34 |                                 - image_002.png
 35 |                                 - ...
 36 |                             - results.csv # robot actions stored
 37 |                             - results_raw.csv # joint and target object position stored
 38 |                         - data_001
 39 |                         - ...
 40 |                     - [cam_view_1]
 41 |                         - data_000
 42 |                         - data_001
 43 |                         - ...
 44 |                     - ...
 45 |     - time_sequence_length(int) : number of history length input for RT-1 model,
 46 |         6 means current frame image and past 5 frames of images will be packed and input to RT-1
 47 |     - predicting_next_ts(bool) : in our dataset's results.csv and results_raw.csv, we stored current frame's action and joint status.
 48 |         if we want to predict next frame's action, this option needs to be True and result in the 1 step offset reading on csv files
 49 |         this differs between the samplings method of different dataset.
 50 |     - num_train_episode(int) : specifies numbers of training episodes
 51 |     - num_train_episode(int) : specifies numbers of validation episodes
 52 |     - cam_view(list of strs) : camera views used for training.
 53 | 
 54 |     Returns:
 55 |     - train_dataset(torch.utils.data.Dataset)
 56 |     - val_dataset(torch.utils.data.Dataset)
 57 |     """
 58 | 
 59 |     with open(os.path.join(data_path, cam_view[0], "dataset_info.json"), "r") as f:
 60 |         info = json.load(f)
 61 |     episode_length = info["episode_length"]
 62 |     episode_dirs = sorted(glob.glob(data_path + "/" + cam_view[0] + "/*/"))
 63 |     assert len(episode_dirs) == len(
 64 |         episode_length
 65 |     ), "length of episode directories and episode length not equal, check dataset's dataset_info.json"
 66 |     perm_indice = torch.randperm(len(episode_dirs)).tolist()
 67 |     dirs_lengths = dict(
 68 |         episode_dirs=np.array(episode_dirs)[perm_indice],
 69 |         episode_length=np.array(episode_length)[perm_indice],
 70 |     )
 71 |     train_episode_dirs = dirs_lengths["episode_dirs"][:num_train_episode]
 72 |     train_episode_length = dirs_lengths["episode_length"][:num_train_episode]
 73 |     val_episode_dirs = dirs_lengths["episode_dirs"][
 74 |         num_train_episode : num_train_episode + num_val_episode
 75 |     ]
 76 |     val_episode_length = dirs_lengths["episode_length"][
 77 |         num_train_episode : num_train_episode + num_val_episode
 78 |     ]
 79 | 
 80 |     train_dataset = IODataset(
 81 |         episode_dirs=train_episode_dirs,
 82 |         episode_length=train_episode_length,
 83 |         time_sequence_length=time_sequence_length,
 84 |         predicting_next_ts=predicting_next_ts,
 85 |         cam_view=cam_view,
 86 |         language_embedding_size=language_embedding_size,
 87 |     )
 88 |     val_dataset = IODataset(
 89 |         episode_dirs=val_episode_dirs,
 90 |         episode_length=val_episode_length,
 91 |         time_sequence_length=time_sequence_length,
 92 |         predicting_next_ts=predicting_next_ts,
 93 |         cam_view=cam_view,
 94 |         language_embedding_size=language_embedding_size,
 95 |     )
 96 |     return train_dataset, val_dataset
 97 | 
 98 | 
 99 | class IODataset(Dataset):
100 |     def __init__(
101 |         self,
102 |         episode_dirs,
103 |         episode_length,
104 |         time_sequence_length=6,
105 |         predicting_next_ts=True,
106 |         cam_view=["front"],
107 |         robot_dof=9,
108 |         language_embedding_size=512,
109 |     ):
110 |         self._cam_view = cam_view
111 |         self.predicting_next_ts = predicting_next_ts
112 |         self._time_sequence_length = time_sequence_length
113 |         self._episode_length = episode_length
114 |         self.querys = self.generate_history_steps(episode_length)
115 |         self._episode_dirs = episode_dirs
116 |         self.keys_image = self.generate_fn_lists(self._episode_dirs)
117 |         self.values, self.num_zero_history_list = self.organize_file_names()
118 |         self._robot_dof = robot_dof
119 |         self._language_embedding_size = language_embedding_size
120 | 
121 |     def generate_fn_lists(self, episode_dirs):
122 |         """
123 |         This function globs all the image path in the dataset
124 |         Parameters:
125 |         - episode_dirs(list of strs): directories where image is stored, etc:
126 |             - [robotname]_[taskname]
127 |                 - [cam_view_0]
128 |                     - data_000
129 |                     - data_001
130 |                     - data_002
131 |                     - ...
132 |         Returns:
133 |         - keys(list of strs): all globbed image filename in a list
134 |         """
135 |         keys = []
136 |         for ed in episode_dirs:
137 |             image_files = sorted(glob.glob(f"{ed}rgb/*.png"))
138 |             keys.append(image_files)
139 |         return keys
140 | 
141 |     def generate_history_steps(self, episode_length):
142 |         """
143 |         This function generates the step for current frame and history frames
144 |         Parameters:
145 |         - episode_length(list of int): number of episode lengths for each episode
146 |         Returns:
147 |         - keys(list of tensors): history steps for each data
148 |         """
149 |         querys = []
150 |         for el in episode_length:
151 |             q = torch.cat(
152 |                 (
153 |                     [
154 |                         torch.arange(el)[:, None] - i
155 |                         for i in range(self._time_sequence_length)
156 |                     ]
157 |                 ),
158 |                 dim=1,
159 |             )
160 |             q[q < 0] = -1
161 |             querys.append(q.flip(1))
162 |         return querys
163 | 
164 |     def organize_file_names(self):
165 |         """
166 |         This function generates the infor for each data, including how many zeros were padded
167 |         data's episode directory, image filenames, and all the other parameters for data
168 |         Parameters:
169 |         -
170 |         Returns:
171 |         - values(list): each value including
172 |             - num_zero_history: when we read at initial frames of a episode, it doesn't have history,
173 |                 then we need to pad zeros to make sure these aligns to data with history frames.
174 |                 this number specified how many frames of zero is padded
175 |             - episode_dir: the episode directory where this data is stored
176 |             - img_fns = img_fns: the images this data should read
177 |             - query_index = index of this data in this episode
178 |             - episode_length = total length of this episode
179 |         """
180 |         values = []
181 |         num_zero_history_list = []
182 |         for i, (query, key_img, ed) in enumerate(
183 |             zip(self.querys, self.keys_image, self._episode_dirs)
184 |         ):
185 |             for q in query:
186 |                 img_fns = []
187 |                 for img_idx in q:
188 |                     img_fns.append(key_img[img_idx] if img_idx >= 0 else None)
189 |                 num_zero_history = (q < 0).sum()
190 |                 num_zero_history_list.append(int(num_zero_history))
191 |                 values.append(
192 |                     dict(
193 |                         num_zero_history=num_zero_history,
194 |                         episode_dir=ed,
195 |                         img_fns=img_fns,
196 |                         query_index=q,
197 |                         episode_length=self._episode_length[i],
198 |                     )
199 |                 )
200 |         return values, num_zero_history_list
201 | 
202 |     def __len__(self):
203 |         return len(self.values)
204 | 
205 |     def __getitem__(self, idx):
206 |         value = self.values[idx]
207 |         img = self.get_image(value["img_fns"])
208 |         lang = self.get_language_instruction()
209 |         ee_pos_cmd, ee_rot_cmd, gripper_cmd, joint, tar_obj_pose = self.get_ee_data(
210 |             value["episode_dir"], value["query_index"], value["num_zero_history"]
211 |         )
212 |         terminate_episode = self.get_episode_status(
213 |             value["episode_length"], value["query_index"], value["num_zero_history"]
214 |         )
215 |         sample_obs = {
216 |             "image": img.float().permute(0, 3, 1, 2),
217 |             # we permute the channel dimension to the second dimension to cope with rt1's convolution layers
218 |             "natural_language_embedding": torch.tensor(lang).float(),
219 |             "joint_position": torch.tensor(joint).float(),
220 |             "tar_obj_pose": torch.tensor(tar_obj_pose).float(),
221 |         }
222 |         sample_action = {
223 |             "world_vector": torch.tensor(ee_pos_cmd).float(),
224 |             "rotation_delta": torch.tensor(ee_rot_cmd).float(),
225 |             "gripper_closedness_action": torch.tensor(gripper_cmd).float(),
226 |             "terminate_episode": torch.tensor(terminate_episode.argmax(-1)).long(),
227 |         }
228 | 
229 |         return sample_obs, sample_action
230 | 
231 |     def get_image(self, img_fns):
232 |         """
233 |         This function generates the step for current frame and history frames
234 |         Parameters:
235 |         - episode_length(list of int): number of episode lengths for each episode
236 |         Returns:
237 |         - keys(list of tensors): history steps for each data
238 |         """
239 |         imgs = []
240 |         for img_fn in img_fns:
241 |             img_multi_view = []
242 |             for c_v in self._cam_view:
243 |                 img_multi_view.append(
244 |                     np.array(Image.open(img_fn.replace(self._cam_view[0], c_v)))
245 |                     if img_fn != None
246 |                     else np.zeros_like(Image.open(img_fns[-1]))
247 |                 )
248 |             img = np.concatenate(img_multi_view, axis=0)
249 |             imgs.append(torch.from_numpy(img[:, :, :3]))
250 |         return torch.stack(imgs, dim=0) / 255.0
251 | 
252 |     def get_ee_data(self, episode_dir, query_index, pad_step_num):
253 |         """
254 |         This function reads the csvs for ground truth robot actions, robot joint status and target object position and orientation:
255 |         Parameters:
256 |         - episode_dir(str): directory where the results.csv and results_raw.csv is stored
257 |         - query_index(tensor): index where exact data is read, padded zeros has a special index of -1
258 |         - pad_step_num(int): how many timestep of zeros is padded
259 |         Returns:
260 |         - ee_pos_cmd(np.array): stores the ground truth command for robot move in position(x, y, z)
261 |         - ee_rot_cmd(np.array): stores the ground truth command for robot move in rotation(rx, ry, rz)
262 |         - gripper_cmd(np.array): stores the ground truth command for robot's gripper open or close
263 |         - joint(np.array): stores the robot's joint status, which can be used to calculate ee's position
264 |         - tar_obj_pose: stores the target object's position and orientation (x, y, z, rx, ry, rz)
265 |         """
266 |         start_idx = query_index[(query_index > -1).nonzero()[0, 0]]
267 |         end_idx = query_index[-1]
268 |         visual_data_filename = f"{episode_dir}result.csv"
269 |         raw_data = pd.read_csv(visual_data_filename)
270 |         visual_data_filename_raw = f"{episode_dir}result_raw.csv"
271 |         raw_raw_data = pd.read_csv(visual_data_filename_raw)
272 |         if self.predicting_next_ts:
273 |             """
274 |             if predicting next timestep's results, then we shift first column to last column
275 |             """
276 |             first_row = raw_data.iloc[0]
277 |             raw_data = raw_data.iloc[1:]
278 |             raw_data = pd.concat([raw_data, first_row.to_frame().T], ignore_index=True)
279 |             first_row = raw_raw_data.iloc[0]
280 |             raw_raw_data = raw_raw_data.iloc[1:]
281 |             raw_raw_data = pd.concat(
282 |                 [raw_raw_data, first_row.to_frame().T], ignore_index=True
283 |             )
284 |         # position has 3 dimensions [x, y, z]
285 |         ee_pos_cmd = np.zeros([pad_step_num, 3])
286 |         # rotation has 3 dimensions [rx, ry, rz]
287 |         ee_rot_cmd = np.zeros([pad_step_num, 3])
288 |         # gripper has 1 dimension which controls open/close of the gripper
289 |         gripper_cmd = np.zeros([pad_step_num, 1])
290 |         # we are using Franka Panda robot, whose has 9 dofs of joint
291 |         joint = np.zeros([pad_step_num, 9])
292 |         # tar_obj_pose is 7 dimension [x,y,z,rx,ry,rz,w]
293 |         # however, in this version we are not using tar_obj_pose
294 |         tar_obj_pose = np.zeros([pad_step_num, 7])
295 |         ee_pos_cmd = np.vstack(
296 |             (
297 |                 ee_pos_cmd,
298 |                 raw_data.loc[
299 |                     start_idx:end_idx,
300 |                     [f"ee_command_position_{ax}" for ax in ["x", "y", "z"]],
301 |                 ].to_numpy(),
302 |             )
303 |         )
304 |         ee_rot_cmd = np.vstack(
305 |             (
306 |                 ee_rot_cmd,
307 |                 raw_data.loc[
308 |                     start_idx:end_idx,
309 |                     [f"ee_command_rotation_{ax}" for ax in ["x", "y", "z"]],
310 |                 ].to_numpy(),
311 |             )
312 |         )
313 |         joint = np.vstack(
314 |             (
315 |                 joint,
316 |                 raw_raw_data.loc[
317 |                     start_idx:end_idx,
318 |                     [f"joint_{str(ax)}" for ax in range(self._robot_dof)],
319 |                 ].to_numpy(),
320 |             )
321 |         )
322 |         tar_obj_pose = np.vstack(
323 |             (
324 |                 tar_obj_pose,
325 |                 raw_raw_data.loc[
326 |                     start_idx:end_idx,
327 |                     [
328 |                         f"tar_obj_pose_{ax}"
329 |                         for ax in ["x", "y", "z", "rx", "ry", "rz", "rw"]
330 |                     ],
331 |                 ].to_numpy(),
332 |             )
333 |         )
334 |         gripper_data = (
335 |             raw_data.loc[start_idx:end_idx, "gripper_closedness_commanded"]
336 |             .to_numpy()
337 |             .reshape(-1, 1)
338 |         )
339 |         gripper_cmd = np.vstack((gripper_cmd, gripper_data))
340 |         return ee_pos_cmd, ee_rot_cmd, gripper_cmd, joint, tar_obj_pose
341 | 
342 |     def get_language_instruction(self):
343 |         """
344 |         since we are only training single-task model, this language embedding is set as constant.
345 |         modify it to language instructions if multi-task model is training.
346 |         it seems that google directly loads embedded language instruction from its language model
347 |         this results in our loading a language embedding instead of language sentence
348 |         """
349 |         return np.zeros([self._time_sequence_length, self._language_embedding_size])
350 | 
351 |     def get_episode_status(self, episode_length, query_index, pad_step_num):
352 |         """
353 |         This function is to find whether current frame and history frame is start or middle or end of the episode:
354 |         Parameters:
355 |         - episode_length(int): length of current episode
356 |         - query_index(tensor): index where exact data is read, padded zeros has a special index of -1
357 |         - pad_step_num(int): how many timestep of zeros is padded
358 |         Returns:
359 |         - episode_status(np.array): specifies status(start, middle or end) of each frame in history
360 |         """
361 |         start_idx = query_index[(query_index > -1).nonzero()[0, 0]]
362 |         end_idx = query_index[-1]
363 |         episode_status = np.zeros([pad_step_num, 4], dtype=np.int32)
364 |         episode_status[:, -1] = 1
365 |         for i in range(start_idx, end_idx + 1):
366 |             status = np.array(
367 |                 [i == 0, i not in [0, episode_length - 2], i == episode_length - 2, 0],
368 |                 dtype=np.int32,
369 |             )
370 |             episode_status = np.vstack((episode_status, status))
371 |         if pad_step_num > 0:
372 |             episode_status[pad_step_num] = np.array([1, 0, 0, 0])
373 |         return episode_status
374 | 
375 | 
376 | def load_config_from_json(json_path):
377 |     with open(json_path, "r") as f:
378 |         config = json.load(f)
379 |     return config
380 | 
381 | 
382 | if __name__ == "__main__":
383 |     args = load_config_from_json("train_config.json")
384 |     dataset, _ = build_dataset(
385 |         data_path=args["data_path"],
386 |         time_sequence_length=args["time_sequence_length"],
387 |         predicting_next_ts=args["predicting_next_ts"],
388 |         num_train_episode=args["num_train_episode"],
389 |         num_val_episode=args["num_val_episode"],
390 |         cam_view=args["cam_view"],
391 |         language_embedding_size=args["network_configs"]["language_embedding_size"],
392 |     )
393 |     # dataset = dataset[:100]
394 | 
395 |     wv_x = []
396 |     wv_y = []
397 |     wv_z = []
398 |     rd_x = []
399 |     rd_y = []
400 |     rd_z = []
401 |     from maruya24_rt1.tokenizers import action_tokenizer
402 |     from gym import spaces
403 |     from collections import OrderedDict
404 |     import matplotlib.pyplot as plt
405 | 
406 |     output_tensor_space = spaces.Dict(
407 |         OrderedDict(
408 |             [
409 |                 ("terminate_episode", spaces.Discrete(4)),
410 |                 (
411 |                     "world_vector",
412 |                     spaces.Box(low=-0.025, high=0.025, shape=(3,), dtype=np.float32),
413 |                 ),
414 |                 (
415 |                     "rotation_delta",
416 |                     spaces.Box(
417 |                         low=-np.pi / 20,
418 |                         high=np.pi / 20,
419 |                         shape=(3,),
420 |                         dtype=np.float32,
421 |                     ),
422 |                 ),
423 |                 (
424 |                     "gripper_closedness_action",
425 |                     spaces.Box(low=-1.0, high=1.0, shape=(1,), dtype=np.float32),
426 |                 ),
427 |             ]
428 |         )
429 |     )
430 |     at = action_tokenizer.RT1ActionTokenizer(
431 |         output_tensor_space, vocab_size=256  # action space
432 |     )
433 |     dataloader = DataLoader(dataset, batch_size=64, num_workers=64)
434 |     for i, batch in tqdm(enumerate(dataloader), total=len(dataset) // 64):
435 |         batch = at.tokenize(batch[1])
436 |         for i in range(batch.size(0)):
437 |             wv_x.append(int(batch[i, -1, 1]))
438 |             wv_y.append(int(batch[i, -1, 2]))
439 |             wv_z.append(int(batch[i, -1, 3]))
440 |             rd_x.append(int(batch[i, -1, 4]))
441 |             rd_y.append(int(batch[i, -1, 5]))
442 |             rd_z.append(int(batch[i, -1, 6]))
443 |         # print(batch)
444 |     plt.subplot(2, 3, 1)
445 |     plt.title("world_vector_x")
446 |     plt.hist(wv_x, bins=256, range=(0, 256))
447 |     plt.xlim(0, 256)
448 |     plt.subplot(2, 3, 2)
449 |     plt.title("world_vector_y")
450 |     plt.hist(wv_y, bins=256, range=(0, 256))
451 |     plt.xlim(0, 256)
452 |     plt.subplot(2, 3, 3)
453 |     plt.title("world_vector_z")
454 |     plt.hist(wv_z, bins=256, range=(0, 256))
455 |     plt.xlim(0, 256)
456 |     plt.subplot(2, 3, 4)
457 |     plt.title("rotation_delta_x")
458 |     plt.hist(rd_x, bins=256, range=(0, 256))
459 |     plt.xlim(0, 256)
460 |     plt.subplot(2, 3, 5)
461 |     plt.title("rotation_delta_y")
462 |     plt.hist(rd_y, bins=256, range=(0, 256))
463 |     plt.xlim(0, 256)
464 |     plt.subplot(2, 3, 6)
465 |     plt.title("rotation_delta_z")
466 |     plt.hist(rd_z, bins=256, range=(0, 256))
467 |     plt.xlim(0, 256)
468 |     plt.show()
469 | 


--------------------------------------------------------------------------------
/IO_evaluator.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import pybullet as p
  3 | import pybullet_data as pdata
  4 | import numpy as np
  5 | import argparse
  6 | import random
  7 | import time
  8 | import os
  9 | import cv2
 10 | from sim_env import get_cam_view_img
 11 | from skvideo.io import vwrite
 12 | import torch
 13 | from gym import spaces
 14 | from collections import OrderedDict
 15 | import copy
 16 | from func_timeout import func_set_timeout
 17 | import func_timeout
 18 | from tqdm import trange
 19 | import json
 20 | 
 21 | from maruya24_rt1.transformer_network import TransformerNetwork
 22 | from maruya24_rt1.transformer_network_test_set_up import state_space_list
 23 | from maruya24_rt1.tokenizers.utils import batched_space_sampler
 24 | from maruya24_rt1.tokenizers.utils import np_to_tensor
 25 | from sim_env import *
 26 | import util.misc as utils
 27 | import torch.nn.functional as F
 28 | from torchvision import transforms
 29 | 
 30 | 
 31 | class SimTester:
 32 |     def __init__(self, task_name, max_step=200, device="cuda", sim_interval=0.005):
 33 |         p.connect(p.DIRECT)
 34 |         p.configureDebugVisualizer(p.COV_ENABLE_GUI, 0)
 35 |         p.setPhysicsEngineParameter(maxNumCmdPer1ms=1000)
 36 |         p.resetDebugVisualizerCamera(
 37 |             cameraDistance=1.5,
 38 |             cameraPitch=-20,
 39 |             cameraYaw=180,
 40 |             cameraTargetPosition=[0, 0, 0.6],
 41 |         )
 42 |         p.setAdditionalSearchPath(pdata.getDataPath())
 43 |         p.setGravity(0, 0, -9.8)
 44 |         self.task_env = globals()[TASKS[task_name]]()
 45 |         self.max_step = max_step
 46 |         self.device = torch.device(device)
 47 |         self.sim_interval = sim_interval
 48 | 
 49 |     def test_step(self, delta_ee_pos, delta_ee_rot, gripper_cmd, cam_views):
 50 |         try:
 51 |             self.execute_action(delta_ee_pos, delta_ee_rot, gripper_cmd, cam_views)
 52 |         except func_timeout.exceptions.FunctionTimedOut:
 53 |             # print("robot stuck")
 54 |             pass
 55 |         self.update_obs(cam_views)
 56 | 
 57 |     @func_set_timeout(
 58 |         0.5
 59 |     )  # Timeout set to handle cases where the robot may get stuck, allowing the next action to execute
 60 |     def execute_action(
 61 |         self, delta_ee_pos, delta_ee_rot, gripper_cmd, cam_views, relative=True
 62 |     ):
 63 |         # Calculate the new end-effector pose based on relative or absolute movements
 64 |         if relative:
 65 |             # Obtain the current end-effector pose
 66 |             last_ee_pose = p.getLinkState(
 67 |                 self.task_env.robot.robot_id, self.task_env.robot.ee_index
 68 |             )[0:2]
 69 | 
 70 |             # Calculate the target end-effector pose based on the relative movement
 71 |             target_ee_pose = p.multiplyTransforms(
 72 |                 last_ee_pose[0],
 73 |                 last_ee_pose[1],
 74 |                 delta_ee_pos,
 75 |                 p.getQuaternionFromEuler(delta_ee_rot),
 76 |             )
 77 |         else:
 78 |             # Handling for absolute movements, if implemented
 79 |             raise NotImplementedError("Absolute movement handling not implemented yet")
 80 | 
 81 |         # Calculate inverse kinematics to obtain joint positions for the target end-effector pose
 82 |         tar_j_pos = self.task_env.robot.calc_ik([target_ee_pose[0], target_ee_pose[1]])
 83 | 
 84 |         # Move the robot's arm to the calculated joint positions
 85 |         self.task_env.robot.move_arm(tar_j_pos)
 86 | 
 87 |         # Control the gripper based on the provided command
 88 |         self.task_env.robot.move_gripper(gripper_cmd[0])
 89 | 
 90 |         # Ensure the robot reaches the target joint positions within a threshold
 91 |         while not (self.task_env.robot.is_j_arrived(tar_j_pos, threshold=1e-3)):
 92 |             p.stepSimulation()  # Step the simulation
 93 |             time.sleep(self.sim_interval)  # Wait for the simulation to progress
 94 | 
 95 |     def reset_tester(self, cam_views):
 96 |         self.task_env.reset_env()
 97 |         self.reset_obs(cam_views)
 98 |         self.gripper_triggered = False
 99 |         self.episode_succ = [False, False]
100 | 
101 |     def reset_obs(self, cam_views):
102 |         self.imgs = [torch.zeros(1, 3, 256 * len(cam_views), 320)] * self.time_sequence_length
103 |         self.joints = [torch.zeros(1, 9)] * self.time_sequence_length
104 | 
105 |     def update_obs(self, cam_views):
106 |         # Obtain observation images from specified camera views
107 |         img = self.get_obs_img(cam_views)
108 | 
109 |         # Remove the oldest image from the image collection and add the new observation image
110 |         self.imgs.pop(0)
111 |         self.imgs.append(torch.from_numpy(img).permute(2, 0, 1).unsqueeze(0))
112 | 
113 |         # Retrieve the current joint status
114 |         joint = self.get_joint_status()
115 | 
116 |         # Remove the oldest joint status and add the current joint status to the collection
117 |         self.joints.pop(0)
118 |         self.joints.append(joint.unsqueeze(0))
119 | 
120 |     def get_joint_status(self):
121 |         """
122 |         Obtain the current joint status of the robot.
123 | 
124 |         Returns:
125 |         - torch.tensor: Tensor containing the current joint states of the robot.
126 |         """
127 |         return torch.tensor(
128 |             [
129 |                 s[0]
130 |                 for s in p.getJointStates(
131 |                     self.task_env.robot.robot_id,
132 |                     list(range(self.task_env.robot.arm_dof)) + [9, 10],
133 |                 )
134 |             ]
135 |         )
136 | 
137 |     def get_obs_img(self, cam_views):
138 |         """
139 |         Generate observation images from specified camera views.
140 | 
141 |         Args:
142 |         - cam_views (list): List of camera views.
143 | 
144 |         Returns:
145 |         - np.ndarray: Observation image array.
146 |         """
147 |         imgs = []
148 |         for cview in cam_views:
149 |             imgs.append(
150 |                 get_cam_view_img(
151 |                     cview, self.task_env.robot.robot_id, self.task_env.robot.ee_index
152 |                 )
153 |             )
154 |         cur_img = np.concatenate(imgs, axis=0)
155 |         return cur_img / 255.0
156 | 
157 |     def check_episode_succ(self):
158 |         """
159 |         Check if the episode is successful based on end-effector and target object positions.
160 | 
161 |         Returns:
162 |         - float: Difference in z-axis positions between the end-effector and the target object.
163 |         """
164 |         ee_z_pos = p.getLinkState(
165 |             self.task_env.robot.robot_id, self.task_env.robot.ee_index
166 |         )[0][2]
167 |         tar_obj_z_pos = self.task_env.tar_obj_pose[0][2]
168 |         contact_points = p.getContactPoints(
169 |             self.task_env.robot.robot_id, self.task_env.tar_obj
170 |         )
171 |         if abs(ee_z_pos - tar_obj_z_pos) < 0.035 and len(contact_points) > 0:
172 |             self.episode_succ[0] = True
173 |         if ee_z_pos - tar_obj_z_pos > 0.08 and len(contact_points) > 0:
174 |             self.episode_succ[1] = True
175 |         return ee_z_pos - tar_obj_z_pos
176 | 
177 | 
178 | def load_config_from_json(json_path):
179 |     with open(json_path, "r") as f:
180 |         config = json.load(f)
181 |     return config
182 | 
183 | 
184 | class Evaluator:
185 |     def __init__(self, task_name) -> None:
186 |         self.sim_tester = SimTester(task_name)
187 |         self.pandaEndEffectorIndex = 11
188 |         args = load_config_from_json("train_config.json")
189 |         self.args = args
190 | 
191 |     def write_results(self, fn, episode_succ):
192 |         with open(fn, "a") as f:
193 |             f.write(str(int(episode_succ[0])) + str(int(episode_succ[1])) + "\n")
194 |         f.close()
195 | 
196 |     def calc_fk(self, obs):
197 |         """
198 |         get end effector's position and orientation in world coordinate system
199 |         Parameter:
200 |         - obs(dict): observations with joints status
201 |         Returns:
202 |         - obs(dict): position and orientation will be stored in obs
203 |         """
204 |         ee_position, ee_orientation = [], []
205 |         for joint in obs["joint_position"]:
206 |             position, orientation = [], []
207 |             for i in range(len(joint)):
208 |                 p.resetJointStatesMultiDof(
209 |                     self.sim_tester.task_env.robot.robot_id,
210 |                     range(9),
211 |                     [[pos] for pos in joint[i]],
212 |                 )
213 |                 pos, orn = p.getLinkState(self.sim_tester.task_env.robot.robot_id, 11)[
214 |                     :2
215 |                 ]
216 |                 pos = list(pos)
217 |                 pos.append(0)
218 |                 position.append(torch.FloatTensor(pos))
219 |                 orientation.append(torch.FloatTensor(orn))
220 |             ee_position.append(torch.stack(position))
221 |             ee_orientation.append(torch.stack(orientation))
222 |         obs["position"] = torch.stack(ee_position).to(self.device)
223 |         obs["orientation"] = torch.stack(ee_orientation).to(self.device)
224 |         del obs["joint_position"]
225 |         return obs
226 | 
227 |     def inference(self, network, imgs, joints):
228 |         """
229 |         Perform inference using the provided neural network on input observations.
230 | 
231 |         Args:
232 |         - network (NeuralNetwork): Neural network model for inference.
233 |         - imgs (list): List of observation images.
234 |         - joints (list): List of joint positions.
235 | 
236 |         Returns:
237 |         - list: List containing the inferred actions related to termination, world vector, rotation delta, and gripper closedness.
238 |         """
239 |         # Batched space sampling for the network state
240 |         network_state = batched_space_sampler(network._state_space, batch_size=1)
241 |         network_state = np_to_tensor(network_state)  # Convert np.ndarray to tensor
242 |         for k, v in network_state.items():
243 |             network_state[k] = torch.zeros_like(v).to(self.device)
244 | 
245 |         output_actions = []
246 |         obs = dict()
247 |         obs["image"] = torch.stack(imgs, dim=1).to(
248 |             self.device
249 |         )  # Stack images on the device
250 |         obs["natural_language_embedding"] = torch.zeros(
251 |             1, network._time_sequence_length, network._language_embedding_size
252 |         ).to(
253 |             self.device
254 |         )  # Prepare language embedding tensor
255 | 
256 |         # Stack joint positions and adjust dimensions
257 |         obs["joint_position"] = torch.stack(joints).permute(1, 0, 2)
258 | 
259 |         # If the network uses proprioception, calculate forward kinematics
260 |         if network.using_proprioception:
261 |             obs = self.calc_fk(obs)
262 | 
263 |         with torch.no_grad():
264 |             for i_ts in range(network._time_sequence_length):
265 |                 ob = utils.retrieve_single_timestep(obs, i_ts)
266 |                 output_action, network_state = network(ob, network_state)
267 |                 output_actions.append(output_action)
268 | 
269 |             # Retrieve the final inferred action
270 |             action = output_actions[-1]
271 | 
272 |         # Move the action to the CPU and convert it to a list for return
273 |         action = utils.dict_to_device(action, torch.device("cpu"))
274 |         return [
275 |             action["terminate_episode"].flatten().tolist(),
276 |             action["world_vector"].flatten().tolist(),
277 |             action["rotation_delta"].flatten().tolist(),
278 |             action["gripper_closedness_action"].flatten().tolist(),
279 |         ]
280 | 
281 |     def val(
282 |         self,
283 |         resume_from_checkpoint,
284 |         ckpt_dir,
285 |         proc_name,
286 |         cam_views,
287 |         test_num,
288 |         results_fn,
289 |         epoch,
290 |         gpu_name,
291 |         using_pos_and_orn,
292 |         tar_obj_pos_rot,
293 |     ):
294 |         self.resume_from_checkpoint = resume_from_checkpoint
295 | 
296 |         checkpoint = torch.load(resume_from_checkpoint, map_location="cpu")
297 |         self._action_space = checkpoint["action_space"]
298 |         network_configs = self.args["network_configs"]
299 |         network_configs["time_sequence_length"] = self.args["time_sequence_length"]
300 |         network_configs["num_encoders"] = len(self.args["cam_view"])
301 |         network_configs["using_proprioception"] = self.args["using_proprioception"]
302 |         network_configs["token_embedding_size"] = network_configs[
303 |             "token_embedding_size_per_image"
304 |         ] * len(self.args["cam_view"])
305 |         del network_configs["token_embedding_size_per_image"]
306 |         network_configs["input_tensor_space"] = state_space_list()[0]
307 |         network_configs["output_tensor_space"] = self._action_space
308 |         self.network = TransformerNetwork(**network_configs)
309 |         setattr(self.sim_tester, "time_sequence_length", self.network._time_sequence_length)
310 |         try:
311 |             local_rank = os.environ["LOCAL_RANK"]
312 |             torch.cuda.set_device(int(local_rank))
313 |         except:
314 |             pass
315 |         self.device = torch.device("cuda")
316 |         self.network.to(self.device)
317 |         self.network.load_state_dict(checkpoint["model_state_dict"])
318 |         self.network.eval()
319 |         try:
320 |             os.mkdir(os.path.join(ckpt_dir, "vids"))
321 |         except:
322 |             pass
323 |         lift_count = 0
324 |         for idx in range(test_num):
325 |             vid_fn = os.path.join(
326 |                 ckpt_dir,
327 |                 "vids",
328 |                 "e"
329 |                 + str(epoch)
330 |                 + "_i"
331 |                 + str(idx)
332 |                 + "_p"
333 |                 + str(proc_name)
334 |                 + "_g"
335 |                 + str(gpu_name)
336 |                 + ".mp4",
337 |             )
338 |             self.sim_tester.reset_tester(cam_views)
339 |             self.sim_tester.task_env.reset_tar_obj(
340 |                 tar_pos_rot=tar_obj_pos_rot, random_pos_rot=False
341 |             )
342 |             time.sleep(1)
343 |             self.sim_tester.update_obs(cam_views)
344 |             vid = []
345 |             for _ in trange(self.sim_tester.max_step):
346 |                 imgs = self.sim_tester.imgs
347 |                 joints = self.sim_tester.joints
348 |                 (
349 |                     terminate_episode,
350 |                     delta_ee_pos,
351 |                     delta_ee_rot,
352 |                     gripper_cmd,
353 |                 ) = self.inference(self.network, imgs, joints)
354 |                 self.sim_tester.test_step(
355 |                     delta_ee_pos, delta_ee_rot, gripper_cmd, cam_views
356 |                 )
357 |                 distance_to_target = self.sim_tester.check_episode_succ()
358 |                 frame = copy.deepcopy(imgs[-1][0].permute(1, 2, 0).numpy() * 255)
359 |                 frame_rgb = cv2.cvtColor(frame.astype(np.uint8), cv2.COLOR_BGR2RGB)
360 |                 text = (
361 |                     "epoch: "
362 |                     + str(epoch)
363 |                     + " dis: "
364 |                     + str(round(distance_to_target, 3))
365 |                 )
366 |                 cv2.putText(
367 |                     frame_rgb, text, (0, 20), cv2.FONT_ITALIC, 0.75, (0, 0, 0), 2
368 |                 )
369 |                 if self.sim_tester.episode_succ[0] == True:
370 |                     cv2.putText(
371 |                         frame_rgb,
372 |                         "grabbed",
373 |                         (0, 40),
374 |                         cv2.FONT_ITALIC,
375 |                         0.75,
376 |                         (0, 255, 0),
377 |                         2,
378 |                     )
379 |                 else:
380 |                     cv2.putText(
381 |                         frame_rgb,
382 |                         "not grabbed",
383 |                         (0, 40),
384 |                         cv2.FONT_ITALIC,
385 |                         0.75,
386 |                         (0, 0, 255),
387 |                         2,
388 |                     )
389 |                 if self.sim_tester.episode_succ[1] == True:
390 |                     cv2.putText(
391 |                         frame_rgb,
392 |                         "lifted",
393 |                         (0, 60),
394 |                         cv2.FONT_ITALIC,
395 |                         0.75,
396 |                         (0, 255, 0),
397 |                         2,
398 |                     )
399 |                 else:
400 |                     cv2.putText(
401 |                         frame_rgb,
402 |                         "not lifted",
403 |                         (0, 60),
404 |                         cv2.FONT_ITALIC,
405 |                         0.75,
406 |                         (0, 0, 255),
407 |                         2,
408 |                     )
409 |                 frame_rgb = cv2.cvtColor(frame_rgb, cv2.COLOR_BGR2RGB)
410 |                 vid.append(frame_rgb)
411 |                 if self.sim_tester.episode_succ[1] == True:
412 |                     lift_count += 1
413 |                     if lift_count >= 5:
414 |                         print("lifted")
415 |                         break
416 |             self.sim_tester.check_episode_succ()
417 |             self.write_results(results_fn, self.sim_tester.episode_succ)
418 |             vwrite(vid_fn, vid)
419 | 
420 | 
421 | def test_loss(
422 |     ckpt_dir,
423 |     evaluator,
424 |     cam_views,
425 |     model_name,
426 |     proc_name,
427 |     epoch,
428 |     gpu_name,
429 |     using_pos_and_orn,
430 |     num_threads,
431 | ):
432 |     cam_views = cam_views.split("_")
433 |     fn = os.path.join(ckpt_dir, str(epoch) + ".txt")
434 |     fn_episodes = os.path.join(ckpt_dir, str(epoch) + "_val_episodes.txt")
435 |     if num_threads > 0:
436 |         with open(fn_episodes, "r") as f:
437 |             val_episode_dirs = f.readlines()
438 | 
439 |         val_episode_dir = val_episode_dirs[
440 |             int(gpu_name) * int(num_threads) + int(proc_name)
441 |         ].strip()
442 | 
443 |         visual_data_filename_raw = f"{val_episode_dir}result_raw.csv"
444 |         raw_raw_data = pd.read_csv(visual_data_filename_raw)
445 |         # print(raw_raw_data.iloc[0])
446 |         tar_obj_pos_rot = [
447 |             raw_raw_data.iloc[0]["tar_obj_pose_x"],
448 |             raw_raw_data.iloc[0]["tar_obj_pose_y"],
449 |             raw_raw_data.iloc[0]["tar_obj_pose_rx"],
450 |             raw_raw_data.iloc[0]["tar_obj_pose_ry"],
451 |             raw_raw_data.iloc[0]["tar_obj_pose_rz"],
452 |             raw_raw_data.iloc[0]["tar_obj_pose_rw"],
453 |         ]
454 |     else:
455 |         tar_obj_pos_rot = [0, 0, 0, 0, 0, 0, 0]
456 |     utils.set_seed()
457 |     evaluator.val(
458 |         os.path.join(ckpt_dir, model_name),
459 |         ckpt_dir=ckpt_dir,
460 |         proc_name=proc_name,
461 |         cam_views=cam_views,
462 |         test_num=1,
463 |         results_fn=fn,
464 |         epoch=epoch,
465 |         gpu_name=gpu_name,
466 |         using_pos_and_orn=using_pos_and_orn,
467 |         tar_obj_pos_rot=tar_obj_pos_rot,
468 |     )
469 | 
470 | 
471 | if __name__ == "__main__":
472 |     parser = argparse.ArgumentParser()
473 |     parser.add_argument("-t", "--task", type=str, default="pick")
474 |     parser.add_argument("-d", "--ckpt_dir", type=str, default="/mnt/logs_1/1700447417")
475 |     parser.add_argument("-c", "--cam_views", type=str)
476 |     parser.add_argument("-m", "--model_name", type=str, default="49-checkpoint.pth")
477 |     parser.add_argument("-p", "--proc_name", type=int, default=200)
478 |     parser.add_argument("-e", "--epoch_num", type=str, default="0")
479 |     parser.add_argument("-g", "--gpu_name", type=str, default="3")
480 |     parser.add_argument("-u", "--using_pos_and_orn", type=int, default=False)
481 |     parser.add_argument("-n", "--num_threads", type=int, default=0)
482 | 
483 |     args = parser.parse_args()
484 |     print(args.proc_name)
485 |     # exit()
486 |     task_name = args.task
487 |     evaluator = Evaluator(task_name)
488 |     test_loss(
489 |         ckpt_dir=args.ckpt_dir,
490 |         evaluator=evaluator,
491 |         cam_views=args.cam_views,
492 |         model_name=args.model_name,
493 |         proc_name=args.proc_name,
494 |         epoch=args.epoch_num,
495 |         gpu_name=args.gpu_name,
496 |         using_pos_and_orn=bool(args.using_pos_and_orn),
497 |         num_threads=args.num_threads,
498 |     )
499 | # python IO_evaluator.py -d /mnt/logs_1/1698325798/999-checkpoint.pth -c front wrist -m 999-checkpoint.pth
500 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [2023] [IO intelligence]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Reproducing RT-1 in PyTorch
  2 | 
  3 | [![IO](https://img.shields.io/badge/io%20intelligence%20-000000)](https://io-ai.tech)
  4 | 
  5 | This repository contains complete PyTorch implementation for **RT-1** based on the paper: [RT-1 Paper](https://arxiv.org/abs/2212.06817) and implementation of RT-1 model by **maruya24's RT-1**: [maruya24's RT-1 GitHub](https://github.com/maruya24/pytorch_robotics_transformer). Our implementation complete the training, validation pipeline, and soon-to-come evaluation pipeline.
  6 | 
  7 | **Try our implementation in [Colab](https://drive.google.com/file/d/18nWZ6pgy2_0fS8BjZsUiTjOFaE6WXMi3/view?usp=sharing)**
  8 | 
  9 | ## Acknowledgements
 10 | 
 11 | - **maruya24**: For their work on RT-1, which serves as the foundation for this implementation - [maruya24's RT-1 GitHub](https://github.com/maruya24/pytorch_robotics_transformer)
 12 |     - changes on model structure: similar to what it looks like in [diffusion policy](https://diffusion-policy.cs.columbia.edu/), robot's end effector's position and orientation is concatenated to the end of the sequence before it is sent into the transformer
 13 | - **detr (Facebook)**: Utilities for distributed training from DETR - [detr/util/misc.py](https://github.com/facebookresearch/detr/blob/main/util/misc.py).
 14 | 
 15 | ## Training
 16 | 
 17 | To train RT-1 in distributed mode with 4 GPUs:
 18 | 
 19 | ```bash
 20 | python -m torch.distributed.launch --nproc_per_node=[your_gpu_num] --use_env IO_trainer_torch.py
 21 | ```
 22 | 
 23 | The training configuration is stored in train_config.json.
 24 | 
 25 | ## Training Configuration
 26 | 
 27 | The training configuration for RT-1 includes various parameters that influence the training process. Here's an overview of some key configurations:
 28 | 
 29 | - **Mode:** Training mode, options ['train', 'eval']
 30 | - **Device:** CUDA device for computation, options ['cpu', 'cuda']
 31 | - **Data Path:** Path to the dataset/[robotname]_[taskname]
 32 | - **Camera Views:** Views used in training (`front`, `wrist`, ... see these in dataset folder)
 33 | - **Log Directory:** Directory to store logs
 34 | - **Time Sequence Length:** Length of the time sequence (e.g., 6), RT-1 takes history timesteps of images as part of model input, which means `1` frame of current timestep image and `time_sequence_length - 1` frames of history image.
 35 | - **Learning Rate:** Initial learning rate
 36 | - **Batch Size:** Size of each training batch
 37 | - **Epochs:** Number of training epochs
 38 | - **Resume:** Whether to resume training from a checkpoint
 39 | - **Resume from checkpoint:** resume training from checkpoint path
 40 | - **World size:** how many gpus you are intended to use during training
 41 | - **Dist url:** distributed urls used for initialize distributed training (e.g., `"env://"`)
 42 | - **Validation Interval:** Interval between validation steps
 43 | - **Num train episode:** number of training episode used
 44 | - **Num val episode:** number of validation episode used
 45 | - **Network Configurations:** Parameters for the network architecture
 46 | - **Scheduler Configurations:** Parameters for the learning rate scheduler
 47 | 
 48 | ### Example Configuration
 49 | 
 50 | 
 51 | ```json
 52 | {
 53 |     "mode": "train",
 54 |     "device": "cuda",
 55 |     "data_path": "IO_pybullet_open_dataset/Panda_pick",
 56 |     "cam_view" : ["front", "wrist"],
 57 |     "log_dir": "/mnt/logs_1",
 58 |     "time_sequence_length": 6,
 59 |     "lr": 0.0001,
 60 |     "batch_size": 6,
 61 |     "epochs": 50,
 62 |     "resume": false,
 63 |     "resume_from_checkpoint": "",
 64 |     "predicting_next_ts": true,
 65 |     "world_size": 4,
 66 |     "dist_url": "env://",
 67 |     "val_interval" : 25,
 68 |     "num_val_threads": 25,
 69 |     "num_train_episode" : 200,
 70 |     "num_val_episode" : 10,
 71 |     "using_proprioception" : false,
 72 |     "network_configs": {
 73 |         "vocab_size" : 256,
 74 |         "token_embedding_size_per_image" : 512,
 75 |         "language_embedding_size" : 512,
 76 |         "num_layers" : 8,
 77 |         "layer_size" : 128,
 78 |         "num_heads" : 8,
 79 |         "feed_forward_size" : 512,
 80 |         "dropout_rate" : 0.1,
 81 |         "crop_size" : 236,
 82 |         "use_token_learner" : true
 83 |     },
 84 |     "scheduler_configs" : {
 85 |         "T_0" : 10,
 86 |         "T_mult" : 2,
 87 |         "eta_min" : 1e-6,
 88 |         "verbose" : true
 89 |     }
 90 |     
 91 | }
 92 | 
 93 | ```
 94 | Pretrained weights trained on settings above can be downloaded from [pretrained_weights](https://drive.google.com/uc?export=download&id=1USLqOfqYfqIrigx1hkY37SGyrLJuaPwc), setting "resume_from_checkpoint" to path of pretrained weight and setting "resume" to True to resume from the checkpoint.
 95 | 
 96 | 
 97 | ## Limitations
 98 | 
 99 | - We are currently validating the code in the PyBullet environment. Validation code will be added within a week.
100 | - The mode is presently limited to single-task training.
101 | 
102 | ## Dataset Structure
103 | 
104 | Our dataset follows a specific file structure:
105 | 
106 | - [robotname]_[taskname]
107 |   - [cam_view_0]
108 |     - data_000
109 |       - rgb # Images
110 |         - image_001.png
111 |         - image_002.png
112 |         - ...
113 |       - results.csv # Robot actions
114 |       - results_raw.csv # Joint and target object positions
115 |     - data_001
116 |     - ...
117 |   - [cam_view_1]
118 |     - data_000
119 |     - data_001
120 |     - ...
121 |   - ...
122 | 
123 | Simliar to Robomimic's lift mission [robomimic](https://robomimic.github.io/), we collected dataset from third-person and first-person perspectives.
124 | 
125 | You can download our dataset collected from PyBullet [IO_open_dataset](https://drive.google.com/uc?export=download&id=1RoTxnipQf2SIXqzvroDOXAleNvvNIIwZ).
126 | 
127 | ## Contacts
128 | 
129 | Join wechat group for discussion
130 | 
131 | 
132 | ![wechat group](img/wechatgroup.jpg)
133 | 
134 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2022 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """robotics_transformer API."""
15 | 
16 | # A new PyPI release will be pushed everytime `__version__` is increased
17 | # When changing this, also update the CHANGELOG.md
18 | __version__ = '0.1.0'
19 | from .maruya24_rt1 import *
20 | from .util import *
21 | 
22 | 


--------------------------------------------------------------------------------
/img/wechatgroup.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ioai-tech/pytorch_rt1_with_trainer_and_tester/bd86991521dda9cfd0664a0d4ad602ab34ad01a1/img/wechatgroup.jpg


--------------------------------------------------------------------------------
/maruya24_rt1/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | .DS_Store
3 | *.png
4 | unused_file/


--------------------------------------------------------------------------------
/maruya24_rt1/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/maruya24_rt1/README.md:
--------------------------------------------------------------------------------
 1 | # RT-1: Robotics Transformer, in PyTorch
 2 | PyTorch re-implementation of [RT-1](https://github.com/google-research/robotics_transformer). This model is based on 2022 paper ["RT-1: Robotics Transformer for Real-World Control at Scale"](https://arxiv.org/abs/2212.06817).
 3 | Although this implementation exactly reproduces the [original code](https://github.com/google-research/robotics_transformer) unlike other PyTorch implementation such as [lucidrains version](https://github.com/lucidrains/robotic-transformer-pytorch), I made some changes for PyTorch compatibility or simplification.
 4 | 
 5 | ## Features
 6 | * Film efficient net based image tokenizer backbone
 7 | * Token learner based compression of input tokens
 8 | * Transformer for end to end robotic control
 9 | * Testing utilities
10 | 
11 | 
12 | ## Main Changes
13 | - Translated TensorFlow implementation into PyTorch implementation.
14 | - Used spaces of OpenAI gym to define observation and action variables instead of using tensor_spec of TF-Agents.
15 | - Abolished 4-d constraint of some specs in transformer_network.py
16 | - Added useful functions and put those in utils.py.
17 | - Added extra comments.
18 | - Didn't implement the add_summaries function of transformer_network.py which is for tensorboard logging.
19 | - Omitted some variables, functions, classes, and tests including those that are no longer necessary for this repository.
20 | 
21 | ## Note
22 | I didn't implement squence_agent.py and its related codes. Because those codes are for training and inference that use the tf_agents library, so I figure it is not necessary in PyTorch implementation. 
23 | This repository enables you to use RT-1 just like the other Reinforcement learning codes that utilize PyTorch and OpenAI Gym, without the tf_agents library.
24 | 
25 | 
26 | ## License
27 | This library is licensed under the terms of the Apache license.
28 | 


--------------------------------------------------------------------------------
/maruya24_rt1/film_efficientnet/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ioai-tech/pytorch_rt1_with_trainer_and_tester/bd86991521dda9cfd0664a0d4ad602ab34ad01a1/maruya24_rt1/film_efficientnet/__init__.py


--------------------------------------------------------------------------------
/maruya24_rt1/film_efficientnet/efficientnet_checkpoints/efficientnetb3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ioai-tech/pytorch_rt1_with_trainer_and_tester/bd86991521dda9cfd0664a0d4ad602ab34ad01a1/maruya24_rt1/film_efficientnet/efficientnet_checkpoints/efficientnetb3.pth


--------------------------------------------------------------------------------
/maruya24_rt1/film_efficientnet/efficientnet_checkpoints/efficientnetb3_notop.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ioai-tech/pytorch_rt1_with_trainer_and_tester/bd86991521dda9cfd0664a0d4ad602ab34ad01a1/maruya24_rt1/film_efficientnet/efficientnet_checkpoints/efficientnetb3_notop.pth


--------------------------------------------------------------------------------
/maruya24_rt1/film_efficientnet/film_conditioning_layer.py:
--------------------------------------------------------------------------------
 1 | # Subject to the terms and conditions of the Apache License, Version 2.0 that the original code follows,
 2 | # I have retained the following copyright notice written on it.
 3 | 
 4 | # Copyright 2022 Google LLC
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #      http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | # You can find the original code from here[https://github.com/google-research/robotics_transformer].
19 | 
20 | import torch
21 | import torch.nn as nn
22 | 
23 | 
24 | class FilmConditioning(nn.Module):
25 |     def __init__(
26 |         self,
27 |         num_channels: int,
28 |         text_embed_dim: int = 512,
29 |     ):
30 |         super().__init__()
31 |         self._projection_add = nn.Linear(text_embed_dim, num_channels)
32 |         self._projection_mult = nn.Linear(text_embed_dim, num_channels)
33 | 
34 |         # Note that we initialize with zeros because empirically we have found
35 |         # this works better than initializing with glorot.
36 |         nn.init.constant_(self._projection_add.weight, 0)
37 |         nn.init.constant_(self._projection_add.bias, 0)
38 |         nn.init.constant_(self._projection_mult.weight, 0)
39 |         nn.init.constant_(self._projection_mult.bias, 0)
40 | 
41 |     # conv_filter: feature maps which corresponds to F in FiLM  paper. (B, C, H, W)
42 |     # conditioning: text which corresponds to x in FiLM paper. this is one vector that is created from a text,
43 |     # note that this is not embedding vectors from a text. Please refer to Universal Sentence Encoder. (B, D). D = 512.
44 |     def forward(self, conv_filters: torch.Tensor, conditioning: torch.Tensor):
45 |         projected_cond_add = self._projection_add(conditioning)  # (B, D) -> (B, C)
46 |         projected_cond_mult = self._projection_mult(conditioning)
47 | 
48 |         projected_cond_add = projected_cond_add.unsqueeze(2).unsqueeze(
49 |             3
50 |         )  # (B, C) -> (B, C, 1, 1)
51 |         projected_cond_mult = projected_cond_mult.unsqueeze(2).unsqueeze(3)
52 | 
53 |         # Original FiLM paper argues that 1 + gamma centers the initialization at
54 |         # identity transform.
55 |         # see 7.2 section in FiLM paper
56 |         result = (1 + projected_cond_mult) * conv_filters + projected_cond_add
57 | 
58 |         return result
59 | 


--------------------------------------------------------------------------------
/maruya24_rt1/film_efficientnet/film_efficientnet_encoder.py:
--------------------------------------------------------------------------------
  1 | # Subject to the terms and conditions of the Apache License, Version 2.0 that the original code follows,
  2 | # I have retained the following copyright notice written on it.
  3 | 
  4 | # Copyright 2022 Google LLC
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #      http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | # You can find the original code from here[https://github.com/google-research/robotics_transformer].
 19 | 
 20 | import copy
 21 | import math
 22 | import os
 23 | import json
 24 | 
 25 | import torch
 26 | import torch.nn as nn
 27 | import torch.nn.functional as F
 28 | from torchvision.ops import StochasticDepth
 29 | from torchvision.ops.misc import Conv2dNormActivation
 30 | import numpy as np
 31 | 
 32 | from maruya24_rt1.film_efficientnet.film_conditioning_layer import FilmConditioning
 33 | 
 34 | 
 35 | # This is based on Table 1 in a EfficientNet paper.
 36 | DEFAULT_BLOCKS_ARGS = [
 37 |     {
 38 |         "kernel_size": 3,
 39 |         "repeats": 1,
 40 |         "in_size": 32,
 41 |         "out_size": 16,
 42 |         "expand_ratio": 1,
 43 |         "id_skip": True,
 44 |         "strides": 1,
 45 |         "se_ratio": 0.25,
 46 |     },
 47 |     {
 48 |         "kernel_size": 3,
 49 |         "repeats": 2,
 50 |         "in_size": 16,
 51 |         "out_size": 24,
 52 |         "expand_ratio": 6,
 53 |         "id_skip": True,
 54 |         "strides": 2,
 55 |         "se_ratio": 0.25,
 56 |     },
 57 |     {
 58 |         "kernel_size": 5,
 59 |         "repeats": 2,
 60 |         "in_size": 24,
 61 |         "out_size": 40,
 62 |         "expand_ratio": 6,
 63 |         "id_skip": True,
 64 |         "strides": 2,
 65 |         "se_ratio": 0.25,
 66 |     },
 67 |     {
 68 |         "kernel_size": 3,
 69 |         "repeats": 3,
 70 |         "in_size": 40,
 71 |         "out_size": 80,
 72 |         "expand_ratio": 6,
 73 |         "id_skip": True,
 74 |         "strides": 2,
 75 |         "se_ratio": 0.25,
 76 |     },
 77 |     {
 78 |         "kernel_size": 5,
 79 |         "repeats": 3,
 80 |         "in_size": 80,
 81 |         "out_size": 112,
 82 |         "expand_ratio": 6,
 83 |         "id_skip": True,
 84 |         "strides": 1,
 85 |         "se_ratio": 0.25,
 86 |     },
 87 |     {
 88 |         "kernel_size": 5,
 89 |         "repeats": 4,
 90 |         "in_size": 112,
 91 |         "out_size": 192,
 92 |         "expand_ratio": 6,
 93 |         "id_skip": True,
 94 |         "strides": 2,
 95 |         "se_ratio": 0.25,
 96 |     },
 97 |     {
 98 |         "kernel_size": 3,
 99 |         "repeats": 1,
100 |         "in_size": 192,
101 |         "out_size": 320,
102 |         "expand_ratio": 6,
103 |         "id_skip": True,
104 |         "strides": 1,
105 |         "se_ratio": 0.25,
106 |     },
107 | ]
108 | 
109 | CONV_KERNEL_INITIALIZER = {
110 |     "class_name": "VarianceScaling",
111 |     "config": {"scale": 2.0, "mode": "fan_out", "distribution": "truncated_normal"},
112 | }
113 | 
114 | DENSE_KERNEL_INITIALIZER = {
115 |     "class_name": "VarianceScaling",
116 |     "config": {"scale": 1.0 / 3.0, "mode": "fan_out", "distribution": "uniform"},
117 | }
118 | 
119 | 
120 | # Multiply the number of filters by width_coefficient.
121 | # Usually, divisor = 8. This means new_filters is a multiple of 8.
122 | # Filters means channels.
123 | # We round by the 8, not by the 10.
124 | def round_filters(filters, divisor, width_coefficient):
125 |     """Round number of filters based on depth multiplier."""
126 |     filters *= width_coefficient
127 | 
128 |     # int(filters + divisor / 2) // divisor * divisor is an operation like rounding off.
129 |     # Usual rounding off is operated at 5(=10 / 2) interval. But this is operated at 4(=8 / 2) interval.
130 |     # If filters = 35, new_filter = 32. If filters = 36, new_filter = 40.
131 |     new_filters = max(divisor, int(filters + divisor / 2) // divisor * divisor)
132 | 
133 |     # Make sure that round down does not go down by more than 10%.
134 |     if new_filters < 0.9 * filters:
135 |         new_filters += divisor
136 |     return int(new_filters)
137 | 
138 | 
139 | # Multiply the number of repeats by depth_coefficient.
140 | def round_repeats(repeats, depth_coefficient):
141 |     """Round number of repeats based on depth multiplier."""
142 |     return int(math.ceil(depth_coefficient * repeats))
143 | 
144 | 
145 | class SeModule(nn.Module):
146 |     def __init__(self, expand_size, block_in_size, se_ratio=0.25):
147 |         super(SeModule, self).__init__()
148 | 
149 |         se_size = max(1, int(block_in_size * se_ratio))
150 | 
151 |         self.avgpool = nn.AdaptiveAvgPool2d(1)
152 |         self.fc1 = nn.Conv2d(
153 |             expand_size, se_size, kernel_size=1, stride=1, padding=0
154 |         )  # Note that we use bias=True here.
155 |         self.silu0 = nn.SiLU(inplace=True)
156 |         self.fc2 = nn.Conv2d(
157 |             se_size, expand_size, kernel_size=1, stride=1, padding=0
158 |         )  # Note that we use bias=True here.
159 |         self.act = nn.Sigmoid()
160 | 
161 |     def forward(self, inputs):
162 |         x = self.avgpool(inputs)
163 |         x = self.fc1(x)
164 |         x = self.silu0(x)
165 |         x = self.fc2(x)
166 |         x = self.act(x)
167 |         return inputs * x
168 | 
169 | 
170 | # Inverted Residual + Squeeze-and-Excitation
171 | class MBConvBlock(nn.Module):
172 |     """expand + depthwise + pointwise"""
173 | 
174 |     def __init__(
175 |         self,
176 |         kernel_size,
177 |         in_size,
178 |         out_size,
179 |         expand_ratio,
180 |         id_skip,
181 |         strides,
182 |         se_ratio,
183 |         drop_rate,
184 |     ):
185 |         super(MBConvBlock, self).__init__()
186 |         self.in_size = in_size
187 |         self.out_size = out_size
188 |         self.expand_ratio = expand_ratio
189 |         self.strides = strides
190 |         self.se_ratio = se_ratio
191 |         self.id_skip = id_skip
192 |         self.drop_rate = drop_rate
193 | 
194 |         expand_size = in_size * expand_ratio
195 | 
196 |         layers = []
197 | 
198 |         if not (1 <= strides <= 2):
199 |             raise ValueError("illegal stride value")
200 | 
201 |         # expansion phase (1x1 conv)
202 |         if expand_ratio != 1:
203 |             layers.append(
204 |                 Conv2dNormActivation(
205 |                     in_size,
206 |                     expand_size,
207 |                     kernel_size=1,
208 |                     stride=1,
209 |                     norm_layer=nn.BatchNorm2d,
210 |                     activation_layer=nn.SiLU,
211 |                 )
212 |             )
213 | 
214 |         # depthwise conv
215 |         layers.append(
216 |             Conv2dNormActivation(
217 |                 expand_size,
218 |                 expand_size,
219 |                 kernel_size=kernel_size,
220 |                 stride=strides,
221 |                 groups=expand_size,
222 |                 norm_layer=nn.BatchNorm2d,
223 |                 activation_layer=nn.SiLU,
224 |             )
225 |         )
226 | 
227 |         # Squeeze-and-Excitation module
228 |         if 0 < se_ratio <= 1:
229 |             layers.append(SeModule(expand_size, in_size, se_ratio))
230 | 
231 |         # output phase (1x1 conv)
232 |         layers.append(
233 |             Conv2dNormActivation(
234 |                 expand_size,
235 |                 out_size,
236 |                 kernel_size=1,
237 |                 stride=1,
238 |                 norm_layer=nn.BatchNorm2d,
239 |                 activation_layer=None,  # Note that there is no activation here
240 |             )
241 |         )
242 |         # self.output_conv = nn.Conv2d(expand_size, out_size, kernel_size=1, stride=1, padding=0, bias=False)
243 |         # self.bn2 = nn.BatchNorm2d(out_size)
244 | 
245 |         self.block = nn.Sequential(*layers)
246 | 
247 |         # Dropout
248 |         if drop_rate > 0:
249 |             self.dropout = StochasticDepth(drop_rate, "row")
250 | 
251 |     def forward(self, inputs):
252 |         x = self.block(inputs)
253 | 
254 |         # Dropout and skip connection
255 |         if self.id_skip and self.strides == 1 and self.in_size == self.out_size:
256 |             if self.drop_rate > 0:
257 |                 x = self.dropout(x)
258 |             x = inputs + x
259 | 
260 |         return x
261 | 
262 | 
263 | class EfficientNet(nn.Module):
264 |     def __init__(
265 |         self,
266 |         width_coefficient,
267 |         depth_coefficient,
268 |         dropout_rate=0.2,
269 |         drop_connect_rate=0.2,
270 |         depth_divisor=8,
271 |         blocks_args="default",
272 |         include_top=True,
273 |         classes=1000,
274 |         include_film=False,
275 |         text_embed_dim=512,
276 |     ):
277 |         super().__init__()
278 |         self.dropout_rate = dropout_rate
279 |         self.include_top = include_top
280 |         self.include_film = include_film
281 | 
282 |         # input dimensions
283 |         in_channels = 3
284 | 
285 |         # get defualt setting of MBConv blocks
286 |         if blocks_args == "default":
287 |             blocks_args = DEFAULT_BLOCKS_ARGS
288 | 
289 |         # stem
290 |         out_channels = round_filters(32, depth_divisor, width_coefficient)
291 |         self.convNormAct0 = Conv2dNormActivation(
292 |             in_channels,
293 |             out_channels,
294 |             kernel_size=3,
295 |             stride=2,
296 |             norm_layer=nn.BatchNorm2d,
297 |             activation_layer=nn.SiLU,
298 |         )
299 | 
300 |         # Build blocks
301 |         blocks_args = copy.deepcopy(blocks_args)
302 |         # Detach blocks_args from original DEFAULT_BLOCKS_ARGS. This is necessary to keep DEFAULT_BLOCKS_ARGS as it is when you add change to blocks_args.
303 |         # Therefore, you can make multiple models.
304 |         blocks = []
305 |         films = []
306 |         b = 0
307 |         total_repeats = float(
308 |             sum(
309 |                 round_repeats(args["repeats"], depth_coefficient)
310 |                 for args in blocks_args
311 |             )
312 |         )  # sum of all of repeat
313 |         for args in blocks_args:  # args is dictionary
314 |             assert args["repeats"] > 0
315 |             # Update block input and output filters based on depth multiplier.
316 |             args["in_size"] = round_filters(
317 |                 args["in_size"], depth_divisor, width_coefficient
318 |             )
319 |             args["out_size"] = round_filters(
320 |                 args["out_size"], depth_divisor, width_coefficient
321 |             )
322 | 
323 |             # We delete repeats in args so that we could write MBConv(**args).
324 |             for j in range(round_repeats(args.pop("repeats"), depth_coefficient)):
325 |                 if j == 0:
326 |                     # The first block
327 |                     blocks.append(
328 |                         MBConvBlock(
329 |                             **args,
330 |                             drop_rate=drop_connect_rate
331 |                             * b
332 |                             / total_repeats,  # increase drop_connect_rate linearlly
333 |                         )
334 |                     )
335 |                     args["strides"] = 1
336 |                     args["in_size"] = args["out_size"]
337 |                 else:
338 |                     blocks.append(
339 |                         MBConvBlock(
340 |                             **args,
341 |                             drop_rate=drop_connect_rate * b / total_repeats,
342 |                         )
343 |                     )
344 | 
345 |                 if include_film:
346 |                     films.append(
347 |                         FilmConditioning(
348 |                             num_channels=args["out_size"],
349 |                             text_embed_dim=text_embed_dim,
350 |                         )
351 |                     )
352 |                 b += 1
353 | 
354 |         # Make modulelist. We have to use modulelist. If we don't use it,the blocks will not emerge when we visualize this model.
355 |         self.blocks = nn.ModuleList(blocks)
356 |         if include_film:
357 |             self.films = nn.ModuleList(films)
358 | 
359 |         # Build top
360 |         in_channels = args["out_size"]
361 |         out_channels = 1280
362 |         out_channels = round_filters(out_channels, depth_divisor, width_coefficient)
363 |         self.convNormAct1 = Conv2dNormActivation(
364 |             in_channels,
365 |             out_channels,
366 |             kernel_size=1,
367 |             stride=1,
368 |             norm_layer=nn.BatchNorm2d,
369 |             activation_layer=nn.SiLU,
370 |         )
371 | 
372 |         # If we include top, we do gloval average pooling and dropout. Then, we add fully connected layer.
373 |         if include_top:
374 |             self.glovalAvePool = nn.AdaptiveAvgPool2d(1)
375 |             if dropout_rate > 0:
376 |                 self.dropout = nn.Dropout(dropout_rate)
377 |             # Fully connected layer
378 |             self.fc = nn.Linear(out_channels, classes)
379 | 
380 |     # inputs: image (bs, c, h, w)
381 |     # context: text (bs, embedding_dim). Each vector that is created from a text, not a word.
382 |     def forward(self, inputs, context=None):
383 |         # stem
384 |         outputs = self.convNormAct0(inputs)
385 | 
386 |         # Blocks(26 MBconv)
387 |         if self.include_film:
388 |             for block, film in zip(self.blocks, self.films):
389 |                 outputs = block(outputs)  # MBConv
390 |                 outputs = film(outputs, context)  # FiLM
391 | 
392 |         else:
393 |             for block in self.blocks:
394 |                 outputs = block(outputs)
395 | 
396 |         # top
397 |         outputs = self.convNormAct1(outputs)
398 | 
399 |         if self.include_top:
400 |             outputs = self.glovalAvePool(outputs)
401 |             if self.dropout_rate > 0:
402 |                 outputs = self.dropout(outputs)
403 |             outputs = torch.flatten(outputs, 1)
404 |             outputs = self.fc(outputs)
405 |             return outputs
406 |         else:
407 |             return outputs
408 | 
409 | 
410 | # If you use FiLM, this function allow us to load pretrained weight from naive efficientnet.
411 | def maybe_restore_with_film(
412 |     *args, weights="imagenet", include_top=True, include_film=False, **kwargs
413 | ):
414 |     assert (
415 |         weights == None or weights == "imagenet"
416 |     ), "Set weights to either None or 'imagenet'."
417 |     # Create model without FiLM
418 |     n1 = EfficientNet(*args, include_top=include_top, include_film=False, **kwargs)
419 | 
420 |     # Load weights.
421 |     if weights == "imagenet":
422 |         if include_top:
423 |             weights_path = os.path.join(
424 |                 os.path.dirname(__file__), "efficientnet_checkpoints/efficientnetb3.pth"
425 |             )
426 |         else:
427 |             weights_path = os.path.join(
428 |                 os.path.dirname(__file__),
429 |                 "efficientnet_checkpoints/efficientnetb3_notop.pth",
430 |             )
431 |         # This EfficientNet differs from the official pytorch implementation only in parameter names.
432 |         # So we load the pytorch weights in using this function.
433 |         n1 = load_official_pytorch_param(n1, weights_path)
434 | 
435 |     # If you don't use FiLM, that's all.
436 |     if not include_film:
437 |         return n1
438 | 
439 |     # Create model with FiLM if you use FiLM. And we load pretrained weights from n1.
440 |     n2 = EfficientNet(*args, include_top=include_top, include_film=True, **kwargs)
441 |     if weights == None:
442 |         return n2
443 | 
444 |     n1_state_dict = n1.state_dict().copy()
445 |     n2_state_dict = n2.state_dict().copy()
446 | 
447 |     for name, param in n2_state_dict.items():
448 |         if name in n1_state_dict:
449 |             n2_state_dict[name] = n1_state_dict[name]
450 | 
451 |     n2.load_state_dict(n2_state_dict)
452 |     return n2
453 | 
454 | 
455 | # This function helps load official pytorch efficientnet's weights.
456 | def load_official_pytorch_param(model: nn.Module, weights_path):
457 |     # load weights
458 |     official_state_dict = torch.load(weights_path)
459 | 
460 |     film_eff_state_dict = model.state_dict().copy()
461 |     keys_official_list = list(official_state_dict)
462 |     keys_film_eff_list = list(film_eff_state_dict)
463 | 
464 |     for key_official, key_film_eff in zip(keys_official_list, keys_film_eff_list):
465 |         film_eff_state_dict[key_film_eff] = official_state_dict[key_official]
466 |         # print(str(key_official) + "->" + str(key_film_eff))
467 | 
468 |     # load new weights
469 |     model.load_state_dict(film_eff_state_dict)
470 |     return model
471 | 
472 | 
473 | # EfficientNetB3 is tranined on 300x300 image.
474 | def EfficientNetB3(
475 |     weights="imagenet", include_top=True, classes=1000, include_film=False, **kwargs
476 | ):
477 |     return maybe_restore_with_film(
478 |         1.2,
479 |         1.4,
480 |         0.3,
481 |         weights=weights,
482 |         include_top=include_top,
483 |         classes=classes,
484 |         include_film=include_film,
485 |         **kwargs,
486 |     )
487 | 
488 | 
489 | # Class for Postprocessing model's output
490 | class ILSVRCPredictor:
491 |     def __init__(self, top=5):
492 |         # Load label imformation of ILSVRC
493 |         image_json_path = os.path.join(
494 |             os.path.dirname(__file__), "efficientnet_checkpoints/imagenet_classes.json"
495 |         )
496 |         with open(image_json_path, "r") as f:
497 |             self.class_index = json.load(f)
498 | 
499 |         self.top = top
500 | 
501 |     # Obtain label name of top-k probability
502 |     def predict_topk(self, out):
503 |         assert (
504 |             out.shape[0] == 1
505 |         ), "Only accept tonsor with batch size 1 to simplify implementation"
506 |         top_indices = out[0].detach().numpy().argsort()[-self.top :][::-1]
507 |         predicted_label_names = [self.class_index[str(idx)][1] for idx in top_indices]
508 | 
509 |         return predicted_label_names
510 | 


--------------------------------------------------------------------------------
/maruya24_rt1/film_efficientnet/film_efficientnet_encoder_test.py:
--------------------------------------------------------------------------------
 1 | # Subject to the terms and conditions of the Apache License, Version 2.0 that the original code follows, 
 2 | # I have retained the following copyright notice written on it.
 3 | 
 4 | # Copyright 2022 Google LLC
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #      http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | # You can find the original code from here[https://github.com/google-research/robotics_transformer].
19 | 
20 | 
21 | import unittest
22 | import torch
23 | from skimage import data
24 | from absl.testing import parameterized
25 | from torchvision.models import efficientnet_b3, EfficientNet_B3_Weights
26 | import numpy as np
27 | from torchvision import transforms
28 | import matplotlib.pyplot as plt
29 | 
30 | from pytorch_robotics_transformer.film_efficientnet.film_efficientnet_encoder import EfficientNetB3, ILSVRCPredictor
31 | 
32 | 
33 | # If you want run this test, move to the directory above pytorch_robotics_transformer directory, type below command on terminal.
34 | # python -m pytorch_robotics_transformer.film_efficientnet.film_efficientnet_encoder_test
35 | # If you want to test all test files in film_efficientnet directory, type below command on terminal.
36 | # python -m unittest discover -s pytorch_robotics_transformer/film_efficientnet -p "*_test.py"
37 | 
38 | resize = 300
39 | mean = (0.485, 0.456, 0.406)
40 | std = (0.229, 0.224, 0.225)
41 | 
42 | # This transformation is for imagenet. Use another transformatioin for RT-1
43 | img_trasnform = transforms.Compose([
44 |                 transforms.ToPILImage(),
45 |                 transforms.Resize(resize),
46 |                 transforms.CenterCrop(resize),
47 |                 transforms.ToTensor(),
48 |                 transforms.Normalize(mean, std)
49 |             ])
50 | 
51 | class FilmEfficientnetTest(parameterized.TestCase, unittest.TestCase):
52 |     # This test method will be implemented twice. One is implemented with include_film = True. Another is implemented with include_film = False.
53 |     # To know further more, see a comment of https://github.com/abseil/abseil-py/blob/main/absl/testing/parameterized.py
54 |     @parameterized.parameters([True, False])
55 |     def test_film_efficientnet(self, include_film):
56 |         fe =  EfficientNetB3(include_top=True, weights='imagenet', include_film=include_film)
57 |         fe.eval()
58 | 
59 |         image = data.chelsea() # ndarray
60 |         image_transformed = img_trasnform(image) # tensor
61 | 
62 |         # Show the image.
63 |         # image_show = image_transformed.numpy().transpose((1, 2, 0))
64 |         # image_show = np.clip(image_show, 0, 1)
65 |         # plt.imshow(image_show)
66 |         # plt.show()
67 | 
68 |         image_transformed = image_transformed.unsqueeze(0)
69 |         context = torch.rand(1, 512)
70 | 
71 |         if include_film:
72 |             eff_output = fe(image_transformed, context)
73 |         else:
74 |             eff_output = fe(image_transformed)
75 |         
76 |         predictor = ILSVRCPredictor(top=10)
77 | 
78 |         film_preds = predictor.predict_topk(eff_output)
79 |         # print(film_preds)
80 |         self.assertIn('tabby', film_preds)
81 | 
82 | if __name__ == '__main__':
83 |     unittest.main()


--------------------------------------------------------------------------------
/maruya24_rt1/film_efficientnet/preprocessors.py:
--------------------------------------------------------------------------------
 1 | # Subject to the terms and conditions of the Apache License, Version 2.0 that the original code follows, 
 2 | # I have retained the following copyright notice written on it.
 3 | 
 4 | # Copyright 2022 Google LLC
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #      http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | # You can find the original code from here[https://github.com/google-research/robotics_transformer].
19 | 
20 | 
21 | 
22 | # This prepocessor is a simplfied version of original code.
23 | # This will pad the image and crop it at a random location
24 | # The cropped image maintain original image size and almost full field of view.
25 | # Omitted preprocessor_test.py. But you can do simple test by running this code.
26 | 
27 | import torch
28 | import torch.nn.functional as F
29 | from skimage import data
30 | import numpy as np
31 | import matplotlib.pyplot as plt
32 | 
33 | # images: [B, 3 ,H, W]
34 | # receive images whose values is in the range [0,255]
35 | # -> change images values range into [0.0 ,1.0]
36 | # -> padding and crop image
37 | def convert_dtype_and_crop_images(images: torch.Tensor, ratio: float = 0.07):
38 |     if images.dtype == torch.uint8:
39 |         images = images / 255.
40 |     images = images.to(torch.float32)
41 | 
42 |     _, _, height, width = images.shape
43 |     ud_pad = int(height * ratio)
44 |     lr_pad = int(width * ratio)
45 |     
46 |     images= F.pad(images, pad=(lr_pad, lr_pad, ud_pad, ud_pad))
47 | 
48 |     shif_h = torch.randint(0, 2*ud_pad+1, size=[])
49 |     shif_w = torch.randint(0, 2*lr_pad+1, size=[])
50 | 
51 |     grid_h, grid_w = torch.meshgrid(torch.arange(shif_h, shif_h + height),
52 |                                     torch.arange(shif_w, shif_w + width), 
53 |                                     indexing='ij')
54 |     images = images[..., grid_h, grid_w] # fancy index
55 | 
56 |     return images
57 | 
58 | 
59 | 
60 | if __name__ == '__main__':
61 |     images = data.coffee() # ndarray
62 |     images = np.tile(np.expand_dims(images,0), (10,1,1,1)) # batch size: 10
63 |     images = torch.from_numpy(images).permute(0, 3, 1, 2) # (b, h, w, c) -> (b, c, h, w)
64 |     images = convert_dtype_and_crop_images(images)
65 |     print(torch.max(images))
66 |     print(torch.min(images))
67 |     print(images.shape)
68 |     image_show = images.permute(0, 2, 3 , 1).numpy()
69 |     plt.imshow(image_show[0])
70 |     plt.show()


--------------------------------------------------------------------------------
/maruya24_rt1/film_efficientnet/pretrained_efficientnet_encoder.py:
--------------------------------------------------------------------------------
 1 | # Subject to the terms and conditions of the Apache License, Version 2.0 that the original code follows,
 2 | # I have retained the following copyright notice written on it.
 3 | 
 4 | # Copyright 2022 Google LLC
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #      http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | # You can find the original code from here[https://github.com/google-research/robotics_transformer].
19 | 
20 | 
21 | """Encoder based on Efficientnet."""
22 | 
23 | 
24 | # film_efficientnet_encoder receive [bs, 3, 300, 300] and returns [bs, 1536, 10, 10]
25 | # Here we use 1x1 conv. [bs, 1536, 10, 10] -> [bs, 512, 10, 10]
26 | # then apply FiLM.
27 | 
28 | import torch
29 | import torch.nn as nn
30 | from typing import Optional
31 | 
32 | from maruya24_rt1.film_efficientnet.film_efficientnet_encoder import EfficientNetB3
33 | from maruya24_rt1.film_efficientnet.film_conditioning_layer import FilmConditioning
34 | 
35 | 
36 | class EfficientNetEncoder(nn.Module):
37 |     def __init__(
38 |         self,
39 |         token_embedding_size: int = 512,
40 |         language_embedding_size: int = 512,
41 |         weights: Optional[str] = "imagenet",
42 |         early_film: bool = True,
43 |         include_top: bool = False,
44 |         pooling: bool = True,
45 |     ):
46 |         super().__init__()
47 | 
48 |         self.conv1x1 = nn.Conv2d(
49 |             in_channels=1536,  # If we use EfficientNetB3 and input image has 3 channels, in_channels is 1536.
50 |             out_channels=token_embedding_size,
51 |             kernel_size=1,
52 |             stride=1,
53 |             padding=0,
54 |             bias=False,
55 |         )
56 |         self.net = EfficientNetB3(
57 |             weights=weights,
58 |             include_top=include_top,
59 |             include_film=early_film,
60 |             text_embed_dim=language_embedding_size,
61 |         )
62 |         self.film_layer = FilmConditioning(
63 |             num_channels=token_embedding_size, text_embed_dim=language_embedding_size
64 |         )
65 | 
66 |         self.early_film = early_film
67 |         self._pooling = pooling
68 | 
69 |     def _encode(self, image: torch.Tensor, context: torch.Tensor) -> torch.Tensor:
70 |         """Run the image through the efficientnet encoder."""
71 |         if self.early_film:
72 |             return self.net(image, context)
73 |         return self.net(image)
74 | 
75 |     def forward(self, image: torch.Tensor, context: torch.Tensor) -> torch.Tensor:
76 |         features = self._encode(image, context)
77 |         features = self.conv1x1(features)
78 |         features = self.film_layer(features, context)
79 | 
80 |         if not self._pooling:
81 |             return features
82 | 
83 |         # Global average pool.
84 |         return torch.mean(features, dim=(2, 3))
85 | 


--------------------------------------------------------------------------------
/maruya24_rt1/film_efficientnet/pretrained_efficientnet_encoder_test.py:
--------------------------------------------------------------------------------
 1 | # Subject to the terms and conditions of the Apache License, Version 2.0 that the original code follows, 
 2 | # I have retained the following copyright notice written on it.
 3 | 
 4 | # Copyright 2022 Google LLC
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #      http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | # You can find the original code from here[https://github.com/google-research/robotics_transformer].
19 | 
20 | 
21 | from pytorch_robotics_transformer.film_efficientnet import film_efficientnet_encoder
22 | from pytorch_robotics_transformer.film_efficientnet import pretrained_efficientnet_encoder as eff
23 | import unittest
24 | import numpy as np
25 | from skimage import data
26 | import torch
27 | # import film_efficientnet_encoder
28 | # import pretrained_efficientnet_encoder as eff
29 | from torchvision import transforms
30 | 
31 | resize = 300
32 | mean = (0.485, 0.456, 0.406)
33 | std = (0.229, 0.224, 0.225)
34 | 
35 | # This transformation is for imagenet. Use another transformatioin for RT-1
36 | img_trasnform = transforms.Compose([
37 |                 transforms.ToPILImage(),
38 |                 transforms.Resize(resize),
39 |                 transforms.CenterCrop(resize),
40 |                 transforms.ToTensor(),
41 |                 transforms.Normalize(mean, std)
42 |             ])
43 | 
44 | 
45 | class PretrainedEfficientnetEncoderTest(unittest.TestCase):
46 |     def test_encoding(self):
47 |         """Test that we get a correctly shaped decoding."""
48 |         # prepare context
49 |         state = np.random.RandomState(0)
50 |         context = state.uniform(-1, 1, (10, 512))
51 |         context = torch.from_numpy(context).to(torch.float32)
52 | 
53 |         # prepare image
54 |         image = data.chelsea() # ndarray
55 |         image_transformed = img_trasnform(image) # tensor
56 |         image_transformed = torch.tile(image_transformed.unsqueeze(0), (10,1,1,1))
57 | 
58 |         token_embedding_size = 512
59 |         model = eff.EfficientNetEncoder(token_embedding_size = token_embedding_size)
60 |         model.eval()
61 |         preds = model(image_transformed, context)
62 |         self.assertEqual(preds.shape, (10, token_embedding_size))
63 | 
64 |     def test_imagenet_classification(self):
65 |         """Test that we can correctly classify an image of a cat."""
66 |         # prepare context
67 |         state = np.random.RandomState(0)
68 |         context = state.uniform(-1, 1, (10, 512))
69 |         context = torch.from_numpy(context).to(torch.float32)
70 | 
71 |         # prepare image
72 |         image = data.chelsea() # ndarray
73 |         image_transformed = img_trasnform(image) # tensor
74 |         image_transformed = torch.tile(image_transformed.unsqueeze(0), (10,1,1,1))
75 |         
76 |         # prediction
77 |         model = eff.EfficientNetEncoder(include_top=True)
78 |         model.eval()
79 |         preds = model._encode(image_transformed, context)
80 |         preds = preds[0:1]
81 | 
82 |         predictor = film_efficientnet_encoder.ILSVRCPredictor()
83 |         predicted_names = predictor.predict_topk(preds)
84 | 
85 | 
86 |         self.assertIn('tabby', predicted_names)
87 | 
88 | 
89 | if __name__ == '__main__':
90 |     unittest.main()


--------------------------------------------------------------------------------
/maruya24_rt1/tokenizers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ioai-tech/pytorch_rt1_with_trainer_and_tester/bd86991521dda9cfd0664a0d4ad602ab34ad01a1/maruya24_rt1/tokenizers/__init__.py


--------------------------------------------------------------------------------
/maruya24_rt1/tokenizers/action_tokenizer.py:
--------------------------------------------------------------------------------
  1 | # Subject to the terms and conditions of the Apache License, Version 2.0 that the original code follows, 
  2 | # I have retained the following copyright notice written on it.
  3 | 
  4 | # Copyright 2022 Google LLC
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #      http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | # You can find the original code from here[https://github.com/google-research/robotics_transformer].
 19 | 
 20 | 
 21 | from gym import spaces
 22 | import torch
 23 | 
 24 | from typing import Dict, Union
 25 | """
 26 | Please define action space using gym.spaces.Dict.
 27 | 
 28 | As an example, if an action is:
 29 | terminate = [0, 1] # this is one hot vector.
 30 | world_vector = [0.9, 0.8, -0.3]
 31 | rotation_delta = [-0.1, 0.2, .6]
 32 | gripper_closedness = 0.9
 33 | 
 34 | then action space will look like
 35 | action_space = gym.spaces.Dict(
 36 |     {
 37 |         'terminate': gym.spaces.Discrete(2),
 38 |         'world_vector': gym.spaces.Box(low= -1.0, high= 1.0, shape=(3,), dtype=np.float32),
 39 |         'rotation_delta': gym.spaces.Box(low= -np.pi / 2  , high= np.pi / 2, shape=(3,), dtype=np.float32),
 40 |         'gripper_closedness_action': gym.spaces.Box(low= -1.0  , high= 1.0, shape=(1,), dtype=np.float32)
 41 |     }
 42 | )
 43 | or 
 44 | action_space = gym.spaces.Dict(
 45 |             OrderedDict([
 46 |                 ('terminate', gym.spaces.Discrete(2)), 
 47 |                 ('world_vector', gym.spaces.Box(low= -1.0, high= 1.0, shape=(3,), dtype=np.float32)),
 48 |                 ('rotation_delta', gym.spaces.Box(low= -np.pi / 2., high= np.pi / 2., shape=(3,), dtype=np.float32)),
 49 |                 ('gripper_closedness_action', gym.spaces.Box(low= -1.0  , high= 1.0, shape=(1,), dtype=np.float32))
 50 |                 ])
 51 |         )
 52 | Please use OrderedDict if you want gym.spaces.Dict to keep order of actions.
 53 | 
 54 | This action_space is just information about each action.
 55 | These information are very convenient when interpreting, examining, cliping, and processing the action
 56 | because the action is dictionary with key names which are the same as the action_space.
 57 | 
 58 | action value will be look like
 59 | action = {
 60 |     'terminate': 1,
 61 |     'world_vector': [0.9, 0.8, -0.3],
 62 |     'rotation_delta': [-0.1, 0.2, .6],
 63 |     'gripper_closedness_action': [0.9]
 64 | }
 65 | Note that values are int and numpy 1-d arrays.
 66 | """
 67 | 
 68 | class RT1ActionTokenizer():
 69 |     def __init__(self,
 70 |                  action_space: spaces.Dict,
 71 |                  vocab_size: int):
 72 |         """Instantiates an RT1ActionTokenizer.
 73 | 
 74 |         Args:
 75 |         action_space: A dictionary of OpenAI gym spaces of the expected actions.
 76 |         vocab_size: Number of buckets to discretize action to.
 77 |         """
 78 | 
 79 |         self._action_space = action_space
 80 |         self._vocab_size = vocab_size
 81 |         self._action_order = list(action_space.keys()) # Order of tokenizing
 82 | 
 83 |         self._tokens_per_action = 0
 84 |         # Calculate the number of tokens per action
 85 |         for action in self._action_order:
 86 |             # If this action is spaces.Discrete, this is one token.
 87 |             if isinstance(self._action_space[action], spaces.Discrete):
 88 |                 self._tokens_per_action += 1
 89 | 
 90 |             # If a action is spaces.Box, get the number of tokens using shape.
 91 |             elif isinstance(self._action_space[action], spaces.Box):
 92 |                 action_shape = self._action_space[action].shape
 93 |                 if len(action_shape) != 1:
 94 |                     raise ValueError(
 95 |                         f'Only action shapes with single dimension supported, got {action_shape}')
 96 |                 self._tokens_per_action += action_shape[0]
 97 |             else:
 98 |                 raise ValueError('We assume action_space is defined by either gym.spaces.Discrete or gym.spaces.Box')
 99 | 
100 | 
101 |     @property
102 |     def tokens_per_action(self) -> int:
103 |         return self._tokens_per_action
104 | 
105 |     def tokenize(self, action: Dict[str, torch.Tensor]) -> torch.Tensor:
106 |         """Tokenizes an action."""
107 |         action_tokens = []
108 |         # Perform tokenizing in order of self._action_order 
109 |         for k in self._action_order: # If a is Discrete, the size of a is () or (batch,), which means the former is scalar of Tensor type and the later is 1-d Tensor.
110 |             a = action[k]
111 |             a_space = self._action_space[k]
112 |             if isinstance(a_space, spaces.Discrete):
113 |                 assert torch.all(a < self._vocab_size), "Discrete action should be smaller than vocab size."
114 |                 token = a #  Discrete action is already token. The size is () or (batch,)
115 |                 token = a.unsqueeze(-1) # The size is (1,) or (batch, 1). Discrete action will be one token.
116 |             else: # if a is Box, size of a is (action_size) or (batch, action_size).
117 |                 low = torch.tensor(a_space.low).to(a.device)
118 |                 high = torch.tensor(a_space.high).to(a.device)
119 |                 a = torch.clamp(a, low, high)
120 |                 # Normalize the action.
121 |                 token = (a - low) / (high - low)
122 |                 # Bucket and discretize the action to vocab_size.
123 |                 token = token * (self._vocab_size - 1)
124 |                 token = token.to(torch.int32) # The size is (action_size) or (batch, action_size).
125 |             action_tokens.append(token) # if this action has action_size, this action will be action_size tokens.
126 |         # Contatenate all actions. The size will be (tokens_per_action) or (batch,  tokens_per_action)
127 |         action_tokens = torch.concat(action_tokens, dim=-1)
128 |         return action_tokens
129 | 
130 |     # The size of action_tokens is (tokens_per_action) or  (batch, tokens_per_action)
131 |     def detokenize(self, action_tokens: torch.Tensor) -> Dict[str, torch.Tensor]:
132 |         """Detokenizes an action."""
133 |         action = {}
134 |         token_index = 0
135 |         # action_tokens is in self._action_order order
136 |         # So we will detokenize in self._action_order order
137 |         for k in self._action_order:
138 |             # Discrete actions are already assumed to be tokens.
139 |             space = self._action_space[k]
140 |             if isinstance(space, spaces.Discrete):
141 |                 # The size of action_tokens[k] : (1,) or (batch,), which means the former is scalar of Tensor type and the later is 1-d Tensor.
142 |                 action[k] = action_tokens[..., token_index]
143 |                 # A poor model may output tokens outside the allowed range, in that case
144 |                 # set them to a default value, the 0 token in this case.
145 |                 action[k] = torch.where(action[k] > space.n, torch.zeros_like(action[k]), action[k])
146 |                 token_index += 1
147 |             else:
148 |                 actions = []
149 |                 action_dim = space.shape[0]
150 |                 for j in range(action_dim):
151 |                     a = action_tokens[..., token_index:token_index + 1] # The size of a: (1,) or (batch, 1)
152 |                     a = a.to(torch.float32) # Change int32 to float32
153 |                     # de-normalize
154 |                     a = a / (self._vocab_size - 1)
155 |                     a = a * (space.high[j] - space.low[j]) + space.low[j]
156 |                     actions.append(a)
157 |                     token_index += 1
158 |                 action[k] = torch.concat(actions, dim=-1) # size: (action_dim) or (batch, action_dim)
159 |         return action


--------------------------------------------------------------------------------
/maruya24_rt1/tokenizers/action_tokenizer_test.py:
--------------------------------------------------------------------------------
  1 | # Subject to the terms and conditions of the Apache License, Version 2.0 that the original code follows, 
  2 | # I have retained the following copyright notice written on it.
  3 | 
  4 | # Copyright 2022 Google LLC
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #      http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | # You can find the original code from here[https://github.com/google-research/robotics_transformer].
 19 | 
 20 | import unittest
 21 | from gym import spaces
 22 | from pytorch_robotics_transformer.tokenizers.action_tokenizer import RT1ActionTokenizer
 23 | import numpy as np
 24 | from collections import OrderedDict
 25 | from typing import List, Dict
 26 | from pytorch_robotics_transformer.tokenizers.utils import batched_space_sampler, np_to_tensor
 27 | import torch
 28 | import sys
 29 | 
 30 | class ActionTokenizerTest(unittest.TestCase):
 31 |     
 32 |     # Use one Discrete action.
 33 |     # Check tokens_per_action and token.
 34 |     def testTokenize_Discrete(self):
 35 |         action_space = spaces.Dict(
 36 |             {
 37 |                 'terminate': spaces.Discrete(2) 
 38 |             }
 39 |         )
 40 |         tokenizer = RT1ActionTokenizer(action_space, vocab_size=10)
 41 |         self.assertEqual(1, tokenizer.tokens_per_action)
 42 | 
 43 |         action = {
 44 |             'terminate': 1
 45 |         }
 46 |         action = np_to_tensor(action)
 47 |         action_tokens = tokenizer.tokenize(action)
 48 |         self.assertEqual(torch.tensor([1]), action_tokens)
 49 |         
 50 | 
 51 |     def testDetokenize_Discrete(self):
 52 |         action_space = spaces.Dict(
 53 |             {
 54 |                 'terminate': spaces.Discrete(2)
 55 |             }
 56 |         )
 57 |         tokenizer = RT1ActionTokenizer(action_space, vocab_size=10)
 58 |         # 0 token should become 0
 59 |         action = tokenizer.detokenize(torch.tensor([0])) # tokenized action is ndarray.
 60 |         self.assertEqual(torch.tensor(0), action['terminate'])
 61 |         # 1 token should become 1
 62 |         action = tokenizer.detokenize(torch.tensor([1]))
 63 |         self.assertEqual(torch.tensor(1), action['terminate'])
 64 |         # OOV(Out of vocabulary) 3 token should become a default 0
 65 |         action = tokenizer.detokenize(torch.tensor([3]))
 66 |         self.assertEqual(torch.tensor(0), action['terminate'])
 67 | 
 68 | 
 69 |     # Use one Box action.
 70 |     # Check tokens_per_action and token.
 71 |     def testTokenize_Box(self):
 72 |         action_space = spaces.Dict(
 73 |             {
 74 |                 'world_vector': spaces.Box(low= -1., high= 1., shape=(3,), dtype=np.float32)
 75 |             }
 76 |         )
 77 |         tokenizer = RT1ActionTokenizer(action_space, vocab_size=10)
 78 |         self.assertEqual(3, tokenizer.tokens_per_action)
 79 | 
 80 |         action = {
 81 |             'world_vector': np.array([0.1, 0.5, -0.8])
 82 |         }
 83 |         action = np_to_tensor(action)
 84 |         action_tokens = tokenizer.tokenize(action)
 85 |         self.assertSequenceEqual([4, 6, 0], list(action_tokens))
 86 | 
 87 | 
 88 |     def testTokenize_Box_with_time_dimension(self):
 89 |         action_space = spaces.Dict(
 90 |             {
 91 |                 'world_vector': spaces.Box(low= -1., high= 1., shape=(3,), dtype=np.float32)
 92 |             }
 93 |         )
 94 |         tokenizer = RT1ActionTokenizer(action_space, vocab_size=10)
 95 |         self.assertEqual(3, tokenizer.tokens_per_action)
 96 | 
 97 |         # We check if tokenizer works correctly when action vector has dimensions of batch size and time.
 98 |         batch_size = 2
 99 |         time_dimension = 3
100 |         world_vec = np.array([[0.1, 0.5, -0.8], [0.1, 0.5, -0.8], [0.1, 0.5, -0.8], [0.1, 0.5, -0.8], [0.1, 0.5, -0.8], [0.1, 0.5, -0.8]])
101 |         world_vec = np.reshape(world_vec, (batch_size, time_dimension, tokenizer.tokens_per_action))
102 |         action = {
103 |             'world_vector': world_vec
104 |         }
105 |         action = np_to_tensor(action)
106 |         action_tokens = tokenizer.tokenize(action)
107 |         self.assertSequenceEqual(
108 |         [batch_size, time_dimension, tokenizer.tokens_per_action], list(action_tokens.shape))
109 | 
110 | 
111 |     def testTokenize_Box_at_limits(self):
112 |         minimum = -1.
113 |         maximum = 1.
114 |         vocab_size = 10
115 |         action_space = spaces.Dict(
116 |             {
117 |                 'world_vector': spaces.Box(low=minimum, high= maximum, shape=(2,), dtype=np.float32)
118 |             }
119 |         )
120 |         tokenizer = RT1ActionTokenizer(action_space, vocab_size=vocab_size)
121 |         self.assertEqual(2, tokenizer.tokens_per_action)
122 |         action = {
123 |             'world_vector': [minimum, maximum]
124 |         }
125 |         action = np_to_tensor(action)
126 |         action_tokens = tokenizer.tokenize(action)
127 |         # Minimum value will go to 0
128 |         # Maximum value witll go to vocab_size-1
129 |         self.assertSequenceEqual([0, vocab_size - 1], list(action_tokens))
130 | 
131 | 
132 |     def testTokenize_invalid_action_spec_shape(self):
133 |         action_space = spaces.Dict(
134 |             {
135 |                 'world_vector': spaces.Box(low=-1., high=1., shape=(2,2), dtype=np.float32)
136 |             }
137 |         )
138 |         with self.assertRaises(ValueError):
139 |             RT1ActionTokenizer(action_space, vocab_size=10)
140 | 
141 | 
142 |     # This test is the closest to a real situation.
143 |     def testTokenizeAndDetokenizeIsEqual(self):
144 |         action_space_dict = OrderedDict([('terminate', spaces.Discrete(2)), 
145 |                                          ('world_vector', spaces.Box(low= -1.0, high= 1.0, shape=(3,), dtype=np.float32)),
146 |                                          ('rotation_delta', spaces.Box(low= -np.pi / 2., high= np.pi / 2., shape=(3,), dtype=np.float32)),
147 |                                          ('gripper_closedness_action', spaces.Box(low= -1.0  , high= 1.0, shape=(1,), dtype=np.float32))
148 |                                          ])
149 |         action_space = spaces.Dict(action_space_dict)
150 |         tokenizer = RT1ActionTokenizer(action_space, vocab_size=1024)
151 |         self.assertEqual(8, tokenizer.tokens_per_action)
152 | 
153 |         # Repeat the following test N times with fuzzy inputs.
154 |         n_repeat = 10
155 |         for _ in range(n_repeat):
156 |             action = action_space.sample()
157 |             action = np_to_tensor(action)
158 |             action_tokens = tokenizer.tokenize(action)
159 |             policy_action = tokenizer.detokenize(action_tokens)
160 |             # print(action)
161 |             # print(action_tokens)
162 |             # print(policy_action)
163 |             for k in action:
164 |                 np.testing.assert_allclose(action[k].numpy(), policy_action[k].numpy(), 2)
165 |         
166 |         # Repeat the test with batched actions
167 |         batch_size = 2
168 |         batched_action = batched_space_sampler(action_space, batch_size)
169 |         batched_action = np_to_tensor(batched_action)
170 |         action_tokens = tokenizer.tokenize(batched_action)
171 |         policy_action = tokenizer.detokenize(action_tokens)
172 | 
173 |         # print(batched_action)
174 |         # print(action_tokens)
175 |         # print(policy_action)
176 | 
177 |         for k in batched_action:
178 |             for a, policy_a in zip(batched_action[k], policy_action[k]):
179 |                 np.testing.assert_almost_equal(a.numpy(), policy_a.numpy(), decimal=2)
180 | 
181 | 
182 | 
183 | if __name__ == '__main__':
184 |     unittest.main()


--------------------------------------------------------------------------------
/maruya24_rt1/tokenizers/image_tokenizer.py:
--------------------------------------------------------------------------------
  1 | # Subject to the terms and conditions of the Apache License, Version 2.0 that the original code follows,
  2 | # I have retained the following copyright notice written on it.
  3 | 
  4 | # Copyright 2022 Google LLC
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #      http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | # You can find the original code from here[https://github.com/google-research/robotics_transformer].
 19 | 
 20 | """A FiLM Efficientnet contextual image tokenizer used in Robotics Transformer 1.
 21 | """
 22 | 
 23 | from typing import Optional
 24 | import torch
 25 | import torch.nn as nn
 26 | import torch.nn.functional as F
 27 | 
 28 | from maruya24_rt1.film_efficientnet.pretrained_efficientnet_encoder import (
 29 |     EfficientNetEncoder,
 30 | )
 31 | from maruya24_rt1.tokenizers.token_learner import TokenLearnerModule
 32 | 
 33 | 
 34 | class RT1ImageTokenizer(nn.Module):
 35 |     def __init__(
 36 |         self,
 37 |         embedding_output_dim: int = 512,
 38 |         language_embedding_size: int = 512,
 39 |         use_token_learner: bool = False,
 40 |         num_tokens: int = 8,
 41 |     ):
 42 |         super().__init__()
 43 |         self._tokenizer = EfficientNetEncoder(
 44 |             token_embedding_size=embedding_output_dim,
 45 |             language_embedding_size=language_embedding_size,
 46 |             early_film=True,
 47 |             pooling=False,
 48 |         )
 49 | 
 50 |         self._use_token_learner = use_token_learner
 51 |         if self._use_token_learner:
 52 |             self._num_tokens = num_tokens
 53 |             self._token_learner = TokenLearnerModule(
 54 |                 inputs_channels=embedding_output_dim, num_tokens=self._num_tokens
 55 |             )
 56 | 
 57 |     @property
 58 |     def tokens_per_context_image(self) -> int:
 59 |         if self._use_token_learner:
 60 |             num_tokens = self._num_tokens
 61 |         else:
 62 |             num_tokens = 100
 63 |         return num_tokens
 64 | 
 65 |     # Note that context is the same value along with time axis.
 66 |     # This means (b, 0, embedding_dim) == (b, 1, embedding_dim) == (b, 2, embedding_dim) ...
 67 |     def forward(
 68 |         self, image: torch.Tensor, context: Optional[torch.Tensor] = None
 69 |     ) -> torch.Tensor:
 70 |         """Gets image tokens.
 71 | 
 72 |         Args:
 73 |         image: Images of shape (b, t, 3, h, w) to tokenize.
 74 |         context: An optional context vector (e.g., a natural language embedding).
 75 |             Expected to have shape (b, t, embedding_dim).
 76 |         training: Whether or not we are in training mode.
 77 | 
 78 |         Returns:
 79 |         tokens: has shape (batch, t, num_tokens_per_timestep, embedding_dim)
 80 |         """
 81 |         b, t, c, h, w = image.shape
 82 | 
 83 |         # Fold the time axis into the batch axis.
 84 |         image = image.view(b * t, c, h, w)
 85 |         if context is not None:
 86 |             context = context.view(b * t, -1)
 87 | 
 88 |         tokens = self._tokenizer(image, context=context)  # [b * t, 512 , 10, 10]
 89 | 
 90 |         if self._use_token_learner:
 91 |             tokens = self._token_learner(tokens)  # [b * t, num_token, 512]
 92 |             # Unflatten the time axis, which was previously flattened into the batch.
 93 |             tokens = tokens.view(b, t, tokens.shape[1], -1)
 94 |             return tokens  # [b, t, num_token, 512]
 95 |         else:
 96 |             # Unflatten the time axis, which was previously flattened into the batch.
 97 |             tokens = tokens.view(b, t, 512, -1)  # [b, t, 512 , 10 * 10]
 98 |             # If you don't use token learner, the number of token is 100.
 99 |             tokens = tokens.transpose(2, 3)  # [b, t, 10 * 10, 512]
100 |             return tokens
101 | 


--------------------------------------------------------------------------------
/maruya24_rt1/tokenizers/image_tokenizer_test.py:
--------------------------------------------------------------------------------
 1 | # Subject to the terms and conditions of the Apache License, Version 2.0 that the original code follows, 
 2 | # I have retained the following copyright notice written on it.
 3 | 
 4 | # Copyright 2022 Google LLC
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #      http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | # You can find the original code from here[https://github.com/google-research/robotics_transformer].
19 | 
20 | import unittest
21 | import torch
22 | from pytorch_robotics_transformer.tokenizers import image_tokenizer
23 | from absl.testing import parameterized
24 | 
25 | # Run this command above the robotics_transformer_pytorch directory
26 | # python -m pytorch.robotics_transformer.tokenizers.image_tokenizer_test
27 | 
28 | class ImageTokenizerTest(parameterized.TestCase, unittest.TestCase):
29 | 
30 |     @parameterized.named_parameters(
31 |       ('sample_image', 300, False, 8),
32 |       ('sample_image_token_learner', 300, True, 8))
33 |     def testTokenize(self, image_resolution, use_token_learner, num_tokens):
34 |         batch = 1
35 |         seq = 2
36 |         tokenizer = image_tokenizer.RT1ImageTokenizer(
37 |             use_token_learner=use_token_learner, 
38 |             num_tokens=num_tokens)
39 |         image = torch.randn(batch, seq, 3,image_resolution, image_resolution)
40 |         image = torch.clamp(image, min=0, max=1)
41 |         context_vector = torch.rand(batch, seq, 512)
42 |         image_tokens = tokenizer(image, context_vector) # [b, t, num_token, 512] or [b, t, 10 * 10, 512]
43 |         if use_token_learner:
44 |             self.assertEqual(list(image_tokens.shape), [batch, seq, num_tokens, 512])
45 |         else:
46 |             self.assertEqual(list(image_tokens.shape), [batch, seq, 100, 512])
47 | 
48 | if __name__ == '__main__':
49 |     unittest.main()


--------------------------------------------------------------------------------
/maruya24_rt1/tokenizers/token_learner.py:
--------------------------------------------------------------------------------
  1 | # Subject to the terms and conditions of the Apache License, Version 2.0 that the original code follows,
  2 | # I have retained the following copyright notice written on it.
  3 | 
  4 | # Copyright 2022 Google LLC
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #      http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | # You can find the original code from here[https://github.com/google-research/robotics_transformer].
 19 | 
 20 | """PyTorch implementation of Token Learner(Ryoo et al 2021)."""
 21 | 
 22 | import torch
 23 | import torch.nn as nn
 24 | import torch.nn.functional as F
 25 | 
 26 | 
 27 | class TokenLearnerModule(nn.Module):
 28 |     def __init__(
 29 |         self,
 30 |         inputs_channels: int,
 31 |         num_tokens: int,
 32 |         bottleneck_dim: int = 64,
 33 |         dropout_rate: float = 0.0,
 34 |     ):
 35 |         super().__init__()
 36 | 
 37 |         self.layerNorm = nn.LayerNorm(inputs_channels)
 38 | 
 39 |         self.conv1 = nn.Conv2d(
 40 |             in_channels=inputs_channels,
 41 |             out_channels=bottleneck_dim,
 42 |             kernel_size=1,
 43 |             stride=1,
 44 |             padding=0,
 45 |         )
 46 | 
 47 |         self.gelu1 = nn.GELU(approximate="tanh")
 48 | 
 49 |         if dropout_rate > 0:
 50 |             self.dropout1 = nn.Dropout2d(dropout_rate)
 51 |         else:
 52 |             self.dropout1 = nn.Identity()
 53 | 
 54 |         self.conv2 = nn.Conv2d(
 55 |             in_channels=bottleneck_dim,
 56 |             out_channels=num_tokens,
 57 |             kernel_size=1,
 58 |             stride=1,
 59 |             padding=0,
 60 |         )
 61 | 
 62 |         if dropout_rate > 0:
 63 |             self.dropout2 = nn.Dropout2d(dropout_rate)
 64 |         else:
 65 |             self.dropout2 = nn.Identity()
 66 | 
 67 |     # inputs: [bs, c, h, w] or [bs * seq, c, h, w]
 68 |     # seq is time-series length such as frame
 69 |     def forward(self, inputs: torch.Tensor):
 70 |         # layer norm
 71 |         x = self.layerNorm(inputs.permute(0, 2, 3, 1))
 72 |         x = x.permute(0, 3, 1, 2)
 73 | 
 74 |         # create weights map
 75 |         x = self.gelu1(self.conv1(x))
 76 |         x = self.dropout1(x)
 77 |         x = self.conv2(x)
 78 |         x = self.dropout2(x)  # (bs, num_tokens, h, w)
 79 | 
 80 |         x = x.view(x.shape[0], x.shape[1], -1)  # (bs, num_tokens, h*w)
 81 |         weights_maps = F.softmax(x, dim=-1)
 82 | 
 83 |         # create tokens
 84 |         bs, c, h, w = inputs.shape
 85 |         inputs = inputs.permute(0, 2, 3, 1).view(bs, h * w, c)
 86 | 
 87 |         tokens = torch.bmm(weights_maps, inputs)
 88 |         # weighs_maps: [bs, n_token, h*w]
 89 |         # inputs: [bs, h * w, c]
 90 |         # tokens: [bs, n_token, c]
 91 | 
 92 |         # Above computation is equivalent to below explanation.
 93 |         # weights_maps has n_tokens channels. each channels is a weight map with the size of (h, w).
 94 |         # inputs has c channels, each channels size is (h, w).
 95 |         # compute the element-wise product of one weight map of weights_maps and one of channels of inputs
 96 |         # That result in (H, W). After sum this, we get a scalar.
 97 |         # Iterate this operation to all other channels of inputs using the same weight map, we get c scalar.
 98 |         # reshape (1, 1, c), then we get a learned token, as shown in Fig. 1 in tokenlearner paper.
 99 |         # We do the computation using all other weight map, then we get all tokens.
100 |         return tokens
101 | 


--------------------------------------------------------------------------------
/maruya24_rt1/tokenizers/token_learner_test.py:
--------------------------------------------------------------------------------
 1 | # Subject to the terms and conditions of the Apache License, Version 2.0 that the original code follows, 
 2 | # I have retained the following copyright notice written on it.
 3 | 
 4 | # Copyright 2022 Google LLC
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #      http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | # You can find the original code from here[https://github.com/google-research/robotics_transformer].
19 | 
20 | import unittest
21 | import torch
22 | from pytorch_robotics_transformer.tokenizers.token_learner import TokenLearnerModule
23 | 
24 | # Type this command on your terminal aboved robotics_transformer_pytorch directory.
25 | # python -m robotics_transformer_pytorch.tokenizer.token_learnerr_test
26 | 
27 | class TokenLearnerTest(unittest.TestCase):
28 |     def testTokenLearner(self, embedding_dim=512, num_tokens=8):
29 |         batch = 1
30 |         seq = 2
31 |         token_learner_layer = TokenLearnerModule(
32 |             inputs_channels=embedding_dim, 
33 |             num_tokens=num_tokens)
34 |         # seq is time-series length
35 |         # embedding_dim = channels
36 |         inputvec = torch.randn((batch * seq, embedding_dim, 10, 10))
37 | 
38 |         learnedtokens = token_learner_layer(inputvec)
39 |         self.assertEqual(list(learnedtokens.shape), [batch * seq, num_tokens, embedding_dim])
40 | 
41 | if __name__ == '__main__':
42 |     unittest.main()


--------------------------------------------------------------------------------
/maruya24_rt1/tokenizers/utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Union
 2 | import numpy as np
 3 | from gym import spaces
 4 | import torch
 5 | 
 6 | # This function will turn the space into the batched space and return a batched action sample.
 7 | # The output format is compatible with OpenAI gym's Vectorized Environments.
 8 | def batched_space_sampler(space: spaces.Dict, batch_size: int):
 9 |     batched_sample : Dict[str, np.ndarray] = {}
10 |     samples = [space.sample() for _ in range(batch_size)] # 辞書のリスト
11 |     for key in samples[0].keys():
12 |         value_list = []
13 |         for i in range(batch_size):
14 |             value_list.append(samples[i][key])
15 |         value_list = np.stack(value_list, axis=0)
16 |         batched_sample[key] = value_list
17 |     return batched_sample
18 | 
19 | # This function turn all dict values into tensor.
20 | def np_to_tensor(sample_dict: Dict[str, Union[int,np.ndarray]]) -> Dict[str, torch.Tensor]:
21 |     new_dict = {}
22 |     for key, value in sample_dict.items():
23 |         value = torch.tensor(value)
24 |         new_dict[key] = value
25 | 
26 |     return new_dict


--------------------------------------------------------------------------------
/maruya24_rt1/transformer.py:
--------------------------------------------------------------------------------
  1 | # Subject to the terms and conditions of the Apache License, Version 2.0 that the original code follows, 
  2 | # I have retained the following copyright notice written on it.
  3 | 
  4 | # Copyright 2022 Google LLC
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #      http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | # You can find the original code from here[https://github.com/google-research/robotics_transformer].
 19 | 
 20 | import torch
 21 | import torch.nn as nn
 22 | import torch.nn.functional as F
 23 | import math
 24 | from typing import Optional, Tuple, Union, List
 25 | 
 26 | # This implementation is similar to tf.keras.layers.MultiHeadAttention, not torch.nn.MultiheadAttention.
 27 | # This can be used in the situation where query = key = value.
 28 | # In RT-1 we don't set value_dim. Therefore, values_dim = key_dim.
 29 | class TF_MultiHeadAttention(nn.Module):
 30 |     def __init__(self, 
 31 |                  heads: int, 
 32 |                  d_model: int, 
 33 |                  key_dim: int, 
 34 |                  value_dim: Optional[int] = None, 
 35 |                  dropout: float = 0.1,
 36 |                  return_attention_scores: bool = False):
 37 |         super().__init__()
 38 |         
 39 |         self.d_model = d_model
 40 |         self.h = heads
 41 |         self.key_dim = key_dim
 42 |         self.value_dim = value_dim if value_dim else key_dim # if value_dim is None, value_dim will be key_dim.
 43 |         self.return_attention_scores = return_attention_scores
 44 | 
 45 |         self.q_linear = nn.Linear(d_model, self.h * self.key_dim)
 46 |         self.k_linear = nn.Linear(d_model, self.h * self.key_dim)
 47 |         self.v_linear = nn.Linear(d_model, self.h * self.value_dim)
 48 |         self.dropout = nn.Dropout(dropout)
 49 |         self.out = nn.Linear(self.h * self.value_dim, d_model)
 50 |     
 51 |     def forward(self, q, k, v, mask=None):
 52 |         
 53 |         bs = q.size(0)
 54 |         
 55 |         # perform linear operation and split into h heads
 56 |         
 57 |         k = self.k_linear(k).view(bs, -1, self.h, self.key_dim)
 58 |         q = self.q_linear(q).view(bs, -1, self.h, self.key_dim)
 59 |         v = self.v_linear(v).view(bs, -1, self.h, self.value_dim)
 60 |         
 61 |         # transpose to get dimensions bs * h * sl * key_dim or bs * h * sl * value_dim
 62 |        
 63 |         k = k.transpose(1,2)
 64 |         q = q.transpose(1,2)
 65 |         v = v.transpose(1,2) # calculate attention using function we will define next
 66 | 
 67 |         if self.return_attention_scores:
 68 |             attention_output, score = self.attention(q, k, v, self.key_dim, mask, self.dropout, self.return_attention_scores) # attention_output: (bs, h, sl, value_dim), score: (bs, h, sl, sl)
 69 |         else:
 70 |             attention_output = self.attention(q, k, v, self.key_dim, mask, self.dropout, self.return_attention_scores) # (bs, h, sl, value_dim)
 71 |         
 72 |         # concatenate heads and put through final linear layer
 73 |         concat = attention_output.transpose(1,2).contiguous().view(bs, -1, self.h * self.value_dim) # (bs, sl, heads * value_dim)
 74 |         output = self.out(concat) # (bs, sl, d_model)
 75 | 
 76 |         if self.return_attention_scores:
 77 |             return output, score
 78 |         else:
 79 |             return output
 80 | 
 81 | 
 82 |     def attention(self, q, k, v, key_dim, mask=None, dropout=None, return_attention_scores=False):
 83 |         
 84 |         scores = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(key_dim)
 85 |         # q: (bs, h, sl, key_dim)
 86 |         # k.transpose(-2, -1) : (bs, h, key_dim, sl)
 87 |         # score: (bs, h, sl, sl)
 88 |         
 89 |         if mask is not None:
 90 |             # mask: (sl, sl)
 91 |             mask = mask.unsqueeze(0).unsqueeze(1).to(scores.device)
 92 |             scores = scores.masked_fill(mask == 0, -1e9)
 93 |         
 94 |         scores = F.softmax(scores, dim=-1)
 95 | 
 96 |         
 97 |         if dropout is not None:
 98 |             scores = dropout(scores)
 99 | 
100 | 
101 |         output = torch.matmul(scores, v)
102 |         # score: (bs, h, sl, sl)
103 |         # v : (bs, h, sl, value_dim)
104 |         # output: (bs, h, sl, value_dim)
105 | 
106 |         if return_attention_scores:
107 |             return output, scores
108 |         else:
109 |             return output
110 | 
111 | # input_size and output_size: (bs, sl, feed_forward_size)
112 | class _TransformerLayer(nn.Module):
113 |     """A single transformer block."""
114 |     def __init__(self,
115 |             layer_size: int = 4096, # This corresponds to key_dim which is the size of each attention head for query, key and values.
116 |             num_heads: int = 8,
117 |             feed_forward_size: int = 512, # This corresponds to d_model which is embedding dimension of each token in transformer part.
118 |             dropout_rate: float = 0.1,
119 |             return_attention_scores: bool = False):
120 | 
121 |         super().__init__()
122 |         self._return_attention_scores = return_attention_scores
123 | 
124 |         self.norm_1 = nn.LayerNorm(feed_forward_size)
125 |         self.attn = TF_MultiHeadAttention(num_heads,feed_forward_size, layer_size, dropout=dropout_rate, return_attention_scores=return_attention_scores)
126 |         self.ff = nn.Linear(feed_forward_size, feed_forward_size)
127 |         self.norm_2 = nn.LayerNorm(feed_forward_size)
128 |         self.dropout_1 = nn.Dropout(dropout_rate)
129 | 
130 |     def forward(self, x: torch.Tensor, mask: torch.Tensor) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
131 |         x1 = self.norm_1(x)
132 |         attn_results = self.attn(x1, x1, x1, mask=mask)
133 |         if self._return_attention_scores:
134 |             x1, score = attn_results
135 |         else:
136 |             x1, score = attn_results, None
137 |         x = x + x1
138 | 
139 |         y = self.norm_2(x)
140 |         ff_y = self.ff(y)
141 |         ff_y = self.dropout_1(ff_y)
142 |         x = x + ff_y
143 | 
144 |         return x, score
145 | 
146 | class Transformer(nn.Module):
147 |     def __init__(self,
148 |             num_layers: int = 1, # Number of transformer layers.
149 |             layer_size: int = 4096, # This corresponds to key_dim which is the size of each attention head for query, key and values.
150 |             num_heads: int = 8,
151 |             feed_forward_size: int = 512, # This corresponds to d_model which is embedding dimension of each token in transformer part.
152 |             dropout_rate: float = 0.1,
153 |             vocab_size: int = 256, # Dimensionality of tokens from the output layer. This is also dimensionality of tokens from the input layer.
154 |             input_token_emb_dim: int = 512, # embedding dim of input tokens.
155 |             return_attention_scores: bool = False,
156 |             max_seq_len: int = 256, # Maximum sequence length. This Transformer can't receive tokens that are more than this number.
157 |             ):
158 |         super(Transformer, self).__init__()
159 | 
160 |         self._layers = nn.ModuleList([
161 |         _TransformerLayer(  # pylint: disable=g-complex-comprehension
162 |             layer_size=layer_size,
163 |             num_heads=num_heads,
164 |             feed_forward_size=feed_forward_size,
165 |             dropout_rate=dropout_rate,
166 |             return_attention_scores=return_attention_scores)
167 |             for _ in range(num_layers)
168 |         ])
169 | 
170 |         self._token_emb = nn.Linear(input_token_emb_dim, feed_forward_size)
171 |         self._position_emb = nn.Embedding(max_seq_len, feed_forward_size) # <--- 工夫が必要 ここだけBERTのようにする？
172 |         self._output_tokens = nn.Linear(feed_forward_size, vocab_size)
173 | 
174 |     # inputs: (bs, seq, emb_dim). emb_dim = vocab_size
175 |     def forward(self, inputs: torch.Tensor, attention_mask: torch.Tensor) -> Union[torch.Tensor, Tuple[torch.Tensor, List[torch.Tensor]]]:
176 |         batch_size = inputs.shape[0]
177 |         seq_len = inputs.shape[1]
178 | 
179 |         # 1. Token Embeddings
180 |         tokens_embeddings = self._token_emb(inputs) # (bs, seq_len, feed_forward_size)
181 | 
182 |         # 2. Transformer Positional Embedding：
183 |         position_ids = torch.arange(seq_len, dtype=torch.long, device=inputs.device)
184 |         position_ids = torch.tile(position_ids.unsqueeze(0), dims=(batch_size, 1)) # (bs, seq_len)
185 |         position_embeddings = self._position_emb(position_ids) # (bs, seq_len, feed_forward_size)
186 | 
187 |         # Add the two embedded tensors together
188 |         x = tokens_embeddings + position_embeddings # (bs, seq_len, feed_forward_size)
189 | 
190 |         scores = []
191 | 
192 |         for layer in self._layers:
193 |             x, score = layer(x, mask=attention_mask)
194 |             if score is not None:
195 |                 scores.append(score)
196 |         x = self._output_tokens(x) # (bs, seq_len, vocab_size)
197 |         return x, scores


--------------------------------------------------------------------------------
/maruya24_rt1/transformer_network_test.py:
--------------------------------------------------------------------------------
  1 | # Subject to the terms and conditions of the Apache License, Version 2.0 that the original code follows, 
  2 | # I have retained the following copyright notice written on it.
  3 | 
  4 | # Copyright 2022 Google LLC
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #      http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | # You can find the original code from here[https://github.com/google-research/robotics_transformer].
 19 | 
 20 | 
 21 | """Tests for networks."""
 22 | 
 23 | import torch
 24 | import torch.nn.functional as F
 25 | from absl.testing import parameterized
 26 | import unittest
 27 | import numpy as np
 28 | from typing import Dict
 29 | import sys
 30 | sys.path.append("/home/io011/workspace/robotics_transformer/")
 31 | import transformer_network
 32 | from transformer_network_test_set_up import BATCH_SIZE
 33 | from transformer_network_test_set_up import NAME_TO_INF_OBSERVATIONS
 34 | from transformer_network_test_set_up import NAME_TO_STATE_SPACES
 35 | from transformer_network_test_set_up import observations_list
 36 | from transformer_network_test_set_up import space_names_list
 37 | from transformer_network_test_set_up import state_space_list
 38 | from transformer_network_test_set_up import TIME_SEQUENCE_LENGTH
 39 | from transformer_network_test_set_up import TransformerNetworkTestUtils
 40 | from tokenizers.utils import batched_space_sampler
 41 | from tokenizers.utils import np_to_tensor
 42 | 
 43 | 
 44 | class TransformerNetworkTest(TransformerNetworkTestUtils):
 45 |     @parameterized.named_parameters([{
 46 |         'testcase_name': '_' + name,
 47 |         'state_space': spec,
 48 |         'train_observation': obs,
 49 |     } for (name, spec, obs) in zip(space_names_list(), state_space_list(), observations_list())])
 50 |     def testTransformerTrainLossCall(self, state_space, train_observation):
 51 |         network = transformer_network.TransformerNetwork(
 52 |         vocab_size=256,
 53 |         token_embedding_size=512,
 54 |         num_layers=8,
 55 |         layer_size=128,
 56 |         num_heads=8,
 57 |         feed_forward_size=512,
 58 |         dropout_rate=0.1,
 59 |         crop_size=236,
 60 |         input_tensor_space=state_space,
 61 |         output_tensor_space=self._action_space,
 62 |         time_sequence_length=TIME_SEQUENCE_LENGTH,
 63 |         use_token_learner=True)
 64 | 
 65 | 
 66 |         network.set_actions(self._train_action)
 67 | 
 68 |         network_state = batched_space_sampler(network._state_space, batch_size=BATCH_SIZE)
 69 |         network_state = np_to_tensor(network_state) # change np.ndarray type of sample values into tensor type
 70 | 
 71 |         output_actions, network_state = network(
 72 |             train_observation, network_state=network_state)
 73 | 
 74 |         expected_shape = [2, 3]
 75 | 
 76 |         self.assertEqual(list(network.get_actor_loss().shape), expected_shape)
 77 | 
 78 |         self.assertCountEqual(self._train_action.keys(), output_actions.keys())
 79 | 
 80 |     @parameterized.named_parameters([{
 81 |         'testcase_name': '_' + name,
 82 |         'space_name': name,
 83 |     } for name in space_names_list()])
 84 |     def testTransformerInferenceLossCall(self, space_name):
 85 |         state_space = NAME_TO_STATE_SPACES[space_name]
 86 |         observation = NAME_TO_INF_OBSERVATIONS[space_name] #  observation has no time dimension unlike during training.
 87 | 
 88 |         network = transformer_network.TransformerNetwork(
 89 |         input_tensor_space=state_space,
 90 |         #Dict('image': Box(0.0, 1.0, (3, 256, 320), float32), 'natural_language_embedding': Box(-inf, inf, (512,), float32))
 91 |         output_tensor_space=self._action_space,
 92 |         # Dict('terminate_episode': Discrete(2), 'world_vector': Box(-1.0, 1.0, (3,), float32), 'rotation_delta': Box(-1.5707964, 1.5707964, (3,), float32), 'gripper_closedness_action': Box(-1.0, 1.0, (1,), float32))
 93 |         time_sequence_length=TIME_SEQUENCE_LENGTH)
 94 | 
 95 |         network.set_actions(self._inference_action) # self._inference_action has no time dimension unlike self._train_action.
 96 |         # inference currently only support batch size of 1
 97 |         network_state = batched_space_sampler(network._state_space, batch_size=1)
 98 |         network_state = np_to_tensor(network_state) # change np.ndarray type of sample values into tensor type
 99 | 
100 |         output_actions, network_state = network(
101 |             observation, network_state=network_state)
102 |         # torch.float32 torch.Size([1, 3, 256, 320])
103 |         # {'image': tensor([[[[0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
104 |         #   [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
105 |         #   [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
106 |         #   ...,
107 |         #   [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
108 |         #   [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
109 |         #   [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000]],
110 | 
111 |         #  [[0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
112 |         #   [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
113 |         #   [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
114 |         #   ...,
115 |         #   [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
116 |         #   [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
117 |         #   [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000]],
118 | 
119 |         #  [[0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
120 |         #   [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
121 |         #   [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
122 |         #   ...,
123 |         #   [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
124 |         #   [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
125 |         #   [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000]]]]), 
126 |         # 
127 |         # torch.float32  torch.Size([1, 512])
128 |         # 'natural_language_embedding': tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
129 |         #  1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
130 |         #  1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
131 |         #  1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
132 |         #  1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
133 |         #  1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
134 |         #  1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
135 |         #  1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
136 |         #  1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
137 |         #  1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
138 |         #  1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
139 |         #  1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
140 |         #  1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
141 |         #  1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
142 |         #  1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
143 |         #  1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
144 |         #  1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
145 |         #  1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
146 |         #  1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
147 |         #  1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
148 |         #  1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
149 |         #  1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
150 |         #  1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
151 |         #  1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
152 |         #  1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
153 |         #  1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
154 |         #  1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
155 |         #  1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
156 |         #  1., 1., 1., 1., 1., 1., 1., 1.]])}
157 | 
158 |        # torch.int64 torch.Size([1, 3, 8])
159 |     #     {'action_tokens': tensor([[[116,  61,  70, 126, 170, 243,  58, 127],
160 |     #      [167, 251, 204,  19, 222, 236,  61,  91],
161 |     #      [254, 249, 204,  14,  64, 162,  53, 248]]]), 
162 |     # torch.float32 torch.Size([1, 3, 8, 512])
163 |     # 'context_image_tokens': tensor([[[[-0.8647, -1.9947, -0.7335,  ..., -1.0252, -0.1364,  0.6359],
164 |     #       [ 0.1899,  0.1604, -0.4497,  ..., -0.3601,  0.0933, -0.6361],
165 |     #       [ 1.5404,  0.2238, -1.0574,  ...,  0.0621,  1.2833, -2.0537],
166 |     #       ...,
167 |     #       [ 1.5256, -0.4647, -0.5676,  ...,  1.4825, -0.5317,  0.7946],
168 |     #       [-1.3059,  0.8747, -2.9062,  ..., -0.0762,  1.8340, -0.8561],
169 |     #       [-0.6595, -2.0878,  0.2709,  ...,  0.1764,  1.2843, -0.4609]],
170 | 
171 |     #      [[-0.0947,  0.0133, -0.2301,  ...,  0.1116, -0.1111, -0.0391],
172 |     #       [-0.0998,  0.0090, -0.2151,  ...,  0.1465, -0.1033, -0.0342],
173 |     #       [-0.0935,  0.0161, -0.2835,  ...,  0.1232, -0.1025, -0.0332],
174 |     #       ...,
175 |     #       [-0.1249, -0.0076, -0.2253,  ...,  0.1392, -0.1412, -0.0528],
176 |     #       [-0.1013, -0.0141, -0.2092,  ...,  0.1403, -0.1279, -0.0494],
177 |     #       [-0.0929,  0.0040, -0.1969,  ...,  0.1384, -0.1017, -0.0603]],
178 | 
179 |     #      [[-1.8412, -1.0423, -0.4420,  ...,  0.8732, -0.1140, -0.1008],
180 |     #       [ 0.6039, -0.6286, -0.1894,  ..., -0.4552,  1.3157, -1.1879],
181 |     #       [ 1.1295, -0.1596, -0.0862,  ..., -1.0804,  1.0614, -0.0249],
182 |     #       ...,
183 |     #       [-0.3196, -1.3627,  0.3153,  ..., -0.7006,  1.0846, -0.2101],
184 |     #       [ 2.3243,  1.2046,  1.8313,  ..., -1.1140,  0.4493, -0.8400],
185 |     #       [ 1.1736, -0.3540, -0.6332,  ...,  0.0968,  1.5409,  0.7139]]]],
186 |     #    grad_fn=<CatBackward0>), 
187 |     # torch.int64 torch.Size([])
188 |     # 'seq_idx': tensor(2)}
189 | 
190 |         self.assertEqual(network.get_actor_loss().item(), 0.0)
191 |         self.assertCountEqual(self._inference_action.keys(), output_actions.keys())
192 | 
193 |     @parameterized.named_parameters([{
194 |       'testcase_name': '_' + name,
195 |       'state_space': spec,
196 |     } for name, spec in zip(space_names_list(), state_space_list())])
197 |     def testTransformerCausality(self, state_space):
198 |         network = transformer_network.TransformerNetwork(
199 |         input_tensor_space=state_space,
200 |         output_tensor_space=self._action_space,
201 |         time_sequence_length=TIME_SEQUENCE_LENGTH,
202 |         dropout_rate=0.0)
203 | 
204 |         network.eval()
205 | 
206 |         time_sequence_length = network._time_sequence_length
207 |         tokens_per_image = network._tokens_per_context_image
208 |         tokens_per_action = network._tokens_per_action
209 | 
210 |         # size of all_tokens: (time_sequence_length * (tokens_per_image + tokens_per_action)) 
211 |         def _split_image_and_action_tokens(all_tokens):
212 |             image_start_indices = [(tokens_per_image + tokens_per_action) * k
213 |                              for k in range(time_sequence_length)]
214 | 
215 |             image_tokens = torch.stack(
216 |                 [all_tokens[i:i + tokens_per_image] for i in image_start_indices],
217 |                 dim=0)
218 |             action_start_indices = [i + tokens_per_image for i in image_start_indices]
219 |             action_tokens = torch.stack([
220 |                     all_tokens[i:i + tokens_per_action] for i in action_start_indices],
221 |                     dim=0)
222 | 
223 |             image_tokens = F.one_hot(image_tokens, network._token_embedding_size)
224 |             # Add batch dimension.
225 |             image_tokens = image_tokens.unsqueeze(0) # image_tokens: (1, time_sequence_length, tokens_per_image, emb_dim)
226 |             action_tokens = action_tokens.unsqueeze(0) # action: (1, time_sequence_length, tokens_per_action)
227 | 
228 |             return image_tokens, action_tokens
229 | 
230 |         # Generate some random tokens for image and actions.
231 |         all_tokens = torch.randint(low=0, high=10, size=(time_sequence_length * (tokens_per_image + tokens_per_action),))
232 |         context_image_tokens, action_tokens = _split_image_and_action_tokens(all_tokens)
233 |         # Get the output tokens without any zeroed out input tokens.
234 |         # output_tokens: (t*num_tokens, vocab_size)
235 |         output_tokens = network._transformer_call(
236 |             context_image_tokens=context_image_tokens,
237 |             action_tokens=action_tokens,
238 |             attention_mask=network._default_attention_mask,
239 |             batch_size=1)[0]
240 | 
241 |         for t in range(time_sequence_length *
242 |                    (tokens_per_image + tokens_per_action)):
243 |             # Zero out future input tokens.
244 |             all_tokens_at_t = torch.concat([all_tokens[:t + 1], torch.zeros_like(all_tokens[t + 1:])], 0)
245 |             context_image_tokens, action_tokens = _split_image_and_action_tokens(all_tokens_at_t)
246 | 
247 |             # Get the output tokens with zeroed out input tokens after t.
248 |             output_tokens_at_t = network._transformer_call(
249 |                 context_image_tokens=context_image_tokens,
250 |                 action_tokens=action_tokens,
251 |                 attention_mask=network._default_attention_mask,
252 |                 batch_size=1)[0]
253 | 
254 |             # The output token is unchanged if future input tokens are zeroed out.
255 |             np.testing.assert_array_equal(output_tokens[:t + 1].detach().numpy(), output_tokens_at_t[:t + 1].detach().numpy())
256 | 
257 | 
258 | if __name__ == '__main__':
259 |     unittest.main()
260 | 


--------------------------------------------------------------------------------
/maruya24_rt1/transformer_network_test_set_up.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # Subject to the terms and conditions of the Apache License, Version 2.0 that the original code follows, 
  3 | # I have retained the following copyright notice written on it.
  4 | 
  5 | # Copyright 2022 Google LLC
  6 | #
  7 | # Licensed under the Apache License, Version 2.0 (the "License");
  8 | # you may not use this file except in compliance with the License.
  9 | # You may obtain a copy of the License at
 10 | #
 11 | #      http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing, software
 14 | # distributed under the License is distributed on an "AS IS" BASIS,
 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 | # See the License for the specific language governing permissions and
 17 | # limitations under the License.
 18 | 
 19 | # You can find the original code from here[https://github.com/google-research/robotics_transformer].
 20 | 
 21 | 
 22 | import copy
 23 | from typing import Optional, Tuple, Union, List, Dict
 24 | from absl.testing import parameterized
 25 | import numpy as np
 26 | from gym import spaces
 27 | import torch
 28 | import unittest
 29 | from collections import OrderedDict
 30 | 
 31 | BATCH_SIZE = 3
 32 | TIME_SEQUENCE_LENGTH = 3
 33 | HEIGHT = 256
 34 | WIDTH = 320
 35 | NUM_IMAGE_TOKENS = 2
 36 | 
 37 | # For now, we use one type of spaces.
 38 | def space_names_list() -> List[str]:
 39 |     """Lists the different types of spaces accepted by the transformer."""
 40 |     return ['default']
 41 | 
 42 | 
 43 | def state_space_list() -> List[spaces.Dict]:
 44 |     """Lists the different types of state spec accepted by the transformer."""
 45 |     # This will be input_tensor_spec
 46 |     state_space = spaces.Dict(
 47 |         {
 48 |             'image': spaces.Box(low=0.0, high=1.0, 
 49 |                             shape=(3, HEIGHT, WIDTH), dtype=np.float32),
 50 |             'natural_language_embedding': spaces.Box(low=-np.inf, high=np.inf, 
 51 |                             shape=[512], dtype=np.float32)
 52 |         }
 53 |     )
 54 | 
 55 |     return [state_space]
 56 | 
 57 | def observations_list(training: bool = True) -> List[Dict[str, torch.Tensor]]:
 58 |     """Lists the different types of observations accepted by the transformer."""
 59 |     if training:
 60 |         image_shape = [BATCH_SIZE, TIME_SEQUENCE_LENGTH, 3, HEIGHT, WIDTH]
 61 |         emb_shape = [BATCH_SIZE, TIME_SEQUENCE_LENGTH, 512]
 62 |     else:
 63 |         # inference currently only support batch size of 1
 64 |         image_shape = [1, 3, HEIGHT, WIDTH]
 65 |         emb_shape = [1, 512]
 66 |     return [
 67 |             {
 68 |                 'image': torch.full(image_shape, 0.5),
 69 |                 'natural_language_embedding': torch.full(emb_shape, 1.0)
 70 |             }
 71 |         ]
 72 | 
 73 | NAME_TO_STATE_SPACES = dict(zip(space_names_list(), state_space_list()))
 74 | NAME_TO_OBSERVATIONS = dict(zip(space_names_list(), observations_list()))
 75 | NAME_TO_INF_OBSERVATIONS = dict(
 76 |     zip(space_names_list(), observations_list(False)))
 77 | 
 78 | # This class will be inherited by TransformerNetworkTestUtils in transformer_network_test.py.
 79 | class TransformerNetworkTestUtils(parameterized.TestCase, unittest.TestCase):
 80 |     """Defines spaces, SequenceAgent, and various other testing utilities."""
 81 | 
 82 |     def _define_spaces(self,
 83 |                 train_batch_size=BATCH_SIZE,
 84 |                 inference_batch_size=1,
 85 |                 time_sequence_length=TIME_SEQUENCE_LENGTH,
 86 |                 inference_sequence_length=TIME_SEQUENCE_LENGTH,
 87 |                 token_embedding_size=512,
 88 |                 image_width=WIDTH,
 89 |                 image_height=HEIGHT):
 90 |         """Defines spaces and observations (both training and inference)."""
 91 | 
 92 |         self.train_batch_size = train_batch_size
 93 |         self.inference_batch_size = inference_batch_size
 94 |         self.time_sequence_length = time_sequence_length
 95 |         self.inference_sequence_length = inference_sequence_length
 96 |         self.token_embedding_size = token_embedding_size
 97 | 
 98 |         # action space need to keep order of actions so that it can tokenize and detokenize actions correctly.
 99 |         # So we define action_space with OrderedDict.
100 |         action_space = spaces.Dict(
101 |             OrderedDict([
102 |                 ('terminate_episode', spaces.Discrete(2)), 
103 |                 ('world_vector', spaces.Box(low= -1.0, high= 1.0, shape=(3,), dtype=np.float32)),
104 |                 ('rotation_delta', spaces.Box(low= -np.pi / 2, high= np.pi / 2, shape=(3,), dtype=np.float32)),
105 |                 ('gripper_closedness_action', spaces.Box(low= -1.0  , high= 1.0, shape=(1,), dtype=np.float32))
106 |                 ])
107 |         )
108 | 
109 |         state_space = spaces.Dict(
110 |             {
111 |                 'image': spaces.Box(low=0.0, high=1.0, 
112 |                                 shape=(3, image_height, image_width), dtype=np.float32),
113 |                 'natural_language_embedding': spaces.Box(low=-np.inf, high=np.inf, 
114 |                                 shape=[self.token_embedding_size], dtype=np.float32)
115 |             }
116 |         )
117 | 
118 |         self._policy_info_space = {
119 |             'return': spaces.Box(low=0.0, high=1.0, shape=(),dtype=np.float32),
120 |             'discounted_return': spaces.Box(low=0.0, high=1.0, shape=(),dtype=np.float32)
121 |         }
122 |         self._state_space = state_space
123 |         self._action_space = action_space
124 | 
125 |         self._inference_observation = {
126 |             'image':
127 |                 torch.full([self.inference_batch_size, 3, image_height, image_width], 1.0),
128 |             'natural_language_embedding':
129 |                 torch.full([self.inference_batch_size, self.token_embedding_size], 1.0),
130 |         }
131 | 
132 |         self._train_observation = {
133 |             'image':
134 |                 torch.full(
135 |                     [self.train_batch_size, self.time_sequence_length, 3, image_height, image_width], 0.5),
136 |             'natural_language_embedding':
137 |                 torch.full(
138 |                     [self.train_batch_size, self.time_sequence_length, self.token_embedding_size], 1.)
139 |         }
140 |         self._inference_action = {
141 |             'world_vector':
142 |                 torch.full([self.inference_batch_size, 3], 0.5),
143 |             'rotation_delta':
144 |                 torch.full([self.inference_batch_size, 3], 0.5),
145 |             'terminate_episode':
146 |                 torch.full([self.inference_batch_size], 1),
147 |             'gripper_closedness_action':
148 |                 torch.full([self.inference_batch_size, 1], 0.5),
149 |         }
150 | 
151 |         self._train_action = {
152 |             'world_vector':
153 |                 torch.full([self.train_batch_size, self.time_sequence_length, 3], 0.5),
154 |             'rotation_delta':
155 |                 torch.full([self.train_batch_size, self.time_sequence_length, 3], 0.5),
156 |             'terminate_episode':
157 |                 torch.full([self.train_batch_size, self.time_sequence_length], 1),
158 |             'gripper_closedness_action':
159 |                 torch.full([self.train_batch_size, self.time_sequence_length, 1], 0.5),
160 |         }
161 | 
162 |     def setUp(self):
163 |         self._define_spaces()
164 |         super().setUp()
165 | 
166 |         


--------------------------------------------------------------------------------
/maruya24_rt1/transformer_test.py:
--------------------------------------------------------------------------------
 1 | # Subject to the terms and conditions of the Apache License, Version 2.0 that the original code follows, 
 2 | # I have retained the following copyright notice written on it.
 3 | 
 4 | # Copyright 2022 Google LLC
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #      http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | # You can find the original code from here[https://github.com/google-research/robotics_transformer].
19 | 
20 | import unittest
21 | import torch
22 | 
23 | from maruya24_rt1.transformer import Transformer
24 | from absl.testing import parameterized
25 | 
26 | class TransformerTest(parameterized.TestCase, unittest.TestCase):
27 | 
28 |     def setUp(self):
29 |         self._vocab_size = 10
30 |         batch_size = 8
31 |         sequence_len = 12
32 |         self._tokens = torch.rand((batch_size, sequence_len, self._vocab_size))
33 | 
34 |     @parameterized.parameters(True, False)
35 |     def test_transformer_forwardpass(self, return_attention_scores):
36 |         network = Transformer(
37 |             num_layers=2,
38 |             layer_size=512,
39 |             num_heads=4,
40 |             feed_forward_size=256,
41 |             dropout_rate=0.1,
42 |             vocab_size=self._vocab_size,
43 |             input_token_emb_dim=self._vocab_size,
44 |             return_attention_scores=return_attention_scores,
45 |             max_seq_len=15)
46 | 
47 |         output_tokens, attention_scores = network(self._tokens, attention_mask=None)
48 |         self.assertSequenceEqual(self._tokens.shape, output_tokens.shape)
49 |         if return_attention_scores:
50 |             self.assertNotEmpty(attention_scores)
51 |         else:
52 |             self.assertEmpty(attention_scores)
53 | 
54 | 
55 | 
56 | if __name__ == '__main__':
57 |     unittest.main()


--------------------------------------------------------------------------------
/multi.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Assigning the Python script name to a variable
 4 | python_script="IO_evaluator.py"
 5 | 
 6 | # Getting input arguments from the command line
 7 | epoch=$1
 8 | task="pick"
 9 | ckpt_dir=$2
10 | model_name=$3
11 | gpu_name=$4
12 | cam_view=$5
13 | using_pos_and_orn=$6
14 | num_threads=$7
15 | initial_c_value=0
16 | override_exist=1
17 | 
18 | # Function to run the Python script with specified arguments
19 | run_python_script() {
20 |   local c_value=$1
21 |   python $python_script -t $task -d $ckpt_dir -m $model_name -c $cam_view -e $epoch -g $gpu_name -u $using_pos_and_orn -p $c_value -n $num_threads
22 | }
23 | 
24 | # Function to handle cleanup when Ctrl+C is detected
25 | cleanup() {
26 |   echo "Ctrl+C detected. Killing all threads..."
27 |   kill 0
28 |   exit 1
29 | }
30 | 
31 | # Function to run multiple threads of the Python script
32 | run_threads() {
33 |   local num_threads=$1
34 |   local c_value=$2
35 |   local increment=1
36 | 
37 |   # Trap Ctrl+C to call the cleanup function
38 |   trap cleanup INT
39 | 
40 |   # Run the Python script in multiple threads
41 |   for ((i=0; i<num_threads; i++))
42 |   do
43 |     run_python_script $c_value &  # Run Python script with specified arguments in the background
44 |     c_value=$((c_value+increment))  # Increment the value of 'c_value'
45 |   done
46 | 
47 |   wait  # Wait for all background processes to finish
48 | }
49 | 
50 | # Run the specified number of threads with initial 'c_value'
51 | run_threads $num_threads $initial_c_value
52 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | absl-py==2.0.0
 2 | cachetools==5.3.2
 3 | certifi==2023.7.22
 4 | charset-normalizer==3.3.2
 5 | cloudpickle==3.0.0
 6 | contourpy==1.2.0
 7 | cycler==0.12.1
 8 | fonttools==4.44.3
 9 | func-timeout==4.3.5
10 | google-auth==2.23.4
11 | google-auth-oauthlib==1.1.0
12 | grpcio==1.59.2
13 | gym==0.26.2
14 | gym-notices==0.0.8
15 | idna==3.4
16 | imageio==2.32.0
17 | kiwisolver==1.4.5
18 | lazy_loader==0.3
19 | Markdown==3.5.1
20 | MarkupSafe==2.1.3
21 | matplotlib==3.8.1
22 | networkx==3.2.1
23 | nvidia-cublas-cu11==11.10.3.66
24 | nvidia-cuda-nvrtc-cu11==11.7.99
25 | nvidia-cuda-runtime-cu11==11.7.99
26 | nvidia-cudnn-cu11==8.5.0.96
27 | oauthlib==3.2.2
28 | pandas==2.1.3
29 | Pillow==10.0.1
30 | protobuf==4.23.4
31 | pyasn1==0.5.0
32 | pyasn1-modules==0.3.0
33 | pybullet==3.2.5
34 | pyparsing==3.1.1
35 | pytz==2023.3.post1
36 | requests==2.31.0
37 | requests-oauthlib==1.3.1
38 | rsa==4.9
39 | scikit-image==0.22.0
40 | scikit-video==1.1.11
41 | scipy==1.11.3
42 | tensorboard==2.15.1
43 | tensorboard-data-server==0.7.2
44 | tifffile==2023.9.26
45 | torch==1.13.1
46 | torchvision==0.14.1
47 | tqdm==4.66.1
48 | tzdata==2023.3
49 | urllib3==2.1.0
50 | Werkzeug==3.0.1


--------------------------------------------------------------------------------
/robots.py:
--------------------------------------------------------------------------------
  1 | import pybullet as p
  2 | import numpy as np
  3 | import random
  4 | import pybullet_data as pdata
  5 | import math
  6 | 
  7 | 
  8 | class RobotBase(object):
  9 |     def __init__(self) -> None:
 10 |         self.arm_dof = None
 11 |         self.ee_index = None
 12 |         self.finger_index = None
 13 |         self.home_j_pos = None
 14 |         self.gripper_range = None
 15 |         self.is_hand = False
 16 |         self.home_ee_pose = ((-0.0395, 0.415, 1.145), (1, 0, 0, 0))
 17 |         self.grasp_force_threshold = None
 18 | 
 19 |     def load(self, urdf_path, base_pos, base_ori):
 20 |         self.robot_id = p.loadURDF(
 21 |             urdf_path,
 22 |             base_pos,
 23 |             base_ori,
 24 |             useFixedBase=True,
 25 |             flags=p.URDF_ENABLE_CACHED_GRAPHICS_SHAPES,
 26 |         )
 27 | 
 28 |     def calc_ik(self, pose):
 29 |         return list(
 30 |             p.calculateInverseKinematics(
 31 |                 self.robot_id,
 32 |                 self.ee_index,
 33 |                 pose[0],
 34 |                 pose[1],
 35 |                 [-7] * 7,
 36 |                 [7] * 7,
 37 |                 [7] * 7,
 38 |                 self.home_j_pos,
 39 |                 maxNumIterations=100,
 40 |                 solver=p.IK_DLS,
 41 |             )
 42 |         )
 43 | 
 44 |     def reset_j_home(self, random_home=False):
 45 |         if random_home:
 46 |             for i in range(self.arm_dof):
 47 |                 self.home_j_pos[i] += random.uniform(-np.pi / 10, np.pi / 10)
 48 |         # self.home_j_pos = [1.22, -0.458, 0.31, -3.0, 0.20, 4* math.pi / 5, 0, 0.04, 0.04]
 49 |         self.reset_j_pos(self.home_j_pos)
 50 | 
 51 |     def reset_j_pos(self, j_pos):
 52 |         index = 0
 53 |         for j in range(p.getNumJoints(self.robot_id)):
 54 |             p.changeDynamics(self.robot_id, j, linearDamping=0, angularDamping=0)
 55 |             joint_type = p.getJointInfo(self.robot_id, j)[2]
 56 |             if joint_type in [
 57 |                 p.JOINT_PRISMATIC,
 58 |                 p.JOINT_REVOLUTE,
 59 |             ]:
 60 |                 p.resetJointState(self.robot_id, j, j_pos[index])
 61 |                 index = index + 1
 62 | 
 63 |     def move_arm(self, arm_pos, max_vel, force):
 64 |         for i in range(self.arm_dof):
 65 |             p.setJointMotorControl2(
 66 |                 self.robot_id,
 67 |                 i,
 68 |                 p.POSITION_CONTROL,
 69 |                 arm_pos[i],
 70 |                 maxVelocity=max_vel[i],
 71 |                 force=force[i],
 72 |             )
 73 | 
 74 |     def move_gripper(self, gripper_pos, max_vel, force):
 75 |         for i, j_idx in enumerate(self.finger_index):
 76 |             p.setJointMotorControl2(
 77 |                 self.robot_id,
 78 |                 j_idx,
 79 |                 p.POSITION_CONTROL,
 80 |                 gripper_pos[i],
 81 |                 maxVelocity=max_vel[i],
 82 |                 force=force[i],
 83 |             )
 84 | 
 85 |     def move_j(self):
 86 |         raise NotImplementedError
 87 | 
 88 |     def is_j_arrived(self, j_pos, include_finger=True, threshold=1e-2):
 89 |         cur_joint_position = [
 90 |             s[0]
 91 |             for s in p.getJointStates(
 92 |                 self.robot_id, list(range(self.arm_dof)) + self.finger_index
 93 |             )
 94 |         ]
 95 |         diff_arm = np.abs(
 96 |             np.array(cur_joint_position)
 97 |             - np.array(j_pos)[: self.arm_dof + len(self.finger_index)]
 98 |         )
 99 |         is_arrive = np.all(diff_arm[: self.arm_dof - 1] <= threshold)
100 |         # if include_finger:
101 |         #     is_arrive = is_arrive and np.all(
102 |         #         diff_arm[-len(self.finger_index) :] <= threshold
103 |         #     )
104 |         return is_arrive
105 | 
106 |     def is_ee_arrived(self, ee_pose, tar_obj_id=None, threshold=2 * 1e-2):
107 |         robot_ee_pos = np.array(p.getLinkState(self.robot_id, self.ee_index)[0])
108 |         diff_pos = np.abs(robot_ee_pos - ee_pose[0])
109 |         is_arrive = np.all(diff_pos <= threshold)
110 |         if tar_obj_id != None:
111 |             is_arrive = (
112 |                 is_arrive
113 |                 and p.getClosestPoints(self.robot_id, tar_obj_id, 1e-5)
114 |                 and p.getContactPoints(self.robot_id, tar_obj_id)
115 |                 and p.getContactPoints(self.robot_id, tar_obj_id)[0][9]
116 |                 > self.grasp_force_threshold
117 |             )
118 |             # p.getContactPoints(self.robot_id, tar_obj_id)
119 |         return is_arrive
120 | 
121 | 
122 | class UR5(RobotBase):
123 |     def __init__(self):
124 |         super().__init__()
125 |         self.arm_dof = 6
126 |         self.ee_index = 18
127 |         self.finger_index = [8]
128 |         self.home_j_pos = [1.40, -1.58, 0.94, -0.93, -1.57, -1.74] + [0.0] * 6
129 |         self.gripper_range = [0, 0.085]
130 |         self.grasp_force_threshold = 30
131 | 
132 |     def load(self):
133 |         super().load(
134 |             "urdf/ur5_robotiq/urdf/ur5_robotiq_85.urdf", [0, 0, 0.62], [0, 0, 0, 1]
135 |         )
136 |         finger_child_multiplier = {10: -1, 12: 1, 13: 1, 15: -1, 17: 1}
137 |         for joint_id, multiplier in finger_child_multiplier.items():
138 |             c = p.createConstraint(
139 |                 self.robot_id,
140 |                 self.finger_index[0],
141 |                 self.robot_id,
142 |                 joint_id,
143 |                 jointType=p.JOINT_GEAR,
144 |                 jointAxis=[0, 1, 0],
145 |                 parentFramePosition=[0, 0, 0],
146 |                 childFramePosition=[0, 0, 0],
147 |             )
148 |             p.changeConstraint(
149 |                 c, gearRatio=-multiplier, maxForce=100, erp=1
150 |             )  # Note: the mysterious `erp` is of EXTREME importance
151 | 
152 |     def move_gripper(self, gripper_state):
153 |         # open_length = np.clip(open_length, *self.gripper_range)
154 |         open_length = (
155 |             gripper_state * (self.gripper_range[1] - self.gripper_range[0])
156 |             + self.gripper_range[0]
157 |         )
158 |         open_angle = 0.715 - math.asin((open_length - 0.010) / 0.1143)
159 |         # angle calculation
160 |         super().move_gripper([open_angle], max_vel=[4], force=[1000])
161 | 
162 |     def move_arm(self, j_pos):
163 |         super().move_arm(
164 |             j_pos[: self.arm_dof],
165 |             [1] * self.arm_dof,
166 |             [5 * 240.0] * self.arm_dof,
167 |         )
168 | 
169 | 
170 | class Panda(RobotBase):
171 |     def __init__(self):
172 |         super().__init__()
173 |         self.arm_dof = 7
174 |         self.ee_index = 11
175 |         self.finger_index = [9, 10]
176 |         self.home_j_pos = [1.22, -0.458, 0.31, -2.0, 0.20, 1.56, 2.32, 0.04, 0.04]
177 |         self.gripper_range = [0.01, 0.04]
178 |         self.grasp_force_threshold = 2
179 | 
180 |     def load(self):
181 |         super().load("franka_panda/panda.urdf", [0, 0, 0.62], [0, 0, 0, 1])
182 |         # create a constraint to keep the fingers centered, 9 and 10 for finger indices
183 |         c = p.createConstraint(
184 |             self.robot_id,
185 |             self.finger_index[0],
186 |             self.robot_id,
187 |             self.finger_index[1],
188 |             jointType=p.JOINT_GEAR,
189 |             jointAxis=[1, 0, 0],
190 |             parentFramePosition=[0, 0, 0],
191 |             childFramePosition=[0, 0, 0],
192 |         )
193 |         p.changeConstraint(c, gearRatio=-1, erp=0.1, maxForce=50)
194 | 
195 |     def move_arm(self, j_pos):
196 |         super().move_arm(
197 |             j_pos[: self.arm_dof],
198 |             [1 if i == self.arm_dof - 1 else 0.5 for i in range(self.arm_dof)],
199 |             [5 * 240.0] * self.arm_dof,
200 |         )
201 | 
202 |     def move_gripper(self, gripper_state):
203 |         gripper_pos = (
204 |             gripper_state * (self.gripper_range[1] - self.gripper_range[0])
205 |             + self.gripper_range[0]
206 |         )
207 |         super().move_gripper([gripper_pos] * 2, [0.1] * 2, [1000] * 2)
208 | 
209 | 
210 | class Iiwa(RobotBase):
211 |     def __init__(self):
212 |         super().__init__()
213 |         self.arm_dof = 7
214 |         self.ee_index = 14
215 |         self.finger_index = [8, 10, 11, 13]
216 |         self.home_j_pos = [
217 |             -0.695,
218 |             -0.117,
219 |             -0.692,
220 |             1.261,
221 |             0.140,
222 |             -1.773,
223 |             0.125,
224 |             0.125,
225 |             -0.5,
226 |             0.5,
227 |             0.5,
228 |             -0.5,
229 |         ]
230 | 
231 |         self.gripper_range = [0, 0.5]
232 |         self.grasp_force_threshold = 20
233 | 
234 |     def load(self):
235 |         self.robot_id = p.loadSDF("urdf/kuka_iiwa/kuka_with_gripper2.sdf")[0]
236 |         p.resetBasePositionAndOrientation(self.robot_id, [-0.1, 0, 0.7], [0, 0, 0, 1])
237 | 
238 |     def move_gripper(self, gripper_state):
239 |         val = (
240 |             gripper_state * (self.gripper_range[1] - self.gripper_range[0])
241 |             + self.gripper_range[0]
242 |         )
243 |         gripper_pos = [-val, val, val, -val]
244 |         super().move_gripper(gripper_pos, max_vel=[0.5] * 4, force=[1000] * 4)
245 | 
246 |     def move_arm(self, j_pos):
247 |         super().move_arm(
248 |             j_pos[: self.arm_dof],
249 |             [1] * self.arm_dof,
250 |             [5 * 240.0] * self.arm_dof,
251 |         )
252 | 
253 | 
254 | class Xarm(RobotBase):
255 |     def __init__(self):
256 |         super().__init__()
257 |         self.arm_dof = 6
258 |         self.ee_index = 13
259 |         self.finger_index = [7, 8, 9, 10, 11, 12]
260 |         # self.finger_index = [7]
261 |         self.home_j_pos = [1.623, -0.0326, -1.939, 0.070, 1.960, 1.683] + [0] * 6
262 |         self.gripper_range = [0, 0.85]
263 |         self.grasp_force_threshold = 10
264 | 
265 |     def load(self):
266 |         super().load("urdf/xarm/xarm6_with_gripper.urdf", [0, 0, 0.62], [0, 0, 0, 1])
267 | 
268 |     def move_gripper(self, gripper_state):
269 |         val = (1 - gripper_state) * (
270 |             self.gripper_range[1] - self.gripper_range[0]
271 |         ) + self.gripper_range[0]
272 |         gripper_pos = [val] * len(self.finger_index)
273 |         super().move_gripper(
274 |             gripper_pos,
275 |             max_vel=[1] * len(self.finger_index),
276 |             force=[100000] * len(self.finger_index),
277 |         )
278 | 
279 |     def move_arm(self, j_pos):
280 |         super().move_arm(
281 |             j_pos[: self.arm_dof],
282 |             [1] * self.arm_dof,
283 |             [5 * 240.0] * self.arm_dof,
284 |         )
285 | 
286 | 
287 | class HandBase(RobotBase):
288 |     def __init__(self):
289 |         super().__init__()
290 | 
291 |     def move_ee(self, ee_pose):
292 |         cur_ee_pose = p.getLinkState(self.robot_id, self.ee_index)[0:2]
293 |         inv_cur_ee_pose = p.invertTransform(cur_ee_pose[0], cur_ee_pose[1])
294 |         delta_pose = p.multiplyTransforms(
295 |             inv_cur_ee_pose[0], inv_cur_ee_pose[1], ee_pose[0], ee_pose[1]
296 |         )
297 |         delta_pos = delta_pose[0]
298 |         delta_rot = p.getEulerFromQuaternion(delta_pose[1])
299 |         max_delta_pos = 0.01
300 |         max_delta_rot = np.pi / 50
301 | 
302 |         delta_pos = np.clip(delta_pos, -max_delta_pos, max_delta_pos)
303 |         delta_rot = np.clip(delta_rot, -max_delta_rot, max_delta_rot)
304 |         ee_pose = p.multiplyTransforms(
305 |             cur_ee_pose[0],
306 |             cur_ee_pose[1],
307 |             delta_pos,
308 |             p.getQuaternionFromEuler(delta_rot),
309 |         )
310 |         p.changeConstraint(self.hand_cid, ee_pose[0], ee_pose[1], maxForce=100, erp=10)
311 | 
312 |     def move_gripper(self, gripper_status):
313 |         for index, range in self.gripper_range.items():
314 |             val = ((1 - gripper_status) * (range[1] - range[0]) + range[0]) * range[2]
315 |             p.setJointMotorControl2(
316 |                 self.robot_id,
317 |                 index,
318 |                 p.POSITION_CONTROL,
319 |                 val,
320 |                 maxVelocity=8,
321 |                 force=100,
322 |             )
323 | 
324 |     def is_ee_arrived(self, ee_pose, tar_obj_id=None, threshold=2 * 1e-2):
325 |         robot_ee_pos = np.array(p.getLinkState(self.robot_id, self.ee_index)[0])
326 |         diff_pos = np.abs(robot_ee_pos - ee_pose[0])
327 |         is_arrive = np.all(diff_pos <= threshold)
328 |         if tar_obj_id != None:
329 |             for i, (key, val) in enumerate(self.gripper_range.items()):
330 |                 is_arrive = is_arrive and np.all(
331 |                     abs(p.getJointState(self.robot_id, key)[0] - val[1] * val[2]) <= 0.2
332 |                 )
333 |         return is_arrive
334 | 
335 | 
336 | class MplHand(HandBase):
337 |     def __init__(self):
338 |         super().__init__()
339 |         self.ee_index = 7
340 |         self.arm_dof = 3
341 |         self.home_j_pos = [0] * 22
342 |         self.is_hand = True
343 |         self.finger_index = [8, 10, 12, 18, 20]
344 |         self.gripper_range = {
345 |             8: [0, 1.8, 1],
346 |             10: [0, 1, -1],
347 |             12: [0, 2.0, 1],
348 |             18: [0, 1.57, 1],
349 |             20: [0, 0.5, 1],
350 |         }
351 |         self.grasp_force_threshold = 2
352 | 
353 |     def load(self):
354 |         self.robot_id = p.loadMJCF("urdf/MPL/MPL.xml")[0]
355 |         p.resetBasePositionAndOrientation(
356 |             self.robot_id, self.home_ee_pose[0], (0, 0, 1, 0)
357 |         )
358 |         for i in range(p.getNumJoints(self.robot_id)):
359 |             p.setJointMotorControl2(self.robot_id, i, p.POSITION_CONTROL, 1.05, 0)
360 |         for i in range(22):
361 |             p.setJointMotorControl2(self.robot_id, i, p.POSITION_CONTROL, 0, 0)
362 |         self.hand_cid = p.createConstraint(
363 |             self.robot_id,
364 |             self.ee_index,
365 |             -1,
366 |             -1,
367 |             p.JOINT_FIXED,
368 |             [0, 0, 0],
369 |             [0, 0, 0],
370 |             [0, 0, 0],
371 |         )
372 |         state = random.getstate()
373 |         random.seed(6)
374 |         link_indeices = [-1, 1, 6] + list(range(9, 24, 2)) + list(range(24, 47, 2))
375 |         for i in link_indeices:
376 |             p.changeVisualShape(
377 |                 self.robot_id,
378 |                 i,
379 |                 rgbaColor=[
380 |                     random.uniform(0, 1),
381 |                     random.uniform(0, 1),
382 |                     random.uniform(0, 1),
383 |                     1,
384 |                 ],
385 |             )
386 |         random.setstate(state)
387 |         p.changeConstraint(self.hand_cid, self.home_ee_pose[0], (1, 0, 0, 0))
388 | 
389 | 
390 | class AllegroHand(HandBase):
391 |     def __init__(self):
392 |         super().__init__()
393 |         self.ee_index = 21
394 |         self.arm_dof = 3
395 |         self.home_j_pos = [0] * 22
396 |         self.is_hand = True
397 |         self.finger_index = [2, 3, 16, 18]
398 |         self.gripper_range = {
399 |             2: [0, 1.8, 1],
400 |             3: [0, 0.2, 1],
401 |             16: [0, 1.4, 1],
402 |             18: [0, 0.2, 1],
403 |         }
404 |         self.grasp_force_threshold = 2
405 | 
406 |     def load(self):
407 |         self.robot_id = p.loadURDF(
408 |             "urdf/allegro_hand_description/urdf/allegro_hand_description_right.urdf"
409 |         )
410 |         p.resetBasePositionAndOrientation(
411 |             self.robot_id, self.home_ee_pose[0], [0, 0.707, 0, 0.707]
412 |         )
413 |         for i in range(p.getNumJoints(self.robot_id)):
414 |             p.setJointMotorControl2(self.robot_id, i, p.POSITION_CONTROL, 0, 0)
415 |         self.hand_cid = p.createConstraint(
416 |             self.robot_id,
417 |             self.ee_index,
418 |             -1,
419 |             -1,
420 |             p.JOINT_FIXED,
421 |             [0, 0, 0],
422 |             [0, 0, 0],
423 |             [0, 0, 0],
424 |         )
425 |         p.changeConstraint(self.hand_cid, self.home_ee_pose[0], [1, 0, 0, 0])
426 | 
427 | 
428 | if __name__ == "__main__":
429 |     pass
430 | 


--------------------------------------------------------------------------------
/sim_env.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import numpy as np
  3 | import math
  4 | import random
  5 | import matplotlib.pyplot as plt
  6 | import pandas as pd
  7 | import shutil
  8 | import os
  9 | import glob
 10 | import pybullet as p
 11 | import threading
 12 | from robots import Panda
 13 | from PIL import Image
 14 | 
 15 | # Dictionary defining task names and their corresponding environment classes
 16 | TASKS = {
 17 |     "touch": "TouchTaskEnv",  # Task 'touch' is associated with 'TouchTaskEnv' class
 18 |     "pick": "PickTaskEnv",  # Task 'pick' is associated with 'PickTaskEnv' class
 19 | }
 20 | 
 21 | # Dictionary defining various camera views along with their configuration parameters
 22 | CAM_INFO = {
 23 |     "front": [
 24 |         [0, 0, 0.7],
 25 |         1.8,
 26 |         180,
 27 |         -20,
 28 |         0,
 29 |         40,
 30 |     ],  # Front view: [position], distance, angles, fov
 31 |     "fronttop": [
 32 |         [0, 0.5, 0.7],
 33 |         1.5,
 34 |         180,
 35 |         -60,
 36 |         0,
 37 |         35,
 38 |     ],  # Front-top view: [position], distance, angles, fov
 39 |     "topdown": [
 40 |         [0, 0.35, 0],
 41 |         2.0,
 42 |         0,
 43 |         -90,
 44 |         0,
 45 |         45,
 46 |     ],  # Top-down view: [position], distance, angles, fov
 47 |     "side": [
 48 |         [0, 0.35, 0.9],
 49 |         1.5,
 50 |         90,
 51 |         0,
 52 |         0,
 53 |         40,
 54 |     ],  # Side view: [position], distance, angles, fov
 55 |     "root": [
 56 |         [0, 0.6, 0.75],
 57 |         1.3,
 58 |         -35,
 59 |         -5,
 60 |         0,
 61 |         40,
 62 |     ],  # Root view: [position], distance, angles, fov
 63 |     "wrist": [],  # Placeholder for the 'wrist', since wrist view goes with the end effector, so no predefined camera parameters required
 64 | }
 65 | 
 66 | # Tuple defining the resolution of the camera (width x height)
 67 | cam_resolution = (1080, 864)
 68 | 
 69 | 
 70 | def get_cam_projection_matrix(cam_view):
 71 |     """
 72 |     Calculates the camera projection matrix based on the given camera view.
 73 | 
 74 |     Parameters:
 75 |     - cam_view (str): Specifies the camera view.
 76 | 
 77 |     Returns:
 78 |     - cam_projection_matrix (list): Projection matrix for the specified camera view.
 79 |     """
 80 | 
 81 |     # Calculate the aspect ratio based on camera resolution
 82 |     aspect = float(cam_resolution[0]) / cam_resolution[1]
 83 |     nearVal = 0.1  # Default near clipping plane value
 84 |     farVal = 100  # Default far clipping plane value
 85 | 
 86 |     if cam_view == "wrist":
 87 |         # Adjust parameters for wrist camera view
 88 |         fov = 100  # Field of view for wrist camera
 89 |         nearVal = 0.018  # Adjusted near clipping plane value for wrist camera
 90 |     else:
 91 |         # Use field of view based on the specified camera view
 92 |         fov = CAM_INFO[cam_view][-1]  # Get field of view for the specified camera view
 93 | 
 94 |     # Compute the camera projection matrix using PyBullet's function
 95 |     cam_projection_matrix = p.computeProjectionMatrixFOV(
 96 |         fov=fov,
 97 |         aspect=aspect,
 98 |         nearVal=nearVal,
 99 |         farVal=farVal,
100 |     )
101 | 
102 |     # Return the calculated camera projection matrix
103 |     return cam_projection_matrix
104 | 
105 | 
106 | def get_view_matrix(cam_view, robot_id, ee_index):
107 |     """
108 |     Generates the view matrix for a specified camera view relative to a robot's end-effector.
109 | 
110 |     Parameters:
111 |     - cam_view (str): Specifies the camera view.
112 |     - robot_id (int): Identifier for the robot.
113 |     - ee_index (int): Index of the end-effector on the robot.
114 | 
115 |     Returns:
116 |     - cam_view_matrix (list): View matrix for the specified camera view.
117 |     """
118 | 
119 |     if cam_view == "wrist":
120 |         # Calculate view matrix for wrist camera view
121 |         eye_pos, eye_ori = p.getLinkState(
122 |             robot_id,
123 |             ee_index,
124 |             computeForwardKinematics=True,
125 |         )[0:2]
126 |         eye_pos = list(eye_pos)
127 |         eye_pos = p.multiplyTransforms(eye_pos, eye_ori, [0, 0, -0.05], [0, 0, 0, 1])[0]
128 |         r_mat = p.getMatrixFromQuaternion(eye_ori)
129 |         tx_vec = np.array([r_mat[0], r_mat[3], r_mat[6]])
130 |         ty_vec = np.array([r_mat[1], r_mat[4], r_mat[7]])
131 |         tz_vec = np.array([r_mat[2], r_mat[5], r_mat[8]])
132 |         camera_position = np.array(eye_pos)
133 |         target_position = eye_pos + 0.001 * tz_vec
134 | 
135 |         # Compute view matrix for wrist camera using PyBullet's function
136 |         cam_view_matrix = p.computeViewMatrix(
137 |             cameraEyePosition=camera_position,
138 |             cameraTargetPosition=target_position,
139 |             cameraUpVector=ty_vec,
140 |         )
141 |     else:
142 |         # Calculate view matrix for non-wrist camera views using yaw, pitch, and roll
143 |         cam_view_matrix = p.computeViewMatrixFromYawPitchRoll(
144 |             CAM_INFO[cam_view][0],
145 |             CAM_INFO[cam_view][1],
146 |             CAM_INFO[cam_view][2],
147 |             CAM_INFO[cam_view][3],
148 |             CAM_INFO[cam_view][4],
149 |             2,
150 |         )
151 | 
152 |     # Return the computed camera view matrix
153 |     return cam_view_matrix
154 | 
155 | 
156 | def get_cam_view_img(cam_view, robot_id=None, ee_index=None):
157 |     """
158 |     Captures an image from a specified camera view using PyBullet.
159 | 
160 |     Parameters:
161 |     - cam_view (str): Specifies the camera view.
162 |     - robot_id (int, optional): Identifier for the robot.
163 |     - ee_index (int, optional): Index of the end-effector on the robot.
164 | 
165 |     Returns:
166 |     - img (numpy.ndarray): Captured image from the specified camera view.
167 |     """
168 | 
169 |     # Obtain the view matrix for the camera view
170 |     cam_view_matrix = get_view_matrix(cam_view, robot_id, ee_index)
171 | 
172 |     # Obtain the projection matrix for the camera view
173 |     cam_projection_matrix = get_cam_projection_matrix(cam_view)
174 | 
175 |     # Capture the camera image using PyBullet
176 |     (width, height, rgb_pixels, _, _) = p.getCameraImage(
177 |         cam_resolution[0],
178 |         cam_resolution[1],
179 |         viewMatrix=cam_view_matrix,
180 |         projectionMatrix=cam_projection_matrix,
181 |     )
182 | 
183 |     # Reshape and process the image data
184 |     rgb_array = np.array(rgb_pixels).reshape((height, width, 4)).astype(np.uint8)
185 |     img = np.array(resize_and_crop(rgb_array[:, :, :3]))  # Process the image
186 | 
187 |     # Return the captured and processed image
188 |     return img
189 | 
190 | 
191 | def resize_and_crop(input_image):
192 |     """
193 |     Crop the image to a 5:4 aspect ratio and resize it to 320x256 pixels.
194 | 
195 |     Parameters:
196 |     - input_image (numpy.ndarray): Input image data in array format.
197 | 
198 |     Returns:
199 |     - input_image (PIL.Image.Image): Cropped and resized image in PIL Image format.
200 |     """
201 | 
202 |     # Convert the input image array to a PIL Image
203 |     input_image = Image.fromarray(input_image)
204 | 
205 |     # Get the width and height of the input image
206 |     width, height = input_image.size
207 | 
208 |     # Define target and current aspect ratios
209 |     target_aspect = 5 / 4
210 |     current_aspect = width / height
211 | 
212 |     if current_aspect > target_aspect:
213 |         # If the image is too wide, crop its width
214 |         new_width = int(target_aspect * height)
215 |         left_margin = (width - new_width) / 2
216 |         input_image = input_image.crop((left_margin, 0, width - left_margin, height))
217 |     elif current_aspect < target_aspect:
218 |         # If the image is too tall, crop its height
219 |         new_height = int(width / target_aspect)
220 |         top_margin = (height - new_height) / 2
221 |         input_image = input_image.crop((0, top_margin, width, height - top_margin))
222 | 
223 |     # Resize the cropped image to 320x256 pixels
224 |     input_image = input_image.resize((320, 256))
225 | 
226 |     # Return the cropped and resized image as a PIL Image
227 |     return input_image
228 | 
229 | 
230 | class SimEnv(object):
231 |     def __init__(self):
232 |         # Set solverResidualThreshold to 0 for physics engine parameter
233 |         p.setPhysicsEngineParameter(solverResidualThreshold=0)
234 | 
235 |         # Control time step and reset environment wait time
236 |         self.control_dt = 1.0 / 240.0
237 |         self.reset_env_wait_time = 0.5
238 | 
239 |         # Initialize attributes related to the robot, target object, poses, state, waypoints, and data recording
240 |         self.robot = None
241 |         self.tar_obj = None
242 |         self.tar_obj_pose = None
243 |         self.state = None
244 |         self.target_waypoints = None
245 |         self.data_record_fq = None
246 |         self.collected_traj = 700
247 | 
248 |         # Load environment, set up camera, and reset environment
249 |         self.load_env()
250 |         self.set_camera()
251 |         self.reset_env()
252 | 
253 |         # Initialize a lock for thread safety
254 |         self.lock = threading.Lock()
255 | 
256 |     def load_env(self):
257 |         raise NotImplementedError
258 | 
259 |     def reset_env(self):
260 |         raise NotImplementedError
261 | 
262 |     def set_camera(self):
263 |         # Set camera resolution
264 |         self.cam_resolution = (1080, 864)  # Define camera resolution
265 | 
266 |         # Retrieve camera information from CAM_INFO dictionary
267 |         self.cam_info = CAM_INFO
268 | 
269 |         # Initialize an empty list to store view matrices for each camera view
270 |         self.cam_view_matrices = []
271 | 
272 |         # Iterate through each camera view in CAM_INFO and compute view matrices
273 |         for key, val in self.cam_info.items():
274 |             if key == "wrist":
275 |                 self.cam_view_matrices.append([])  # Placeholder for 'wrist' view
276 |             else:
277 |                 # Compute view matrix using yaw, pitch, roll, and other parameters from CAM_INFO
278 |                 view_matrix = p.computeViewMatrixFromYawPitchRoll(
279 |                     val[0], val[1], val[2], val[3], val[4], 2
280 |                 )
281 |                 self.cam_view_matrices.append(
282 |                     view_matrix
283 |                 )  # Store the computed view matrix
284 | 
285 |         # Compute projection matrix for the camera with specified FOV, aspect ratio, and depth range
286 |         self.cam_projection_matrix = p.computeProjectionMatrixFOV(
287 |             fov=60,
288 |             aspect=float(self.cam_resolution[0]) / self.cam_resolution[1],
289 |             nearVal=0.1,
290 |             farVal=100,
291 |         )
292 | 
293 | 
294 | class TouchTaskEnv(SimEnv):
295 |     def __init__(self):
296 |         super().__init__()
297 | 
298 |     def load_env(self):
299 |         p.loadURDF("table/table.urdf", [0, 0.35, 0], [0, 0, 0, 1])
300 |         self.tar_obj = p.loadURDF("urdf/cube/cube.urdf", [0, 0, 0], globalScaling=0.04)
301 |         p.changeVisualShape(self.tar_obj, -1, rgbaColor=[1, 0, 0, 1])
302 |         self.robot = Panda()
303 |         self.robot.load()
304 | 
305 |     def reset_tar_obj(self, tar_obj_range=None, tar_pos_rot=None, random_pos_rot=True):
306 |         if random_pos_rot:
307 |             # Generate random position and rotation within the specified range
308 |             x = random.uniform(tar_obj_range[0], tar_obj_range[1])
309 |             y = random.uniform(tar_obj_range[2], tar_obj_range[3])
310 |             r = random.uniform(tar_obj_range[4], tar_obj_range[5])
311 |             pos = [x, y, 0.645]  # Set the z-coordinate (height) of the object
312 |             rot = p.getQuaternionFromEuler(
313 |                 [0, np.pi / 2, r]
314 |             )  # Convert Euler angles to quaternion
315 |         else:
316 |             # Use provided target position and rotation
317 |             x, y, r = tar_pos_rot[0], tar_pos_rot[1], tar_pos_rot[4]
318 |             pos = [x, y, 0.645]
319 |             rot = [tar_pos_rot[2], tar_pos_rot[3], tar_pos_rot[4], tar_pos_rot[5]]
320 | 
321 |         # Reset the target object's position and orientation
322 |         p.resetBasePositionAndOrientation(
323 |             self.tar_obj,
324 |             pos,
325 |             rot,
326 |         )
327 | 
328 |         # Update the stored target object's pose (position and orientation)
329 |         self.tar_obj_pose = p.getBasePositionAndOrientation(self.tar_obj)
330 | 
331 |     def reset_env(self):
332 |         # Reset the robot's joints to their home positions
333 |         self.robot.reset_j_home()
334 | 
335 |         # Pause execution for 1 second to allow for resetting
336 |         time.sleep(1)
337 | 
338 |         # Reset state variables related to the environment
339 |         self.state = 0  # Reset the state to 0
340 |         self.t = 0  # Reset time counter to 0
341 |         self.state_stuck_t = 0  # Reset the state stuck time to 0
342 | 
343 | 
344 | class PickTaskEnv(TouchTaskEnv):
345 |     def __init__(self):
346 |         super().__init__()
347 | 


--------------------------------------------------------------------------------
/train_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "mode": "train",
 3 |     "device": "cuda",
 4 |     "data_path": "/mnt/data/io_rt1_dataset/20231123_pick_r10_cube151212_drop_after_lift_straight_action/Panda_pick",
 5 |     "cam_view" : ["front", "side"],
 6 |     "log_dir": "/mnt/logs_1",
 7 |     "time_sequence_length": 12,
 8 |     "lr": 0.0001,
 9 |     "batch_size": 4,
10 |     "epochs": 20,
11 |     "resume": false,
12 |     "resume_from_checkpoint": "/mnt/logs_1/1701051203/19-checkpoint.pth",
13 |     "predicting_next_ts": true,
14 |     "world_size": 4,
15 |     "dist_url": "env://",
16 |     "val_interval" : 10,
17 |     "num_eval_threads": 25,
18 |     "num_train_episode" : 200,
19 |     "num_val_episode" : 1,
20 |     "using_proprioception" : false,
21 |     "network_configs": {
22 |         "vocab_size" : 256,
23 |         "token_embedding_size_per_image" : 512,
24 |         "language_embedding_size" : 512,
25 |         "num_layers" : 8,
26 |         "layer_size" : 128,
27 |         "num_heads" : 8,
28 |         "feed_forward_size" : 512,
29 |         "dropout_rate" : 0.1,
30 |         "crop_size" : 236,
31 |         "use_token_learner" : true
32 |     },
33 |     "scheduler_configs" : {
34 |         "T_0" : 50,
35 |         "T_mult" : 2,
36 |         "eta_min" : 1e-5,
37 |         "verbose" : true
38 |     }
39 |     
40 | }
41 | 


--------------------------------------------------------------------------------
/urdf/cube/cube.obj:
--------------------------------------------------------------------------------
 1 | # cube.obj
 2 | #
 3 | 
 4 | o cube
 5 | mtllib cube.mtl
 6 | 
 7 | v -0.500000 -0.500000 0.500000
 8 | v 0.500000 -0.500000 0.500000
 9 | v -0.500000 0.500000 0.500000
10 | v 0.500000 0.500000 0.500000
11 | v -0.500000 0.500000 -0.500000
12 | v 0.500000 0.500000 -0.500000
13 | v -0.500000 -0.500000 -0.500000
14 | v 0.500000 -0.500000 -0.500000
15 | 
16 | vt 0.000000 0.000000
17 | vt 1.000000 0.000000
18 | vt 0.000000 1.000000
19 | vt 1.000000 1.000000
20 | 
21 | vn 0.000000 0.000000 1.000000
22 | vn 0.000000 1.000000 0.000000
23 | vn 0.000000 0.000000 -1.000000
24 | vn 0.000000 -1.000000 0.000000
25 | vn 1.000000 0.000000 0.000000
26 | vn -1.000000 0.000000 0.000000
27 | 
28 | g cube
29 | usemtl cube
30 | s 1
31 | f 1/1/1 2/2/1 3/3/1
32 | f 3/3/1 2/2/1 4/4/1
33 | s 2
34 | f 3/1/2 4/2/2 5/3/2
35 | f 5/3/2 4/2/2 6/4/2
36 | s 3
37 | f 5/4/3 6/3/3 7/2/3
38 | f 7/2/3 6/3/3 8/1/3
39 | s 4
40 | f 7/1/4 8/2/4 1/3/4
41 | f 1/3/4 8/2/4 2/4/4
42 | s 5
43 | f 2/1/5 8/2/5 4/3/5
44 | f 4/3/5 8/2/5 6/4/5
45 | s 6
46 | f 7/1/6 1/2/6 5/3/6
47 | f 5/3/6 1/2/6 3/4/6
48 | 
49 | 


--------------------------------------------------------------------------------
/urdf/cube/cube.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ioai-tech/pytorch_rt1_with_trainer_and_tester/bd86991521dda9cfd0664a0d4ad602ab34ad01a1/urdf/cube/cube.png


--------------------------------------------------------------------------------
/urdf/cube/cube.urdf:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" ?>
 2 | <robot name="cube">
 3 |   <link name="baseLink">
 4 |     <contact>
 5 |       <lateral_friction value="1000"/>
 6 |       <rolling_friction value="0.0"/>
 7 |       <contact_cfm value="0.0"/>
 8 |       <contact_erp value="1.0"/>
 9 |     </contact>
10 |     <inertial>
11 |       <origin rpy="0 0 0" xyz="0 0 0"/>
12 |        <mass value="0.1"/>
13 |        <inertia ixx="1" ixy="0" ixz="0" iyy="1" iyz="0" izz="1"/>
14 |     </inertial>
15 |     <visual>
16 |       <origin rpy="0 0 0" xyz="0 0 0"/>
17 |       <geometry>
18 | 				<mesh filename="cube.obj" scale="1.2 1.2 1.2"/>
19 |       </geometry>
20 |        <material name="white">
21 |         <color rgba="1 1 1 1"/>
22 |       </material>
23 |     </visual>
24 |     <collision>
25 |       <origin rpy="0 0 0" xyz="0 0 0"/>
26 |       <geometry>
27 |         <!-- for data collection -->
28 |         <mesh filename="cube.obj" scale="1.2 1.2 1.2"/>
29 |         <!-- for testing -->
30 |         <!-- <box size="1 1 1"/> -->
31 |       </geometry>
32 |     </collision>
33 |   </link>
34 | </robot>
35 | 


--------------------------------------------------------------------------------
/util/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2 | 


--------------------------------------------------------------------------------
/vis.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ioai-tech/pytorch_rt1_with_trainer_and_tester/bd86991521dda9cfd0664a0d4ad602ab34ad01a1/vis.mp4


--------------------------------------------------------------------------------