├── !App ├── !figs │ ├── airdrop.png │ ├── app.png │ ├── interface.png │ └── signing.png ├── ConvertBinaries.py ├── ImageBundleApp │ ├── Configuration │ │ └── SampleCode.xcconfig │ ├── ImageBundleRecorder.xcodeproj │ │ ├── .xcodesamplecode.plist │ │ └── project.pbxproj │ ├── ImageBundleRecorder │ │ ├── Assets.xcassets │ │ │ ├── AccentColor.colorset │ │ │ │ └── Contents.json │ │ │ └── Contents.json │ │ ├── CameraController.swift │ │ ├── CameraManager.swift │ │ ├── ImageBundleRecorder.entitlements │ │ ├── ImageBundleRecorder.swift │ │ ├── Info.plist │ │ ├── MetalTextureView.swift │ │ ├── MetalTextureViewDepth.swift │ │ ├── MetalViewSample.swift │ │ └── shaders.metal │ ├── LICENSE │ │ └── LICENSE.txt │ └── Launch Screen.storyboard └── README.md ├── !figs ├── experiments-thumb.png ├── extra-thumb.png ├── scenes-thumb.png └── synth-thumb.png ├── LICENSE ├── README.md ├── checkpoints └── __init__.py ├── config ├── config_depth.json └── config_rgb.json ├── data └── __init__.py ├── requirements.txt ├── train.py ├── tutorial.ipynb └── utils └── utils.py /!App/!figs/airdrop.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/princeton-computational-imaging/SoaP/857dda0f7578126ed9feb8410eedeef053679f9e/!App/!figs/airdrop.png -------------------------------------------------------------------------------- /!App/!figs/app.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/princeton-computational-imaging/SoaP/857dda0f7578126ed9feb8410eedeef053679f9e/!App/!figs/app.png -------------------------------------------------------------------------------- /!App/!figs/interface.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/princeton-computational-imaging/SoaP/857dda0f7578126ed9feb8410eedeef053679f9e/!App/!figs/interface.png -------------------------------------------------------------------------------- /!App/!figs/signing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/princeton-computational-imaging/SoaP/857dda0f7578126ed9feb8410eedeef053679f9e/!App/!figs/signing.png -------------------------------------------------------------------------------- /!App/ConvertBinaries.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | import struct 4 | from matplotlib import gridspec 5 | import matplotlib.pyplot as plt 6 | from glob import glob 7 | import os 8 | from os.path import join 9 | from os.path import split 10 | from natsort import natsorted 11 | from skimage.transform import resize 12 | import re 13 | from tqdm import tqdm 14 | 15 | """ Code to process depth/image/pose binaries the ios DepthBundleRecorder app into more useable .npz files. 16 | Tested for iPhone 12, 13, 14 pro. 17 | Usage: python ConvertBinaries.py -d folder_with_bundles 18 | Output: a folder processed_folder_with_bundles containing the processed depth bundles 19 | """ 20 | 21 | def cut(x): # return value after the ":" 22 | return x.split(":")[1] 23 | 24 | def process_raw(npz_file, raw_name): 25 | global is_iphone12 26 | with open(raw_name, mode='rb') as file: 27 | raw = file.read() 28 | 29 | raw_split = raw.split(b"") 30 | num_raw_frames = 0 31 | 32 | for i, raw_frame in tqdm(enumerate(raw_split[1:])): 33 | if len(raw_frame) < 100: # skip weird outliers 34 | continue 35 | 36 | raw_header, raw_image = raw_frame.split(b"") 37 | raw_header = re.sub("\[|\]|\(|\)|\s|\'", "", str(raw_header)) # Strip all delims but <> and commas 38 | raw_header = re.sub(r"\s+", "", raw_header) # Strip spaces 39 | rw = raw_header.split(",") 40 | 41 | frame_count = int(cut(rw[1])) # skip description 42 | timestamp = float(cut(rw[2])) 43 | height = int(cut(rw[3])) 44 | width = int(cut(rw[4])) 45 | ISO = int(cut(rw[6])) 46 | exposure_time = float(cut(rw[7])) 47 | aperture = float(cut(rw[8])) 48 | brightness = float(cut(rw[9])) 49 | shutter_speed = float(cut(rw[10])) 50 | black_level = int(cut(rw[13])) 51 | white_level = int(cut(rw[14])) 52 | 53 | 54 | raw_image = struct.unpack('H'* ((len(raw_image)//2)), raw_image) 55 | raw_image = np.reshape(raw_image, (height, width)) 56 | raw_image = np.flip(raw_image.swapaxes(0,1), 1) # make vertical 57 | 58 | if is_iphone12: 59 | # Make everything same bayer format: 60 | # G B 61 | # R G 62 | A1 = raw_image[::2,1::2].copy() 63 | A2 = raw_image[1::2,::2].copy() 64 | raw_image[::2,1::2] = A2 65 | raw_image[1::2,::2] = A1 66 | 67 | raw_image = raw_image.astype(np.uint16) 68 | raw_frame = {'frame_count' : frame_count, 69 | 'timestamp' : timestamp, 70 | 'height' : height, 71 | 'width' : width, 72 | 'ISO' : ISO, 73 | 'exposure_time' : exposure_time, 74 | 'aperture' : aperture, 75 | 'brightness' : brightness, 76 | 'shutter_speed' : shutter_speed, 77 | 'black_level' : black_level, 78 | 'white_level' : white_level, 79 | 'raw' : raw_image 80 | } 81 | 82 | num_raw_frames += 1 83 | npz_file[f'raw_{i}'] = raw_frame 84 | npz_file['num_raw_frames'] = num_raw_frames 85 | 86 | def process_rgb(npz_file, rgb_name): 87 | global is_iphone12 88 | with open(rgb_name, mode='rb') as file: 89 | rgb = file.read() 90 | 91 | rgb_split = rgb.split(b"") 92 | num_rgb_frames = 0 93 | 94 | for i, rgb_frame in tqdm(enumerate(rgb_split[1:])): 95 | if len(rgb_frame) < 100: # skip weird outliers 96 | continue 97 | 98 | rgb_header, rgb_image = rgb_frame.split(b"") 99 | rgb_header = re.sub("\[|\]|\(|\)|\s|\'", "", str(rgb_header)) # Strip all delims but <> and commas 100 | rgb_header = re.sub(r"\s+", "", rgb_header) # Strip spaces 101 | r = rgb_header.split(",") 102 | 103 | frame_count = int(cut(r[1])) # skip description 104 | timestamp = float(cut(r[2])) 105 | height = int(cut(r[3])) 106 | width = int(cut(r[4])) 107 | intrinsics = np.array([[cut(r[6]), r[7], r[8]], 108 | [r[9], r[10], r[11]], 109 | [r[12], r[13], r[14]]], dtype=np.float32) 110 | 111 | rgb_image = struct.unpack('B'* ((len(rgb_image))), rgb_image) 112 | 113 | try: 114 | rgb_image = np.reshape(rgb_image, (1440, 1920, 4)) # 12 extra bytes at the end of each row, mystery 115 | rgb_image = rgb_image[:,:,[2,1,0,3]] # cut extra bytes, go from BGRA to RGBA 116 | rgb_image = np.flip(rgb_image.swapaxes(0,1), 1) # make vertical 117 | except: 118 | raise Exception("RGB format not understood.") 119 | 120 | rgb_image = rgb_image.astype(np.uint8) 121 | rgb_frame = {'frame_count' : frame_count, 122 | 'timestamp' : timestamp, 123 | 'height' : height, 124 | 'width' : width, 125 | 'intrinsics' : intrinsics, 126 | 'rgb' : rgb_image 127 | } 128 | 129 | num_rgb_frames += 1 130 | npz_file[f'rgb_{i}'] = rgb_frame 131 | npz_file['num_rgb_frames'] = num_rgb_frames 132 | 133 | def process_depth(npz_file, depth_name): 134 | with open(depth_name, mode='rb') as file: 135 | depth = file.read() 136 | 137 | depth_split = depth.split(b"") 138 | num_depth_frames = 0 139 | 140 | for i, depth_frame in tqdm(enumerate(depth_split[1:])): 141 | if len(depth_frame) < 100: # skip weird outliers 142 | continue 143 | 144 | depth_header, depth_image = depth_frame.split(b"") 145 | depth_header = re.sub("\[|\]|\(|\)|\s|\'", "", str(depth_header)) # Strip all delims but <> and commas 146 | depth_header = re.sub(r"\s+", "", depth_header) # Strip spaces 147 | d = depth_header.split(",") 148 | 149 | frame_count = int(cut(d[1])) # skip description 150 | timestamp = float(cut(d[2])) 151 | height = int(cut(d[3])) 152 | width = int(cut(d[4])) 153 | intrinsic_width = int(float(cut(d[6]))) 154 | intrinsic_height = int(float(cut(d[7]))) # it has a decimal for some reason 155 | intrinsics = np.array([[cut(d[8]), d[9], d[10]], 156 | [d[11], d[12], d[13]], 157 | [d[14], d[15], d[16]]], dtype=np.float32) 158 | lens_distortion = np.array([cut(d[17]), *d[18:59]], dtype=np.float32) 159 | lens_undistortion = np.array([cut(d[59]), *d[60:101]], dtype=np.float32) # 42 numbers, heh 160 | depth_accuracy = int(cut(d[101])) 161 | 162 | depth_image = struct.unpack('e'* ((len(depth_image)) // 2), depth_image) 163 | depth_image = np.reshape(depth_image, (height, width)) 164 | depth_image = np.flip(depth_image.swapaxes(0,1), 1) # make vertical 165 | depth_image = depth_image.astype(np.float16) 166 | 167 | depth_frame = {'frame_count' : frame_count, 168 | 'timestamp' : timestamp, 169 | 'height' : height, 170 | 'width' : width, 171 | 'intrinsic_height' : intrinsic_height, 172 | 'intrinsic_width' : intrinsic_width, 173 | 'intrinsics' : intrinsics, 174 | 'lens_distortion' : lens_distortion, 175 | 'lens_undistortion' : lens_undistortion, 176 | 'depth_accuracy' : depth_accuracy, 177 | 'depth' : depth_image 178 | } 179 | 180 | num_depth_frames += 1 181 | npz_file[f'depth_{i}'] = depth_frame 182 | npz_file['num_depth_frames'] = num_depth_frames 183 | 184 | def process_motion(npz_file, motion_name): 185 | with open(motion_name, mode='rb') as file: 186 | motion = str(file.read()) 187 | 188 | motion_split = motion.split("") 189 | num_motion_frames = 0 190 | 191 | frame_count = [] 192 | timestamp = [] 193 | quaternion = [] 194 | roll_pitch_yaw = [] 195 | rotation_rate = [] 196 | acceleration = [] 197 | gravity = [] 198 | 199 | for i, motion_frame in tqdm(enumerate(motion_split)): 200 | if len(motion_frame) < 100: # skip weird outliers 201 | continue 202 | 203 | motion_frame = motion_frame.strip().replace("", "") 204 | motion_frame = re.sub("\[|\]|\(|\)|\s|\'", "", motion_frame) # Strip all delims but <> and commas 205 | motion_frame = re.sub(r"\s+", "", motion_frame) # Strip spaces 206 | m = motion_frame.split(",") 207 | 208 | frame_count.append(int(cut(m[0]))) 209 | timestamp.append(float(cut(m[1]))) 210 | # quaternion x,y,z,w 211 | quaternion.append(np.array([cut(m[2]), cut(m[3]), cut(m[4]), cut(m[5])], dtype=np.float32)) 212 | rotation_rate.append(np.array([cut(m[6]), cut(m[7]), cut(m[8])], dtype=np.float32)) 213 | roll_pitch_yaw.append(np.array([cut(m[9]), cut(m[10]), cut(m[11])], dtype=np.float32)) 214 | acceleration.append(np.array([cut(m[12]), cut(m[13]), cut(m[14])], dtype=np.float32)) 215 | gravity.append(np.array([cut(m[15]), cut(m[16]), cut(m[17])], dtype=np.float32)) 216 | 217 | num_motion_frames += 1 218 | 219 | motion_frame = {'frame_count' : np.array(frame_count), 220 | 'timestamp' : np.array(timestamp), 221 | 'quaternion' : np.array(quaternion), 222 | 'rotation_rate' : np.array(rotation_rate), 223 | 'roll_pitch_yaw' : np.array(roll_pitch_yaw), 224 | 'acceleration' : np.array(acceleration), 225 | 'gravity' : np.array(gravity), 226 | 'num_motion_frames': np.array(num_motion_frames)} 227 | 228 | 229 | npz_file["motion"] = motion_frame 230 | 231 | def match_timestamps(npz_file): 232 | raw_timestamps = np.array([npz_file[f'raw_{i}']['timestamp'] for i in range(npz_file['num_raw_frames'])]) 233 | raw_timestamps = np.around(raw_timestamps, 3) 234 | rgb_timestamps = np.array([npz_file[f'rgb_{i}']['timestamp'] for i in range(npz_file['num_rgb_frames'])]) 235 | rgb_timestamps = np.around(rgb_timestamps, 3) 236 | depth_timestamps = np.array([npz_file[f'depth_{i}']['timestamp'] for i in range(npz_file['num_depth_frames'])]) 237 | depth_timestamps = np.around(depth_timestamps, 3) 238 | assert (rgb_timestamps == depth_timestamps).all() 239 | 240 | matches = np.array([np.where(rgb_timestamps == raw_timestamps[i])[0][0] for i in range(npz_file['num_raw_frames'])]) 241 | assert len(matches) == npz_file['num_raw_frames'] # all frames have a match 242 | 243 | for i in range(npz_file['num_raw_frames']): 244 | match_idx = matches[i] 245 | npz_file[f'rgb_{i}'] = npz_file[f'rgb_{match_idx}'] 246 | npz_file[f'depth_{i}'] = npz_file[f'depth_{match_idx}'] 247 | 248 | for i in range(npz_file['num_raw_frames'], npz_file['num_rgb_frames']): 249 | del npz_file[f'rgb_{i}'] 250 | del npz_file[f'depth_{i}'] 251 | 252 | npz_file['num_rgb_frames'] = npz_file['num_depth_frames'] = npz_file['num_raw_frames'] 253 | 254 | raw_timestamps = np.array([npz_file[f'raw_{i}']['timestamp'] for i in range(npz_file['num_raw_frames'])]) 255 | raw_timestamps = np.around(raw_timestamps, 3) 256 | rgb_timestamps = np.array([npz_file[f'rgb_{i}']['timestamp'] for i in range(npz_file['num_rgb_frames'])]) 257 | rgb_timestamps = np.around(rgb_timestamps, 3) 258 | depth_timestamps = np.array([npz_file[f'depth_{i}']['timestamp'] for i in range(npz_file['num_depth_frames'])]) 259 | depth_timestamps = np.around(depth_timestamps, 3) 260 | assert (rgb_timestamps == raw_timestamps).all() 261 | 262 | def main(): 263 | global is_iphone12 264 | parser = argparse.ArgumentParser() 265 | parser.add_argument('-d', default=None, type=str, required=True, help='Data directory') 266 | parser.add_argument('-iphone12', action='store_true', help='Flag that this is an iPhone 12 to rotate bayer array.') 267 | args = parser.parse_args() 268 | is_iphone12 = args.iphone12 269 | 270 | if "bundle-" not in args.d: 271 | bundle_names = natsorted(glob(join(args.d, "bundle*"))) 272 | else: 273 | bundle_names = [args.d] 274 | 275 | for bundle_name in bundle_names: 276 | print(f"Processing {split(bundle_name)[-1]}.") 277 | 278 | if "processed-" in bundle_name: 279 | continue # already processed, skip 280 | 281 | if "-motion" not in bundle_name: 282 | # Process image + depth bundle 283 | motion_name = join(bundle_name, "motion.bin") 284 | rgb_name = join(bundle_name, "imageRGB.bin") 285 | raw_name = join(bundle_name, "imageRAW.bin") 286 | depth_name = join(bundle_name, "depth.bin") 287 | 288 | npz_file = {} 289 | 290 | process_depth(npz_file, depth_name) 291 | process_motion(npz_file, motion_name) 292 | process_rgb(npz_file, rgb_name) 293 | process_raw(npz_file, raw_name) 294 | try: 295 | match_timestamps(npz_file) 296 | except Exception as e: 297 | print(f"Skipping {bundle_name} due to:\n{e}.") 298 | continue 299 | 300 | save_path = join(split(bundle_name)[0], "processed-" + split(bundle_name)[1]) 301 | os.makedirs(save_path, exist_ok=True) 302 | 303 | # Save first frame preview 304 | fig = plt.figure(figsize=(14, 30)) 305 | gs = gridspec.GridSpec(1, 3, wspace=0.0, hspace=0.0, width_ratios=[1,1,1.12]) 306 | ax1 = plt.subplot(gs[0,0]) 307 | ax1.imshow(npz_file['rgb_0']['rgb']) 308 | ax1.axis('off') 309 | ax1.set_title("RGB") 310 | ax2 = plt.subplot(gs[0,1]) 311 | ax2.imshow(npz_file['raw_0']['raw'], cmap="gray") 312 | ax2.axis('off') 313 | ax2.set_title("RAW") 314 | ax3 = plt.subplot(gs[0,2]) 315 | d = ax3.imshow(npz_file['depth_0']['depth'], cmap="Spectral", vmin=0, vmax=5) 316 | ax3.axis('off') 317 | ax3.set_title("Depth") 318 | fig.colorbar(d, fraction=0.055, label="Depth [m]") 319 | plt.savefig(join(save_path, "frame_first.png"), bbox_inches='tight', pad_inches=0.05, facecolor='white') 320 | plt.close() 321 | 322 | # Save last frame preview 323 | fig = plt.figure(figsize=(14, 30)) 324 | gs = gridspec.GridSpec(1, 3, wspace=0.0, hspace=0.0, width_ratios=[1,1,1.12]) 325 | ax1 = plt.subplot(gs[0,0]) 326 | ax1.imshow(npz_file[f'rgb_{npz_file["num_raw_frames"] - 1}']['rgb']) 327 | ax1.axis('off') 328 | ax1.set_title("RGB") 329 | ax2 = plt.subplot(gs[0,1]) 330 | ax2.imshow(npz_file[f'raw_{npz_file["num_raw_frames"] - 1}']['raw'], cmap="gray") 331 | ax2.axis('off') 332 | ax2.set_title("RAW") 333 | ax3 = plt.subplot(gs[0,2]) 334 | d = ax3.imshow(npz_file[f'depth_{npz_file["num_raw_frames"] - 1}']['depth'], cmap="Spectral", vmin=0, vmax=5) 335 | ax3.axis('off') 336 | ax3.set_title("Depth") 337 | fig.colorbar(d, fraction=0.055, label="Depth [m]") 338 | plt.savefig(join(save_path, "frame_last.png"), bbox_inches='tight', pad_inches=0.05, facecolor='white') 339 | plt.close() 340 | 341 | # Save bundle 342 | np.savez_compressed(join(save_path, "frame_bundle"), **npz_file) 343 | 344 | else: 345 | # Process only motion bundle 346 | 347 | motion_name = join(bundle_name, "motion.bin") 348 | 349 | save_path = bundle_name.replace("bundle-", "bundle_processed-") 350 | os.makedirs(save_path, exist_ok=True) 351 | 352 | npz_file = {} 353 | 354 | process_motion(npz_file, motion_name) 355 | 356 | # Save bundle 357 | np.savez(join(save_path, "motion_bundle"), **npz_file) 358 | 359 | if __name__ == '__main__': 360 | is_iphone12 = False 361 | main() 362 | -------------------------------------------------------------------------------- /!App/ImageBundleApp/Configuration/SampleCode.xcconfig: -------------------------------------------------------------------------------- 1 | // 2 | // See LICENSE folder for this sample’s licensing information. 3 | // 4 | // SampleCode.xcconfig 5 | // 6 | 7 | // The `SAMPLE_CODE_DISAMBIGUATOR` configuration is to make it easier to build 8 | // and run a sample code project. Once you set your project's development team, 9 | // you'll have a unique bundle identifier. This is because the bundle identifier 10 | // is derived based on the 'SAMPLE_CODE_DISAMBIGUATOR' value. Do not use this 11 | // approach in your own projects—it's only useful for sample code projects because 12 | // they are frequently downloaded and don't have a development team set. 13 | SAMPLE_CODE_DISAMBIGUATOR=${DEVELOPMENT_TEAM} 14 | -------------------------------------------------------------------------------- /!App/ImageBundleApp/ImageBundleRecorder.xcodeproj/.xcodesamplecode.plist: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /!App/ImageBundleApp/ImageBundleRecorder.xcodeproj/project.pbxproj: -------------------------------------------------------------------------------- 1 | // !$*UTF8*$! 2 | { 3 | archiveVersion = 1; 4 | classes = { 5 | }; 6 | objectVersion = 55; 7 | objects = { 8 | 9 | /* Begin PBXBuildFile section */ 10 | AE9AD78D270ECD01001218B2 /* CameraController.swift in Sources */ = {isa = PBXBuildFile; fileRef = AE9AD78C270ECD01001218B2 /* CameraController.swift */; }; 11 | AED300A5271D6D18008F6007 /* CameraManager.swift in Sources */ = {isa = PBXBuildFile; fileRef = AED300A4271D6D18008F6007 /* CameraManager.swift */; }; 12 | C12A11492857D820009A991A /* MetalTextureViewDepth.swift in Sources */ = {isa = PBXBuildFile; fileRef = C12A11442857D820009A991A /* MetalTextureViewDepth.swift */; }; 13 | C12A114A2857D820009A991A /* MetalViewSample.swift in Sources */ = {isa = PBXBuildFile; fileRef = C12A11452857D820009A991A /* MetalViewSample.swift */; }; 14 | C12A114B2857D820009A991A /* MetalTextureView.swift in Sources */ = {isa = PBXBuildFile; fileRef = C12A11462857D820009A991A /* MetalTextureView.swift */; }; 15 | C12A114C2857D820009A991A /* ImageBundleRecorder.swift in Sources */ = {isa = PBXBuildFile; fileRef = C12A11472857D820009A991A /* ImageBundleRecorder.swift */; }; 16 | C12A114D2857D820009A991A /* shaders.metal in Sources */ = {isa = PBXBuildFile; fileRef = C12A11482857D820009A991A /* shaders.metal */; }; 17 | /* End PBXBuildFile section */ 18 | 19 | /* Begin PBXFileReference section */ 20 | 0C34ECABC9005D93C0DF4297 /* LICENSE.txt */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text; path = LICENSE.txt; sourceTree = ""; }; 21 | 9ED80CBFEB14F0F0F5121BC6 /* SampleCode.xcconfig */ = {isa = PBXFileReference; lastKnownFileType = text.xcconfig; name = SampleCode.xcconfig; path = Configuration/SampleCode.xcconfig; sourceTree = ""; }; 22 | AE921D26270D7B02000E95C6 /* ImageBundleRecorder.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = ImageBundleRecorder.app; sourceTree = BUILT_PRODUCTS_DIR; }; 23 | AE921D2D270D7B04000E95C6 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; 24 | AE9AD78C270ECD01001218B2 /* CameraController.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = CameraController.swift; sourceTree = ""; }; 25 | AED300A4271D6D18008F6007 /* CameraManager.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = CameraManager.swift; sourceTree = ""; }; 26 | C12A11442857D820009A991A /* MetalTextureViewDepth.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = MetalTextureViewDepth.swift; sourceTree = ""; }; 27 | C12A11452857D820009A991A /* MetalViewSample.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = MetalViewSample.swift; sourceTree = ""; }; 28 | C12A11462857D820009A991A /* MetalTextureView.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = MetalTextureView.swift; sourceTree = ""; }; 29 | C12A11472857D820009A991A /* ImageBundleRecorder.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = ImageBundleRecorder.swift; sourceTree = ""; }; 30 | C12A11482857D820009A991A /* shaders.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = shaders.metal; sourceTree = ""; }; 31 | C1400FB3287F9BF400CB4E63 /* ImageBundleRecorder.entitlements */ = {isa = PBXFileReference; lastKnownFileType = text.plist.entitlements; path = ImageBundleRecorder.entitlements; sourceTree = ""; }; 32 | C1400FB42880E45300CB4E63 /* Launch Screen.storyboard */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; path = "Launch Screen.storyboard"; sourceTree = ""; }; 33 | C1970A7928592FC900B08ECB /* Info.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; name = Info.plist; path = /Users/chugunov/source/HandshakeHDR/ImageBundleApp/ImageBundleRecorder/Info.plist; sourceTree = ""; }; 34 | D0C5477C199C5F5C91FCD321 /* README.md */ = {isa = PBXFileReference; lastKnownFileType = net.daringfireball.markdown; path = README.md; sourceTree = ""; }; 35 | /* End PBXFileReference section */ 36 | 37 | /* Begin PBXFrameworksBuildPhase section */ 38 | AE921D23270D7B02000E95C6 /* Frameworks */ = { 39 | isa = PBXFrameworksBuildPhase; 40 | buildActionMask = 2147483647; 41 | files = ( 42 | ); 43 | runOnlyForDeploymentPostprocessing = 0; 44 | }; 45 | /* End PBXFrameworksBuildPhase section */ 46 | 47 | /* Begin PBXGroup section */ 48 | AE921D1D270D7B02000E95C6 = { 49 | isa = PBXGroup; 50 | children = ( 51 | C1400FB42880E45300CB4E63 /* Launch Screen.storyboard */, 52 | D0C5477C199C5F5C91FCD321 /* README.md */, 53 | AE921D28270D7B02000E95C6 /* ImageBundleRecorder */, 54 | AE921D27270D7B02000E95C6 /* Products */, 55 | C42CD530FDC5FDCBE24A185E /* Configuration */, 56 | EA03DA1400B7EED6A2A7C7E7 /* LICENSE */, 57 | ); 58 | sourceTree = ""; 59 | }; 60 | AE921D27270D7B02000E95C6 /* Products */ = { 61 | isa = PBXGroup; 62 | children = ( 63 | AE921D26270D7B02000E95C6 /* ImageBundleRecorder.app */, 64 | ); 65 | name = Products; 66 | sourceTree = ""; 67 | }; 68 | AE921D28270D7B02000E95C6 /* ImageBundleRecorder */ = { 69 | isa = PBXGroup; 70 | children = ( 71 | C1400FB3287F9BF400CB4E63 /* ImageBundleRecorder.entitlements */, 72 | AE9AD78C270ECD01001218B2 /* CameraController.swift */, 73 | AED300A4271D6D18008F6007 /* CameraManager.swift */, 74 | C12A11462857D820009A991A /* MetalTextureView.swift */, 75 | C12A11442857D820009A991A /* MetalTextureViewDepth.swift */, 76 | C12A11452857D820009A991A /* MetalViewSample.swift */, 77 | C12A11472857D820009A991A /* ImageBundleRecorder.swift */, 78 | C12A11482857D820009A991A /* shaders.metal */, 79 | AE921D2D270D7B04000E95C6 /* Assets.xcassets */, 80 | C1970A7928592FC900B08ECB /* Info.plist */, 81 | ); 82 | path = ImageBundleRecorder; 83 | sourceTree = ""; 84 | }; 85 | C42CD530FDC5FDCBE24A185E /* Configuration */ = { 86 | isa = PBXGroup; 87 | children = ( 88 | 9ED80CBFEB14F0F0F5121BC6 /* SampleCode.xcconfig */, 89 | ); 90 | name = Configuration; 91 | sourceTree = ""; 92 | }; 93 | EA03DA1400B7EED6A2A7C7E7 /* LICENSE */ = { 94 | isa = PBXGroup; 95 | children = ( 96 | 0C34ECABC9005D93C0DF4297 /* LICENSE.txt */, 97 | ); 98 | path = LICENSE; 99 | sourceTree = ""; 100 | }; 101 | /* End PBXGroup section */ 102 | 103 | /* Begin PBXNativeTarget section */ 104 | AE921D25270D7B02000E95C6 /* ImageBundleRecorder */ = { 105 | isa = PBXNativeTarget; 106 | buildConfigurationList = AE921D34270D7B04000E95C6 /* Build configuration list for PBXNativeTarget "ImageBundleRecorder" */; 107 | buildPhases = ( 108 | AE921D22270D7B02000E95C6 /* Sources */, 109 | AE921D23270D7B02000E95C6 /* Frameworks */, 110 | ); 111 | buildRules = ( 112 | ); 113 | dependencies = ( 114 | ); 115 | name = ImageBundleRecorder; 116 | productName = DepthAPISample; 117 | productReference = AE921D26270D7B02000E95C6 /* ImageBundleRecorder.app */; 118 | productType = "com.apple.product-type.application"; 119 | }; 120 | /* End PBXNativeTarget section */ 121 | 122 | /* Begin PBXProject section */ 123 | AE921D1E270D7B02000E95C6 /* Project object */ = { 124 | isa = PBXProject; 125 | attributes = { 126 | BuildIndependentTargetsInParallel = 1; 127 | KnownAssetTags = ( 128 | New, 129 | ); 130 | LastSwiftUpdateCheck = 1320; 131 | LastUpgradeCheck = 1320; 132 | ORGANIZATIONNAME = Apple; 133 | TargetAttributes = { 134 | AE921D25270D7B02000E95C6 = { 135 | CreatedOnToolsVersion = 13.2; 136 | LastSwiftMigration = 1320; 137 | }; 138 | }; 139 | }; 140 | buildConfigurationList = AE921D21270D7B02000E95C6 /* Build configuration list for PBXProject "ImageBundleRecorder" */; 141 | compatibilityVersion = "Xcode 13.0"; 142 | developmentRegion = en; 143 | hasScannedForEncodings = 0; 144 | knownRegions = ( 145 | en, 146 | Base, 147 | ); 148 | mainGroup = AE921D1D270D7B02000E95C6; 149 | productRefGroup = AE921D27270D7B02000E95C6 /* Products */; 150 | projectDirPath = ""; 151 | projectRoot = ""; 152 | targets = ( 153 | AE921D25270D7B02000E95C6 /* ImageBundleRecorder */, 154 | ); 155 | }; 156 | /* End PBXProject section */ 157 | 158 | /* Begin PBXSourcesBuildPhase section */ 159 | AE921D22270D7B02000E95C6 /* Sources */ = { 160 | isa = PBXSourcesBuildPhase; 161 | buildActionMask = 2147483647; 162 | files = ( 163 | AE9AD78D270ECD01001218B2 /* CameraController.swift in Sources */, 164 | C12A114A2857D820009A991A /* MetalViewSample.swift in Sources */, 165 | AED300A5271D6D18008F6007 /* CameraManager.swift in Sources */, 166 | C12A114B2857D820009A991A /* MetalTextureView.swift in Sources */, 167 | C12A114C2857D820009A991A /* ImageBundleRecorder.swift in Sources */, 168 | C12A11492857D820009A991A /* MetalTextureViewDepth.swift in Sources */, 169 | C12A114D2857D820009A991A /* shaders.metal in Sources */, 170 | ); 171 | runOnlyForDeploymentPostprocessing = 0; 172 | }; 173 | /* End PBXSourcesBuildPhase section */ 174 | 175 | /* Begin XCBuildConfiguration section */ 176 | AE921D32270D7B04000E95C6 /* Debug */ = { 177 | isa = XCBuildConfiguration; 178 | baseConfigurationReference = 9ED80CBFEB14F0F0F5121BC6 /* SampleCode.xcconfig */; 179 | buildSettings = { 180 | ALWAYS_SEARCH_USER_PATHS = NO; 181 | CLANG_ANALYZER_NONNULL = YES; 182 | CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; 183 | CLANG_CXX_LANGUAGE_STANDARD = "gnu++17"; 184 | CLANG_CXX_LIBRARY = "libc++"; 185 | CLANG_ENABLE_MODULES = YES; 186 | CLANG_ENABLE_OBJC_ARC = YES; 187 | CLANG_ENABLE_OBJC_WEAK = YES; 188 | CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; 189 | CLANG_WARN_BOOL_CONVERSION = YES; 190 | CLANG_WARN_COMMA = YES; 191 | CLANG_WARN_CONSTANT_CONVERSION = YES; 192 | CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; 193 | CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; 194 | CLANG_WARN_DOCUMENTATION_COMMENTS = YES; 195 | CLANG_WARN_EMPTY_BODY = YES; 196 | CLANG_WARN_ENUM_CONVERSION = YES; 197 | CLANG_WARN_INFINITE_RECURSION = YES; 198 | CLANG_WARN_INT_CONVERSION = YES; 199 | CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; 200 | CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; 201 | CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; 202 | CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; 203 | CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; 204 | CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; 205 | CLANG_WARN_STRICT_PROTOTYPES = YES; 206 | CLANG_WARN_SUSPICIOUS_MOVE = YES; 207 | CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; 208 | CLANG_WARN_UNREACHABLE_CODE = YES; 209 | CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; 210 | COPY_PHASE_STRIP = NO; 211 | DEBUG_INFORMATION_FORMAT = dwarf; 212 | ENABLE_STRICT_OBJC_MSGSEND = YES; 213 | ENABLE_TESTABILITY = YES; 214 | GCC_C_LANGUAGE_STANDARD = gnu11; 215 | GCC_DYNAMIC_NO_PIC = NO; 216 | GCC_NO_COMMON_BLOCKS = YES; 217 | GCC_OPTIMIZATION_LEVEL = 0; 218 | GCC_PREPROCESSOR_DEFINITIONS = ( 219 | "DEBUG=1", 220 | "$(inherited)", 221 | ); 222 | GCC_WARN_64_TO_32_BIT_CONVERSION = YES; 223 | GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; 224 | GCC_WARN_UNDECLARED_SELECTOR = YES; 225 | GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; 226 | GCC_WARN_UNUSED_FUNCTION = YES; 227 | GCC_WARN_UNUSED_VARIABLE = YES; 228 | IPHONEOS_DEPLOYMENT_TARGET = 16.1; 229 | MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; 230 | MTL_FAST_MATH = YES; 231 | ONLY_ACTIVE_ARCH = YES; 232 | SDKROOT = iphoneos; 233 | SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG; 234 | SWIFT_OPTIMIZATION_LEVEL = "-Onone"; 235 | }; 236 | name = Debug; 237 | }; 238 | AE921D33270D7B04000E95C6 /* Release */ = { 239 | isa = XCBuildConfiguration; 240 | baseConfigurationReference = 9ED80CBFEB14F0F0F5121BC6 /* SampleCode.xcconfig */; 241 | buildSettings = { 242 | ALWAYS_SEARCH_USER_PATHS = NO; 243 | CLANG_ANALYZER_NONNULL = YES; 244 | CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; 245 | CLANG_CXX_LANGUAGE_STANDARD = "gnu++17"; 246 | CLANG_CXX_LIBRARY = "libc++"; 247 | CLANG_ENABLE_MODULES = YES; 248 | CLANG_ENABLE_OBJC_ARC = YES; 249 | CLANG_ENABLE_OBJC_WEAK = YES; 250 | CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; 251 | CLANG_WARN_BOOL_CONVERSION = YES; 252 | CLANG_WARN_COMMA = YES; 253 | CLANG_WARN_CONSTANT_CONVERSION = YES; 254 | CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; 255 | CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; 256 | CLANG_WARN_DOCUMENTATION_COMMENTS = YES; 257 | CLANG_WARN_EMPTY_BODY = YES; 258 | CLANG_WARN_ENUM_CONVERSION = YES; 259 | CLANG_WARN_INFINITE_RECURSION = YES; 260 | CLANG_WARN_INT_CONVERSION = YES; 261 | CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; 262 | CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; 263 | CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; 264 | CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; 265 | CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; 266 | CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; 267 | CLANG_WARN_STRICT_PROTOTYPES = YES; 268 | CLANG_WARN_SUSPICIOUS_MOVE = YES; 269 | CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; 270 | CLANG_WARN_UNREACHABLE_CODE = YES; 271 | CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; 272 | COPY_PHASE_STRIP = NO; 273 | DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; 274 | ENABLE_NS_ASSERTIONS = NO; 275 | ENABLE_STRICT_OBJC_MSGSEND = YES; 276 | GCC_C_LANGUAGE_STANDARD = gnu11; 277 | GCC_NO_COMMON_BLOCKS = YES; 278 | GCC_WARN_64_TO_32_BIT_CONVERSION = YES; 279 | GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; 280 | GCC_WARN_UNDECLARED_SELECTOR = YES; 281 | GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; 282 | GCC_WARN_UNUSED_FUNCTION = YES; 283 | GCC_WARN_UNUSED_VARIABLE = YES; 284 | IPHONEOS_DEPLOYMENT_TARGET = 16.1; 285 | MTL_ENABLE_DEBUG_INFO = NO; 286 | MTL_FAST_MATH = YES; 287 | SDKROOT = iphoneos; 288 | SWIFT_COMPILATION_MODE = wholemodule; 289 | SWIFT_OPTIMIZATION_LEVEL = "-O"; 290 | VALIDATE_PRODUCT = YES; 291 | }; 292 | name = Release; 293 | }; 294 | AE921D35270D7B04000E95C6 /* Debug */ = { 295 | isa = XCBuildConfiguration; 296 | baseConfigurationReference = 9ED80CBFEB14F0F0F5121BC6 /* SampleCode.xcconfig */; 297 | buildSettings = { 298 | ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; 299 | ASSETCATALOG_COMPILER_INCLUDE_ALL_APPICON_ASSETS = NO; 300 | CODE_SIGN_ENTITLEMENTS = ImageBundleRecorder/ImageBundleRecorder.entitlements; 301 | CODE_SIGN_IDENTITY = "Apple Development"; 302 | CODE_SIGN_STYLE = Automatic; 303 | CURRENT_PROJECT_VERSION = 2; 304 | DEVELOPMENT_ASSET_PATHS = ""; 305 | DEVELOPMENT_TEAM = ""; 306 | ENABLE_PREVIEWS = YES; 307 | GENERATE_INFOPLIST_FILE = YES; 308 | INFOPLIST_FILE = ImageBundleRecorder/Info.plist; 309 | INFOPLIST_KEY_CFBundleDisplayName = "Image Bundle Recorder"; 310 | INFOPLIST_KEY_LSSupportsOpeningDocumentsInPlace = YES; 311 | INFOPLIST_KEY_NSCameraUsageDescription = "This app requires the camera for augmented reality."; 312 | INFOPLIST_KEY_NSLocationWhenInUseUsageDescription = ""; 313 | INFOPLIST_KEY_NSMicrophoneUsageDescription = ""; 314 | INFOPLIST_KEY_NSPhotoLibraryAddUsageDescription = "This app saves captures to phone."; 315 | INFOPLIST_KEY_NSPhotoLibraryUsageDescription = ""; 316 | INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; 317 | INFOPLIST_KEY_UILaunchStoryboardName = "Launch Screen"; 318 | INFOPLIST_KEY_UIRequiredDeviceCapabilities = "armv7 arkit"; 319 | INFOPLIST_KEY_UIRequiresFullScreen = YES; 320 | INFOPLIST_KEY_UIStatusBarHidden = YES; 321 | INFOPLIST_KEY_UIStatusBarStyle = UIStatusBarStyleDarkContent; 322 | INFOPLIST_KEY_UISupportedInterfaceOrientations = UIInterfaceOrientationPortrait; 323 | INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = UIInterfaceOrientationPortrait; 324 | INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = UIInterfaceOrientationPortrait; 325 | INFOPLIST_KEY_UISupportsDocumentBrowser = YES; 326 | IPHONEOS_DEPLOYMENT_TARGET = 16.0; 327 | LD_RUNPATH_SEARCH_PATHS = ( 328 | "$(inherited)", 329 | "@executable_path/Frameworks", 330 | ); 331 | MARKETING_VERSION = 1.0; 332 | PRODUCT_BUNDLE_IDENTIFIER = com.imageBundleRecorder; 333 | PRODUCT_NAME = "$(TARGET_NAME)"; 334 | PROVISIONING_PROFILE_SPECIFIER = ""; 335 | SWIFT_EMIT_LOC_STRINGS = YES; 336 | SWIFT_INSTALL_OBJC_HEADER = NO; 337 | SWIFT_OPTIMIZATION_LEVEL = "-Onone"; 338 | SWIFT_VERSION = 5.0; 339 | TARGETED_DEVICE_FAMILY = 1; 340 | }; 341 | name = Debug; 342 | }; 343 | AE921D36270D7B04000E95C6 /* Release */ = { 344 | isa = XCBuildConfiguration; 345 | baseConfigurationReference = 9ED80CBFEB14F0F0F5121BC6 /* SampleCode.xcconfig */; 346 | buildSettings = { 347 | ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; 348 | ASSETCATALOG_COMPILER_INCLUDE_ALL_APPICON_ASSETS = NO; 349 | CODE_SIGN_ENTITLEMENTS = ImageBundleRecorder/ImageBundleRecorder.entitlements; 350 | CODE_SIGN_IDENTITY = "Apple Development"; 351 | CODE_SIGN_STYLE = Automatic; 352 | CURRENT_PROJECT_VERSION = 2; 353 | DEVELOPMENT_ASSET_PATHS = ""; 354 | DEVELOPMENT_TEAM = ""; 355 | ENABLE_PREVIEWS = YES; 356 | GENERATE_INFOPLIST_FILE = YES; 357 | INFOPLIST_FILE = ImageBundleRecorder/Info.plist; 358 | INFOPLIST_KEY_CFBundleDisplayName = "Image Bundle Recorder"; 359 | INFOPLIST_KEY_LSSupportsOpeningDocumentsInPlace = YES; 360 | INFOPLIST_KEY_NSCameraUsageDescription = "This app requires the camera for augmented reality."; 361 | INFOPLIST_KEY_NSLocationWhenInUseUsageDescription = ""; 362 | INFOPLIST_KEY_NSMicrophoneUsageDescription = ""; 363 | INFOPLIST_KEY_NSPhotoLibraryAddUsageDescription = "This app saves captures to phone."; 364 | INFOPLIST_KEY_NSPhotoLibraryUsageDescription = ""; 365 | INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; 366 | INFOPLIST_KEY_UILaunchStoryboardName = "Launch Screen"; 367 | INFOPLIST_KEY_UIRequiredDeviceCapabilities = "armv7 arkit"; 368 | INFOPLIST_KEY_UIRequiresFullScreen = YES; 369 | INFOPLIST_KEY_UIStatusBarHidden = YES; 370 | INFOPLIST_KEY_UIStatusBarStyle = UIStatusBarStyleDarkContent; 371 | INFOPLIST_KEY_UISupportedInterfaceOrientations = UIInterfaceOrientationPortrait; 372 | INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = UIInterfaceOrientationPortrait; 373 | INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = UIInterfaceOrientationPortrait; 374 | INFOPLIST_KEY_UISupportsDocumentBrowser = YES; 375 | IPHONEOS_DEPLOYMENT_TARGET = 16.0; 376 | LD_RUNPATH_SEARCH_PATHS = ( 377 | "$(inherited)", 378 | "@executable_path/Frameworks", 379 | ); 380 | MARKETING_VERSION = 1.0; 381 | PRODUCT_BUNDLE_IDENTIFIER = com.imageBundleRecorder; 382 | PRODUCT_NAME = "$(TARGET_NAME)"; 383 | PROVISIONING_PROFILE_SPECIFIER = ""; 384 | SWIFT_EMIT_LOC_STRINGS = YES; 385 | SWIFT_INSTALL_OBJC_HEADER = NO; 386 | SWIFT_VERSION = 5.0; 387 | TARGETED_DEVICE_FAMILY = 1; 388 | }; 389 | name = Release; 390 | }; 391 | /* End XCBuildConfiguration section */ 392 | 393 | /* Begin XCConfigurationList section */ 394 | AE921D21270D7B02000E95C6 /* Build configuration list for PBXProject "ImageBundleRecorder" */ = { 395 | isa = XCConfigurationList; 396 | buildConfigurations = ( 397 | AE921D32270D7B04000E95C6 /* Debug */, 398 | AE921D33270D7B04000E95C6 /* Release */, 399 | ); 400 | defaultConfigurationIsVisible = 0; 401 | defaultConfigurationName = Release; 402 | }; 403 | AE921D34270D7B04000E95C6 /* Build configuration list for PBXNativeTarget "ImageBundleRecorder" */ = { 404 | isa = XCConfigurationList; 405 | buildConfigurations = ( 406 | AE921D35270D7B04000E95C6 /* Debug */, 407 | AE921D36270D7B04000E95C6 /* Release */, 408 | ); 409 | defaultConfigurationIsVisible = 0; 410 | defaultConfigurationName = Release; 411 | }; 412 | /* End XCConfigurationList section */ 413 | }; 414 | rootObject = AE921D1E270D7B02000E95C6 /* Project object */; 415 | } 416 | -------------------------------------------------------------------------------- /!App/ImageBundleApp/ImageBundleRecorder/Assets.xcassets/AccentColor.colorset/Contents.json: -------------------------------------------------------------------------------- 1 | { 2 | "colors" : [ 3 | { 4 | "idiom" : "universal" 5 | } 6 | ], 7 | "info" : { 8 | "author" : "xcode", 9 | "version" : 1 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /!App/ImageBundleApp/ImageBundleRecorder/Assets.xcassets/Contents.json: -------------------------------------------------------------------------------- 1 | { 2 | "info" : { 3 | "author" : "xcode", 4 | "version" : 1 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /!App/ImageBundleApp/ImageBundleRecorder/CameraController.swift: -------------------------------------------------------------------------------- 1 | /* 2 | See LICENSE folder for this sample’s licensing information. 3 | 4 | Abstract: 5 | An object that configures and manages the capture pipeline to stream video and LiDAR depth data. 6 | */ 7 | 8 | import Foundation 9 | import AVFoundation 10 | import CoreImage 11 | import CoreMotion 12 | 13 | protocol CaptureDataReceiver: AnyObject { 14 | func onNewData(capturedData: CameraCapturedData) 15 | func onNewPhotoData(capturedData: CameraCapturedData) 16 | } 17 | 18 | class CameraController: NSObject, ObservableObject, AVCaptureVideoDataOutputSampleBufferDelegate { 19 | 20 | enum ConfigurationError: Error { 21 | case lidarDeviceUnavailable 22 | case requiredFormatUnavailable 23 | } 24 | 25 | private let preferredWidthResolution = 4032 26 | 27 | private(set) var captureSession: AVCaptureSession! 28 | 29 | private let videoQueue = DispatchQueue(label: "com.example.apple-samplecode.VideoQueue", qos: .userInteractive) 30 | 31 | private var photoOutput: AVCapturePhotoOutput! 32 | private var depthDataOutput: AVCaptureDepthDataOutput! 33 | private var videoDataOutput: AVCaptureVideoDataOutput! 34 | private var outputVideoSync: AVCaptureDataOutputSynchronizer! 35 | private let metalDevice: MTLDevice? 36 | private var timer: Timer? 37 | private var motion: CMMotionManager! 38 | public var device: AVCaptureDevice! 39 | public var savingState = 0 // 0 - not saving, 1 - saving, 2 - error 40 | public var frameCount = 99999 41 | public var bundleSize = 42 42 | public var convertedDepth: AVDepthData! 43 | public var recordScene = false 44 | 45 | public var saveSuffix: String! 46 | public var rawFrameTimes: [Double] = [] 47 | public var rgbFrameTimes: [Double] = [] 48 | public var motionURL: URL! 49 | public var motionData: Data! 50 | public var imageRGBData: Data! 51 | public var imageRGBURL: URL! 52 | public var depthData: Data! 53 | public var depthURL: URL! 54 | public var imageRAWData: Data! 55 | public var imageRAWURL: URL! 56 | 57 | @Published var bundleFolder : URL? 58 | 59 | 60 | private var textureCache: CVMetalTextureCache! 61 | 62 | weak var delegate: CaptureDataReceiver? 63 | 64 | var isFilteringEnabled = true 65 | 66 | override init() { 67 | 68 | // create a texture cache to hold sample buffer textures 69 | metalDevice = MTLCreateSystemDefaultDevice() 70 | CVMetalTextureCacheCreate(nil, 71 | nil, 72 | metalDevice!, 73 | nil, 74 | &textureCache) 75 | 76 | super.init() 77 | 78 | do { 79 | try setupSession() 80 | } catch { 81 | fatalError("Unable to configure the capture session.") 82 | } 83 | 84 | 85 | } 86 | 87 | private func setupSession() throws { 88 | captureSession = AVCaptureSession() 89 | 90 | // configure the capture session 91 | captureSession.beginConfiguration() 92 | captureSession.sessionPreset = .photo 93 | 94 | try setupCaptureInput() 95 | setupCaptureOutputs() 96 | 97 | // finalize capture session configuration 98 | captureSession.commitConfiguration() 99 | } 100 | 101 | // MARK: Init Bundle 102 | private func initBundleFolder(suffix: String = "") { 103 | let currDate = Date() 104 | let dateFormatter = DateFormatter() 105 | dateFormatter.dateFormat = "yyyy-MM-dd_HH-mm-ss" 106 | let currDateString = dateFormatter.string(from : currDate) 107 | 108 | let DocumentDirectory = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0] 109 | let DirPath = DocumentDirectory.appendingPathComponent("bundle-" + currDateString + suffix + "/") 110 | 111 | do { 112 | try FileManager.default.createDirectory(atPath: DirPath.path, withIntermediateDirectories: true, attributes: nil) 113 | } catch let error as NSError { 114 | print("Unable to create directory \(error.debugDescription)") 115 | } 116 | 117 | bundleFolder = URL(fileURLWithPath: DirPath.path) 118 | } 119 | 120 | public func recordMotionBundle(saveSuffix: String = ""){ 121 | self.saveSuffix = saveSuffix 122 | 123 | recordScene = false 124 | 125 | motionData = Data.init() 126 | rawFrameTimes = [] 127 | rgbFrameTimes = [] 128 | frameCount = 0 129 | 130 | capturePhoto() 131 | } 132 | 133 | public func recordBundle(saveSuffix: String = ""){ 134 | self.saveSuffix = saveSuffix 135 | 136 | recordScene = true 137 | 138 | motionData = Data.init() 139 | imageRGBData = Data.init() 140 | imageRAWData = Data.init() 141 | depthData = Data.init() 142 | rawFrameTimes = [] 143 | rgbFrameTimes = [] 144 | frameCount = 0 145 | 146 | capturePhoto() 147 | } 148 | 149 | 150 | // MARK: Start Motion Capture 151 | private func startMotionCapture() { 152 | self.motion = CMMotionManager() 153 | 154 | if self.motion.isDeviceMotionAvailable { self.motion!.deviceMotionUpdateInterval = 1.0 / 200.0 // ask for 200Hz but max frequency is 100Hz for 14pro 155 | self.motion.showsDeviceMovementDisplay = true 156 | // get the attitude relative to the magnetic north reference frame 157 | self.motion.startDeviceMotionUpdates(using: .xArbitraryZVertical, 158 | to: OperationQueue(), withHandler: { (data, error) in 159 | // make sure the data is valid before accessing it 160 | if let validData = data { 161 | 162 | let timestamp = validData.timestamp 163 | 164 | let attitude = validData.attitude 165 | let quaternion = validData.attitude.quaternion 166 | let rotationRate = validData.rotationRate 167 | let userAcceleration = validData.userAcceleration 168 | let gravity = validData.gravity 169 | 170 | // generate header information to parse later in python 171 | var header = """ 172 | 173 | frameCount:\(String(describing: self.frameCount)),timestamp:\(String(describing: timestamp)), 174 | quaternionX:\(String(describing: quaternion.x)),quaternionY:\(String(describing: quaternion.y)), 175 | quaternionZ:\(String(describing: quaternion.z)),quaternionW:\(String(describing: quaternion.w)), 176 | rotationRateX:\(String(describing: rotationRate.x)),rotationRateY:\(String(describing: rotationRate.y)), 177 | rotationRateZ:\(String(describing: rotationRate.z)),roll:\(String(describing: attitude.roll)), 178 | pitch:\(String(describing: attitude.pitch)),yaw:\(String(describing: attitude.yaw)), 179 | userAccelerationX:\(String(describing: userAcceleration.x)),userAccelerationY:\(String(describing: userAcceleration.y)), 180 | userAccelerationZ:\(String(describing: userAcceleration.z)),gravityX:\(String(describing: gravity.x)), 181 | gravityY:\(String(describing: gravity.y)),gravityZ:\(String(describing: gravity.z)) 182 | 183 | """ 184 | header = header.components(separatedBy: .whitespacesAndNewlines).joined() // remove newlines 185 | let encodedHeader = [UInt8](header.utf8) 186 | 187 | if self.motionData != nil && self.frameCount != 99999 { 188 | self.motionData.append(encodedHeader, count: header.utf8.count) 189 | } 190 | } 191 | }) 192 | } 193 | } 194 | 195 | // MARK: Set Up Capture 196 | private func setupCaptureInput() throws { 197 | 198 | self.startMotionCapture() 199 | 200 | // LiDAR + main wide lens 201 | self.device = AVCaptureDevice.default(.builtInLiDARDepthCamera, for: .video, position: .back) 202 | 203 | guard let format = (self.device.formats.last { format in 204 | format.formatDescription.dimensions.width == preferredWidthResolution && 205 | format.formatDescription.mediaSubType.rawValue == kCVPixelFormatType_420YpCbCr8BiPlanarFullRange && 206 | !format.isVideoBinned && 207 | !format.supportedDepthDataFormats.isEmpty 208 | }) else { 209 | print("No such image format.") 210 | throw ConfigurationError.requiredFormatUnavailable 211 | } 212 | 213 | guard let depthFormat = (format.supportedDepthDataFormats.last { depthFormat in 214 | depthFormat.formatDescription.mediaSubType.rawValue == kCVPixelFormatType_DepthFloat16 215 | }) else { 216 | print("No such depth format.") 217 | throw ConfigurationError.requiredFormatUnavailable 218 | } 219 | 220 | // begin the device configuration 221 | try self.device.lockForConfiguration() 222 | 223 | // configure the device and depth formats 224 | self.device.activeFormat = format 225 | self.device.activeDepthDataFormat = depthFormat 226 | self.device.focusMode = .continuousAutoFocus 227 | self.device.activeVideoMaxFrameDuration = CMTimeMake(value: 1, timescale: 30) // 30 fps 228 | self.device.activeVideoMinFrameDuration = CMTimeMake(value: 1, timescale: 30) // 30 fps 229 | self.device.activeDepthDataMinFrameDuration = CMTimeMake(value: 1, timescale: 30) // 30 fps 230 | 231 | // finish the device configuration 232 | self.device.unlockForConfiguration() 233 | 234 | print("Selected video format: \(self.device.activeFormat)") 235 | print("Selected depth format: \(String(describing: self.device.activeDepthDataFormat))") 236 | 237 | // add a device input to the capture session 238 | let deviceInput = try AVCaptureDeviceInput(device: self.device) 239 | captureSession.addInput(deviceInput) 240 | } 241 | 242 | private func setupCaptureOutputs() { 243 | // create an object to output video sample buffers 244 | videoDataOutput = AVCaptureVideoDataOutput() 245 | videoDataOutput.videoSettings = [(kCVPixelBufferPixelFormatTypeKey as String): NSNumber(value: 1111970369), // BGRA stream 246 | (kCVPixelBufferWidthKey as String): NSNumber(value: 1920), 247 | (kCVPixelBufferHeightKey as String): NSNumber(value: 1440)] 248 | captureSession.addOutput(videoDataOutput) 249 | 250 | // create an object to output depth data. 251 | depthDataOutput = AVCaptureDepthDataOutput() 252 | depthDataOutput.isFilteringEnabled = true 253 | captureSession.addOutput(depthDataOutput) 254 | 255 | 256 | // create an object to synchronize the delivery of depth and video data 257 | outputVideoSync = AVCaptureDataOutputSynchronizer(dataOutputs: [depthDataOutput, videoDataOutput]) 258 | outputVideoSync.setDelegate(self, queue: videoQueue) 259 | 260 | // enable camera intrinsics matrix delivery 261 | guard let outputConnection = videoDataOutput.connection(with: .video) else { return } 262 | if outputConnection.isCameraIntrinsicMatrixDeliverySupported { 263 | outputConnection.isCameraIntrinsicMatrixDeliveryEnabled = true 264 | } 265 | 266 | // create an object to output photos 267 | photoOutput = AVCapturePhotoOutput() 268 | captureSession.addOutput(photoOutput) 269 | photoOutput.maxPhotoQualityPrioritization = .speed 270 | photoOutput.isAppleProRAWEnabled = false // if true, captures are extremely slow as they stitch/process images 271 | photoOutput.maxPhotoDimensions = .init(width: 8064, height: 6048) // only gives 4k even if you ask for 8k unless you set proraw true 272 | 273 | // enable delivery of depth data after adding the output to the capture session 274 | photoOutput.isDepthDataDeliveryEnabled = true 275 | } 276 | 277 | func startStream() { 278 | captureSession.startRunning() 279 | } 280 | 281 | func stopStream() { 282 | captureSession.stopRunning() 283 | } 284 | } 285 | 286 | // MARK: Synchronized RGB and Depth 287 | extension CameraController: AVCaptureDataOutputSynchronizerDelegate { 288 | 289 | func dataOutputSynchronizer(_ synchronizer: AVCaptureDataOutputSynchronizer, 290 | didOutput synchronizedDataCollection: AVCaptureSynchronizedDataCollection) { 291 | 292 | // retrieve the synchronized depth and sample buffer container objects 293 | guard let syncedDepthData = synchronizedDataCollection.synchronizedData(for: depthDataOutput) as? AVCaptureSynchronizedDepthData, 294 | let syncedVideoData = synchronizedDataCollection.synchronizedData(for: videoDataOutput) as? AVCaptureSynchronizedSampleBufferData else { return } 295 | 296 | guard let pixelBuffer = syncedVideoData.sampleBuffer.imageBuffer else { return } 297 | 298 | let timestamp = syncedDepthData.timestamp.seconds 299 | self.convertedDepth = syncedDepthData.depthData.converting(toDepthDataType: kCVPixelFormatType_DepthFloat16) 300 | var data: CameraCapturedData! 301 | 302 | if (self.frameCount != 99999 && self.recordScene) || (self.recordScene && self.rawFrameTimes.contains(timestamp)){ 303 | // if long-burst being recorded, write data 304 | self.writeImageBGRA(sampleBuffer: syncedVideoData.sampleBuffer, timestamp: timestamp, frameCount: self.frameCount) 305 | self.writeDepth(depthData: syncedDepthData.depthData, timestamp: timestamp, frameCount: self.frameCount) 306 | self.rgbFrameTimes.append(round(timestamp * 1000) / 1000.0) 307 | } 308 | data = CameraCapturedData(depth: self.convertedDepth.depthDataMap.texture(withFormat: .r16Float, planeIndex: 0, addToCache: textureCache), 309 | color: pixelBuffer.texture(withFormat: .bgra8Unorm, planeIndex: 0, addToCache: textureCache), 310 | timestamp: timestamp) 311 | 312 | 313 | delegate?.onNewPhotoData(capturedData: data) 314 | } 315 | } 316 | 317 | 318 | extension CameraController: AVCapturePhotoCaptureDelegate { 319 | 320 | // MARK: Capture Photo 321 | func capturePhoto() { 322 | var photoSettings: AVCapturePhotoSettings 323 | 324 | 325 | // MARK: Terminate Recording 326 | if self.frameCount == self.bundleSize { 327 | 328 | // delay so we catch last RGB/depth pair if it's delayed 329 | DispatchQueue.main.asyncAfter(deadline: .now() + 0.25) { 330 | self.frameCount = 99999 331 | print("Resetting camera back to autoexposure.") 332 | 333 | do{ 334 | try self.device.lockForConfiguration() 335 | } catch { 336 | fatalError("Device could not be locked.") 337 | } 338 | 339 | self.device.exposureMode = .continuousAutoExposure 340 | self.device.focusMode = .continuousAutoFocus 341 | self.device.unlockForConfiguration() 342 | 343 | print("Writing to disk.") 344 | self.savingState = 1 345 | } 346 | 347 | // delay more so UI catches the 'savingData' change 348 | DispatchQueue.main.asyncAfter(deadline: .now() + 0.5) { [self] in 349 | 350 | var missingTimes : [Double] = [] 351 | for elem in self.rawFrameTimes { 352 | if !self.rgbFrameTimes.contains(elem){ 353 | missingTimes.append(elem) 354 | } 355 | } 356 | 357 | if missingTimes.count > 0 || self.rgbFrameTimes.count < self.bundleSize { 358 | // something broke, missing synced frames 359 | print("Missing times: ", missingTimes) 360 | 361 | self.motionData = nil 362 | self.imageRGBData = nil 363 | self.imageRAWData = nil 364 | self.depthData = nil 365 | 366 | self.savingState = 2 // error 367 | DispatchQueue.main.asyncAfter(deadline: .now() + 5) { 368 | self.savingState = 0 // clear error in 5 seconds 369 | } 370 | 371 | return 372 | } 373 | 374 | // make folders to save files to 375 | if self.recordScene { 376 | if self.saveSuffix != "" { 377 | self.initBundleFolder(suffix: "-" + self.saveSuffix) 378 | } else { 379 | self.initBundleFolder() 380 | } 381 | 382 | print("Recording bundle into \(String(describing: self.bundleFolder!.path))") 383 | 384 | self.motionURL = URL(fileURLWithPath: "motion", relativeTo: self.bundleFolder).appendingPathExtension("bin") 385 | self.imageRGBURL = URL(fileURLWithPath: "imageRGB", relativeTo: self.bundleFolder).appendingPathExtension("bin") 386 | self.imageRAWURL = URL(fileURLWithPath: "imageRAW", relativeTo: self.bundleFolder).appendingPathExtension("bin") 387 | self.depthURL = URL(fileURLWithPath: "depth", relativeTo: self.bundleFolder).appendingPathExtension("bin") 388 | 389 | } else { // motion bundle 390 | if self.saveSuffix != "" { 391 | self.initBundleFolder(suffix: "-" + self.saveSuffix + "-motion") 392 | } else { 393 | self.initBundleFolder(suffix: "-motion") 394 | } 395 | 396 | print("Recording motion into \(String(describing: self.bundleFolder!.path))") 397 | 398 | self.motionURL = URL(fileURLWithPath: "motion", relativeTo: self.bundleFolder).appendingPathExtension("bin") 399 | } 400 | 401 | 402 | try? self.motionData.write(to: self.motionURL) 403 | 404 | // record to disk 405 | if self.recordScene { 406 | try? self.imageRGBData.write(to: self.imageRGBURL) 407 | try? self.imageRAWData.write(to: self.imageRAWURL) 408 | try? self.depthData.write(to: self.depthURL) 409 | } 410 | 411 | self.recordScene = false 412 | self.savingState = 0 413 | 414 | // clear memory 415 | self.motionData = nil 416 | self.imageRGBData = nil 417 | self.imageRAWData = nil 418 | self.depthData = nil 419 | 420 | print("Done recording bundle.") 421 | } 422 | return 423 | 424 | } else if self.frameCount >= self.bundleSize { 425 | self.frameCount = 99999 426 | return // don't record past bundle size 427 | } 428 | 429 | if photoOutput.availableRawPhotoPixelFormatTypes.count > 0 { 430 | 431 | for format in photoOutput.availableRawPhotoPixelFormatTypes { 432 | print(format) 433 | } 434 | 435 | let rawType = photoOutput.availableRawPhotoPixelFormatTypes.first! 436 | 437 | // set ISO and Exposure Time 438 | do{ 439 | try self.device.lockForConfiguration() 440 | } catch { 441 | fatalError("Device could not be locked.") 442 | } 443 | 444 | let deviceISO = device.iso 445 | let deviceExposureDuration = device.exposureDuration.seconds 446 | 447 | let iso = deviceISO 448 | let maxExposureDuration: CMTime = CMTime(seconds: 0.041, preferredTimescale: CMTimeScale(1000000)) 449 | let exposureDuration: CMTime = min(device.exposureDuration, maxExposureDuration) // Don't drop under 21fps 450 | 451 | self.device.setExposureModeCustom(duration: exposureDuration, iso: iso) 452 | self.device.focusMode = .locked 453 | 454 | self.device.unlockForConfiguration() 455 | 456 | if frameCount == 0 { // sleep for 200 milliseconds to let exposure catch up for first frame 457 | usleep(200000) 458 | } 459 | 460 | photoSettings = AVCapturePhotoSettings(rawPixelFormatType: rawType, processedFormat: nil) 461 | photoSettings.isDepthDataDeliveryEnabled = false 462 | } else { 463 | fatalError("No RAW format found.") 464 | } 465 | 466 | photoOutput.capturePhoto(with: photoSettings, delegate: self) 467 | } 468 | 469 | // MARK: Photo Output 470 | func photoOutput(_ output: AVCapturePhotoOutput, didFinishProcessingPhoto photo: AVCapturePhoto, error: Error?) { 471 | 472 | // Retrieve the image and depth data. 473 | guard let pixelBuffer = photo.pixelBuffer else {return} 474 | 475 | if self.frameCount >= self.bundleSize { 476 | self.frameCount = 99999 477 | return // don't record past bundle size 478 | } 479 | 480 | 481 | if self.recordScene { 482 | self.writeImageRAW(photo: photo, timestamp: photo.timestamp.seconds, frameCount: self.frameCount) 483 | self.rawFrameTimes.append(round(photo.timestamp.seconds * 1000) / 1000.0) 484 | } 485 | 486 | self.frameCount += 1 487 | self.capturePhoto() 488 | 489 | } 490 | 491 | // MARK: Write Depth 492 | func convertLensDistortionLookupTable(lookupTable: Data) -> [Float] { 493 | let tableLength = lookupTable.count / MemoryLayout.size 494 | var floatArray: [Float] = Array(repeating: 0, count: tableLength) 495 | _ = floatArray.withUnsafeMutableBytes{lookupTable.copyBytes(to: $0)} 496 | return floatArray 497 | } 498 | 499 | func convertIntrinsicMatrix(intrinsicMatrix: simd_float3x3) -> [[Float]]{ 500 | return (0 ..< 3).map{ x in 501 | (0 ..< 3).map{ y in intrinsicMatrix[x][y]} 502 | } 503 | } 504 | 505 | func writeDepth(depthData: AVDepthData, timestamp: Double, frameCount: Int) { 506 | let pixelBuffer = depthData.depthDataMap 507 | guard CVPixelBufferLockBaseAddress(pixelBuffer, .readOnly) == noErr else { return } 508 | defer { CVPixelBufferUnlockBaseAddress(pixelBuffer, .readOnly) } 509 | 510 | guard let srcPtr = CVPixelBufferGetBaseAddress(pixelBuffer) else { 511 | print("Failed to retrieve depth pointer.") 512 | return 513 | } 514 | 515 | let rowBytes : Int = CVPixelBufferGetBytesPerRow(pixelBuffer) 516 | let width = Int(CVPixelBufferGetWidth(pixelBuffer)) 517 | let height = Int(CVPixelBufferGetHeight(pixelBuffer)) 518 | let capacity = CVPixelBufferGetDataSize(pixelBuffer) 519 | let uint8Pointer = srcPtr.bindMemory(to: UInt8.self, capacity: capacity) 520 | 521 | let intrinsicWidth = depthData.cameraCalibrationData!.intrinsicMatrixReferenceDimensions.width 522 | let intrinsicHeight = depthData.cameraCalibrationData!.intrinsicMatrixReferenceDimensions.height 523 | let intrinsicMatrix = depthData.cameraCalibrationData!.intrinsicMatrix 524 | let lensDistortion = depthData.cameraCalibrationData!.lensDistortionLookupTable! 525 | let lensInverseDistortion = depthData.cameraCalibrationData!.inverseLensDistortionLookupTable! 526 | let depthAccuracy = depthData.depthDataAccuracy.rawValue 527 | 528 | var header = """ 529 | 530 | description:depthmap, 531 | frameCount:\(String(describing: frameCount)), 532 | timestamp:\(String(describing: timestamp)), 533 | height:\(String(describing: height)), 534 | width:\(String(describing: width)), 535 | rowBytes:\(String(describing: rowBytes)), 536 | intrinsicWidth:\(String(describing: intrinsicWidth)), 537 | intrinsicHeight:\(String(describing: intrinsicHeight)), 538 | intrinsicMatrix:\(String(describing: convertIntrinsicMatrix(intrinsicMatrix: intrinsicMatrix))), 539 | lensDistortion:\(String(describing: convertLensDistortionLookupTable(lookupTable: lensDistortion))), 540 | lensInverseDistortion:\(String(describing: convertLensDistortionLookupTable(lookupTable: lensInverseDistortion))), 541 | depthAccuracy:\(String(describing: depthAccuracy)) 542 | 543 | """ 544 | 545 | header = header.components(separatedBy: .whitespacesAndNewlines).joined() // remove newlines 546 | let encodedHeader = [UInt8](header.utf8) 547 | self.depthData.append(encodedHeader, count: header.utf8.count) 548 | self.depthData.append(uint8Pointer, count: Int(rowBytes * height)) 549 | } 550 | 551 | // MARK: Write RAW 552 | func writeImageRAW(photo: AVCapturePhoto, timestamp: Double, frameCount: Int) { 553 | guard let pixelBuffer = photo.pixelBuffer else { return } 554 | 555 | guard CVPixelBufferLockBaseAddress(pixelBuffer, .readOnly) == noErr else { 556 | print("Failed to retrieve readonly base address for RAW.") 557 | return 558 | } 559 | defer { CVPixelBufferUnlockBaseAddress(pixelBuffer, .readOnly) } 560 | 561 | 562 | guard let srcPtr = CVPixelBufferGetBaseAddress(pixelBuffer) else { 563 | print("Failed to retrieve RAW pointer.") 564 | return 565 | } 566 | 567 | let rowBytes : Int = CVPixelBufferGetBytesPerRow(pixelBuffer) 568 | let width = Int(CVPixelBufferGetWidth(pixelBuffer)) 569 | let height = Int(CVPixelBufferGetHeight(pixelBuffer)) 570 | let capacity = CVPixelBufferGetDataSize(pixelBuffer) 571 | let uint8Pointer = srcPtr.bindMemory(to: UInt8.self, capacity: capacity) 572 | 573 | let exifdata = photo.metadata["{Exif}"] as! NSDictionary 574 | let DNGdata = photo.metadata["{DNG}"] as! NSDictionary 575 | let brightnessValue = exifdata["BrightnessValue"] != nil ? exifdata["BrightnessValue"]! : -1.0 576 | 577 | var header = """ 578 | 579 | description:imageRAW, 580 | frameCount:\(String(describing: frameCount)), 581 | timestamp:\(String(describing: timestamp)), 582 | height:\(String(describing: height)), 583 | width:\(String(describing: width)), 584 | rowBytes:\(String(describing: rowBytes)), 585 | ISO:\(String(describing: (exifdata["ISOSpeedRatings"] as! NSArray)[0])), 586 | exposureTime:\(String(describing: exifdata["ExposureTime"]!)), 587 | apertureValue:\(String(describing: exifdata["ApertureValue"]!)), 588 | brightnessValue:\(String(describing: brightnessValue)), 589 | shutterSpeedValue:\(String(describing: exifdata["ShutterSpeedValue"]!)), 590 | pixelXDimension:\(String(describing: exifdata["PixelXDimension"]!)), 591 | pixelYDimension:\(String(describing: exifdata["PixelYDimension"]!)), 592 | blackLevel:\(String(describing: DNGdata["BlackLevel"]!)), 593 | whiteLevel:\(String(describing: DNGdata["WhiteLevel"]!)) 594 | 595 | """ 596 | 597 | header = header.components(separatedBy: .whitespacesAndNewlines).joined() // remove newlines 598 | let encodedHeader = [UInt8](header.utf8) 599 | self.imageRAWData.append(encodedHeader, count: header.utf8.count) 600 | self.imageRAWData.append(uint8Pointer, count: Int(rowBytes * height)) 601 | } 602 | 603 | 604 | // MARK: Write BGRA 605 | func writeImageBGRA(sampleBuffer: CMSampleBuffer, timestamp: Double, frameCount: Int) { 606 | 607 | var intrinsicMatrix: simd_float3x3? 608 | 609 | if let camData = CMGetAttachment(sampleBuffer, key: kCMSampleBufferAttachmentKey_CameraIntrinsicMatrix, attachmentModeOut: nil) as? Data { 610 | intrinsicMatrix = camData.withUnsafeBytes { $0.pointee } 611 | } 612 | 613 | guard let pixelBuffer = sampleBuffer.imageBuffer else { return } 614 | 615 | guard CVPixelBufferLockBaseAddress(pixelBuffer, .readOnly) == noErr else { return } 616 | defer { CVPixelBufferUnlockBaseAddress(pixelBuffer, .readOnly) } 617 | 618 | 619 | guard let srcPtr = CVPixelBufferGetBaseAddress(pixelBuffer) else { 620 | print("Failed to retrieve BGRA pointer.") 621 | return 622 | } 623 | 624 | let rowBytes : Int = CVPixelBufferGetBytesPerRow(pixelBuffer) 625 | let width = Int(CVPixelBufferGetWidth(pixelBuffer)) 626 | let height = Int(CVPixelBufferGetHeight(pixelBuffer)) 627 | let capacity = CVPixelBufferGetDataSize(pixelBuffer) 628 | let uint8Pointer = srcPtr.bindMemory(to: UInt8.self, capacity: capacity) 629 | 630 | 631 | var header = """ 632 | 633 | description:imageBGRA, 634 | frameCount:\(String(describing: frameCount)), 635 | timestamp:\(String(describing: timestamp)), 636 | height:\(String(describing: height)), 637 | width:\(String(describing: width)), 638 | rowBytes:\(String(describing: rowBytes)), 639 | intrinsicMatrix:\(String(describing: convertIntrinsicMatrix(intrinsicMatrix: intrinsicMatrix!))) 640 | 641 | """ 642 | 643 | header = header.components(separatedBy: .whitespacesAndNewlines).joined() // remove newlines 644 | let encodedHeader = [UInt8](header.utf8) 645 | self.imageRGBData.append(encodedHeader, count: header.utf8.count) 646 | self.imageRGBData.append(uint8Pointer, count: Int(rowBytes * height)) 647 | 648 | } 649 | } 650 | 651 | -------------------------------------------------------------------------------- /!App/ImageBundleApp/ImageBundleRecorder/CameraManager.swift: -------------------------------------------------------------------------------- 1 | /* 2 | See LICENSE folder for this sample’s licensing information. 3 | 4 | Abstract: 5 | An object that connects the CameraController and the views. 6 | */ 7 | 8 | import Foundation 9 | import SwiftUI 10 | import Combine 11 | import simd 12 | import AVFoundation 13 | 14 | final class MetalTextureContent { 15 | var texture: MTLTexture? 16 | } 17 | 18 | extension CVPixelBuffer { 19 | 20 | func texture(withFormat pixelFormat: MTLPixelFormat, planeIndex: Int, addToCache cache: CVMetalTextureCache) -> MTLTexture? { 21 | 22 | let width = CVPixelBufferGetWidthOfPlane(self, planeIndex) 23 | let height = CVPixelBufferGetHeightOfPlane(self, planeIndex) 24 | 25 | var cvtexture: CVMetalTexture? 26 | CVMetalTextureCacheCreateTextureFromImage(nil, cache, self, nil, pixelFormat, width, height, planeIndex, &cvtexture) 27 | guard let texture = cvtexture else { return nil } 28 | return CVMetalTextureGetTexture(texture) 29 | } 30 | 31 | } 32 | 33 | 34 | class CameraManager: ObservableObject, CaptureDataReceiver { 35 | 36 | var capturedData: CameraCapturedData 37 | @Published var isFilteringDepth: Bool { 38 | didSet { 39 | controller.isFilteringEnabled = isFilteringDepth 40 | } 41 | } 42 | @Published var orientation = UIDevice.current.orientation 43 | 44 | var fpsArray = Array(repeating: 0.0, count: 30) 45 | var fpsCount = 0 46 | var timePrev = 0.0 47 | 48 | var controller: CameraController 49 | var cancellables = Set() 50 | var session: AVCaptureSession { controller.captureSession } 51 | 52 | @Published var iso : Float = 0 53 | @Published var exposureTime : Double = 0 54 | @Published var frameCount = 99999 55 | @Published var savingState = 0 56 | 57 | init() { 58 | // Create an object to store the captured data for the views to present. 59 | capturedData = CameraCapturedData(depth: nil, color: nil, timestamp: 0) 60 | controller = CameraController() 61 | controller.isFilteringEnabled = true 62 | controller.startStream() 63 | isFilteringDepth = controller.isFilteringEnabled 64 | 65 | NotificationCenter.default.publisher(for: UIDevice.orientationDidChangeNotification).sink { _ in 66 | self.orientation = UIDevice.current.orientation 67 | }.store(in: &cancellables) 68 | controller.delegate = self 69 | } 70 | 71 | func resumeStream() { 72 | controller.startStream() 73 | } 74 | 75 | func onNewPhotoData(capturedData: CameraCapturedData) { 76 | // Because the views hold a reference to `capturedData`, the app updates each texture separately. 77 | self.capturedData.depthContent.texture = capturedData.depth 78 | self.capturedData.colorRGBContent.texture = capturedData.color 79 | self.capturedData.timestamp = capturedData.timestamp 80 | 81 | if capturedData.timestamp != nil && 1.0/(capturedData.timestamp! - self.timePrev) < 10000 { // skip double-frames 82 | self.fpsCount += 1 83 | self.fpsArray[self.fpsCount % self.fpsArray.count] = 1.0/(capturedData.timestamp! - self.timePrev) 84 | // print("Current FPS: ", self.fpsArray.reduce(0.0, +)/(Double(self.fpsArray.count))) 85 | self.timePrev = capturedData.timestamp! 86 | } 87 | 88 | DispatchQueue.main.async { // Hacky, for printing to UI 89 | self.iso = self.controller.device.iso 90 | self.exposureTime = self.controller.device.exposureDuration.seconds 91 | self.frameCount = self.controller.frameCount 92 | self.savingState = self.controller.savingState 93 | 94 | } 95 | } 96 | 97 | func onNewData(capturedData: CameraCapturedData) { 98 | // do nothing 99 | } 100 | 101 | } 102 | 103 | class CameraCapturedData { 104 | 105 | var depth: MTLTexture? 106 | var depthContent: MetalTextureContent 107 | var color: MTLTexture? 108 | var colorRGBContent: MetalTextureContent 109 | var timestamp: Double? 110 | 111 | init(depth: MTLTexture?, 112 | color: MTLTexture?, 113 | timestamp: Double?) { 114 | 115 | self.depth = depth 116 | self.depthContent = MetalTextureContent() 117 | self.depthContent.texture = depth 118 | self.color = color 119 | self.colorRGBContent = MetalTextureContent() 120 | self.colorRGBContent.texture = color 121 | self.timestamp = timestamp 122 | } 123 | } 124 | -------------------------------------------------------------------------------- /!App/ImageBundleApp/ImageBundleRecorder/ImageBundleRecorder.entitlements: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | com.apple.developer.kernel.increased-memory-limit 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /!App/ImageBundleApp/ImageBundleRecorder/ImageBundleRecorder.swift: -------------------------------------------------------------------------------- 1 | /* 2 | See LICENSE folder for this sample’s licensing information. 3 | 4 | Abstract: 5 | The single entry point for DepthBundleRecorder. 6 | */ 7 | 8 | import SwiftUI 9 | @main 10 | struct ImageBundleRecorder: App { 11 | var body: some Scene { 12 | WindowGroup { 13 | MetalDepthView() 14 | } 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /!App/ImageBundleApp/ImageBundleRecorder/Info.plist: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | LSSupportsOpeningDocumentsInPlace 6 | 7 | 8 | 9 | UIApplicationSceneManifest 10 | 11 | UIFileSharingEnabled 12 | 13 | 14 | 15 | UIRequiresFullScreen - 2 16 | 17 | UISupportedInterfaceOrientations~ipad - 2 18 | 19 | UIInterfaceOrientationPortrait 20 | UIInterfaceOrientationPortraitUpsideDown 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /!App/ImageBundleApp/ImageBundleRecorder/MetalTextureView.swift: -------------------------------------------------------------------------------- 1 | /* 2 | See LICENSE folder for this sample’s licensing information. 3 | 4 | Abstract: 5 | A view that displays a Metal-rendered depth visualization. 6 | */ 7 | 8 | import Foundation 9 | import SwiftUI 10 | import MetalKit 11 | import Metal 12 | 13 | // Display `MTLTextures` in an `MTKView` using SwiftUI. 14 | //- Tag: MTKCoordinator` 15 | class MTKCoordinator: NSObject, MTKViewDelegate { 16 | var content: MetalTextureContent 17 | let view: MTKView 18 | var pipelineState: MTLRenderPipelineState! 19 | var metalCommandQueue: MTLCommandQueue! 20 | 21 | init(content: MetalTextureContent, view: MTKView) { 22 | self.content = content 23 | self.view = view 24 | if let metalDevice = MTLCreateSystemDefaultDevice() { 25 | view.device = metalDevice 26 | self.metalCommandQueue = metalDevice.makeCommandQueue()! 27 | } 28 | super.init() 29 | 30 | prepareFunctions() 31 | } 32 | func prepareFunctions() { 33 | guard let metalDevice = view.device else { fatalError("Expected a Metal device.") } 34 | do { 35 | let library = metalDevice.makeDefaultLibrary() 36 | let pipelineDescriptor = MTLRenderPipelineDescriptor() 37 | pipelineDescriptor.colorAttachments[0].pixelFormat = .bgra8Unorm 38 | pipelineDescriptor.vertexFunction = library!.makeFunction(name: "planeVertexShader") 39 | pipelineDescriptor.fragmentFunction = library!.makeFunction(name: "planeFragmentShader") 40 | pipelineDescriptor.vertexDescriptor = createPlaneMetalVertexDescriptor() 41 | pipelineState = try metalDevice.makeRenderPipelineState(descriptor: pipelineDescriptor) 42 | } catch { 43 | print("Unexpected error: \(error).") 44 | } 45 | } 46 | func createPlaneMetalVertexDescriptor() -> MTLVertexDescriptor { 47 | let mtlVertexDescriptor: MTLVertexDescriptor = MTLVertexDescriptor() 48 | // Store position in `attribute[[0]]`. 49 | mtlVertexDescriptor.attributes[0].format = .float2 50 | mtlVertexDescriptor.attributes[0].offset = 0 51 | mtlVertexDescriptor.attributes[0].bufferIndex = 0 52 | 53 | // Store texture coordinates in `attribute[[1]]`. 54 | mtlVertexDescriptor.attributes[1].format = .float2 55 | mtlVertexDescriptor.attributes[1].offset = 8 56 | mtlVertexDescriptor.attributes[1].bufferIndex = 0 57 | 58 | // Set stride to twice the `float2` bytes per vertex. 59 | mtlVertexDescriptor.layouts[0].stride = 2 * MemoryLayout>.stride 60 | mtlVertexDescriptor.layouts[0].stepRate = 1 61 | mtlVertexDescriptor.layouts[0].stepFunction = .perVertex 62 | 63 | return mtlVertexDescriptor 64 | } 65 | 66 | func mtkView(_ view: MTKView, drawableSizeWillChange size: CGSize) { 67 | 68 | } 69 | 70 | // Draw a textured quad. 71 | func draw(in view: MTKView) { 72 | guard content.texture != nil else { 73 | // print("There's no content to display.") 74 | return 75 | } 76 | guard let commandBuffer = metalCommandQueue.makeCommandBuffer() else { return } 77 | guard let passDescriptor = view.currentRenderPassDescriptor else { return } 78 | guard let encoder = commandBuffer.makeRenderCommandEncoder(descriptor: passDescriptor) else { return } 79 | let vertexData: [Float] = [ -1, -1, 1, 1, 80 | 1, -1, 1, 0, 81 | -1, 1, 0, 1, 82 | 1, 1, 0, 0] 83 | encoder.setVertexBytes(vertexData, length: vertexData.count * MemoryLayout.stride, index: 0) 84 | encoder.setFragmentTexture(content.texture, index: 0) 85 | encoder.setRenderPipelineState(pipelineState) 86 | encoder.drawPrimitives(type: .triangleStrip, vertexStart: 0, vertexCount: 4) 87 | encoder.endEncoding() 88 | commandBuffer.present(view.currentDrawable!) 89 | commandBuffer.commit() 90 | } 91 | 92 | } 93 | //- Tag: MetalTextureView 94 | struct MetalTextureView: UIViewRepresentable { 95 | var mtkView: MTKView 96 | var content: MetalTextureContent 97 | func makeCoordinator() -> MTKCoordinator { 98 | MTKCoordinator(content: content, view: mtkView) 99 | } 100 | func makeUIView(context: UIViewRepresentableContext) -> MTKView { 101 | mtkView.delegate = context.coordinator 102 | mtkView.preferredFramesPerSecond = 120 103 | mtkView.backgroundColor = context.environment.colorScheme == .dark ? .black : .white 104 | mtkView.isOpaque = true 105 | mtkView.framebufferOnly = false 106 | mtkView.clearColor = MTLClearColor(red: 0, green: 0, blue: 0, alpha: 0) 107 | mtkView.drawableSize = mtkView.frame.size 108 | mtkView.enableSetNeedsDisplay = false 109 | mtkView.colorPixelFormat = .bgra8Unorm 110 | return mtkView 111 | } 112 | 113 | // `UIViewRepresentable` requires this implementation; however, the sample 114 | // app doesn't use it. Instead, `MTKView.delegate` handles display updates. 115 | func updateUIView(_ uiView: MTKView, context: UIViewRepresentableContext) { 116 | 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /!App/ImageBundleApp/ImageBundleRecorder/MetalTextureViewDepth.swift: -------------------------------------------------------------------------------- 1 | /* 2 | See LICENSE folder for this sample’s licensing information. 3 | 4 | Abstract: 5 | A view that displays scene depth information. 6 | */ 7 | 8 | import Foundation 9 | import SwiftUI 10 | import MetalKit 11 | import Metal 12 | 13 | //- Tag: CoordinatorDepth 14 | final class CoordinatorDepth: MTKCoordinator { 15 | @Binding var confSelection: Int 16 | init(mtkView: MTKView, depthContent: MetalTextureContent, confSelection: Binding) { 17 | self._confSelection = confSelection 18 | super.init(content: depthContent, view: mtkView) 19 | } 20 | override func prepareFunctions() { 21 | guard let metalDevice = view.device else { fatalError("Expected a Metal device.") } 22 | do { 23 | let library = metalDevice.makeDefaultLibrary() 24 | let pipelineDescriptor = MTLRenderPipelineDescriptor() 25 | pipelineDescriptor.colorAttachments[0].pixelFormat = .bgra8Unorm 26 | pipelineDescriptor.vertexFunction = library!.makeFunction(name: "planeVertexShader") 27 | pipelineDescriptor.fragmentFunction = library!.makeFunction(name: "planeFragmentShaderDepth") 28 | pipelineDescriptor.vertexDescriptor = createPlaneMetalVertexDescriptor() 29 | pipelineState = try metalDevice.makeRenderPipelineState(descriptor: pipelineDescriptor) 30 | } catch { 31 | print("Unexpected error: \(error).") 32 | } 33 | } 34 | 35 | } 36 | 37 | struct MetalTextureViewDepth: UIViewRepresentable { 38 | var mtkView: MTKView 39 | var content: MetalTextureContent 40 | 41 | @Binding var confSelection: Int 42 | func makeCoordinator() -> CoordinatorDepth { 43 | CoordinatorDepth(mtkView: mtkView, depthContent: content, confSelection: $confSelection) 44 | } 45 | 46 | func makeUIView(context: UIViewRepresentableContext) -> MTKView { 47 | mtkView.delegate = context.coordinator 48 | mtkView.preferredFramesPerSecond = 120 49 | mtkView.backgroundColor = context.environment.colorScheme == .dark ? .black : .white 50 | mtkView.isOpaque = true 51 | mtkView.framebufferOnly = false 52 | mtkView.clearColor = MTLClearColor(red: 0, green: 0, blue: 0, alpha: 0) 53 | mtkView.drawableSize = mtkView.frame.size 54 | mtkView.enableSetNeedsDisplay = false 55 | mtkView.colorPixelFormat = .bgra8Unorm 56 | return mtkView 57 | } 58 | 59 | // `UIViewRepresentable` requires this implementation; however, the sample 60 | // app doesn't use it. Instead, `MTKView.delegate` handles display updates. 61 | func updateUIView(_ uiView: MTKView, context: UIViewRepresentableContext) { 62 | 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /!App/ImageBundleApp/ImageBundleRecorder/MetalViewSample.swift: -------------------------------------------------------------------------------- 1 | /* 2 | See LICENSE folder for this sample’s licensing information. 3 | 4 | Abstract: 5 | The app's main user interface. 6 | */ 7 | 8 | import Foundation 9 | import SwiftUI 10 | import MetalKit 11 | 12 | // Add a title to a view that enlarges the view to full screen on tap. 13 | struct Texture: ViewModifier { 14 | let height: CGFloat 15 | let width: CGFloat 16 | let title: String 17 | let view: T 18 | func body(content: Content) -> some View { 19 | VStack { 20 | Text(title).foregroundColor(Color.red) 21 | // To display the same view in the navigation, reference the view 22 | // directly versus using the view's `content` property. 23 | NavigationLink(destination: view.aspectRatio(CGSize(width: width, height: height), contentMode: .fill)) { 24 | view.frame(maxWidth: width, maxHeight: height, alignment: .center) 25 | .aspectRatio(CGSize(width: width, height: height), contentMode: .fill) 26 | } 27 | } 28 | } 29 | } 30 | 31 | extension View { 32 | // Apply `zoomOnTapModifier` with a `self` reference to show the same view 33 | // on tap. 34 | func zoomOnTapModifier(height: CGFloat, width: CGFloat, title: String) -> some View { 35 | modifier(Texture(height: height, width: width, title: title, view: self)) 36 | } 37 | } 38 | extension Image { 39 | init(_ texture: MTLTexture, ciContext: CIContext, scale: CGFloat, orientation: Image.Orientation, label: Text) { 40 | let ciimage = CIImage(mtlTexture: texture)! 41 | let cgimage = ciContext.createCGImage(ciimage, from: ciimage.extent) 42 | self.init(cgimage!, scale: 1.0, orientation: .leftMirrored, label: label) 43 | } 44 | } 45 | 46 | 47 | struct MetalDepthView: View { 48 | @ObservedObject var manager = CameraManager() 49 | 50 | // Set the default sizes for the texture views. 51 | let sizeH: CGFloat = 320 52 | let sizeW: CGFloat = 240 53 | 54 | // Manage the AR session and AR data processing. 55 | //- Tag: ARProvider 56 | let ciContext: CIContext = CIContext() 57 | 58 | // Save the user's confidence selection. 59 | @State var isPaused = false 60 | @State var selectedConfidence = 0 61 | @State private var scaleMovement: Float = 1.5 62 | @State var saveSuffix: String = "" 63 | @State var numRecordedSceneBundles = 0 64 | @State var numRecordedPoseBundles = 0 65 | let screenWidth = UIScreen.main.bounds.size.width 66 | let fontSize : CGFloat = 22 67 | 68 | var body: some View { 69 | VStack(alignment: .leading, spacing: 0) { 70 | 71 | 72 | // depth and image display 73 | HStack(alignment: .top) { 74 | if manager.savingState == 1 { 75 | Spacer(minLength: 10) 76 | Image(systemName: "square.and.arrow.down.fill").font(.system(size: fontSize + 2)); Text("SAVING DATA TO DISK").font(.system(size: fontSize + 2)) 77 | Spacer(minLength: 10) 78 | } else if manager.savingState == 2 { 79 | Spacer(minLength: 10) 80 | Image(systemName: "exclamationmark.triangle.fill").font(.system(size: fontSize + 2)); Text("SOMETHING WENT WRONG,\nWAIT A MOMENT AND TRY AGAIN").font(.system(size: fontSize + 2)) 81 | Spacer(minLength: 10) 82 | } else { 83 | MetalTextureViewDepth(mtkView: MTKView(), content: manager.capturedData.depthContent, confSelection: $selectedConfidence) 84 | MetalTextureView(mtkView: MTKView(), content: manager.capturedData.colorRGBContent) 85 | } 86 | }.frame(width: screenWidth, height:400) 87 | 88 | HStack() { 89 | if manager.savingState == 0 { 90 | Text("Exposure: \(manager.exposureTime) ISO: \(manager.iso)").font(.system(size: fontSize)) 91 | } 92 | }.frame(width: 400, height: 30) 93 | 94 | // input field 95 | HStack() { 96 | Spacer(minLength: 10) 97 | TextField("Save File Suffix", text: $saveSuffix) 98 | .disableAutocorrection(true) 99 | .border(Color(UIColor.separator)) 100 | .autocapitalization(.none) 101 | .font(.system(size: fontSize)) 102 | Spacer(minLength: 10) 103 | }.frame(width: screenWidth, height: 50) 104 | 105 | // input field 106 | HStack() { 107 | Spacer(minLength: 10) 108 | Text("Recorded \(numRecordedPoseBundles) Motion, \(numRecordedSceneBundles) Scene Bundles").font(.system(size: fontSize)) 109 | Spacer(minLength: 10) 110 | }.frame(width: screenWidth, height: 30) 111 | 112 | // bundle size selector 113 | HStack { 114 | Spacer(minLength: 10) 115 | Text("Bundle Size:").font(.system(size: fontSize)) 116 | Picker(selection: $manager.controller.bundleSize, label: Text("Bundle Size:")) { 117 | Text("1").tag(1) 118 | Text("15").tag(15) 119 | Text("30").tag(30) 120 | Text("42").tag(42) 121 | }.pickerStyle(SegmentedPickerStyle()) 122 | Spacer(minLength: 10) 123 | }.frame(width: screenWidth, height:50) 124 | 125 | // buttons for stream interaction 126 | HStack() { 127 | Spacer(minLength: 20) 128 | Button(action: { 129 | manager.controller.frameCount = 99999 130 | manager.controller.stopStream() 131 | usleep(100000) 132 | manager.controller.startStream() 133 | }) { 134 | Image(systemName: "exclamationmark.arrow.circlepath").font(.system(size: 40)) 135 | } 136 | Spacer(minLength: 80) 137 | Button(action: { 138 | if manager.controller.frameCount == 99999 { 139 | manager.controller.recordBundle(saveSuffix: saveSuffix) 140 | numRecordedSceneBundles += 1 141 | } 142 | }) { 143 | Image(systemName: (manager.frameCount == 99999) ? "record.circle.fill" : "" ).font(.system(size: 40)) 144 | } 145 | Spacer(minLength: 80) 146 | Button(action: { 147 | if manager.controller.frameCount == 99999 { 148 | manager.controller.recordMotionBundle(saveSuffix: saveSuffix) 149 | numRecordedPoseBundles += 1 150 | } 151 | }) { 152 | Image(systemName: (manager.frameCount == 99999) ? "move.3d" : "" ).font(.system(size: 40)) 153 | } 154 | Spacer(minLength: 20) 155 | }.frame(width: screenWidth, height: 90) 156 | }.frame(maxWidth: .infinity, maxHeight: .infinity) 157 | .background(Color(CGColor(red: 0, green: 0, blue: 0, alpha: 0.3))) 158 | .ignoresSafeArea() 159 | } 160 | 161 | } 162 | -------------------------------------------------------------------------------- /!App/ImageBundleApp/ImageBundleRecorder/shaders.metal: -------------------------------------------------------------------------------- 1 | /* 2 | See LICENSE folder for this sample’s licensing information. 3 | 4 | Abstract: 5 | The sample app's Metal shaders. 6 | */ 7 | 8 | #include 9 | 10 | using namespace metal; 11 | 12 | 13 | typedef struct 14 | { 15 | float2 position [[attribute(0)]]; 16 | float2 texCoord [[attribute(1)]]; 17 | } Vertex; 18 | 19 | typedef struct 20 | { 21 | float4 position [[position]]; 22 | float2 texCoord; 23 | } ColorInOut; 24 | 25 | 26 | 27 | // Display a 2D texture. 28 | vertex ColorInOut planeVertexShader(Vertex in [[stage_in]]) 29 | { 30 | ColorInOut out; 31 | out.position = float4(in.position, 0.0f, 1.0f); 32 | out.texCoord = in.texCoord; 33 | return out; 34 | } 35 | 36 | // Shade a 2D plane by passing through the texture inputs. 37 | fragment float4 planeFragmentShader(ColorInOut in [[stage_in]], texture2d textureIn [[ texture(0) ]]) 38 | { 39 | constexpr sampler colorSampler(address::clamp_to_edge, filter::linear); 40 | float4 sample = textureIn.sample(colorSampler, in.texCoord); 41 | return sample; 42 | } 43 | 44 | // Convert a color value to RGB using a Jet color scheme. 45 | static half4 getJetColorsFromNormalizedVal(half val) { 46 | half4 res ; 47 | if(val <= 0.01h) 48 | return half4(); 49 | res.r = 1.5h - fabs(4.0h * val - 3.0h); 50 | res.g = 1.5h - fabs(4.0h * val - 2.0h); 51 | res.b = 1.5h - fabs(4.0h * val - 1.0h); 52 | res.a = 1.0h; 53 | res = clamp(res,0.0h,1.0h); 54 | return res; 55 | } 56 | 57 | // Shade a texture with depth values using a Jet color scheme. 58 | //- Tag: planeFragmentShaderDepth 59 | fragment half4 planeFragmentShaderDepth(ColorInOut in [[stage_in]], texture2d textureDepth [[ texture(0) ]]) 60 | { 61 | constexpr sampler colorSampler(address::clamp_to_edge, filter::nearest); 62 | float4 s = textureDepth.sample(colorSampler, in.texCoord); 63 | 64 | // Size the color gradient to a maximum distance of 2.5 meters. 65 | // The LiDAR Scanner supports a value no larger than 5.0; the 66 | // sample app uses a value of 2.5 to better distinguish depth 67 | // in smaller environments. 68 | half val = s.r / 2.5h; 69 | half4 res = getJetColorsFromNormalizedVal(val); 70 | return res; 71 | } 72 | 73 | // Shade a texture with confidence levels low, medium, and high to red, green, and blue, respectively. 74 | fragment half4 planeFragmentShaderConfidence(ColorInOut in [[stage_in]], texture2d textureIn [[ texture(0) ]]) 75 | { 76 | constexpr sampler colorSampler(address::clamp_to_edge, filter::nearest); 77 | float4 s = textureIn.sample(colorSampler, in.texCoord); 78 | float res = round( 255.0f*(s.r) ) ; 79 | int resI = int(res); 80 | half4 color = half4(0.0h, 0.0h, 0.0h, 0.0h); 81 | if (resI == 0) 82 | color = half4(1.0h, 0.0h, 0.0h, 1.0h); 83 | else if (resI == 1) 84 | color = half4(0.0h, 1.0h, 0.0h, 1.0h); 85 | else if (resI == 2) 86 | color = half4(0.0h, 0.0h, 1.0h, 1.0h); 87 | return color; 88 | } 89 | 90 | 91 | // Declare a particle class that the `pointCloudVertexShader` inputs 92 | // to `pointCloudFragmentShader`. 93 | typedef struct 94 | { 95 | float4 clipSpacePosition [[position]]; 96 | float2 coor; 97 | float pSize [[point_size]]; 98 | float depth; 99 | half4 color; 100 | } ParticleVertexInOut; 101 | 102 | 103 | // Position vertices for the point cloud view. Filters out points with 104 | // confidence below the selected confidence value and calculates the color of a 105 | // particle using the color Y and CbCr per vertex. Use `viewMatrix` and 106 | // `cameraIntrinsics` to calculate the world point location of each vertex in 107 | // the depth map. 108 | //- Tag: pointCloudVertexShader 109 | vertex ParticleVertexInOut pointCloudVertexShader( 110 | uint vertexID [[ vertex_id ]], 111 | texture2d depthTexture [[ texture(0) ]], 112 | texture2d confTexture [[ texture(1) ]], 113 | constant float4x4& viewMatrix [[ buffer(0) ]], 114 | constant float3x3& cameraIntrinsics [[ buffer(1) ]], 115 | constant int &confFilterMode [[ buffer(2) ]], 116 | texture2d colorYtexture [[ texture(2) ]], 117 | texture2d colorCbCrtexture [[ texture(3) ]] 118 | ) 119 | { // ... 120 | ParticleVertexInOut out; 121 | uint2 pos; 122 | // Count the rows that are depth-texture-width wide to determine the y-value. 123 | pos.y = vertexID / depthTexture.get_width(); 124 | 125 | // The x-position is the remainder of the y-value division. 126 | pos.x = vertexID % depthTexture.get_width(); 127 | //get depth in [mm] 128 | float depth = depthTexture.read(pos).x * 1000.0f; 129 | 130 | // Convert confidence from normalized `float` to `int`. 131 | float4 conf = confTexture.read(pos); 132 | int confInt = int(round( 255.0f*(conf.r) )) ; 133 | 134 | // Filter points by confidence level. 135 | const auto visibility = confInt >= confFilterMode; 136 | if(visibility == false) 137 | depth = 0.0f; 138 | 139 | // Calculate the vertex's world coordinates. 140 | float xrw = ((int)pos.x - cameraIntrinsics[2][0]) * depth / cameraIntrinsics[0][0]; 141 | float yrw = ((int)pos.y - cameraIntrinsics[2][1]) * depth / cameraIntrinsics[1][1]; 142 | float4 xyzw = { xrw, yrw, depth, 1.f }; 143 | 144 | // Project the coordinates to the view. 145 | float4 vecout = viewMatrix * xyzw; 146 | 147 | // Color the vertex. 148 | constexpr sampler textureSampler (mag_filter::linear, 149 | min_filter::linear); 150 | out.coor = { pos.x / (depthTexture.get_width() - 1.0f), pos.y / (depthTexture.get_height() - 1.0f) }; 151 | half y = colorYtexture.sample(textureSampler, out.coor).r; 152 | half2 uv = colorCbCrtexture.sample(textureSampler, out.coor).rg - half2(0.5h, 0.5h); 153 | // Convert YUV to RGB inline. 154 | half4 rgbaResult = half4(y + 1.402h * uv.y, y - 0.7141h * uv.y - 0.3441h * uv.x, y + 1.772h * uv.x, 1.0h); 155 | 156 | out.color = rgbaResult; 157 | out.clipSpacePosition = vecout; 158 | out.depth = depth; 159 | // Set the particle display size. 160 | out.pSize = 5.0f; 161 | 162 | return out; 163 | } 164 | 165 | // Shade the point cloud points by using quad particles. 166 | fragment half4 pointCloudFragmentShader( 167 | ParticleVertexInOut in [[stage_in]]) 168 | { 169 | // Avoid drawing particles that are too close, or filtered particles that 170 | // have zero depth. 171 | if (in.depth < 1.0f) 172 | discard_fragment(); 173 | else 174 | { 175 | return in.color; 176 | } 177 | return half4(); 178 | } 179 | 180 | 181 | // Convert the Y and CbCr textures into a single RGBA texture. 182 | kernel void convertYCbCrToRGBA(texture2d colorYtexture [[texture(0)]], 183 | texture2d colorCbCrtexture [[texture(1)]], 184 | texture2d colorRGBTexture [[texture(2)]], 185 | uint2 gid [[thread_position_in_grid]]) 186 | { 187 | float y = colorYtexture.read(gid).r; 188 | float2 uv = colorCbCrtexture.read(gid / 2).rg; 189 | 190 | const float4x4 ycbcrToRGBTransform = float4x4( 191 | float4(+1.0000f, +1.0000f, +1.0000f, +0.0000f), 192 | float4(+0.0000f, -0.3441f, +1.7720f, +0.0000f), 193 | float4(+1.4020f, -0.7141f, +0.0000f, +0.0000f), 194 | float4(-0.7010f, +0.5291f, -0.8860f, +1.0000f) 195 | ); 196 | 197 | // Sample Y and CbCr textures to get the YCbCr color at the given texture 198 | // coordinate. 199 | float4 ycbcr = float4(y, uv.x, uv.y, 1.0f); 200 | 201 | // Return the converted RGB color. 202 | float4 colorSample = ycbcrToRGBTransform * ycbcr; 203 | colorRGBTexture.write(colorSample, uint2(gid.xy)); 204 | 205 | } 206 | -------------------------------------------------------------------------------- /!App/ImageBundleApp/LICENSE/LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright © 2022 Apple Inc. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | 9 | -------------------------------------------------------------------------------- /!App/ImageBundleApp/Launch Screen.storyboard: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 24 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /!App/README.md: -------------------------------------------------------------------------------- 1 | # iOS App for RAW Long-Burst Recording 2 | 3 | This app is part the official code repository for the work: [Shakes on a Plane: Unsupervised Depth Estimation from Unstabilized Photography](https://light.princeton.edu/publication/soap/) 4 | 5 | If you use parts of this work, or otherwise take inspiration from it, please considering citing our paper: 6 | ``` 7 | @InProceedings{Chugunov_2023_CVPR, 8 | author = {Chugunov, Ilya and Zhang, Yuxuan and Heide, Felix}, 9 | title = {Shakes on a Plane: Unsupervised Depth Estimation From Unstabilized Photography}, 10 | booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, 11 | month = {June}, 12 | year = {2023}, 13 | pages = {13240-13251} 14 | } 15 | ``` 16 | 17 | The code is partially derived from [this sample code for recording depth with AVFoundation](https://developer.apple.com/documentation/avfoundation/additional_data_capture/capturing_depth_using_the_lidar_camera). We highly recommend you read through that example to familiarize yourself with the structure and function of portions of this app. 18 | 19 | ## Getting the App Running 20 | 1. Open the `.xcodeproj` project file with Xcode (tested for Xcode 14). 21 | 2. Click on the project and select `Signing & Capabilities` to provision the app. You'll have to make an [Apple developer account](https://developer.apple.com/) if you don't have one already. Check the `Automatically manage signing` box, select your team (likely your personal Apple developer account), and you should be done. 22 | 23 | ![xcode](!figs/signing.png) 24 | 25 | 3. Plug in your device (in our case an iPhone 14 Pro), and trust it. It should appear in the list of devices at the top of Xcode. Select it as the device to build/deploy to. 26 | 4. Press the play button at the top of Xcode to build the app, its icon should now appear on your phone. 27 | 5. As this is an app not from the app store, you will have to trust the developer in your settings under `Settings / General / VPN & Device Management` (This location may depend on your iOS version). 28 | 29 | ![app](!figs/app.png) 30 | 31 | 6. You should now be able to run the app. 32 | 33 | ## Using the App 34 | 35 | ![interface](!figs/interface.png) 36 | 37 | 1. Enter a suffix (or leave blank). Captured long-burst data will be saved into folders named `bundle-{capture_time}-{suffix}`. 38 | 2. Use the sliding bar to select how many frames to record per capture: 1, 15, 30, or 42. The app records at ~21 fps, so the longest recording length is approximately 2 seconds. 39 | 3. Left button resets the video stream. Middle button captures a long-burst with RAW, RGB, Depth, and motion (gyro/accelerometer) data. Right button records only motion data. 40 | 4. If the recording was sucessful, a message will pop up stating the device is "Saving Data to Disk". Otherwise an error message will appear and no data will be written to the device. 41 | 5. If the video stream stops or appears to have high latency, try restarting the app and closing other apps that may be taking up phone memory. 42 | 6. ! **careful** ! : This app records completely uncompressed 14-bit, 12-megapixel RAWs; so a 42-frame recording is like 2 gigabytes of data. It's really easy to quickly fill up your phone's entire storage, so remember to delete unneeded captures and empty the `Recently Deleted` folder. 43 | 44 | ## Processing the Recorded Long-Bursts 45 | 1. Airdrop is the easiest way to move recorded long-burst bundles from the phone to your computer. Navigate in the `Files` app to `On My iPhone` and you should see an `Image Bundle Recorder` folder. Then select and airdrop the desired data to your device: 46 | 47 | ![airdrop](!figs/airdrop.png) 48 | 49 | 2. Place all these bundles into a folder, and convert them to `.npz` dictionaries with: 50 | ```python ConvertBinaries.py -d {folder_containing_bundles}```. This will populate the parent directory with folders containing the processed `.npz` data and preview first/last images of the recorded data. 51 | 52 | 3. See [0_data_format.ipynb](https://github.com/princeton-computational-imaging/SoaP/blob/main/0_data_format.ipynb) in the main repo to understand what's inside this `.npz` data and [1_reconstruction.ipynb](https://github.com/princeton-computational-imaging/SoaP/blob/main/1_reconstruction.ipynb) to learn how to train a SoaP model with it. 53 | 54 | Best fishes, 55 | Ilya 56 | 57 | -------------------------------------------------------------------------------- /!figs/experiments-thumb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/princeton-computational-imaging/SoaP/857dda0f7578126ed9feb8410eedeef053679f9e/!figs/experiments-thumb.png -------------------------------------------------------------------------------- /!figs/extra-thumb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/princeton-computational-imaging/SoaP/857dda0f7578126ed9feb8410eedeef053679f9e/!figs/extra-thumb.png -------------------------------------------------------------------------------- /!figs/scenes-thumb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/princeton-computational-imaging/SoaP/857dda0f7578126ed9feb8410eedeef053679f9e/!figs/scenes-thumb.png -------------------------------------------------------------------------------- /!figs/synth-thumb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/princeton-computational-imaging/SoaP/857dda0f7578126ed9feb8410eedeef053679f9e/!figs/synth-thumb.png -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Ilya Chugunov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Shakes on a Plane: Unsupervised Depth Estimation from Unstabilized Photography 3 | 4 | Open In Colab 5 | 6 | 7 | 8 | This is the official code repository for the work: [Shakes on a Plane: Unsupervised Depth Estimation from Unstabilized Photography](https://light.princeton.edu/publication/soap/), presented at CVPR 2023. 9 | 10 | If you use parts of this work, or otherwise take inspiration from it, please considering citing our paper: 11 | ``` 12 | @InProceedings{Chugunov_2023_CVPR, 13 | author = {Chugunov, Ilya and Zhang, Yuxuan and Heide, Felix}, 14 | title = {Shakes on a Plane: Unsupervised Depth Estimation From Unstabilized Photography}, 15 | booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, 16 | month = {June}, 17 | year = {2023}, 18 | pages = {13240-13251} 19 | } 20 | ``` 21 | 22 | ## Requirements: 23 | - Developed using PyTorch 1.13.0 and PyTorch Ligthning 1.8.3 on Linux x64 machine 24 | - Condensed package requirements are in `\requirements.txt`. Note that this contains the exact package versions at the time of publishing. Code will likely work with newer versions of the libraries, but you will need to watch out for changes in class/function calls. 25 | 26 | This code also requires tiny-cuda-nn, see [NVlabs/tiny-cuda-nn](https://github.com/NVlabs/tiny-cuda-nn) for installation instructions (we used Version 1.6 at the time of publishing). 27 | 28 | ## Project Structure: 29 | ```cpp 30 | SoaP 31 | ├── checkpoints 32 | │ └── // folder for network checkpoints 33 | ├── config 34 | │ ├── config_depth.json // depth MLP configuration 35 | │ └── config_rgb.json // image MLP configuration 36 | ├── data 37 | │ └── // folder for long-burst data 38 | ├── utils 39 | │ └── utils.py // network helper functions (e.g. camera projection, spline interpolation) 40 | ├── tutorial.ipynb // interactive tutorial for training and depth reconstruction 41 | ├── README.md // <- You Are Here 42 | ├── requirements.txt // frozen package requirements 43 | └── train.py // dataloader, network, visualization, and trainer code 44 | ``` 45 | ## Getting Started: 46 | We recommend you start by going through `tutorial.ipynb` to download a sample long-burst and familiarize yourself with the data contained within it and how to train a model with it. This tutorial is also available as a [Colab notebook](https://colab.research.google.com/github/princeton-computational-imaging/SoaP/blob/main/tutorial.ipynb). 47 | 48 | For other training arguments, see the argument parser section of `\train.py`. 49 | 50 | ## Data: 51 | You can download the long-burst data used in the paper (and extra bonus scenes) via the following links: 52 | 53 | 1. Shade map used for lens shading compensation (**important**, see paper supplemental for more information): [shade_map.npy](https://soap.cs.princeton.edu/shade_map.npy) 54 | 55 | 2. Main scenes: [scenes.zip](https://soap.cs.princeton.edu/scenes.zip) 56 | ![xcode](!figs/scenes-thumb.png) 57 | Model checkpoints: [scenes-checkpoints.zip](https://soap.cs.princeton.edu/scenes-checkpoints.zip) 58 | These checkpoints may require you to download the full scene data in order to properly load them. 59 | 60 | 3. Supplemental experiment scenes: [experiments.zip](https://soap.cs.princeton.edu/experiments.zip) 61 | ![xcode](!figs/experiments-thumb.png) 62 | 63 | 4. Extra un-used (but neat) scenes: [extra.zip](https://soap.cs.princeton.edu/extra.zip) 64 | ![xcode](!figs/extra-thumb.png) 65 | 66 | 5. Synthetic rendered data (with scanned object meshes): [synthetic.zip](https://soap.cs.princeton.edu/synthetic.zip) 67 | ![xcode](!figs/synth-thumb.png) 68 | 69 | 70 | We recommend you unzip these folders and place them into `\data` 71 | 72 | ## App: 73 | Want to record your own long-burst data? See [!App](https://github.com/princeton-computational-imaging/SoaP/tree/main/!App) for details! 74 | 75 | 76 | Good luck have fun, 77 | Ilya 78 | -------------------------------------------------------------------------------- /checkpoints/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/princeton-computational-imaging/SoaP/857dda0f7578126ed9feb8410eedeef053679f9e/checkpoints/__init__.py -------------------------------------------------------------------------------- /config/config_depth.json: -------------------------------------------------------------------------------- 1 | { 2 | "encoding": { 3 | "otype": "HashGrid", 4 | "n_levels": 8, 5 | "n_features_per_level": 4, 6 | "log2_hashmap_size": 14, 7 | "base_resolution": 8, 8 | "per_level_scale": 1.4 9 | }, 10 | "network": { 11 | "otype": "FullyFusedMLP", 12 | "activation": "ReLU", 13 | "output_activation": "None", 14 | "n_neurons": 128, 15 | "n_hidden_layers": 5 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /config/config_rgb.json: -------------------------------------------------------------------------------- 1 | { 2 | "encoding": { 3 | "otype": "HashGrid", 4 | "n_levels": 16, 5 | "n_features_per_level": 4, 6 | "log2_hashmap_size": 22, 7 | "base_resolution": 8, 8 | "per_level_scale": 1.5 9 | }, 10 | "network": { 11 | "otype": "FullyFusedMLP", 12 | "activation": "ReLU", 13 | "output_activation": "None", 14 | "n_neurons": 128, 15 | "n_hidden_layers": 5 16 | } 17 | } -------------------------------------------------------------------------------- /data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/princeton-computational-imaging/SoaP/857dda0f7578126ed9feb8410eedeef053679f9e/data/__init__.py -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | commentjson>=0.9.0 2 | matplotlib>=3.6.2 3 | numpy>=1.22.3 4 | pytorch_lightning>=1.8.3.post1 5 | torch>=1.13.0 6 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import commentjson as json 3 | import numpy as np 4 | import os 5 | 6 | import tinycudann as tcnn 7 | 8 | from utils import utils 9 | from utils.utils import debatch 10 | 11 | import torch 12 | from torch.nn import functional as F 13 | from torch.utils.data import Dataset 14 | from torch.utils.data import DataLoader 15 | import pytorch_lightning as pl 16 | 17 | ######################################################################################################### 18 | ################################################ DATASET ################################################ 19 | ######################################################################################################### 20 | 21 | class BundleDataset(Dataset): 22 | def __init__(self, args): 23 | bundle = dict(np.load(args.bundle_path, allow_pickle=True)) 24 | utils.de_item(bundle) 25 | 26 | if not args.no_raw: 27 | raw_frames = torch.tensor(np.array([bundle[f'raw_{i}']['raw'] for i in range(bundle['num_raw_frames'])]).astype(np.int32))[None] # B,T,H,W 28 | if args.no_shade_map or args.no_raw: 29 | pass # no shade map needed 30 | else: 31 | shade_map = torch.tensor(np.load(os.path.join(os.getcwd(), "data/shade_map.npy")))[None,None,:,:] # 1,1,H,W, compensation for lens shading 32 | raw_frames = raw_frames * shade_map 33 | 34 | self.motion = bundle['motion'] 35 | if args.no_device_rotations: 36 | self.frame_timestamps = torch.tensor(np.linspace(0,1, bundle['num_rgb_frames'])) 37 | self.motion_timestamps = torch.tensor(np.linspace(0,1, bundle['num_rgb_frames'])) 38 | self.quaternions = torch.tensor(np.repeat([[0,0,0,1.0]], bundle['num_rgb_frames'], axis=0)).float() 39 | else: 40 | self.frame_timestamps = torch.tensor([bundle[f'raw_{i}']['timestamp'] for i in range(bundle['num_rgb_frames'])]) 41 | self.motion_timestamps = torch.tensor(self.motion['timestamp']) 42 | self.quaternions = torch.tensor(self.motion['quaternion']) # T',4, has different timestamps from frames 43 | 44 | self.reference_quaternion = utils.multi_interp(self.frame_timestamps[0:1], self.motion_timestamps, self.quaternions) # quaternion at frame 0 45 | self.reference_rotation = utils.convert_quaternions_to_rot(self.reference_quaternion) 46 | 47 | self.processed_rgb_volume = torch.tensor(np.array([bundle[f'rgb_{i}']['rgb'] for i in range(bundle['num_rgb_frames'])])) 48 | self.processed_rgb_volume = (self.processed_rgb_volume[:,:,:,:3].permute(0,3,1,2)).float() # remove alpha, make: T,C,H,W 49 | self.processed_rgb_volume = self.processed_rgb_volume / self.processed_rgb_volume[0].max() # scale 0-1 50 | 51 | intrinsics_ratio = 1.0 52 | if args.no_phone_depth and not args.no_raw: # intrinsics from RGB, but img from RAW, rescale 53 | intrinsics_ratio = bundle['raw_0']['height'] / bundle['rgb_0']['height'] 54 | elif not args.no_phone_depth and args.no_raw: # intrinsics from depth, img from processed RGB 55 | intrinsics_ratio = bundle['rgb_0']['height'] / bundle['raw_0']['height'] 56 | 57 | if args.no_phone_depth: 58 | self.intrinsics = torch.tensor(np.array([bundle[f'rgb_{i}']['intrinsics'] for i in range(bundle['num_rgb_frames'])])).float() # T,3,3 59 | else: 60 | self.intrinsics = torch.tensor(np.array([bundle[f'depth_{i}']['intrinsics'] for i in range(bundle['num_depth_frames'])])) 61 | 62 | self.intrinsics[:,:3,:2] = self.intrinsics[:,:3,:2] * intrinsics_ratio 63 | 64 | if args.no_raw: # use processed RGB 65 | self.rgb_volume = (self.processed_rgb_volume).float() 66 | self.rgb_volume = self.rgb_volume - self.rgb_volume.min() 67 | self.rgb_volume = self.rgb_volume/self.rgb_volume.max() 68 | 69 | else: # use minimally processed RAW 70 | self.rgb_volume = (utils.raw_to_rgb(raw_frames)).float() # T,C,H,W 71 | self.rgb_volume = self.rgb_volume - self.rgb_volume.min() 72 | self.rgb_volume = self.rgb_volume/self.rgb_volume.max() 73 | if args.dark: # cut off highlights for scaling (long-tail-distribution) 74 | self.rgb_volume = self.rgb_volume/np.percentile(self.rgb_volume, 98) 75 | self.rgb_volume = self.rgb_volume.clamp(0,1) 76 | 77 | self.reference_intrinsics = self.intrinsics[0:1] 78 | 79 | if args.no_phone_depth: 80 | self.depth_volume = torch.zeros(bundle['num_rgb_frames'], 1, 64, 64, dtype=torch.float32) # placeholder depth 81 | else: 82 | self.depth_volume = torch.tensor(np.array([bundle[f'depth_{i}']['depth'] for i in range(bundle['num_depth_frames'])])) 83 | self.depth_volume = 1/(self.depth_volume[:,:,:,None].permute(0,3,1,2)).float() # T,C,H,W; lidar has inverse depth 84 | 85 | T,C,H,W = self.rgb_volume.shape 86 | self.num_frames, self.img_channels, self.img_height, self.img_width = T,C,H,W 87 | 88 | self.point_batch_size = args.point_batch_size 89 | self.num_batches = args.num_batches 90 | 91 | def __len__(self): 92 | return self.num_batches # arbitrary as we continuously generate random samples 93 | 94 | def __getitem__(self, idx): 95 | # create uniform u,v between 0.025 and 0.975 to preserve edges 96 | uv = torch.rand(self.point_batch_size, 2) * torch.tensor([[0.95,0.95]]) + torch.tensor([[0.025,0.025]]) 97 | 98 | # t is time for all frames, looks like [0, 0,... 0, 1/41, 1/41, ..., 1/41, 2/41, 2/41, ..., 2/41, etc.] 99 | t = torch.linspace(0,1,self.num_frames).repeat_interleave(uv.shape[0])[:,None] # num_frames * point_batch_size, 1 100 | 101 | return self.sample_grid(uv, t, frame=0, sample_depth=True, sample_rgb=True, sample_processed_rgb=False) 102 | 103 | def sample_grid(self, uv, t, frame, sample_depth=False, sample_rgb=False, sample_processed_rgb=False): 104 | """ Return TUV grid, interpolated rotation, intrinsics, depth, rgb samples 105 | """ 106 | 107 | lidar_samples, rgb_samples, rgb_processed_samples = -1, -1, -1 108 | 109 | # convert to frame times [0-1] -> (seconds) 110 | t_frame = torch.tensor(np.interp(t, np.linspace(0,1,len(self.frame_timestamps)), self.frame_timestamps)).squeeze() 111 | # grab linearly interpolated quaternions at those timestamps 112 | quaternions = utils.multi_interp(t_frame, self.motion_timestamps, self.quaternions) 113 | # grab linearly interpolated intrinsics at those timestamps 114 | intrinsics = utils.multi_interp(t_frame, self.frame_timestamps, self.intrinsics.view(-1,9)).reshape(-1,3,3) 115 | 116 | if sample_depth: 117 | # grid_sample uses coordinates [-1,1] whereas MLP uses [0,1], hence rescaling 118 | grid_uv = ((uv - 0.5) * 2)[None,:,None,:] # 1,point_batch_size,1,2 119 | lidar_samples = F.grid_sample(self.depth_volume[frame:frame+1], grid_uv, mode="bilinear", padding_mode="border", align_corners=True) 120 | lidar_samples = lidar_samples.squeeze()[:,None] # point_batch_size, C 121 | 122 | if sample_rgb: 123 | grid_uv = ((uv - 0.5) * 2)[None,:,None,:] # 1,point_batch_size,1,2 124 | rgb_samples = F.grid_sample(self.rgb_volume[frame:frame+1], grid_uv, mode="bilinear", padding_mode="border", align_corners=True) 125 | rgb_samples = rgb_samples.squeeze().permute(1,0) # point_batch_size, C 126 | 127 | if sample_processed_rgb: 128 | grid_uv = ((uv - 0.5) * 2)[None,:,None,:] # 1,point_batch_size,1,2 129 | rgb_processed_samples = F.grid_sample(self.processed_rgb_volume[frame:frame+1], grid_uv, mode="bilinear", padding_mode="border", align_corners=True) 130 | rgb_processed_samples = rgb_processed_samples.squeeze().permute(1,0) # point_batch_size, C 131 | 132 | return t, uv, quaternions, intrinsics, lidar_samples, rgb_samples, rgb_processed_samples 133 | 134 | ######################################################################################################### 135 | ################################################ MODELS #################$############################### 136 | ######################################################################################################### 137 | 138 | class PlaneModel(pl.LightningModule): 139 | def __init__(self, depth): 140 | super().__init__() 141 | # ax + by + c 142 | self.plane_coefs = torch.nn.Parameter(data=torch.tensor([1/10,1/10,depth/5]), requires_grad=True) 143 | # increase effective learning rate of plane without custom lr scheduler 144 | self.scale_factor = torch.nn.Parameter(data=torch.tensor([5.0,5.0,5.0]), requires_grad=False) 145 | 146 | def forward(self, uv): 147 | uv_homogenous = torch.cat((uv, torch.ones_like(uv[:,:1])), dim=1) 148 | plane = uv_homogenous * self.plane_coefs * self.scale_factor 149 | return torch.sum(plane, dim=1, keepdims=True) 150 | 151 | 152 | class LearnedRotationModel(pl.LightningModule): 153 | def __init__(self, args): 154 | super().__init__() 155 | self.args = args 156 | self.rotation_betas = torch.nn.Parameter(data=torch.zeros(args.control_points_motion, 3, 1, dtype=torch.float32), requires_grad=True) 157 | 158 | def forward(self, quaternions, t): 159 | # use de casteljau algorithm for interpolation 160 | rotation_deltas = utils.de_casteljau(self.rotation_betas, t) 161 | rx, ry, rz = rotation_deltas[:,0], rotation_deltas[:,1], rotation_deltas[:,2] 162 | r1 = torch.ones_like(rx) 163 | 164 | # identity rotation eye(3) plus small rotational offsets 165 | rotations = torch.stack([torch.stack([ r1, -rz, ry], dim=-1), 166 | torch.stack([ rz, r1, -rx], dim=-1), 167 | torch.stack([-ry, rx, r1], dim=-1)], dim=-1) 168 | 169 | return rotations 170 | 171 | 172 | class DeviceRotationModel(pl.LightningModule): 173 | def __init__(self, args, reference_rotation): 174 | super().__init__() 175 | self.args = args 176 | self.reference_rotation = reference_rotation 177 | self.rotation_betas = torch.nn.Parameter(data=torch.zeros(args.control_points_motion, 3, 1, dtype=torch.float32), requires_grad=True) 178 | 179 | def forward(self, quaternions, t): 180 | rotations = torch.inverse(self.reference_rotation) @ utils.convert_quaternions_to_rot(quaternions) # from gyro 181 | 182 | rotation_deltas = utils.de_casteljau(self.rotation_betas, t) 183 | rx, ry, rz = rotation_deltas[:,0], rotation_deltas[:,1], rotation_deltas[:,2] 184 | r0 = torch.zeros_like(rx) 185 | 186 | rotation_offsets = torch.stack([torch.stack([ r0, -rz, ry], dim=-1), 187 | torch.stack([ rz, r0, -rx], dim=-1), 188 | torch.stack([-ry, rx, r0], dim=-1)], dim=-1) 189 | 190 | return rotations + self.args.rotation_weight * rotation_offsets 191 | 192 | 193 | class TranslationModel(pl.LightningModule): 194 | def __init__(self, args): 195 | super().__init__() 196 | self.args = args 197 | self.translation_betas = torch.nn.Parameter(data=torch.zeros(args.control_points_motion, 3, 1, dtype=torch.float32), requires_grad=True) 198 | 199 | def forward(self, t): 200 | return self.args.translation_weight * utils.de_casteljau(self.translation_betas, t) 201 | 202 | class IntrinsicsModel(pl.LightningModule): 203 | def __init__(self, args, reference_intrinsics): 204 | super().__init__() 205 | self.args = args 206 | self.intrinsic_betas = torch.nn.Parameter(data=torch.zeros(args.control_points_intrinsics, 1, 1, dtype=torch.float32), requires_grad=True) 207 | self.focal = torch.nn.Parameter(data=torch.tensor([reference_intrinsics[0,0,0]]), requires_grad=True) 208 | self.cy = reference_intrinsics[0,2,0] 209 | self.cx = reference_intrinsics[0,2,1] 210 | 211 | def forward(self, t): 212 | f_deltas = utils.de_casteljau(self.intrinsic_betas, t) 213 | 214 | cy = self.cy * torch.ones_like(t) 215 | cx = self.cx * torch.ones_like(t) 216 | f = (self.focal * torch.ones_like(t)) + f_deltas 217 | f0 = torch.zeros_like(t) 218 | f1 = torch.ones_like(t) 219 | 220 | intrinsics = torch.stack([torch.stack([f, f0, cy], dim=-1), 221 | torch.stack([f0, f, cx], dim=-1), 222 | torch.stack([f0, f0, f1], dim=-1)], dim=-1) 223 | return intrinsics.squeeze(dim=1) 224 | 225 | ######################################################################################################### 226 | ################################################ NETWORK ################################################ 227 | ######################################################################################################### 228 | 229 | class BundleMLP(pl.LightningModule): 230 | def __init__(self, args): 231 | super().__init__() 232 | # load network configs 233 | with open(args.config_path_depth) as config_depth: 234 | config_depth = json.load(config_depth) 235 | with open(args.config_path_rgb) as config_rgb: 236 | config_rgb = json.load(config_rgb) 237 | 238 | self.args = args 239 | 240 | self.encoding_depth = tcnn.Encoding(n_input_dims=2, encoding_config=config_depth["encoding"]) 241 | self.network_depth = tcnn.Network(n_input_dims=self.encoding_depth.n_output_dims, n_output_dims=1, network_config=config_depth["network"]) 242 | 243 | self.encoding_rgb = tcnn.Encoding(n_input_dims=2, encoding_config=config_rgb["encoding"]) 244 | self.network_rgb = tcnn.Network(n_input_dims=self.encoding_rgb.n_output_dims, n_output_dims=3, network_config=config_rgb["network"]) 245 | 246 | self.model_translation = TranslationModel(args) 247 | self.model_plane = PlaneModel(depth=1.0) 248 | 249 | self.mask = torch.ones(self.encoding_depth.n_output_dims, dtype=torch.float32) 250 | self.save_hyperparameters() 251 | 252 | bundle = BundleDataset(args) 253 | self.bundle = bundle 254 | self.rgb_volume = bundle.rgb_volume 255 | self.processed_rgb_volume = bundle.processed_rgb_volume 256 | self.reference_intrinsics = bundle.reference_intrinsics 257 | self.reference_rotation = bundle.reference_rotation 258 | 259 | if args.no_device_rotations: # learn rotations from scratch 260 | self.model_rotation = LearnedRotationModel(args) 261 | else: # use gyro data 262 | self.model_rotation = DeviceRotationModel(args, self.reference_rotation) 263 | 264 | self.model_intrinsics = IntrinsicsModel(args, self.reference_intrinsics) 265 | 266 | def sample_volume(self, uv, volume, frame=None): 267 | """ Grid sample from 2D image volume at coordinates (u,v) 268 | If frame=None, sample from all frames, else single frame 269 | """ 270 | pbs = self.bundle.point_batch_size 271 | grid_uv = ((uv - 0.5) * 2) 272 | 273 | if frame is None: 274 | grid_uv = grid_uv.reshape(self.bundle.num_frames, pbs, 1, -1) # frames, pbs, 1, 2 275 | rgb_samples = F.grid_sample(volume, grid_uv, mode="bilinear", padding_mode="border", align_corners=True) 276 | rgb_samples = rgb_samples.squeeze().permute(0,2,1).reshape(pbs * self.bundle.num_frames, -1) 277 | else: 278 | grid_uv = grid_uv[None,:,None,:] # frames, pbs, 1, 2 279 | rgb_samples = F.grid_sample(volume[frame:frame+1], grid_uv, mode="bilinear", padding_mode="border", align_corners=True) 280 | rgb_samples = rgb_samples.squeeze().permute(1,0) 281 | 282 | return rgb_samples 283 | 284 | def configure_optimizers(self): 285 | optimizer = torch.optim.Adam(self.parameters(), lr=self.args.lr, betas=(0.9, 0.99), eps=1e-15, weight_decay=0) 286 | scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=self.args.gamma) 287 | return [optimizer], [scheduler] 288 | 289 | def forward(self, t, uv, quaternions, lidar_samples, rgb_samples): 290 | """ Forward model pass, estimate motion, implicit depth + image. 291 | """ 292 | translation = self.model_translation(t) 293 | rotation = self.model_rotation(quaternions, t) 294 | 295 | uv_depth = self.encoding_depth(uv) 296 | uv_rgb = self.encoding_rgb(uv) 297 | 298 | mask = self.mask.to(self.device)[None,:] 299 | 300 | if self.args.allow_negative_depth: # no ReLUs, no clamps, depth can go hog-wild 301 | plane = self.model_plane(uv) 302 | depth = (plane - self.network_depth(uv_depth * mask)) 303 | else: # clamp depth between 0.01 and 10, depth offset must be *in front* of plane 304 | plane = F.relu(self.model_plane(uv)).clamp(0.01, 10.0) 305 | depth = (plane - F.relu(self.network_depth(uv_depth * mask))).clamp(0.01, 10) 306 | 307 | if self.args.fixed_image: # just sample static reference frame 308 | rgb = rgb_samples 309 | else: # sample from RGB MLP 310 | rgb = F.relu(0.5 + self.network_rgb(uv_rgb)).float() 311 | 312 | return rgb, depth, plane, rotation, translation 313 | 314 | def reproject(self, t, uv, depth, rotation, translation, intrinsics): 315 | """ Reproject uv coordinates to new refererence frame 316 | """ 317 | if self.args.no_intrinsics: # use learned model 318 | intrinsics = self.model_intrinsics(t) 319 | reference_intrinsics = self.model_intrinsics(torch.zeros_like(t)[0:1]) 320 | else: # used stored intrinsics 321 | reference_intrinsics = self.reference_intrinsics 322 | 323 | uvz = torch.cat((uv, depth), dim=1) 324 | xyz = utils.uvz_to_xyz(uvz, reference_intrinsics, img_width=self.bundle.img_width, img_height=self.bundle.img_height) 325 | xyz = (torch.inverse(rotation) @ xyz[:,:,None])[:,:,0] + translation # project to query 326 | uvz_reprojected = utils.xyz_to_uvz(xyz, intrinsics, img_width=self.bundle.img_width, img_height=self.bundle.img_height) 327 | tuv_reprojected = torch.cat((t, uvz_reprojected[:,0:2]), dim=1) 328 | 329 | return tuv_reprojected 330 | 331 | def training_step(self, train_batch, batch_idx): 332 | N = self.bundle.num_frames 333 | pbs = self.args.point_batch_size 334 | 335 | t, uv, quaternions, intrinsics, lidar_samples, rgb_samples, _ = train_batch # collapse batch + point dimensions 336 | t, uv, quaternions, intrinsics, lidar_samples, rgb_samples = debatch(t), debatch(uv), debatch(quaternions), debatch(intrinsics), debatch(lidar_samples), debatch(rgb_samples) 337 | 338 | rgb, depth, plane, rotation, translation = self.forward(t, uv, quaternions, lidar_samples, rgb_samples) 339 | uv, rgb, depth, plane = uv.repeat(N,1), rgb.repeat(N,1), depth.repeat(N,1), plane.repeat(N,1) 340 | 341 | tuv_plane_reprojected = self.reproject(t, uv, plane, rotation, translation, intrinsics) 342 | tuv_depth_reprojected = self.reproject(t, uv, depth, rotation, translation, intrinsics) 343 | 344 | rgb_plane_reprojected = self.sample_volume(tuv_plane_reprojected[:,1:], self.rgb_volume) # sample all timesteps with u,v 345 | rgb_depth_reprojected = self.sample_volume(tuv_depth_reprojected[:,1:], self.rgb_volume) 346 | 347 | loss = 0.0 348 | 349 | # overall depth loss 350 | depth_rgb_loss = ((rgb/(rgb.detach() + 0.001)) - (rgb_depth_reprojected/(rgb.detach() + 0.001))) ** 2 351 | depth_rgb_loss = depth_rgb_loss.mean(dim=1, keepdims=True) # mean over RGB channels 352 | loss += depth_rgb_loss.mean() 353 | 354 | # plane-only loss 355 | plane_rgb_loss = ((rgb/(rgb.detach() + 0.001)) - (rgb_plane_reprojected/(rgb.detach() + 0.001))) ** 2 356 | plane_rgb_loss = plane_rgb_loss.mean(dim=1, keepdims=True) # mean over RGB channels 357 | 358 | # weighted plane loss 359 | plane_depth_loss = (depth/plane - 1) ** 2 360 | weighted_plane_depth_loss = plane_rgb_loss/(depth_rgb_loss + 0.001) * plane_depth_loss 361 | loss += self.args.plane_weight * weighted_plane_depth_loss.mean() 362 | 363 | self.log('loss', loss) 364 | return loss 365 | 366 | def make_grid(self, height, width, u_lims, v_lims): 367 | """ Create (u,v) meshgrid with size (height,width) extent (u_lims, v_lims) 368 | """ 369 | u = torch.linspace(u_lims[0], u_lims[1], width) 370 | v = torch.linspace(v_lims[0], v_lims[1], height) 371 | u_grid, v_grid = torch.meshgrid([u, v], indexing="xy") # u/v grid 372 | return torch.stack((u_grid.flatten(), v_grid.flatten())).t() 373 | 374 | def generate_imgs(self, frame, height=960, width=720, u_lims=[0,1], v_lims=[0,1]): 375 | """ Produce reference images and depth maps for tensorboard/visualization 376 | """ 377 | device = self.device 378 | uv = self.make_grid(height, width, u_lims, v_lims) 379 | t = torch.tensor(frame/(self.bundle.num_frames - 1)).repeat(uv.shape[0])[:,None] # num_points, 1 380 | 381 | batch = self.bundle.sample_grid(uv, t, frame, sample_depth=True, sample_rgb=True, sample_processed_rgb=True) 382 | batch = [elem.to(device) for elem in batch] 383 | t, uv, quaternions, intrinsics, lidar_samples, rgb_samples, rgb_processed_samples = batch 384 | 385 | rgb_raw = rgb_samples.reshape(height, width, 3).permute(2,0,1) # channel first 386 | rgb_processed = rgb_processed_samples.reshape(height, width, 3).permute(2,0,1) # channel first 387 | depth_lidar = lidar_samples.reshape(height, width) 388 | depth_lidar_img = utils.colorize_tensor(depth_lidar, vmin=lidar_samples.min(), vmax=lidar_samples.max(), cmap="RdYlBu") 389 | 390 | return rgb_raw, rgb_processed, depth_lidar, depth_lidar_img 391 | 392 | def generate_outputs(self, frame, height=960, width=720, u_lims=[0,1], v_lims=[0,1]): 393 | """ Use forward model to sample implicit image I(u,v), depth D(u,v) and raw/processed images 394 | at reprojected u,v, coordinates. Results should be aligned (sampled at (u',v')) 395 | """ 396 | device = self.device 397 | uv = self.make_grid(height, width, u_lims, v_lims) 398 | t = torch.tensor(frame/(self.bundle.num_frames - 1)).repeat(uv.shape[0])[:,None] # num_points, 1 399 | 400 | batch = self.bundle.sample_grid(uv, t, frame, sample_depth=True, sample_rgb=True, sample_processed_rgb=True) 401 | batch = [elem.to(device) for elem in batch] 402 | t, uv, quaternions, intrinsics, lidar_samples, rgb_samples, rgb_processed_samples = batch 403 | 404 | with torch.no_grad(): 405 | rgb, depth, plane, rotation, translation = self.forward(t, uv, quaternions, lidar_samples, rgb_samples) 406 | tuv_reprojected = self.reproject(t, uv, depth, rotation, translation, intrinsics) 407 | rgb_raw = self.sample_volume(tuv_reprojected[:,1:], self.rgb_volume, frame=frame) 408 | rgb_processed = self.sample_volume(tuv_reprojected[:,1:], self.processed_rgb_volume, frame=frame) 409 | 410 | rgb = rgb.reshape(height, width, 3).permute(2,0,1) # channel first 411 | rgb_raw = rgb_raw.reshape(height, width, 3).permute(2,0,1) # channel first 412 | rgb_processed = rgb_processed.reshape(height, width, 3).permute(2,0,1) 413 | 414 | depth = depth.reshape(height, width) 415 | depth_img = utils.colorize_tensor(depth, vmin=0, vmax=depth.max(), cmap="RdYlBu") 416 | 417 | return rgb, rgb_raw, rgb_processed, depth, depth_img 418 | 419 | ######################################################################################################### 420 | ############################################### VALIDATION ############################################## 421 | ######################################################################################################### 422 | 423 | class ValidationCallback(pl.Callback): 424 | def __init__(self): 425 | super().__init__() 426 | 427 | def on_train_epoch_start(self, trainer, model): 428 | args = model.args 429 | coef = ((model.current_epoch/model.args.max_epochs) * args.mask_k_max) + ((1 - model.current_epoch/model.args.max_epochs) * args.mask_k_min) 430 | model.mask = torch.sigmoid(torch.linspace(args.mask_k_max, coef, len(model.mask))) 431 | print("Mask mean:", model.mask.mean()) 432 | 433 | # let plane train on its own for 10 epochs 434 | if model.current_epoch == 10: 435 | # start training depth 436 | model.encoding_depth.requires_grad_(True) 437 | model.encoding_depth.train(True) 438 | model.network_depth.requires_grad_(True) 439 | model.network_depth.train(True) 440 | if args.no_intrinsics: 441 | model.model_intrinsics.requires_grad_(True) 442 | model.model_intrinsics.train(True) 443 | 444 | for i, frame in enumerate([0]): # can sample more frames 445 | rgb, rgb_raw, rgb_processed, depth, depth_img = model.generate_outputs(frame) 446 | model.logger.experiment.add_image(f'pred/{i}_rgb', rgb, global_step=trainer.global_step) 447 | model.logger.experiment.add_image(f'pred/{i}_raw', rgb_raw, global_step=trainer.global_step) 448 | model.logger.experiment.add_image(f'pred/{i}_processed', rgb_processed, global_step=trainer.global_step) 449 | model.logger.experiment.add_image(f'pred/{i}_depth', depth_img, global_step=trainer.global_step) 450 | 451 | if model.args.save_video: # save the evolution of the model 452 | if i == 0: # save first frame 453 | np.save(f"video/{model.args.name}/{model.current_epoch}_depth.npy", depth.detach().cpu().numpy()) 454 | np.save(f"video/{model.args.name}/{model.current_epoch}_rgb.npy", rgb.detach().cpu().numpy()) 455 | 456 | # zoomed images 457 | # rgb, rgb_raw, rgb_processed, depth, depth_img, depth_lidar, depth_lidar_img = model.generate_imgs(frame, u_lims=[0.4,0.6], v_lims=[0.4,0.6]) 458 | # model.logger.experiment.add_image(f'pred/{i}_rgb_zoom', rgb, global_step=trainer.global_step) 459 | # model.logger.experiment.add_image(f'pred/{i}_depth_zoom', depth_img, global_step=trainer.global_step) 460 | 461 | def on_train_start(self, trainer, model): 462 | pl.seed_everything(42) 463 | 464 | # pl doesn't put non-parameters on the right device 465 | model.rgb_volume = model.rgb_volume.to(model.device) 466 | model.processed_rgb_volume = model.processed_rgb_volume.to(model.device) 467 | model.reference_intrinsics = model.reference_intrinsics.to(model.device) 468 | if not model.args.no_device_rotations: 469 | model.model_rotation.reference_rotation = model.model_rotation.reference_rotation.to(model.device) 470 | model.model_intrinsics.focal = model.model_intrinsics.focal.to(model.device) 471 | 472 | model.logger.experiment.add_text("args", str(model.args)) 473 | 474 | rgb_raw, rgb_processed, depth_lidar, depth_lidar_img = model.generate_imgs(0) 475 | model.logger.experiment.add_image('gt/lidar', depth_lidar_img, global_step=trainer.global_step) 476 | 477 | for i, frame in enumerate([0]): 478 | rgb_raw, rgb_processed, depth_lidar, depth_lidar_img = model.generate_imgs(frame) 479 | model.logger.experiment.add_image(f'gt/{i}_rgb_raw', rgb_raw, global_step=trainer.global_step) 480 | model.logger.experiment.add_image(f'gt/{i}_rgb_processed', rgb_processed, global_step=trainer.global_step) 481 | # zoomed images 482 | # rgb, rgb_raw, rgb_processed, depth, depth_img, depth_lidar, depth_lidar_img = model.generate_imgs(frame, u_lims=[0.4,0.6], v_lims=[0.4,0.6]) 483 | # model.logger.experiment.add_image(f'gt/{i}_rgb_raw_zoom', rgb_raw, global_step=trainer.global_step) 484 | # model.logger.experiment.add_image(f'gt/{i}_rgb_processed_zoom', rgb_processed, global_step=trainer.global_step) 485 | 486 | if model.args.save_video: 487 | os.makedirs(f"video/{model.args.name}", exist_ok=True) 488 | 489 | def on_train_end(self, trainer, model): 490 | checkpoint_dir = os.path.join("checkpoints", args.name, "last.ckpt") 491 | trainer.save_checkpoint(checkpoint_dir) 492 | 493 | if __name__ == "__main__": 494 | 495 | # argparse 496 | parser = argparse.ArgumentParser() 497 | 498 | # data 499 | parser.add_argument('--point_batch_size', type=int, default=1024, help="Number of points to sample per dataloader index.") 500 | parser.add_argument('--num_batches', type=int, default=256, help="Number of training batches.") 501 | parser.add_argument('--no_shade_map', action='store_true', help="Don't use shade map, useful for low-light captures.") 502 | parser.add_argument('--no_raw', action='store_true', help="No RAW data available, use RGB volume instead.") 503 | parser.add_argument('--no_device_rotations', action='store_true', help="Learn rotations from scratch, useful if no gyro data available.") 504 | parser.add_argument('--no_intrinsics', action='store_true', help="Learn camera intrinsics from scratch, useful if no camera intrinsics available.") 505 | parser.add_argument('--no_phone_depth', action='store_true', help="No phone depth data in bundle.") 506 | parser.add_argument('--allow_negative_depth', action='store_true', help="Allow negative depth solutions, useful for weird or digitally stabilized data.") 507 | parser.add_argument('--dark', action='store_true', help="Low-light capture, automatically also turns off shade map.") 508 | 509 | # model 510 | parser.add_argument('--control_points_motion', type=int, default=21, help="Spline control points for translation/rotation model.") 511 | parser.add_argument('--control_points_intrinsics', type=int, default=4, help="Spline control points for intrinsics model.") 512 | parser.add_argument('--config_path_depth', type=str, default="config/config_depth.json", help="Depth model config.") 513 | parser.add_argument('--config_path_rgb', type=str, default="config/config_rgb.json", help="RGB model config.") 514 | parser.add_argument('--plane_weight', type=float, default=1e-4, help="Depth regularization.") 515 | parser.add_argument('--rotation_weight', type=float, default=1e-1, help="Scale learned rotation.") 516 | parser.add_argument('--translation_weight', type=float, default=1e-1, help="Scale learned translation.") 517 | parser.add_argument('--mask_k_min', type=float, default=-100, help="Mask weight evolution parameter.") 518 | parser.add_argument('--mask_k_max', type=float, default=100, help="Mask weight evolution parameter.") 519 | parser.add_argument('--fixed_image', action='store_true', help="Fix I(u,v) to be the zero-th frame during training.") 520 | 521 | 522 | # training 523 | parser.add_argument('--bundle_path', type=str, required=True, help="Path to frame_bundle.npz") 524 | parser.add_argument('--name', type=str, required=True, help="Experiment name for logs and checkpoints.") 525 | parser.add_argument('--max_epochs', type=int, default=100, help="Number of training epochs.") 526 | parser.add_argument('--gamma', type=float, default=0.98, help="Learning rate decay gamma.") 527 | parser.add_argument('--lr', type=float, default=1e-4, help="Learning rate.") 528 | parser.add_argument('--save_video', action='store_true', help="Store training outputs at each epoch for visualization.") 529 | 530 | 531 | args = parser.parse_args() 532 | if args.dark: 533 | args.no_shade_map = True 534 | 535 | print(args) 536 | 537 | # dataset 538 | bundle_dataset = BundleDataset(args) 539 | train_loader = DataLoader(bundle_dataset, batch_size=1, num_workers=os.cpu_count(), shuffle=True, pin_memory=True) 540 | 541 | # model 542 | model = BundleMLP(args) 543 | # let plane train on its own at the start 544 | model.network_depth.requires_grad_(False) 545 | model.encoding_depth.requires_grad_(False) 546 | model.model_intrinsics.requires_grad_(False) 547 | 548 | # training 549 | # checkpoint_callback = pl.callbacks.ModelCheckpoint(dirpath=os.path.join("checkpoints", args.name), save_top_k=1, save_last=True, monitor="loss") 550 | lr_callback = pl.callbacks.LearningRateMonitor() 551 | logger = pl.loggers.TensorBoardLogger(save_dir=os.getcwd(), version=args.name, name="lightning_logs") 552 | validation_callback = ValidationCallback() 553 | trainer = pl.Trainer(accelerator="auto", strategy="auto", max_epochs=args.max_epochs, 554 | logger=logger, callbacks=[validation_callback, lr_callback], enable_checkpointing=False) 555 | trainer.fit(model, train_loader) 556 | -------------------------------------------------------------------------------- /tutorial.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "4J5E1CJukt0H" 7 | }, 8 | "source": [ 9 | "### Section 0: Setup\n", 10 | "If running this in Google Colab, make sure that you are connected to a GPU instance and run the install script below. It should (hopefully) take about 2-5mins to execute." 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": { 17 | "id": "AI4QE-aXkt0J" 18 | }, 19 | "outputs": [], 20 | "source": [ 21 | "import subprocess\n", 22 | "import os\n", 23 | "# Check if GPU exists\n", 24 | "\n", 25 | "try:\n", 26 | " subprocess.check_output('nvidia-smi')\n", 27 | " print(\"GPU is enabled.\")\n", 28 | " # Check if running in Google Colab\n", 29 | " if 'COLAB_GPU' in os.environ:\n", 30 | " # Instal TinyCuda\n", 31 | " %cd /content/\n", 32 | " # cursed one-line wheel download/install\n", 33 | " !curl -L \"https://github.com/Ilya-Muromets/TinyCudaColab/releases/latest/download/tinycudann-colab-gpu.zip\" -o tinycudann-colab-gpu.zip && unzip -o tinycudann-colab-gpu.zip && WHEEL=$(find . -maxdepth 1 -name \"*.whl\" | head -n 1) && echo \"Found wheel: $WHEEL\" && pip install \"$WHEEL\" --force-reinstall\n", 34 | " !pip install commentjson\n", 35 | " !pip install pytorch_lightning\n", 36 | " !pip install matplotlib==3.7.0\n", 37 | " # broken cuda version\n", 38 | " !pip uninstall -y torchaudio\n", 39 | " else:\n", 40 | " print(\"COLAB_GPU not detected\")\n", 41 | "except FileNotFoundError as e:\n", 42 | " print(\"GPU is not enabled in this notebook.\")\n", 43 | " print(\"Please select 'Runtime -> Change runtime type' and set the hardware accelerator to GPU.\")" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "\n", 51 | "## WARNING:\n", 52 | "### Colab will ask to restart the session after running the above cell (because it pre-loads matplotlib for some reason). You should first restart the session, then continue running the cells below. Do not re-run the cell above after restarting the session." 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": { 59 | "id": "AfKFeJKQPPmc" 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "import os\n", 64 | "# Clone repo (Colab only)\n", 65 | "if 'COLAB_GPU' in os.environ:\n", 66 | " !git clone https://github.com/princeton-computational-imaging/SoaP/\n", 67 | " %cd /content/SoaP/" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": { 74 | "id": "866qbOL9kt0K" 75 | }, 76 | "outputs": [], 77 | "source": [ 78 | "!wget https://soap.cs.princeton.edu/shade_map.npy -P data/\n", 79 | "!wget https://soap.cs.princeton.edu/demo.zip -P data/\n", 80 | "!unzip data/demo.zip -d data/" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": { 86 | "id": "GB_OzWSbcd83" 87 | }, 88 | "source": [ 89 | "### Section 1: (Optional) What is a `frame_bundle.npz`?\n" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": { 96 | "id": "ut2yAwYZc1Ei" 97 | }, 98 | "outputs": [], 99 | "source": [ 100 | "%matplotlib inline\n", 101 | "\n", 102 | "import torch\n", 103 | "import numpy as np\n", 104 | "import matplotlib.pyplot as plt\n", 105 | "from utils import utils" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": { 111 | "id": "r_NBzSevc4pP" 112 | }, 113 | "source": [ 114 | "Load data from disk:" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": { 121 | "id": "bRhSQ01dc6m8" 122 | }, 123 | "outputs": [], 124 | "source": [ 125 | "bundle_path = \"data/demo/dragon/compressed_frame_bundle.npz\"\n", 126 | "# convert to dictionary - important, by default npz load as a namespace which can have odd behaviour\n", 127 | "bundle = dict(np.load(bundle_path, allow_pickle=True))\n", 128 | "# remove extra dimensions\n", 129 | "utils.de_item(bundle)" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": { 135 | "id": "v0FAm169dL_d" 136 | }, 137 | "source": [ 138 | "Our bundle contains four sets of data: \n", 139 | "1. `motion` : device motion data including rotation, gravity, and acceleration \n", 140 | "2. `raw_[x]` : Bayer RAW frames enumerated from `0` to `num_raw_frames - 1`, with associated metadata \n", 141 | "3. `rgb_[x]` : Processed Apple RGB frames enumerated from `0` to `num_rgb_frames - 1`, with associated metadata \n", 142 | "4. `depth_[x]` : Apple depth maps enumerated from `0` to `num_depth_frames - 1`, with associated metadata \n", 143 | "\n", 144 | "Lets take a closer look at this data:" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": { 151 | "id": "RYldYeSxdLCZ" 152 | }, 153 | "outputs": [], 154 | "source": [ 155 | "bundle[\"motion\"].keys()" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": { 161 | "id": "PbsVcU-LdQBt" 162 | }, 163 | "source": [ 164 | "The motion data `motion` contains: \n", 165 | "1. `frame_count` : what frame was being recorded when the associated motion data was recorded. There can be multiple motion values for the same frame as the frequency of the accelerometer/gyroscope (100Hz) is higher than the framerate we're recording at (21fps).\n", 166 | "2. `timestamp` : absolute device time at which measurements were recorded\n", 167 | "3. `quaternion` : device relative rotation expressed in quaternion format\n", 168 | "4. `rotation_rate` : velocity of device rotation expressed in roll-pitch-yaw\n", 169 | "5. `roll_pitch_yaw` : device relative rotation expressed in roll-pitch-yaw\n", 170 | "6. `acceleration` : device relative acceleration (with gravity removed) expressed in x-y-z\n", 171 | "7. `gravity` : acceleration due to gravity expressed in x-y-z\n", 172 | "8. `num_motion_frames` : number of recorded measurements\n", 173 | "\n", 174 | "As an example lets plot the device roll over time:" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": { 181 | "id": "8xxPIJ6IdYZS" 182 | }, 183 | "outputs": [], 184 | "source": [ 185 | "roll_pitch_yaw = bundle[\"motion\"][\"roll_pitch_yaw\"] # [3,N]\n", 186 | "timestamp = bundle[\"motion\"][\"timestamp\"] # [N]\n", 187 | "roll = roll_pitch_yaw[:,0]\n", 188 | "pitch = roll_pitch_yaw[:,1]\n", 189 | "yaw = roll_pitch_yaw[:,2]\n", 190 | "\n", 191 | "plt.plot(timestamp, roll)\n", 192 | "plt.ylabel(\"Roll [Rad]\")\n", 193 | "plt.xlabel(\"Device Time [s]\")\n", 194 | "plt.show()" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": { 200 | "id": "oF1jRJgudb3g" 201 | }, 202 | "source": [ 203 | "RAW image data:" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "metadata": { 210 | "id": "c9pcNfyEdaTP" 211 | }, 212 | "outputs": [], 213 | "source": [ 214 | "frame = 0 # change this to view other frames\n", 215 | "raw = bundle[f\"raw_{frame}\"]\n", 216 | "rgb = bundle[f\"rgb_{frame}\"]\n", 217 | "depth = bundle[f\"depth_{frame}\"]" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "metadata": { 224 | "id": "R9B2Q6kjdPiK" 225 | }, 226 | "outputs": [], 227 | "source": [ 228 | "print(raw.keys())\n", 229 | "print(\"height:\", raw['height'], \"width:\", raw['width'])" 230 | ] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "metadata": { 235 | "id": "7x0KZnv9dgDg" 236 | }, 237 | "source": [ 238 | "Each `raw` frame consists of:\n", 239 | "1. `frame_count` : frame number, ranges from 0 - `num_raw_frames`\n", 240 | "2. `timestamp` : absolute device time at which frame was recorded\n", 241 | "3. `height, width` : frame dimensions (**WARNING**: these may not match the expected orientation of the frame, i.e. if you are recording with the phone vertical or horizontal, the `width` does not change and always refers to the long side of the capture)\n", 242 | "4. `ISO`, `exposure_time`, `aperture` : camera ISO, exposure time (seconds), and f-stop used to capture the image\n", 243 | "5. `brightnesss` : the estimated 'brightness' of the scene, honestly not sure what this is (pls message me if you know)\n", 244 | "6. `shutter_speed` : inverse of `exposure_time`\n", 245 | "7. `black_level`, `white_level`: min and max real RAW values\n", 246 | "8. `raw`, 4032 x 3024 single channel, 14-bit mosaiced bayer CFA frame\n", 247 | "\n", 248 | "Lets look at the RAW image data:" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "metadata": { 255 | "id": "aaAoETy-dhrm" 256 | }, 257 | "outputs": [], 258 | "source": [ 259 | "raw_img = raw[\"raw\"]\n", 260 | "\n", 261 | "# use simple demosaicing the fill gap values (see paper supplemental)\n", 262 | "raw_demosaiced = utils.raw_to_rgb(torch.tensor(raw_img[None,None].astype(np.int32)))[0].permute(1,2,0)\n", 263 | "raw_demosaiced = raw_demosaiced/raw_demosaiced.max()" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": null, 269 | "metadata": { 270 | "id": "Z-yzxCOrdvK6" 271 | }, 272 | "outputs": [], 273 | "source": [ 274 | "fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))\n", 275 | "axes[0].imshow(raw_img, cmap=\"gray\")\n", 276 | "axes[0].set_title(f\"Frame {frame} Mosaiced Raw\")\n", 277 | "im = axes[1].imshow(raw_demosaiced)\n", 278 | "axes[1].set_title(f\"Frame {frame} De-Mosaiced Raw\")\n", 279 | "\n", 280 | "fig.subplots_adjust(right=0.7)\n", 281 | "plt.show()" 282 | ] 283 | }, 284 | { 285 | "cell_type": "markdown", 286 | "metadata": { 287 | "id": "kDdX44-4dyM8" 288 | }, 289 | "source": [ 290 | "If we zoom into a small patch of the above mosaiced RAW we can see the Bayer CFA pattern:" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": null, 296 | "metadata": { 297 | "id": "EE62TZPGdxxb" 298 | }, 299 | "outputs": [], 300 | "source": [ 301 | "plt.imshow(raw_img[:8,:8], cmap=\"gray\")\n", 302 | "plt.show()" 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": { 308 | "id": "l_s4Xiosd2Op" 309 | }, 310 | "source": [ 311 | "Applying the shade map to this data we see how it corrects for the vignetting on the edges of the scene:" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": null, 317 | "metadata": { 318 | "id": "jMMBTfHFkt0L" 319 | }, 320 | "outputs": [], 321 | "source": [ 322 | "shade_map = np.load(\"data/shade_map.npy\")\n", 323 | "raw_img_deshade = raw[\"raw\"] * shade_map" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": null, 329 | "metadata": { 330 | "id": "V4RkBotcd4jn" 331 | }, 332 | "outputs": [], 333 | "source": [ 334 | "fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))\n", 335 | "axes[0].imshow(shade_map, cmap=\"gray\")\n", 336 | "axes[0].set_title(f\"Shade Map\")\n", 337 | "im = axes[1].imshow(raw_img_deshade, cmap=\"gray\")\n", 338 | "axes[1].set_title(f\"Frame {frame} Mosaiced Raw + Shade Map\")\n", 339 | "\n", 340 | "fig.subplots_adjust(right=0.7)\n", 341 | "plt.show()" 342 | ] 343 | }, 344 | { 345 | "cell_type": "markdown", 346 | "metadata": { 347 | "id": "i2yFj30BeBTG" 348 | }, 349 | "source": [ 350 | "Processed RGB and depth data:" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": null, 356 | "metadata": { 357 | "id": "nVzXhZXheEhv" 358 | }, 359 | "outputs": [], 360 | "source": [ 361 | "print(rgb.keys())\n", 362 | "print(\"height:\", rgb['height'], \"width:\", rgb['width'])" 363 | ] 364 | }, 365 | { 366 | "cell_type": "markdown", 367 | "metadata": { 368 | "id": "oHDdV9gXeGa0" 369 | }, 370 | "source": [ 371 | "Each `rgb` frame contains:\n", 372 | "1. `frame_count`, `timestamp`, `height`, `width` : see `raw` documentation\n", 373 | "2. `intrinsics`: 3x3 camera intrinsics, see: [documentation](https://developer.apple.com/documentation/avfoundation/avcameracalibrationdata/2881135-intrinsicmatrix)\n", 374 | "3. `rgb`, 1920 x 1440 3 channel, 8-bit processed RGB frame" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": null, 380 | "metadata": { 381 | "id": "jdU-m4v4eIzo" 382 | }, 383 | "outputs": [], 384 | "source": [ 385 | "print(depth.keys())\n", 386 | "print(\"height:\", depth['height'], \"width:\", depth['width'])" 387 | ] 388 | }, 389 | { 390 | "cell_type": "markdown", 391 | "metadata": { 392 | "id": "rMYQNoSueK__" 393 | }, 394 | "source": [ 395 | "Each `depth` frame contains:\n", 396 | "1. `frame_count`, `timestamp`, `height`, `width` : see `raw` documentation\n", 397 | "2. `intrinsic_height`, `intrinsic_width`, `intrinsics` : 3x3 camera intrinsics, with associated frame height and width\n", 398 | "3. `lens_distortion` : look-up table for radial distortion correction, see: [documentation](https://developer.apple.com/documentation/avfoundation/avcameracalibrationdata/2881129-lensdistortionlookuptable)\n", 399 | "4. `lens_undistortion` : inverse of `lens_distortion`\n", 400 | "5. `depth_accuracy` : [accuracy of depth measurements](https://developer.apple.com/documentation/avfoundation/avdepthdata/accuracy), depends on iPhone/iOS version. `1` -> metric depth, `0` -> relative depth\n", 401 | "6. `depth`, 320 x 240 inverse depth map from monocular cues + LiDAR measurements\n", 402 | "\n", 403 | "Here's a preview of what the RGB and iPhone depth data look like:" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": null, 409 | "metadata": { 410 | "id": "72Ar-jykeP_a" 411 | }, 412 | "outputs": [], 413 | "source": [ 414 | "fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 8))\n", 415 | "axes[0].imshow(rgb['rgb'])\n", 416 | "axes[0].set_title(\"Frame {0} Image\".format(frame))\n", 417 | "im = axes[1].imshow(depth['depth'], cmap='RdYlBu')\n", 418 | "axes[1].set_title(\"Frame {0} iPhone Depth\".format(frame))\n", 419 | "\n", 420 | "fig.subplots_adjust(right=0.82)\n", 421 | "cbar_ax = fig.add_axes([0.85, 0.15, 0.02, 0.7])\n", 422 | "fig.colorbar(im, cax=cbar_ax, label='Depth [m]')\n", 423 | "plt.show()\n", 424 | "\n", 425 | "print()\n", 426 | "print(\"Camera Info at Frame {0}: \\n\".format(frame))\n", 427 | "print(\"Timestamp:\", rgb['timestamp'], \"\\n\")\n", 428 | "print(\"Camera Intrinsics: \\n\", rgb['intrinsics'])" 429 | ] 430 | }, 431 | { 432 | "cell_type": "markdown", 433 | "metadata": { 434 | "id": "GWrA79XSeTJ4" 435 | }, 436 | "source": [ 437 | "### Section 2: Training on a `frame_bundle.npz`\n", 438 | "This section will cover how to fit our model to an input RAW frame_bundle.npz, monitor the model's training, and plot its outputs.\n" 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": null, 444 | "metadata": { 445 | "id": "Xg4HzP48et2G" 446 | }, 447 | "outputs": [], 448 | "source": [ 449 | "import torch\n", 450 | "import numpy as np\n", 451 | "import matplotlib.pyplot as plt\n", 452 | "\n", 453 | "from train import *\n", 454 | "from utils import utils" 455 | ] 456 | }, 457 | { 458 | "cell_type": "markdown", 459 | "metadata": { 460 | "id": "TqJAuR2Sf1tt" 461 | }, 462 | "source": [ 463 | "Lets begin by taking a look at the images in our `compressed_frame_bundle.npz` (this is a sub-sampled `frame_bundle.npz` with 9 images instead of 42 to speed up training/download time)" 464 | ] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": null, 469 | "metadata": { 470 | "id": "Hma153d3fbOI" 471 | }, 472 | "outputs": [], 473 | "source": [ 474 | "bundle = dict(np.load(\"data/demo/dragon/compressed_frame_bundle.npz\", allow_pickle=True))\n", 475 | "utils.de_item(bundle)\n", 476 | "\n", 477 | "# plot the first 5 images, downsample 2x for speed\n", 478 | "fig, ax = plt.subplots(1,5, figsize=(19.5,5))\n", 479 | "for i in range(5):\n", 480 | " ax[i].imshow(bundle[f\"rgb_{i}\"][\"rgb\"][::2,::2])\n", 481 | " ax[i].set_title(f\"Image {i}\")\n", 482 | "\n", 483 | "# remove ticks\n", 484 | "for a in ax:\n", 485 | " a.tick_params(left=False, bottom=False, labelleft=False, labelbottom=False)\n", 486 | "# adjust spacing\n", 487 | "plt.subplots_adjust(wspace=0.0)\n", 488 | "plt.show()" 489 | ] 490 | }, 491 | { 492 | "cell_type": "markdown", 493 | "metadata": { 494 | "id": "0N382cN3eyP9" 495 | }, 496 | "source": [ 497 | "While they barely appear to change, there's actually still more than enough parallax here to recover meaningful depth. \n", 498 | "\n", 499 | "We begin by launching tensorboard so we can see our training progress:" 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": null, 505 | "metadata": { 506 | "id": "_YhuEtN7eyWa" 507 | }, 508 | "outputs": [], 509 | "source": [ 510 | "%load_ext tensorboard\n", 511 | "%tensorboard --logdir lightning_logs" 512 | ] 513 | }, 514 | { 515 | "cell_type": "markdown", 516 | "metadata": { 517 | "id": "sZyyIi2Bkt0L" 518 | }, 519 | "source": [ 520 | "Next we run `train.py`. On an RTX 4090 this should train in a couple minutes, on Colab this will be quite a bit slower. \n", 521 | "\n", 522 | "You can refresh the tensorboard window above to watch the training progress." 523 | ] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "execution_count": null, 528 | "metadata": { 529 | "id": "jz3HSxV5g7Gy" 530 | }, 531 | "outputs": [], 532 | "source": [ 533 | "# only run to 30 epochs to save time, remove the flag to run for default 100 epochs\n", 534 | "!python3 train.py --name dragon-test --bundle_path data/demo/dragon/compressed_frame_bundle.npz --max_epochs 30" 535 | ] 536 | }, 537 | { 538 | "cell_type": "markdown", 539 | "metadata": { 540 | "id": "0EHa5z5djQlI" 541 | }, 542 | "source": [ 543 | "To view our reconstruction we load the model from disk:" 544 | ] 545 | }, 546 | { 547 | "cell_type": "code", 548 | "execution_count": null, 549 | "metadata": { 550 | "id": "-mWQko-gjT8w" 551 | }, 552 | "outputs": [], 553 | "source": [ 554 | "model = BundleMLP.load_from_checkpoint(\"checkpoints/dragon-test/last.ckpt\", device=\"cuda\")" 555 | ] 556 | }, 557 | { 558 | "cell_type": "code", 559 | "execution_count": null, 560 | "metadata": { 561 | "id": "qcG0vZwfjW4A" 562 | }, 563 | "outputs": [], 564 | "source": [ 565 | "# move model components to GPU\n", 566 | "model = model.eval()\n", 567 | "model = model.to('cuda')\n", 568 | "model.rgb_volume = model.rgb_volume.to('cuda')\n", 569 | "model.processed_rgb_volume = model.processed_rgb_volume.to('cuda')\n", 570 | "model.model_rotation = model.model_rotation.to('cuda')\n", 571 | "model.model_translation = model.model_translation.to('cuda')\n", 572 | "model.reference_intrinsics = model.reference_intrinsics.to('cuda')\n", 573 | "model.model_rotation.reference_rotation = model.model_rotation.reference_rotation.to('cuda')\n", 574 | "\n", 575 | "# use all encoding levels for inference\n", 576 | "model.mask = torch.ones_like(model.mask)" 577 | ] 578 | }, 579 | { 580 | "cell_type": "markdown", 581 | "metadata": { 582 | "id": "pPbUQXv0jaTa" 583 | }, 584 | "source": [ 585 | "And use `model.generate_outputs` to generate the outputs:" 586 | ] 587 | }, 588 | { 589 | "cell_type": "code", 590 | "execution_count": null, 591 | "metadata": { 592 | "id": "F9uc5Rnyjgap" 593 | }, 594 | "outputs": [], 595 | "source": [ 596 | "rgb, rgb_raw, rgb_processed, depth, depth_img = model.generate_outputs(frame=0, height=1920, width=1440, u_lims=[0.025,0.975], v_lims=[0.025,0.975])" 597 | ] 598 | }, 599 | { 600 | "cell_type": "markdown", 601 | "metadata": { 602 | "id": "nvxRR70Sjksj" 603 | }, 604 | "source": [ 605 | "Outputs:\n", 606 | "1. `rgb` : color values I(u,v) output by implicit image model\n", 607 | "2. `rgb_raw` : corresponding sampled values from bayer RAW volume\n", 608 | "3. `rgb_processed` : corresponding sampled values from processed RGB volume\n", 609 | "4. `depth` : depth values D(u,v) from shakes-on-a-plane implicit depth model\n", 610 | "5. `depth_img` : same as `depth` but with colormap applied for tensorboard visualization" 611 | ] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "execution_count": null, 616 | "metadata": { 617 | "id": "C-uH6IaejnEV" 618 | }, 619 | "outputs": [], 620 | "source": [ 621 | "fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(14, 8))\n", 622 | "axes[0].imshow((rgb.permute(1,2,0).cpu()).clip(0,1)) # increase brightness\n", 623 | "axes[0].set_title(\"Reconstructed Image I(u,v)\")\n", 624 | "axes[1].imshow(rgb_processed.permute(1,2,0).cpu())\n", 625 | "axes[1].set_title(\"Processed RGB\")\n", 626 | "axes[2].imshow(depth.cpu(), cmap=\"RdYlBu\")\n", 627 | "axes[2].set_title(\"Reconstructed Depth D(u,v)\")\n", 628 | "plt.show()" 629 | ] 630 | }, 631 | { 632 | "cell_type": "markdown", 633 | "metadata": { 634 | "id": "su3XUolojxfm" 635 | }, 636 | "source": [ 637 | "### Section 3: Training on PNGs\n", 638 | "This section is almost identical to the previous one, except we will learn how to convert a stack of `PNGs` into a `frame_bundle.npz` before fitting our model to it." 639 | ] 640 | }, 641 | { 642 | "cell_type": "code", 643 | "execution_count": null, 644 | "metadata": { 645 | "id": "TJ0UL1GYkIaj" 646 | }, 647 | "outputs": [], 648 | "source": [ 649 | "import numpy as np\n", 650 | "import matplotlib.pyplot as plt\n", 651 | "import utils.utils as utils\n", 652 | "from glob import glob\n", 653 | "from train import *" 654 | ] 655 | }, 656 | { 657 | "cell_type": "markdown", 658 | "metadata": { 659 | "id": "VWi3iakjkhvE" 660 | }, 661 | "source": [ 662 | "You can replace the code below with any filetype (e.g., load an MP4 with OpenCV), as long as `imgs` is a `NxHxWxC` array, where N is the number of frames." 663 | ] 664 | }, 665 | { 666 | "cell_type": "code", 667 | "execution_count": null, 668 | "metadata": { 669 | "id": "8NosYD8BkU_8" 670 | }, 671 | "outputs": [], 672 | "source": [ 673 | "imgs = sorted(glob(\"data/demo/dragon-rgb/*.png\")) # change file extension to match your filetypes\n", 674 | "imgs = np.array([plt.imread(img)[:,:,:3] for img in imgs]) # remove alpha channel and load\n", 675 | "\n", 676 | "print(\"Number of images: \", len(imgs))\n", 677 | "# plot first image, last image\n", 678 | "fig, ax = plt.subplots(1,2, figsize=(10,5))\n", 679 | "ax[0].imshow(imgs[0])\n", 680 | "ax[0].set_title(\"Image 0\")\n", 681 | "ax[1].imshow(imgs[-1])\n", 682 | "ax[1].set_title(f\"Image {len(imgs)-1}\")\n", 683 | "plt.show()" 684 | ] 685 | }, 686 | { 687 | "cell_type": "markdown", 688 | "metadata": { 689 | "id": "XLyPF5h3k8Qe" 690 | }, 691 | "source": [ 692 | "For our projective camera model to work we'll need to supply it with [camera intrinsics](https://en.wikipedia.org/wiki/Camera_matrix). Here we'll assume we don't have and calibrated intrinsics and will have to create our own.\n", 693 | "\n", 694 | "We'll set the camera centers `cx` and `cy` to be the center of the image:" 695 | ] 696 | }, 697 | { 698 | "cell_type": "code", 699 | "execution_count": null, 700 | "metadata": { 701 | "id": "sy72i80WkX5z" 702 | }, 703 | "outputs": [], 704 | "source": [ 705 | "cy = imgs.shape[1] // 2 # set centers to the middle of the image\n", 706 | "cx = imgs.shape[2] // 2\n", 707 | "print(\"Center y: \", cy, \"\\nCenter x: \", cx)" 708 | ] 709 | }, 710 | { 711 | "cell_type": "markdown", 712 | "metadata": { 713 | "id": "y4KSGYJzlZl1" 714 | }, 715 | "source": [ 716 | "If we don't know the focal length of the camera, we can use a best guess of its FOV (around 70 degrees for a standard phone camera) to calculate it:" 717 | ] 718 | }, 719 | { 720 | "cell_type": "code", 721 | "execution_count": null, 722 | "metadata": { 723 | "id": "wlZVRRrplY_i" 724 | }, 725 | "outputs": [], 726 | "source": [ 727 | "focal = min(cx, cy)/np.tan(70 * (np.pi/180/2)) # 70 degree field of view\n", 728 | "print(\"Focal length (pixels): \", focal)" 729 | ] 730 | }, 731 | { 732 | "cell_type": "code", 733 | "execution_count": null, 734 | "metadata": { 735 | "id": "3-P3IkZ2lbtF" 736 | }, 737 | "outputs": [], 738 | "source": [ 739 | "intrinsics = np.array([[focal, 0, 0],\n", 740 | " [0, focal, 0],\n", 741 | " [cx, cy, 1]])" 742 | ] 743 | }, 744 | { 745 | "cell_type": "markdown", 746 | "metadata": { 747 | "id": "awjacroSlgeq" 748 | }, 749 | "source": [ 750 | "These and the images are all we need to make our custom frame bundle, which we save to the same folder as the input data:" 751 | ] 752 | }, 753 | { 754 | "cell_type": "code", 755 | "execution_count": null, 756 | "metadata": { 757 | "id": "97_QQvqhle9E" 758 | }, 759 | "outputs": [], 760 | "source": [ 761 | "rgb_bundle = {}\n", 762 | "for i in range(len(imgs)):\n", 763 | " rgb = {\"rgb\": imgs[i], \"intrinsics\": intrinsics, \"height\": imgs.shape[2], \"width\": imgs.shape[1]}\n", 764 | " rgb_bundle[f'rgb_{i}'] = rgb\n", 765 | "rgb_bundle['num_rgb_frames'] = len(imgs)\n", 766 | "rgb_bundle['num_raw_frames'] = 0\n", 767 | "rgb_bundle['num_depth_frames'] = 0\n", 768 | "rgb_bundle['motion'] = None\n", 769 | "np.savez('data/demo/dragon-rgb/frame_bundle.npz', **rgb_bundle)" 770 | ] 771 | }, 772 | { 773 | "cell_type": "markdown", 774 | "metadata": { 775 | "id": "HfxU3mMQlpWv" 776 | }, 777 | "source": [ 778 | "Now we can train our model as before:" 779 | ] 780 | }, 781 | { 782 | "cell_type": "code", 783 | "execution_count": null, 784 | "metadata": { 785 | "id": "JjRx9O-Jlm1K" 786 | }, 787 | "outputs": [], 788 | "source": [ 789 | "%load_ext tensorboard\n", 790 | "%tensorboard --logdir lightning_logs" 791 | ] 792 | }, 793 | { 794 | "cell_type": "markdown", 795 | "metadata": { 796 | "id": "Qv-Ss6PBl6hS" 797 | }, 798 | "source": [ 799 | "However we now have to add flags `--no_device_rotations`, `--no_phone_depth`, and `--no_raw` to let the training code know that we're only passing in RGB data and nothing else." 800 | ] 801 | }, 802 | { 803 | "cell_type": "code", 804 | "execution_count": null, 805 | "metadata": { 806 | "id": "O-RQ4woClzLv" 807 | }, 808 | "outputs": [], 809 | "source": [ 810 | "# only run to 30 epochs to save time, remove the flag to run for default 100 epochs\n", 811 | "!python3 train.py --name dragon-rgb-test --bundle_path data/demo/dragon-rgb/frame_bundle.npz --max_epochs 30 --no_device_rotations --no_phone_depth --no_raw" 812 | ] 813 | }, 814 | { 815 | "cell_type": "markdown", 816 | "metadata": { 817 | "id": "Qv85okr3lzL5" 818 | }, 819 | "source": [ 820 | "To view our reconstruction we load the model from disk:" 821 | ] 822 | }, 823 | { 824 | "cell_type": "code", 825 | "execution_count": null, 826 | "metadata": { 827 | "id": "ZQCjH4B_lzL5" 828 | }, 829 | "outputs": [], 830 | "source": [ 831 | "model = BundleMLP.load_from_checkpoint(\"checkpoints/dragon-rgb-test/last.ckpt\", device=\"cuda\")" 832 | ] 833 | }, 834 | { 835 | "cell_type": "code", 836 | "execution_count": null, 837 | "metadata": { 838 | "id": "vUr4jWsTlzL5" 839 | }, 840 | "outputs": [], 841 | "source": [ 842 | "# move model components to GPU\n", 843 | "model = model.eval()\n", 844 | "model = model.to('cuda')\n", 845 | "model.rgb_volume = model.rgb_volume.to('cuda')\n", 846 | "model.processed_rgb_volume = model.processed_rgb_volume.to('cuda')\n", 847 | "model.model_rotation = model.model_rotation.to('cuda')\n", 848 | "model.model_translation = model.model_translation.to('cuda')\n", 849 | "model.reference_intrinsics = model.reference_intrinsics.to('cuda')\n", 850 | "# model.model_rotation.reference_rotation = model.model_rotation.reference_rotation.to('cuda') # doesnt exist\n", 851 | "\n", 852 | "# use all encoding levels for inference\n", 853 | "model.mask = torch.ones_like(model.mask)" 854 | ] 855 | }, 856 | { 857 | "cell_type": "markdown", 858 | "metadata": { 859 | "id": "BEg6nQvGlzL5" 860 | }, 861 | "source": [ 862 | "And use `model.generate_outputs` to generate the outputs:" 863 | ] 864 | }, 865 | { 866 | "cell_type": "code", 867 | "execution_count": null, 868 | "metadata": { 869 | "id": "E_EQO9S1lzL5" 870 | }, 871 | "outputs": [], 872 | "source": [ 873 | "rgb, rgb_raw, rgb_processed, depth, depth_img = model.generate_outputs(frame=0, height=1920, width=1440, u_lims=[0.025,0.975], v_lims=[0.025,0.975])" 874 | ] 875 | }, 876 | { 877 | "cell_type": "markdown", 878 | "metadata": { 879 | "id": "H2e610IylzL5" 880 | }, 881 | "source": [ 882 | "Outputs:\n", 883 | "1. `rgb` : color values I(u,v) output by implicit image model\n", 884 | "2. `rgb_raw` : corresponding sampled values from bayer RAW volume\n", 885 | "3. `rgb_processed` : corresponding sampled values from processed RGB volume\n", 886 | "4. `depth` : depth values D(u,v) from shakes-on-a-plane implicit depth model\n", 887 | "5. `depth_img` : same as `depth` but with colormap applied for tensorboard visualization" 888 | ] 889 | }, 890 | { 891 | "cell_type": "code", 892 | "execution_count": null, 893 | "metadata": { 894 | "id": "OdliAs7blzL5" 895 | }, 896 | "outputs": [], 897 | "source": [ 898 | "fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(14, 8))\n", 899 | "axes[0].imshow((rgb.permute(1,2,0).cpu()).clip(0,1)) # increase brightness\n", 900 | "axes[0].set_title(\"Reconstructed Image I(u,v)\")\n", 901 | "axes[1].imshow(rgb_processed.permute(1,2,0).cpu())\n", 902 | "axes[1].set_title(\"Processed RGB\")\n", 903 | "axes[2].imshow(depth.cpu(), cmap=\"RdYlBu\")\n", 904 | "axes[2].set_title(\"Reconstructed Depth D(u,v)\")\n", 905 | "plt.show()" 906 | ] 907 | } 908 | ], 909 | "metadata": { 910 | "accelerator": "GPU", 911 | "colab": { 912 | "collapsed_sections": [ 913 | "4J5E1CJukt0H", 914 | "GB_OzWSbcd83", 915 | "GWrA79XSeTJ4", 916 | "su3XUolojxfm" 917 | ], 918 | "gpuType": "T4", 919 | "provenance": [] 920 | }, 921 | "kernelspec": { 922 | "display_name": "Python 3", 923 | "name": "python3" 924 | }, 925 | "language_info": { 926 | "codemirror_mode": { 927 | "name": "ipython", 928 | "version": 3 929 | }, 930 | "file_extension": ".py", 931 | "mimetype": "text/x-python", 932 | "name": "python", 933 | "nbconvert_exporter": "python", 934 | "pygments_lexer": "ipython3", 935 | "version": "3.10.9" 936 | } 937 | }, 938 | "nbformat": 4, 939 | "nbformat_minor": 0 940 | } 941 | -------------------------------------------------------------------------------- /utils/utils.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import torch 4 | 5 | def de_casteljau(betas, t): 6 | """ castle interpolation, for knights 7 | see: https://en.wikipedia.org/wiki/De_Casteljau%27s_algorithm 8 | assumes t in [0,1] 9 | """ 10 | t = t[None,None,:,0] # 1,1,T 11 | 12 | out = betas.clone() 13 | N = betas.shape[0] # number of points 14 | for i in range(1, N): 15 | out = out[:-1,:] * (1-t) + out[1:,:] * t 16 | return out.squeeze(0).permute(1,0) 17 | 18 | def uvz_to_xyz(uvz, intrinsics, img_width, img_height): 19 | """ Get xyz coordinates in meters from uv coordinates [0-1] 20 | iPhone poses are right-handed system, +x is right towards power button, +y is up towards front camera, +z is towards user's face 21 | images are opencv convention right-handed, x to the right, y down, and z into the world (away from face) 22 | """ 23 | u = uvz[:,0:1] * img_width 24 | v = uvz[:,1:2] * img_height 25 | z = uvz[:,2:3] 26 | 27 | # intrinsics are for landscape sensor, top row: y, middle row: x, bottom row: z 28 | fy, cy, fx, cx = intrinsics[:,0,0,None], intrinsics[:,2,0,None], intrinsics[:,1,1,None], intrinsics[:,2,1,None] 29 | 30 | x = (u - cx) * (z/fx) 31 | y = (v - cy) * (z/fy) 32 | 33 | # rotate around the camera's x-axis by 180 degrees 34 | # now point cloud is in y up and z towards face convention 35 | y = -y 36 | z = -z 37 | 38 | # match pose convention (y,x,z) 39 | return torch.cat((x,y,z), dim=1) 40 | 41 | def xyz_to_uvz(uvz, intrinsics, img_width, img_height): 42 | """ Get uv coordinates [0-1] from coordinates rays in meters 43 | """ 44 | fy, cy, fx, cx = intrinsics[:,0,0,None], intrinsics[:,2,0,None], intrinsics[:,1,1,None], intrinsics[:,2,1,None] 45 | x, y, z = uvz[:,0:1], uvz[:,1:2], uvz[:,2:3] 46 | 47 | # undo rotation from convert_px_rays_to_m 48 | y = -y 49 | z = -z 50 | 51 | u = (x * (fx/z) + cx) / img_width 52 | v = (y * (fy/z) + cy) / img_height 53 | 54 | return torch.cat((u,v,z), dim=1) 55 | 56 | def convert_quaternions_to_rot(quaternions): 57 | """ Convert quaternions (xyzw) to 3x3 rotation matrices. 58 | Adapted from: https://automaticaddison.com/how-to-convert-a-quaternion-to-a-rotation-matrix 59 | """ 60 | 61 | qx, qy, qz, qw = quaternions[:,0], quaternions[:,1], quaternions[:,2], quaternions[:,3] 62 | 63 | R00 = 2 * ((qw * qw) + (qx * qx)) - 1 64 | R01 = 2 * ((qx * qy) - (qw * qz)) 65 | R02 = 2 * ((qx * qz) + (qw * qy)) 66 | 67 | R10 = 2 * ((qx * qy) + (qw * qz)) 68 | R11 = 2 * ((qw * qw) + (qy * qy)) - 1 69 | R12 = 2 * ((qy * qz) - (qw * qx)) 70 | 71 | R20 = 2 * ((qx * qz) - (qw * qy)) 72 | R21 = 2 * ((qy * qz) + (qw * qx)) 73 | R22 = 2 * ((qw * qw) + (qz * qz)) - 1 74 | 75 | R = torch.stack([R00, R01, R02, R10, R11, R12, R20, R21, R22], dim=-1) 76 | R = R.reshape(-1,3,3) 77 | 78 | return R 79 | 80 | def multi_interp(x, xp, fp): 81 | """ Simple extension of np.interp for independent 82 | linear interpolation of all axes of fp 83 | """ 84 | if torch.is_tensor(fp): 85 | out = [torch.tensor(np.interp(x, xp, fp[:,i]), dtype=fp.dtype) for i in range(fp.shape[-1])] 86 | return torch.stack(out, dim=-1) 87 | else: 88 | out = [np.interp(x, xp, fp[:,i]) for i in range(fp.shape[-1])] 89 | return np.stack(out, axis=-1) 90 | 91 | def raw_to_rgb(raw_frames): 92 | """ Convert RAW mosaic into three-channel RGB volume 93 | by only in-filling empty pixels. 94 | Returns volume of shape: (T, C, H, W) 95 | """ 96 | 97 | B = raw_frames[:,:,0::2,1::2].float() 98 | G1 = raw_frames[:,:,0::2,0::2].float() 99 | G2 = raw_frames[:,:,1::2,1::2].float() 100 | R = raw_frames[:,:,1::2,0::2].float() 101 | 102 | # Blue 103 | B_upsampled = torch.zeros_like(B).repeat(1,1,2,2) 104 | B_left = torch.roll(B, 1, dims=3) 105 | B_down = torch.roll(B, -1, dims=2) 106 | B_diag = torch.roll(B, [-1,1], dims=[2,3]) 107 | 108 | B_upsampled[:,:,0::2,1::2] = B 109 | B_upsampled[:,:,0::2,0::2] = (B + B_left)/2 110 | B_upsampled[:,:,1::2,1::2] = (B + B_down)/2 111 | B_upsampled[:,:,1::2,0::2] = (B + B_down + B_left + B_diag)/4 112 | 113 | # Green 114 | G_upsampled = torch.zeros_like(G1).repeat(1,1,2,2) 115 | G1_right = torch.roll(G1, -1, dims=3) 116 | G1_down = torch.roll(G1, -1, dims=2) 117 | 118 | G2_left = torch.roll(G2, 1, dims=3) 119 | G2_up = torch.roll(G2, 1, dims=2) 120 | 121 | G_upsampled[:,:,0::2,0::2] = G1 122 | G_upsampled[:,:,0::2,1::2] = (G1 + G1_right + G2 + G2_up)/4 123 | G_upsampled[:,:,1::2,0::2] = (G1 + G1_down + G2 + G2_left)/4 124 | G_upsampled[:,:,1::2,1::2] = G2 125 | G_upsampled = G_upsampled 126 | 127 | # Red 128 | R_upsampled = torch.zeros_like(R).repeat(1,1,2,2) 129 | R_right = torch.roll(R, -1, dims=3) 130 | R_up = torch.roll(R, 1, dims=2) 131 | R_diag = torch.roll(R, [1,-1], dims=[2,3]) 132 | 133 | R_upsampled[:,:,1::2,0::2] = R 134 | R_upsampled[:,:,1::2,1::2] = (R + R_right)/2 135 | R_upsampled[:,:,0::2,0::2] = (R + R_up)/2 136 | R_upsampled[:,:,0::2,1::2] = (R + R_up + R_right + R_diag)/4 137 | 138 | rgb_volume = torch.concat([R_upsampled, G_upsampled, B_upsampled], dim=0).permute(1,0,2,3) # T, C, H, W 139 | 140 | return rgb_volume 141 | 142 | def de_item(bundle): 143 | """ Call .item() on all dictionary items 144 | removes unnecessary extra dimension 145 | """ 146 | 147 | bundle['motion'] = bundle['motion'].item() 148 | 149 | if 'num_rgb_frames' not in bundle: 150 | return # motion bundle 151 | 152 | for i in range(bundle['num_rgb_frames']): 153 | bundle[f'rgb_{i}'] = bundle[f'rgb_{i}'].item() 154 | 155 | for i in range(bundle['num_raw_frames']): 156 | bundle[f'raw_{i}'] = bundle[f'raw_{i}'].item() 157 | 158 | for i in range(bundle['num_depth_frames']): 159 | bundle[f'depth_{i}'] = bundle[f'depth_{i}'].item() 160 | 161 | def debatch(x): 162 | """ Collapse batch and channel dimension together 163 | """ 164 | 165 | if len(x.shape) <=1: 166 | raise Exception("This tensor is to small to debatch.") 167 | elif len(x.shape) == 2: 168 | return x.reshape(x.shape[0] * x.shape[1]) 169 | else: 170 | return x.reshape(x.shape[0] * x.shape[1], *x.shape[2:]) 171 | 172 | def colorize_tensor(value, vmin=None, vmax=None, cmap=None, colorbar=False, height=9.6, width=7.2): 173 | """ Convert tensor to 3 channel RGB array according to colors from cmap 174 | similar usage as plt.imshow 175 | """ 176 | assert len(value.shape) == 2 # H x W 177 | 178 | fig, ax = plt.subplots(1,1) 179 | fig.set_size_inches(width,height) 180 | a = ax.imshow(value.detach().cpu(), vmin=vmin, vmax=vmax, cmap=cmap) 181 | ax.set_axis_off() 182 | if colorbar: 183 | cbar = plt.colorbar(a, fraction=0.05) 184 | cbar.ax.tick_params(labelsize=30) 185 | plt.tight_layout() 186 | plt.close() 187 | 188 | # Draw figure on canvas 189 | fig.canvas.draw() 190 | 191 | # Convert the figure to numpy array, read the pixel values and reshape the array 192 | img = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8) 193 | img = img.reshape(fig.canvas.get_width_height()[::-1] + (3,)) 194 | 195 | # Normalize into 0-1 range for TensorBoard(X). Swap axes for newer versions where API expects colors in first dim 196 | img = img / 255.0 197 | 198 | return torch.tensor(img).permute(2,0,1).float() --------------------------------------------------------------------------------