├── test ├── __init__.py └── test_sampling.py ├── autolabel ├── __init__.py ├── features │ ├── __init__.py │ ├── dino.py │ ├── fcn50.py │ └── lseg.py ├── constants.py ├── visualization.py ├── utils │ ├── feature_utils.py │ └── __init__.py ├── model_utils.py ├── models.py └── trainer.py ├── setup.py ├── assets └── teaser.jpg ├── .gitmodules ├── .gitignore ├── setup.cfg ├── .yapf.vim ├── LICENSE ├── configs ├── scannet_mapping.json └── label_map.csv ├── scripts ├── compute_sam_mask.py ├── data │ ├── convert_scanner.py │ ├── convert_arkitscenes.py │ ├── convert_replica.py │ ├── convert_hypersim.py │ └── convert_scannet.py ├── language │ ├── pointcloud.py │ └── evaluate.py ├── evaluate.py ├── export.py ├── train.py ├── compute_feature_maps.py ├── render.py ├── convert_to_instant_ngp.py ├── demo_ui.py └── mapping.py ├── docs ├── data.md └── vision-language.md └── README.md /test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /autolabel/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup() 4 | -------------------------------------------------------------------------------- /assets/teaser.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-asl/pvlff/HEAD/assets/teaser.jpg -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "torch_ngp"] 2 | path = torch_ngp 3 | url = git@github.com:ethz-asl/torch-ngp.git 4 | -------------------------------------------------------------------------------- /autolabel/features/__init__.py: -------------------------------------------------------------------------------- 1 | from autolabel.features.fcn50 import FCN50 2 | from autolabel.features.dino import Dino 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/**/* 2 | __pycache__ 3 | .DS_Store 4 | *.egg-info 5 | Hierarchical-Localization/ 6 | ops/maplab/maplab 7 | out*/ -------------------------------------------------------------------------------- /autolabel/constants.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from matplotlib import cm 3 | 4 | colors = (cm.tab10(np.linspace(0, 1, 10)) * 255.0)[:, :3].astype(np.uint8) 5 | COLORS = np.concatenate([colors, colors, colors, colors], axis=0) 6 | -------------------------------------------------------------------------------- /autolabel/visualization.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from matplotlib import cm 3 | 4 | 5 | def visualize_depth(nparray, maxdepth=10.0): 6 | """ 7 | Takes metric scale np.array and returns a colormapped np.array with type np.uint8. 8 | """ 9 | normalized_depth = 1.0 - np.clip(nparray, 0.0, maxdepth) / maxdepth 10 | return (cm.inferno(normalized_depth) * 255).astype(np.uint8) 11 | -------------------------------------------------------------------------------- /autolabel/utils/feature_utils.py: -------------------------------------------------------------------------------- 1 | def get_feature_extractor(features, checkpoint=None): 2 | if features == 'fcn50': 3 | from autolabel.features import FCN50 4 | return FCN50() 5 | elif features == 'dino': 6 | from autolabel.features import Dino 7 | return Dino() 8 | elif features == 'lseg': 9 | from autolabel.features import lseg 10 | return lseg.LSegFE(checkpoint) 11 | else: 12 | raise NotImplementedError() 13 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = autolabel 3 | version = 0.1 4 | description = A project for inferring the structure and semantics of scenes. 5 | license = MIT License 6 | 7 | [options] 8 | packages = autolabel 9 | install_requires = 10 | rich 11 | numpy 12 | pillow 13 | tqdm 14 | tensorboardX 15 | opencv-python 16 | ; open3d 17 | torch 18 | torch_ngp 19 | trimesh 20 | matplotlib 21 | scipy 22 | ; PyQt6 23 | numba 24 | scikit-video 25 | scikit-image 26 | 27 | [yapf] 28 | based_on_style = google 29 | indent_width = 4 30 | column_limit = 80 31 | -------------------------------------------------------------------------------- /.yapf.vim: -------------------------------------------------------------------------------- 1 | function! yapf#YAPF() range 2 | " Determine range to format. 3 | let l:cmd = 'yapf' 4 | 5 | " Call YAPF with the current buffer 6 | let l:formatted_text = systemlist(l:cmd, join(getline(1, '$'), "\n") . "\n") 7 | 8 | if v:shell_error 9 | echohl ErrorMsg 10 | echomsg printf('"%s" returned error: %s', l:cmd, l:formatted_text[-1]) 11 | echohl None 12 | return 13 | endif 14 | 15 | " Update the buffer. 16 | execute '1,' . string(line('$')) . 'delete' 17 | call setline(1, l:formatted_text) 18 | 19 | " Reset cursor to first line of the formatted range. 20 | call cursor(a:firstline, 1) 21 | endfunction 22 | -------------------------------------------------------------------------------- /autolabel/features/dino.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchvision import transforms 3 | from torchvision.io.image import read_image 4 | from torchvision.transforms.functional import to_pil_image 5 | from torchvision.models import feature_extraction 6 | from torch.nn import functional as F 7 | 8 | 9 | class Dino: 10 | 11 | def __init__(self): 12 | self.normalize = normalize = transforms.Normalize( 13 | mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]).cuda() 14 | self.model = torch.hub.load('facebookresearch/dino:main', 15 | 'dino_vits8').half().cuda() 16 | self.model.eval() 17 | 18 | def shape(self, *args): 19 | return (90, 120) 20 | 21 | def __call__(self, x): 22 | B, C, H, W = x.shape 23 | x = self.normalize(x) 24 | x = self.model.get_intermediate_layers(x.half()) 25 | width_out = W // 8 26 | height_out = H // 8 27 | return x[0][:, 1:, :].reshape(B, height_out, width_out, 384).detach() 28 | -------------------------------------------------------------------------------- /autolabel/features/fcn50.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchvision import transforms 3 | from torchvision.models.segmentation import fcn_resnet50 4 | from torchvision.transforms.functional import to_pil_image 5 | from torchvision.models import feature_extraction 6 | from torch.nn import functional as F 7 | 8 | 9 | class FCN50: 10 | 11 | def __init__(self): 12 | self.model = fcn_resnet50(pretrained=True) 13 | self.model.eval() 14 | self.model = self.model.cuda() 15 | self.extractor = feature_extraction.create_feature_extractor( 16 | self.model, return_nodes={'classifier.2': 'features'}) 17 | self.normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], 18 | std=[0.229, 0.224, 0.225]).cuda() 19 | 20 | @property 21 | def shape(self): 22 | return (90, 120) 23 | 24 | def __call__(self, x): 25 | batch = self.normalize(x) 26 | out = self.extractor(batch) 27 | 28 | return out['features'].detach().cpu().half().numpy().transpose( 29 | [0, 2, 3, 1]) 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Autonomous Systems Lab, ETH Zurich 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /configs/scannet_mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "remap": { 3 | "18": 1, 4 | "35": 1, 5 | "45": 1, 6 | "102": 1, 7 | "132": 1, 8 | "197": 1, 9 | "413": 1, 10 | "414": 1, 11 | "450": 1, 12 | "563": 1, 13 | "572": 1, 14 | "577": 1, 15 | "16": 7, 16 | "37": 7, 17 | "44": 7, 18 | "54": 7, 19 | "172": 7, 20 | "231": 7, 21 | "232": 7, 22 | "341": 7, 23 | "392": 7, 24 | "594": 7, 25 | "32": 15, 26 | "34": 15, 27 | "47": 15, 28 | "75": 15, 29 | "119": 15, 30 | "177": 15, 31 | "359": 15, 32 | "372": 15, 33 | "427": 15, 34 | "496": 15, 35 | "513": 15, 36 | "13": 4, 37 | "76": 4, 38 | "87": 4, 39 | "125": 4, 40 | "128": 4, 41 | "175": 4, 42 | "282": 4, 43 | "420": 4, 44 | "509": 4, 45 | "555": 4, 46 | "561": 4, 47 | "39": 22, 48 | "296": 22, 49 | "329": 22, 50 | "331": 22, 51 | "369": 22, 52 | "389": 22, 53 | "411": 22, 54 | "560": 22, 55 | "604": 22 56 | }, 57 | "prompts": { 58 | "1": "chair; stool; office chair; armchair", 59 | "4": "door; sliding door; doorframe", 60 | "7": "table; desk; coffee table; nightstand; dining table; side table", 61 | "15": "cabinet; kitchen cabinet; file cabinet; cabinet door; bathroom cabinet", 62 | "21": "sink; bathroom sink; kitchen sink", 63 | "22": "backpack; bag" 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /test/test_sampling.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | from autolabel.dataset import IndexSampler 4 | 5 | 6 | class SamplingTestCase(unittest.TestCase): 7 | 8 | def test_find_classes(self): 9 | semantics = np.zeros((2, 10), int) 10 | sampler = IndexSampler() 11 | sampler.update(semantics) 12 | self.assertFalse(sampler.has_semantics) 13 | 14 | self.assertEqual(len(sampler.classes), 0) 15 | semantics[0, 5] = 1 16 | semantics[0, 6] = 2 17 | sampler.update(semantics) 18 | self.assertEqual(len(sampler.classes), 2) 19 | self.assertEqual(sampler.classes[0], 1) 20 | self.assertEqual(sampler.classes[1], 2) 21 | 22 | def test_sampling(self): 23 | semantics = np.zeros((2, 10), int) 24 | semantics[0, 5] = 1 25 | semantics[0, 0] = 2 26 | semantics[1, 5] = 3 27 | sampler = IndexSampler() 28 | sampler.update(semantics) 29 | random_class = sampler.sample_class() 30 | self.assertIn(random_class, [1, 2, 3]) 31 | 32 | random_image, random_index = sampler.sample(1, 1) 33 | self.assertEqual(random_image, 0) 34 | self.assertEqual(random_index[0], 5) 35 | random_image, random_index = sampler.sample(2, 1) 36 | self.assertEqual(random_image, 0) 37 | self.assertEqual(random_index[0], 0) 38 | 39 | random_image, random_indices = sampler.sample(3, 5) 40 | self.assertEqual(random_image, 1) 41 | self.assertEqual(len(random_indices), 5) 42 | self.assertEqual(np.random.choice(random_indices), 5) 43 | self.assertTrue(sampler.has_semantics) 44 | 45 | def test_semantic_indices(self): 46 | semantics = np.zeros((5, 10), int) 47 | semantics[0, 5] = 1 48 | semantics[2, 0] = 2 49 | semantics[4, 5] = 3 50 | sampler = IndexSampler() 51 | sampler.update(semantics) 52 | indices = sampler.semantic_indices() 53 | self.assertEqual(len(indices), 3) 54 | self.assertIn(0, indices) 55 | self.assertIn(2, indices) 56 | self.assertIn(4, indices) 57 | 58 | 59 | if __name__ == "__main__": 60 | unittest.main() 61 | -------------------------------------------------------------------------------- /configs/label_map.csv: -------------------------------------------------------------------------------- 1 | ,id,prompt,evaluated,type 2 | 0,1,backpack,0,-1 3 | 1,2,base-cabinet,0,-1 4 | 2,3,basket,0,-1 5 | 3,4,bathtub,0,-1 6 | 4,5,beam,0,-1 7 | 5,6,beanbag,0,-1 8 | 6,7,bed,1,1 9 | 7,8,bench,0,-1 10 | 8,9,bike,0,-1 11 | 9,10,bin,0,-1 12 | 10,11,blanket,0,-1 13 | 11,12,blinds,0,-1 14 | 12,13,book,0,-1 15 | 13,14,bottle,0,-1 16 | 14,15,box,0,-1 17 | 15,16,bowl,0,-1 18 | 16,17,camera,0,-1 19 | 17,18,cabinet,1,0 20 | 18,19,candle,0,-1 21 | 19,20,chair,1,1 22 | 20,21,chopping-board,0,-1 23 | 21,22,clock,0,-1 24 | 22,23,cloth,0,-1 25 | 23,24,clothing,0,-1 26 | 24,25,coaster,0,-1 27 | 25,26,comforter,0,-1 28 | 26,27,computer-keyboard,0,-1 29 | 27,28,cup,0,-1 30 | 28,29,cushion,0,-1 31 | 29,30,curtain,1,0 32 | 30,31,ceiling,1,0 33 | 31,32,cooktop,0,-1 34 | 32,33,countertop,1,0 35 | 33,34,desk,0,-1 36 | 34,35,desk-organizer,0,-1 37 | 35,36,desktop-computer,0,-1 38 | 36,37,door,1,0 39 | 37,38,exercise-ball,0,-1 40 | 38,39,faucet,0,-1 41 | 39,40,floor,1,0 42 | 40,41,handbag,0,-1 43 | 41,42,hair-dryer,0,-1 44 | 42,43,handrail,0,-1 45 | 43,44,indoor-plant,0,-1 46 | 44,45,knife-block,0,-1 47 | 45,46,kitchen-utensil,0,-1 48 | 46,47,lamp,1,0 49 | 47,48,laptop,0,-1 50 | 48,49,major-appliance,0,-1 51 | 49,50,mat,0,-1 52 | 50,51,microwave,0,-1 53 | 51,52,monitor,0,-1 54 | 52,53,mouse,0,-1 55 | 53,54,nightstand,0,-1 56 | 54,55,pan,0,-1 57 | 55,56,panel,0,-1 58 | 56,57,paper-towel,0,-1 59 | 57,58,phone,0,-1 60 | 58,59,picture,0,-1 61 | 59,60,pillar,0,-1 62 | 60,61,pillow,0,-1 63 | 61,62,pipe,0,-1 64 | 62,63,plant-stand,0,-1 65 | 63,64,plate,0,-1 66 | 64,65,pot,0,-1 67 | 65,66,rack,0,-1 68 | 66,67,refrigerator,0,-1 69 | 67,68,remote-control,0,-1 70 | 68,69,scarf,0,-1 71 | 69,70,sculpture,0,-1 72 | 70,71,shelf,1,0 73 | 71,72,shoe,0,-1 74 | 72,73,shower-stall,0,-1 75 | 73,74,sink,1,1 76 | 74,75,small-appliance,0,-1 77 | 75,76,sofa,1,1 78 | 76,77,stair,0,-1 79 | 77,78,stool,0,-1 80 | 78,79,switch,0,-1 81 | 79,80,table,1,0 82 | 80,81,table-runner,0,-1 83 | 81,82,tablet,0,-1 84 | 82,83,tissue-paper,0,-1 85 | 83,84,toilet,1,1 86 | 84,85,toothbrush,0,-1 87 | 85,86,towel,0,-1 88 | 86,87,tv-screen,1,1 89 | 87,88,tv-stand,0,-1 90 | 88,89,umbrella,0,-1 91 | 89,90,utensil-holder,0,-1 92 | 90,91,vase,0,-1 93 | 91,92,vent,0,-1 94 | 92,93,wall,1,0 95 | 93,94,wall-cabinet,0,-1 96 | 94,95,wall-plug,0,-1 97 | 95,96,wardrobe,0,-1 98 | 96,97,window,1,0 99 | 97,98,rug,0,-1 100 | 98,99,logo,0,-1 101 | 99,100,bag,1,1 102 | 100,101,set-of-clothing,0,-1 103 | -------------------------------------------------------------------------------- /autolabel/features/lseg.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import clip 3 | from torch.nn import functional as F 4 | from modules.lseg_module import LSegModule 5 | from additional_utils.models import LSeg_MultiEvalModule 6 | from torchvision import transforms 7 | 8 | 9 | class LSegFE: 10 | 11 | def __init__(self, checkpoint): 12 | module = LSegModule.load_from_checkpoint(checkpoint_path=checkpoint, 13 | backbone='clip_vitl16_384', 14 | data_path=None, 15 | num_features=256, 16 | batch_size=1, 17 | base_lr=1e-3, 18 | max_epochs=100, 19 | augment=False, 20 | aux=True, 21 | aux_weight=0, 22 | ignore_index=255, 23 | dataset='ade20k', 24 | se_loss=False, 25 | se_weight=0, 26 | arch_option=0, 27 | block_depth=0, 28 | activation='lrelu') 29 | # Skip totensor operation. 30 | self.transform = transforms.Compose(module.val_transform.transforms[1:]) 31 | net = module.net.cuda() 32 | scales = [1.0] 33 | self.evaluator = LSeg_MultiEvalModule(module, scales=scales, 34 | flip=False).half().cuda().eval() 35 | self.text_encoder = module.net.clip_pretrained.to(torch.float32).cuda() 36 | 37 | def shape(self, input_shape): 38 | return (input_shape[0] // 2, input_shape[1] // 2) 39 | 40 | def encode_text(self, text): 41 | """ 42 | text: list of N strings to encode 43 | returns: torch tensor size N x 512 44 | """ 45 | with torch.inference_mode(): 46 | tokenized = clip.tokenize(text).cuda() 47 | features = [] 48 | for item in tokenized: 49 | f = self.text_encoder.encode_text(item[None])[0] 50 | features.append(f) 51 | features = torch.stack(features, dim=0) 52 | return features / torch.norm(features, dim=-1, keepdim=True) 53 | 54 | def __call__(self, x): 55 | x = self.transform(x) 56 | _, _, H, W = x.shape 57 | # Return half size features 58 | H_out, W_out = H // 2, W // 2 59 | out = [] 60 | x = [F.interpolate(image[None], [H_out, W_out]) for image in x] 61 | for image in x: 62 | out.append(self.evaluator.compute_features(image.half())) 63 | 64 | out = torch.cat(out, dim=0) 65 | 66 | return out.permute(0, 2, 3, 1) 67 | -------------------------------------------------------------------------------- /scripts/compute_sam_mask.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ["OPENCV_IO_ENABLE_OPENEXR"]="1" 3 | import argparse 4 | import cv2 5 | from pathlib import Path 6 | import numpy as np 7 | from tqdm import tqdm 8 | from segment_anything import sam_model_registry, SamAutomaticMaskGenerator 9 | 10 | 11 | def read_args(): 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('scene') 14 | parser.add_argument('--sam-vit-h-checkpoint', type=str, required=True) 15 | parser.add_argument('--prefer-union-mask', action='store_true') 16 | return parser.parse_args() 17 | 18 | def _iou(image_vector1, image_vector2): 19 | intersection = np.logical_and(image_vector1, image_vector2).sum() 20 | union = np.logical_or(image_vector1, image_vector2).sum() 21 | iou = intersection / union 22 | return iou, intersection, union 23 | 24 | def generate_float32_mask(masks, prefer_union_mask=True): 25 | indices = [] 26 | for i in np.random.permutation(list(range(len(masks)))): 27 | if len(indices) >= 32: 28 | break 29 | 30 | overlapped = False 31 | for j, ind in enumerate(indices): 32 | iou, intersection, union = _iou(masks[i]['segmentation'].reshape(-1), masks[ind]['segmentation'].reshape(-1)) 33 | if iou > 0.8: 34 | overlapped = True 35 | break 36 | if prefer_union_mask: 37 | if intersection / masks[ind]['area'] > 0.8: 38 | indices[j] = i 39 | overlapped = True 40 | break 41 | elif intersection / masks[i]['area'] > 0.8: 42 | overlapped = True 43 | break 44 | if not overlapped: 45 | indices.append(i) 46 | 47 | one = 1 48 | uint32_mask = np.zeros_like(masks[0]['segmentation'], dtype=np.uint32) 49 | for i, ind in enumerate(indices): 50 | mask = masks[ind] 51 | number = (one << i) 52 | uint32_mask += (number * mask['segmentation']).astype(np.uint32) 53 | return uint32_mask.view(np.float32) 54 | 55 | def main(): 56 | flags = read_args() 57 | sam_checkpoint = flags.sam_vit_h_checkpoint 58 | model_type = "vit_h" 59 | device = "cuda" 60 | 61 | scene_folder = Path(flags.scene) 62 | 63 | image_folder = scene_folder / "rgb" 64 | output_folder = scene_folder / "sam_mask" 65 | output_folder.mkdir(parents=True, exist_ok=True) 66 | 67 | sam = sam_model_registry[model_type](checkpoint=sam_checkpoint) 68 | sam.to(device=device) 69 | mask_generator = SamAutomaticMaskGenerator(sam) 70 | 71 | image_files = os.listdir(image_folder) 72 | image_files.sort() 73 | 74 | for image_file in tqdm(image_files): 75 | image = cv2.imread( 76 | os.path.join(image_folder, image_file) 77 | ) 78 | image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 79 | masks = mask_generator.generate(image) 80 | masks.sort(key=lambda x: x['area'], reverse=True) 81 | masks = [mask for mask in masks if mask['area'] > 100] 82 | 83 | sam_mask = generate_float32_mask(masks, prefer_union_mask=flags.prefer_union_mask) 84 | cv2.imwrite( 85 | os.path.join(output_folder, os.path.splitext(image_file)[0] + '.exr'), 86 | sam_mask 87 | ) 88 | 89 | if __name__ == "__main__": 90 | main() 91 | -------------------------------------------------------------------------------- /docs/data.md: -------------------------------------------------------------------------------- 1 | 2 | # Importing data 3 | 4 | ## Capturing your own datasets 5 | 6 | If you have a LiDAR enabled iOS device, you can use the [Stray Scanner](https://apps.apple.com/us/app/stray-scanner/id1557051662) app to record data. The script at `scripts/data/convert_scanner.py` will allow you to convert a scene recorded using the app to the above format. You can then run the `mapping.py` script to run structure from motion and compute the other outputs. 7 | 8 | After capturing and moving the scenes over to your computer, convert to Autolabel format with: 9 | ``` 10 | python scripts/data/convert_scanner.py --out scenes/scene_name/ 11 | 12 | # Compute camera poses. 13 | python scripts/mapping.py scenes/scene_name/ 14 | ``` 15 | 16 | ## Importing ARKitScenes scenes 17 | 18 | Here are the steps required to download and import scenes from the ARKitScenes dataset. 19 | 20 | ``` 21 | # Clone ARKitScenes repository 22 | git clone https://github.com/apple/ARKitScenes.git arkit-scenes 23 | cd arkit-scenes 24 | 25 | # Create a directory for the scenes to download them 26 | mkdir -p scenes/raw/ && mkdir -p scenes/converted 27 | 28 | # Download the required parts of the dataset 29 | # For now, we only download the low resolution RGB images (256x192), but higher 30 | # resolution frames could be used. 31 | python download_data.py raw --split Training \ 32 | --video_id_csv depth_upsampling/upsampling_train_val_splits.csv \ 33 | --download_dir scenes/raw \ 34 | --raw_dataset_assets lowres_wide lowres_depth lowres_wide.traj confidence lowres_wide_intrinsics 35 | 36 | # Convert the ARKitScenes to the format used by Autolabel 37 | python scripts/data/convert_arkitscenes.py scenes/raw/ --out scenes/converted/ 38 | ``` 39 | 40 | ## Importing Replica renders 41 | 42 | We have written data conversion scripts for different publicly available datasets. 43 | 44 | Renders from the [Replica](https://github.com/facebookresearch/Replica-Dataset) dataset published by [SemanticNeRF](https://github.com/Harry-Zhi/semantic_nerf/) can be converted using the `scripts/data/convert_replica.py` script. 45 | 46 | ## Importing HyperSim scenes 47 | 48 | Download the scenes from [HyperSim](https://github.com/apple/ml-hypersim) using their official [download script](https://github.com/apple/ml-hypersim/blob/main/code/python/tools/dataset_download_images.py) (you can specify which scenes you want to download in the script). 49 | 50 | Download the semantic label file [here](https://github.com/apple/ml-hypersim/blob/main/code/cpp/tools/scene_annotation_tool/semantic_label_descs.csv). 51 | 52 | Download the file of camera parameters [here](https://github.com/apple/ml-hypersim/blob/main/contrib/mikeroberts3000/metadata_camera_parameters.csv). 53 | 54 | 55 | Then, run the conversion script as: 56 | ``` 57 | python scripts/data/convert_hypersim.py \ 58 | --out \ 59 | --ori-semantic-labels \ 60 | --camera-parameter-file 61 | ``` 62 | 63 | 64 | ## Importing ScanNet scenes 65 | 66 | ScanNet scenes can be imported with the `scripts/data/convert_scannet.py` script. 67 | 68 | It is used as: 69 | ``` 70 | python scripts/data/convert_scannet.py --label-map --out --stride 71 | ``` 72 | 73 | - `scannet-directory` this is the path to the raw scannet dataset which contains each scene as a subdirectory. Each scene in turn contains the `*-label-filt.zip`, `*.sens` etc. files in the ScanNet format. 74 | - `--out` specifies the output directory. Each scene will be stored as a subdirectory. 75 | - `--stride` is an integer parameter specifying how many frames to keep. Only every Nth frame is kept in the scan. 76 | 77 | This will in addition also create `mesh.ply` and `mesh_labels.npy` files use by the vision-language evaluation scripts. 78 | 79 | -------------------------------------------------------------------------------- /scripts/data/convert_scanner.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import numpy as np 4 | import json 5 | import cv2 6 | from skvideo import io 7 | from tqdm import tqdm 8 | 9 | 10 | def read_args(): 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('scan', type=str, help="Scan directory") 13 | parser.add_argument('--out', type=str, help="Output directory") 14 | parser.add_argument("--rotate", 15 | action="store_true", 16 | help="Rotate frames 90 degrees") 17 | 18 | parser.add_argument("--subsample", 19 | type=int, 20 | default=1, 21 | help="Subsample use every n frames from the dataset") 22 | return parser.parse_args() 23 | 24 | 25 | def write_frames(scan_dir, rgb_out_dir, rotate=False, subsample=1): 26 | rgb_video = os.path.join(scan_dir, 'rgb.mp4') 27 | video = io.vreader(rgb_video) 28 | img_idx = 0 29 | for i, frame in tqdm(enumerate(video), desc="Writing RGB"): 30 | frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) 31 | if i % subsample != 0: 32 | continue 33 | if rotate: 34 | frame = cv2.rotate(frame, cv2.ROTATE_90_CLOCKWISE) 35 | 36 | frame_path = os.path.join(rgb_out_dir, f"{img_idx:05}.jpg") 37 | img_idx += 1 38 | params = [int(cv2.IMWRITE_JPEG_QUALITY), 90] 39 | cv2.imwrite(frame_path, frame, params) 40 | 41 | 42 | def write_depth(scan_dir, depth_out_dir, rotate=False, subsample=1): 43 | depth_dir_in = os.path.join(scan_dir, 'depth') 44 | confidence_dir = os.path.join(scan_dir, 'confidence') 45 | files = sorted(os.listdir(depth_dir_in)) 46 | img_idx = 0 47 | for i, filename in tqdm(enumerate(files), desc="Writing Depth"): 48 | if '.png' not in filename: 49 | continue 50 | number, _ = filename.split('.') 51 | 52 | if i % subsample != 0: 53 | continue 54 | 55 | depth = cv2.imread(os.path.join(depth_dir_in, filename), -1) 56 | 57 | confidence = cv2.imread(os.path.join(confidence_dir, 58 | number + '.png'))[:, :, 0] 59 | if rotate: 60 | depth = cv2.rotate(depth, cv2.ROTATE_90_CLOCKWISE) 61 | confidence = cv2.rotate(confidence, cv2.ROTATE_90_CLOCKWISE) 62 | 63 | depth[confidence < 2] = 0 64 | cv2.imwrite(os.path.join(depth_out_dir, f"{int(img_idx):05}" + '.png'), 65 | depth) 66 | img_idx += 1 67 | return img_idx 68 | 69 | 70 | def write_intrinsics(scan_dir, out_dir, rotate=False): 71 | intrinsics = np.loadtxt(os.path.join(scan_dir, 'camera_matrix.csv'), 72 | delimiter=',') 73 | fx = intrinsics[0, 0] 74 | fy = intrinsics[1, 1] 75 | cx = intrinsics[0, 2] 76 | cy = intrinsics[1, 2] 77 | if rotate: 78 | out_intrinsics = np.array([[fy, 0, cy], [0, fx, cx], [0, 0, 1]]) 79 | else: 80 | out_intrinsics = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]]) 81 | np.savetxt(os.path.join(out_dir, 'intrinsics.txt'), out_intrinsics) 82 | 83 | 84 | def main(): 85 | flags = read_args() 86 | rgb_out = os.path.join(flags.out, 'raw_rgb/') 87 | depth_out = os.path.join(flags.out, 'raw_depth/') 88 | os.makedirs(rgb_out, exist_ok=True) 89 | os.makedirs(depth_out, exist_ok=True) 90 | scan_dir = flags.scan 91 | 92 | write_intrinsics(scan_dir, flags.out, rotate=flags.rotate) 93 | write_depth(scan_dir, 94 | depth_out, 95 | rotate=flags.rotate, 96 | subsample=flags.subsample) 97 | write_frames(scan_dir, 98 | rgb_out, 99 | rotate=flags.rotate, 100 | subsample=flags.subsample) 101 | print("Done") 102 | 103 | 104 | if __name__ == "__main__": 105 | main() 106 | -------------------------------------------------------------------------------- /autolabel/model_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import glob 4 | import argparse 5 | import pickle 6 | import os 7 | from autolabel.models import ALNetwork 8 | 9 | 10 | def load_checkpoint(model, checkpoint_dir, device='cuda:0'): 11 | checkpoint_list = sorted(glob.glob(f'{checkpoint_dir}/*.pth')) 12 | best = [c for c in checkpoint_list if 'best.pth' in c] 13 | if len(best) != 0: 14 | checkpoint = best[0] 15 | else: 16 | checkpoint = checkpoint_list[-1] 17 | checkpoint_dict = torch.load(checkpoint, map_location=device) 18 | model.load_state_dict(checkpoint_dict['model']) 19 | 20 | instance_centers_list = sorted(glob.glob(f'{checkpoint_dir}/*.npy')) 21 | if len(instance_centers_list) > 0: 22 | model.set_instance_centers( 23 | np.load(instance_centers_list[-1]) 24 | ) 25 | instance_cluster_list = sorted(glob.glob(f'{checkpoint_dir}/*.pkl')) 26 | if len(instance_cluster_list) > 0: 27 | with open(instance_cluster_list[-1], 'rb') as inp: 28 | clust = pickle.load(inp) 29 | model.set_instance_clusterer(clust) 30 | return model 31 | 32 | 33 | def model_flag_parser(): 34 | parser = argparse.ArgumentParser() 35 | parser.add_argument('--lr', type=float, default=5e-3) 36 | parser.add_argument('--geometric-features', '-g', type=int, default=15) 37 | parser.add_argument('--encoding', 38 | default='hg+freq', 39 | choices=['freq', 'hg', 'hg+freq'], 40 | type=str, 41 | help="Network positional encoding to use.") 42 | parser.add_argument('--features', 43 | type=str, 44 | default=None, 45 | choices=[None, 'fcn50', 'dino', 'lseg'], 46 | help="Use semantic feature supervision.") 47 | parser.add_argument('--rgb-weight', default=1.0, type=float) 48 | parser.add_argument('--semantic-weight', default=1.0, type=float) 49 | parser.add_argument('--feature-weight', default=0.5, type=float) 50 | parser.add_argument('--depth-weight', default=0.1, type=float) 51 | parser.add_argument('--feature-dim', default=64, type=int) 52 | parser.add_argument('--contrastive-weight', default=0.1, type=float) 53 | parser.add_argument('--contrastive-feat-dim', default=8, type=int) 54 | parser.add_argument('--contrastive-temperature', default=0.1, type=float) 55 | return parser 56 | 57 | 58 | def model_hash(flags): 59 | features = 'plain' 60 | if flags.features is not None: 61 | features = flags.features 62 | string = f"g{flags.geometric_features}_{flags.encoding}_{features}" 63 | string += f"_rgb{flags.rgb_weight}_d{flags.depth_weight}_s{flags.semantic_weight}" 64 | string += f"_f{flags.feature_weight}" 65 | string += f"_c{flags.contrastive_weight}" 66 | return string 67 | 68 | 69 | def model_dir(scene_path, flags): 70 | mhash = model_hash(flags) 71 | if flags.workspace is None: 72 | return os.path.join(scene_path, 'nerf', mhash) 73 | scene_name = os.path.basename(os.path.normpath(flags.scene)) 74 | return os.path.join(flags.workspace, scene_name, mhash) 75 | 76 | 77 | def create_model(min_bounds, max_bounds, n_classes, flags, bound_scale=1.25): 78 | bound = np.max([np.abs(min_bounds), np.abs(max_bounds)], axis=0).max() * bound_scale 79 | return ALNetwork(num_layers=2, 80 | num_layers_color=2, 81 | hidden_dim_color=128, 82 | hidden_dim=128, 83 | geo_feat_dim=flags.geometric_features, 84 | encoding=flags.encoding, 85 | bound=float(bound), 86 | hidden_dim_semantic=flags.feature_dim, 87 | contrastive_feat_dim=flags.contrastive_feat_dim, 88 | cuda_ray=False, 89 | density_scale=1, 90 | semantic_classes=n_classes) 91 | 92 | 93 | def read_params(workspace): 94 | with open(os.path.join(workspace, 'params.pkl'), 'rb') as f: 95 | return pickle.load(f) 96 | 97 | 98 | def write_params(workspace, flags): 99 | os.makedirs(workspace, exist_ok=True) 100 | with open(os.path.join(workspace, 'params.pkl'), 'wb') as f: 101 | pickle.dump(flags, f) 102 | 103 | 104 | def get_nerf_dir(scene, flags): 105 | scene_name = os.path.basename(os.path.normpath(scene)) 106 | if flags.workspace is None: 107 | return os.path.join(scene, 'nerf') 108 | else: 109 | return os.path.join(flags.workspace, scene_name) 110 | -------------------------------------------------------------------------------- /scripts/language/pointcloud.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import numpy as np 4 | import json 5 | import pandas 6 | import torch 7 | from tqdm import tqdm 8 | import open3d as o3d 9 | from autolabel.dataset import SceneDataset, LenDataset 10 | from autolabel import utils, model_utils 11 | from autolabel.utils.feature_utils import get_feature_extractor 12 | 13 | 14 | def read_args(): 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument('scene') 17 | parser.add_argument('--batch-size', default=8182, type=int) 18 | parser.add_argument('--workspace', type=str, default=None) 19 | parser.add_argument('--out', 20 | type=str, 21 | help="Resulting pointcloud path.", 22 | required=True) 23 | parser.add_argument('--feature-checkpoint', '-f', type=str, required=True) 24 | parser.add_argument( 25 | '--stride', 26 | type=int, 27 | default=1, 28 | help="Only evaluate every Nth frame to save time or for debugging.") 29 | parser.add_argument('--vis', action='store_true') 30 | parser.add_argument('--features', type=str, default='lseg') 31 | parser.add_argument('--heatmap', 32 | type=str, 33 | help="Prompt for generating heatmap.") 34 | return parser.parse_args() 35 | 36 | 37 | def get_nerf_dir(scene, flags): 38 | scene_name = os.path.basename(os.path.normpath(scene)) 39 | if flags.workspace is None: 40 | return os.path.join(scene, 'nerf') 41 | else: 42 | return os.path.join(flags.workspace, scene_name) 43 | 44 | 45 | def get_model(flags, scene_dir): 46 | nerf_dir = get_nerf_dir(scene_dir, flags) 47 | for model in os.listdir(nerf_dir): 48 | checkpoint_dir = os.path.join(nerf_dir, model, 'checkpoints') 49 | if os.path.exists(checkpoint_dir): 50 | return model 51 | 52 | 53 | def render(model, batch, T_CW, dataset, features): 54 | rays_o = torch.tensor(batch['rays_o']).cuda() 55 | rays_d = torch.tensor(batch['rays_d']).cuda() 56 | direction_norms = torch.tensor(batch['direction_norms']).cuda() 57 | depth = torch.tensor(batch['depth']).cuda() 58 | output = model.render(rays_o, 59 | rays_d, 60 | direction_norms, 61 | staged=True, 62 | perturb=False, 63 | num_steps=512, 64 | upsample_steps=0) 65 | variance = output['depth_variance'].cpu().numpy() 66 | cutoff = np.percentile(variance, 50) 67 | mask = variance < cutoff 68 | cm_C = output['coordinates_map'] 69 | H, W, _ = cm_C.shape 70 | cm_C = cm_C.cpu().numpy()[mask] 71 | rgb = output['image'].cpu().numpy()[mask] 72 | return cm_C[:, :3], rgb 73 | 74 | 75 | def main(flags): 76 | scene_name = os.path.basename(os.path.normpath(flags.scene)) 77 | scene = flags.scene 78 | print(f"Evaluating scene {scene_name}") 79 | nerf_dir = get_nerf_dir(scene, flags) 80 | model = get_model(flags, scene) 81 | model_path = os.path.join(nerf_dir, model) 82 | params = model_utils.read_params(model_path) 83 | dataset = SceneDataset('test', 84 | scene, 85 | factor=4.0, 86 | batch_size=flags.batch_size, 87 | lazy=True) 88 | 89 | model = model_utils.create_model(dataset.min_bounds, dataset.max_bounds, 90 | 606, params).cuda() 91 | 92 | checkpoint_dir = os.path.join(model_path, 'checkpoints') 93 | if not os.path.exists(checkpoint_dir) or len( 94 | os.listdir(checkpoint_dir)) == 0: 95 | print("Now checkpoint path") 96 | exit() 97 | 98 | model_utils.load_checkpoint(model, checkpoint_dir) 99 | model = model.eval() 100 | feature_extractor = get_feature_extractor(flags.features, 101 | flags.feature_checkpoint) 102 | 103 | points = [] 104 | colors = [] 105 | for frame_index in tqdm(dataset.indices[::flags.stride]): 106 | batch = dataset._get_test(frame_index) 107 | T_CW = dataset.poses[frame_index] 108 | points_W, rgb = render(model, batch, T_CW, dataset, feature_extractor) 109 | points.append(points_W) 110 | colors.append(rgb) 111 | rgb = np.concatenate(colors, axis=0) 112 | p_W = np.concatenate(points, axis=0) 113 | pc = o3d.geometry.PointCloud(o3d.utility.Vector3dVector(p_W)) 114 | pc.colors = o3d.utility.Vector3dVector(rgb) 115 | o3d.io.write_point_cloud(flags.out, pc) 116 | 117 | 118 | if __name__ == "__main__": 119 | with torch.inference_mode(): 120 | with torch.cuda.amp.autocast(enabled=True): 121 | main(read_args()) 122 | -------------------------------------------------------------------------------- /scripts/evaluate.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import numpy as np 4 | import json 5 | from autolabel.evaluation import Evaluator 6 | from autolabel.dataset import SceneDataset, LenDataset 7 | from autolabel import utils, model_utils 8 | 9 | 10 | def read_args(): 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('scenes', nargs='+') 13 | parser.add_argument('--batch-size', default=8182, type=int) 14 | parser.add_argument('--vis', action='store_true') 15 | parser.add_argument('--workspace', type=str, default=None) 16 | parser.add_argument('--write-images', type=str, default=None) 17 | parser.add_argument('--out', 18 | default=None, 19 | type=str, 20 | help="Where to write results as json, if anywhere.") 21 | return parser.parse_args() 22 | 23 | 24 | def gather_models(flags): 25 | models = set() 26 | for scene in flags.scenes: 27 | nerf_dir = model_utils.get_nerf_dir(scene, flags) 28 | if not os.path.exists(nerf_dir): 29 | continue 30 | for model in os.listdir(nerf_dir): 31 | checkpoint_dir = os.path.join(nerf_dir, model, 'checkpoints') 32 | if os.path.exists(checkpoint_dir): 33 | models.add(model) 34 | return list(models) 35 | 36 | 37 | def write_results(out, results): 38 | with open(out, 'wt') as f: 39 | f.write(json.dumps(results, indent=2)) 40 | 41 | 42 | def main(flags): 43 | models = gather_models(flags) 44 | classes = ["Background", "Class 1"] 45 | scene_names = [os.path.basename(os.path.normpath(p)) for p in flags.scenes] 46 | scenes = [(s, n) for s, n in zip(flags.scenes, scene_names)] 47 | scenes = sorted(scenes, key=lambda x: x[1]) 48 | ious = np.ones((len(scenes), len(models))) * -1. 49 | results = [] 50 | for scene_index, (scene, scene_name) in enumerate(scenes): 51 | print(f"Evaluating scene {scene_name}") 52 | 53 | nerf_dir = model_utils.get_nerf_dir(scene, flags) 54 | 55 | for model_hash in models: 56 | model_path = os.path.join(nerf_dir, model_hash) 57 | if not os.path.exists(model_path): 58 | continue 59 | params = model_utils.read_params(model_path) 60 | dataset = SceneDataset('test', 61 | scene, 62 | factor=4.0, 63 | batch_size=flags.batch_size, 64 | lazy=True) 65 | n_classes = dataset.n_classes if dataset.n_classes is not None else 2 66 | model = model_utils.create_model(dataset.min_bounds, 67 | dataset.max_bounds, n_classes, 68 | params).cuda() 69 | model = model.eval() 70 | 71 | checkpoint_dir = os.path.join(model_path, 'checkpoints') 72 | if not os.path.exists(checkpoint_dir) or len( 73 | os.listdir(checkpoint_dir)) == 0: 74 | continue 75 | 76 | model_utils.load_checkpoint(model, checkpoint_dir) 77 | model = model.eval() 78 | 79 | save_figure_dir = None 80 | if flags.write_images is not None: 81 | save_figure_dir = os.path.join(flags.write_images, scene_name) 82 | evaluator = Evaluator(model, 83 | classes, 84 | name=model_hash, 85 | save_figures=save_figure_dir) 86 | model_index = models.index(model_hash) 87 | assert model_index >= 0 88 | result = evaluator.eval(dataset, flags.vis) 89 | 90 | if len(result.values()) == 0: 91 | continue 92 | miou = np.mean([v for v in result.values()]) 93 | assert ious[scene_index, model_index] < 0.0 94 | ious[scene_index, model_index] = miou 95 | result = dict(vars(params)) 96 | result['scene'] = scene_name 97 | result['iou'] = miou 98 | results.append(result) 99 | 100 | if flags.out is not None: 101 | write_results(flags.out, results) 102 | 103 | from rich.table import Table 104 | from rich.console import Console 105 | table = Table() 106 | table.add_column('Scene') 107 | for model in models: 108 | table.add_column(model) 109 | for scene_name, scene_ious in zip(scene_names, ious): 110 | table.add_row(scene_name, *[f"{v:.03f}" for v in scene_ious]) 111 | total_row = ['Total'] + [f"{v:.03f}" for v in ious.mean(axis=0)] 112 | table.add_row(*total_row, end_section=True) 113 | 114 | console = Console() 115 | console.print(table) 116 | 117 | 118 | if __name__ == "__main__": 119 | main(read_args()) 120 | -------------------------------------------------------------------------------- /scripts/export.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script will export semantic segmentation maps once you are 3 | done with annotating and fitting a scene. 4 | 5 | usage: python scripts/export.py ... --workspace 6 | 7 | params: 8 | workspace: The workspace to lookup trained models from. 9 | Else uses /nerf/. 10 | 11 | Output frames are saved at /output/semantic/ 12 | """ 13 | import cv2 14 | import numpy as np 15 | import os 16 | from skimage import measure 17 | import torch 18 | from tqdm import tqdm 19 | 20 | from autolabel import model_utils 21 | from autolabel.dataset import SceneDataset 22 | from autolabel.utils import Scene 23 | 24 | MAX_WIDTH = 640 25 | 26 | 27 | def read_args(): 28 | parser = model_utils.model_flag_parser() 29 | parser.add_argument('scenes', nargs='+') 30 | parser.add_argument('--workspace', type=str) 31 | parser.add_argument('--objects', 32 | type=int, 33 | default=None, 34 | help=""" 35 | If specified, find the specified number of largest connected components per class in the 36 | produced semantic maps as a post-processing step, removing noise from the segmentation maps. 37 | """) 38 | return parser.parse_args() 39 | 40 | 41 | def lookup_frame_size(scene): 42 | scene = Scene(scene) 43 | width, height = scene.peak_image_size() 44 | if width > MAX_WIDTH: 45 | scale = MAX_WIDTH / width 46 | width *= scale 47 | height *= scale 48 | return (int(np.round(width)), int(np.round(height))) 49 | 50 | 51 | def find_largest_components(p_semantic, class_id, object_count): 52 | p_semantic = p_semantic.copy() 53 | p_semantic[p_semantic != class_id] = 0 54 | labels = measure.label(p_semantic) 55 | counts = np.bincount(labels.flat)[1:] 56 | largest = [] 57 | sorted_counts = np.argsort(counts)[::-1] 58 | for i in range(object_count): 59 | nth_largest_label = sorted_counts[i] + 1 60 | largest.append(labels == nth_largest_label) 61 | return largest 62 | 63 | 64 | def post_process(flags, p_semantic): 65 | out = np.zeros_like(p_semantic) 66 | class_ids = np.unique(p_semantic) 67 | for class_id in class_ids: 68 | if class_id == 0: 69 | # Skip background class. 70 | continue 71 | components = find_largest_components(p_semantic, class_id, 72 | flags.objects) 73 | for component in components: 74 | out[component] = class_id 75 | return out 76 | 77 | 78 | def render_frame(model, batch): 79 | rays_o = torch.tensor(batch['rays_o']).cuda() 80 | rays_d = torch.tensor(batch['rays_d']).cuda() 81 | direction_norms = torch.tensor(batch['direction_norms']).cuda() 82 | depth = torch.tensor(batch['depth']).cuda() 83 | outputs = model.render(rays_o, 84 | rays_d, 85 | direction_norms, 86 | staged=True, 87 | perturb=False, 88 | num_steps=512, 89 | upsample_steps=0) 90 | return outputs['semantic'].argmax(dim=-1).cpu().numpy() 91 | 92 | 93 | def export_labels(flags, scene): 94 | if scene[-1] == os.path.sep: 95 | scene = scene[:-1] 96 | scene_name = os.path.basename(scene) 97 | if flags.workspace is not None: 98 | model_dir = os.path.join(flags.workspace, scene_name) 99 | else: 100 | model_dir = os.path.join(scene, 'nerf') 101 | models = os.listdir(model_dir) 102 | if len(models) > 1: 103 | print( 104 | f"Warning: scene {scene} has more than 1 model directory. Using {models[0]}." 105 | ) 106 | elif len(models) == 0: 107 | print(f"Warning: scene {scene} has no trained models. Skipping.") 108 | return 109 | model_dir = os.path.join(model_dir, models[0]) 110 | model_params = model_utils.read_params(model_dir) 111 | 112 | frame_size = lookup_frame_size(scene) 113 | 114 | dataset = SceneDataset('train', 115 | scene, 116 | size=frame_size, 117 | batch_size=16384, 118 | features=model_params.features, 119 | load_semantic=False) 120 | 121 | n_classes = dataset.n_classes if dataset.n_classes is not None else 2 122 | model = model_utils.create_model(dataset.min_bounds, dataset.max_bounds, 123 | n_classes, model_params).cuda() 124 | model = model.eval() 125 | model_utils.load_checkpoint(model, os.path.join(model_dir, 'checkpoints')) 126 | 127 | output_path = os.path.join(scene, 'output', 'semantic') 128 | os.makedirs(output_path, exist_ok=True) 129 | 130 | with torch.inference_mode(): 131 | with torch.cuda.amp.autocast(enabled=True): 132 | for frame_index, rgb_path in zip(tqdm(dataset.indices), 133 | dataset.scene.rgb_paths()): 134 | batch = dataset._get_test(frame_index) 135 | frame = render_frame(model, batch) 136 | 137 | if flags.objects is not None: 138 | frame = post_process(flags, frame) 139 | 140 | frame_name = os.path.splitext(os.path.basename(rgb_path))[0] 141 | frame_path = os.path.join(output_path, f"{frame_name}.png") 142 | cv2.imwrite(frame_path, frame) 143 | 144 | 145 | def main(): 146 | flags = read_args() 147 | 148 | for scene in flags.scenes: 149 | export_labels(flags, scene) 150 | 151 | 152 | if __name__ == "__main__": 153 | main() 154 | -------------------------------------------------------------------------------- /scripts/train.py: -------------------------------------------------------------------------------- 1 | from argparse import Namespace 2 | import math 3 | import numpy as np 4 | import torch 5 | from torch import optim 6 | 7 | from autolabel import model_utils 8 | from autolabel.dataset import SceneDataset, LenDataset 9 | from autolabel.trainer import SimpleTrainer 10 | 11 | 12 | def read_args(): 13 | parser = model_utils.model_flag_parser() 14 | parser.add_argument('scene') 15 | parser.add_argument('--factor-train', type=float, default=2.0) 16 | parser.add_argument('--factor-test', type=float, default=2.0) 17 | parser.add_argument('--batch-size', '-b', type=int, default=4096) 18 | parser.add_argument('--sample-chunk-size', type=int, default=512) 19 | parser.add_argument('--iters', type=int, default=10000) 20 | parser.add_argument('--workers', '-w', type=int, default=1) 21 | parser.add_argument('--eval', action='store_true') 22 | parser.add_argument('--contrastive', action='store_true') 23 | parser.add_argument('--use-semantic', action='store_true') 24 | parser.add_argument('--sam-sampling', 25 | default='proportional', 26 | choices=['proportional', 'uniform', None], 27 | type=str, 28 | help="SAM sampling method.") 29 | parser.add_argument('--slow-center', action='store_true') 30 | parser.add_argument('--cluster-instance-features', action='store_true') 31 | parser.add_argument( 32 | '--workspace', 33 | type=str, 34 | default=None, 35 | help="Save results in this directory instead of the scene directory.") 36 | return parser.parse_args() 37 | 38 | 39 | def main(): 40 | flags = read_args() 41 | 42 | dataset = SceneDataset('train', 43 | flags.scene, 44 | factor=flags.factor_train, 45 | batch_size=flags.batch_size, 46 | sample_chunk_size=flags.sample_chunk_size, 47 | features=flags.features, 48 | load_semantic=flags.use_semantic, 49 | sam_sampling=flags.sam_sampling) 50 | 51 | n_classes = dataset.n_classes if dataset.n_classes is not None else 2 52 | model = model_utils.create_model(dataset.min_bounds, dataset.max_bounds, 53 | n_classes, flags) 54 | 55 | opt = Namespace(rand_pose=-1, 56 | color_space='srgb', 57 | feature_loss=flags.features is not None, 58 | feature_constrastive_learning=flags.contrastive, 59 | rgb_weight=flags.rgb_weight, 60 | depth_weight=flags.depth_weight, 61 | semantic_weight=flags.semantic_weight, 62 | feature_weight=flags.feature_weight, 63 | contrastive_weight=flags.contrastive_weight, 64 | contrastive_temperature=flags.contrastive_temperature, 65 | sam_sampling=flags.sam_sampling is not None, 66 | slow_center=flags.slow_center) 67 | 68 | optimizer = lambda model: torch.optim.Adam([ 69 | { 70 | 'name': 'encoding', 71 | 'params': model.encoder_parameters() 72 | }, 73 | { 74 | 'name': 'net', 75 | 'params': model.network_parameters(), 76 | 'weight_decay': 1e-6 77 | }, 78 | ], 79 | lr=flags.lr, 80 | betas=(0.9, 0.99), 81 | eps=1e-15) 82 | 83 | train_dataloader = torch.utils.data.DataLoader(LenDataset(dataset, 1000), 84 | batch_size=None, 85 | num_workers=flags.workers) 86 | train_dataloader._data = dataset 87 | 88 | criterion = torch.nn.MSELoss(reduction='none') 89 | gamma = 0.5 90 | steps = math.log(1e-4 / flags.lr, gamma) 91 | step_size = max(flags.iters // steps // 1000, 1) 92 | scheduler = lambda optimizer: optim.lr_scheduler.StepLR( 93 | optimizer, gamma=gamma, step_size=step_size) 94 | 95 | epochs = int(np.ceil(flags.iters / 1000)) 96 | model_dir = model_utils.model_dir(flags.scene, flags) 97 | model_utils.write_params(model_dir, flags) 98 | trainer = SimpleTrainer('ngp', 99 | opt, 100 | model, 101 | device='cuda:0', 102 | workspace=model_dir, 103 | optimizer=optimizer, 104 | criterion=criterion, 105 | fp16=True, 106 | ema_decay=0.95, 107 | lr_scheduler=scheduler, 108 | scheduler_update_every_step=False, 109 | metrics=[], 110 | use_checkpoint='latest') 111 | trainer.train(train_dataloader, epochs) 112 | 113 | trainer.save_checkpoint() 114 | 115 | if flags.cluster_instance_features: 116 | del dataset 117 | dataset = SceneDataset('test', 118 | flags.scene, 119 | factor=4.0, 120 | batch_size=8182, 121 | lazy=True) 122 | trainer.compute_instance_centers(dataset) 123 | trainer.save_instance_centers(save_cluster=True) 124 | 125 | if flags.eval: 126 | testset = SceneDataset('test', 127 | flags.scene, 128 | factor=flags.factor_test, 129 | batch_size=flags.batch_size * 2) 130 | test_dataloader = torch.utils.data.DataLoader(LenDataset( 131 | testset, testset.rotations.shape[0]), 132 | batch_size=None, 133 | num_workers=0) 134 | trainer.evaluate(test_dataloader) 135 | 136 | 137 | if __name__ == "__main__": 138 | main() 139 | -------------------------------------------------------------------------------- /scripts/data/convert_arkitscenes.py: -------------------------------------------------------------------------------- 1 | description = """ 2 | This script converts scenes from the ARKitScenes dataset (https://github.com/apple/ARKitScenes) format to 3 | the format used by autolabel. 4 | 5 | Usage: 6 | python scripts/convert_arkitscenes.py --out 7 | 8 | After running this script, scripts/compute_scene_bounds.py needs to be run to compute the scene bounding box. 9 | 10 | This script uses the lowres_wide, lowres_depth, lowres_wide.traj, confidence, lowres_wide_intrinsics parts of the dataset. 11 | 12 | See Apple's instructions here for details https://github.com/apple/ARKitScenes/blob/main/DATA.md. 13 | 14 | The script to download the ARKitScenes dataset can be found here https://github.com/apple/ARKitScenes/blob/main/download_data.py. 15 | 16 | To download the required parts use it like this: 17 | python download_data.py raw --split Training --video_id_csv depth_upsampling/upsampling_train_val_splits.csv --download_dir /tmp/arkit_scenes/ --raw_dataset_assets lowres_wide lowres_depth lowres_wide.traj confidence lowres_wide_intrinsics 18 | 19 | """ 20 | import argparse 21 | from argparse import RawTextHelpFormatter 22 | import os 23 | import cv2 24 | import numpy as np 25 | from scipy.spatial.transform import Rotation 26 | 27 | 28 | def read_args(): 29 | parser = argparse.ArgumentParser(description=description, 30 | formatter_class=RawTextHelpFormatter) 31 | parser.add_argument('arkit_scenes') 32 | parser.add_argument('--out') 33 | return parser.parse_args() 34 | 35 | 36 | def read_trajectory(path): 37 | return np.loadtxt(path) 38 | 39 | 40 | def extract_name(filename): 41 | return filename.replace('.png', '') 42 | 43 | 44 | def collect_images(dir_path): 45 | filenames = os.listdir(dir_path) 46 | out = {} 47 | for filename in filenames: 48 | name = extract_name(filename) 49 | out[name] = os.path.join(dir_path, filename) 50 | return out 51 | 52 | 53 | def read_intrinsics(dir_path): 54 | intrinsic_files = os.listdir(dir_path) 55 | intrinsic_path = os.path.join(dir_path, intrinsic_files[0]) 56 | _, _, fx, fy, cx, cy = np.loadtxt(intrinsic_path) 57 | C = np.eye(3) 58 | C[0, 0] = fx 59 | C[1, 1] = fy 60 | C[0, 2] = cx 61 | C[1, 2] = cy 62 | return C 63 | 64 | 65 | def to_ts(filename): 66 | _, ts = filename.split('_') 67 | seconds, ms = [int(v) for v in ts.split('.')] 68 | return seconds + ms * 1e-3 69 | 70 | 71 | def find_pose(trajectory, rgb_name): 72 | timestamp = to_ts(rgb_name) 73 | errors = np.abs(trajectory[:, 0] - timestamp) 74 | closest = errors.argmin() 75 | return trajectory[closest], errors[closest] 76 | 77 | 78 | def to_transform(pose): 79 | rotvec = pose[1:4] 80 | translation = pose[4:] 81 | T_CW = np.eye(4) 82 | R_CW = Rotation.from_rotvec(rotvec) 83 | T_CW[:3, :3] = R_CW.as_matrix() 84 | T_CW[:3, 3] = translation 85 | return T_CW 86 | 87 | 88 | def write_scene(flags, scene_name, trajectory, rgb_images, depth_images, 89 | confidence_images, intrinsics): 90 | eps = 1.0 / 90.0 91 | rgb_out = os.path.join(flags.out, scene_name, 'rgb') 92 | depth_out = os.path.join(flags.out, scene_name, 'depth') 93 | pose_out = os.path.join(flags.out, scene_name, 'pose') 94 | os.makedirs(rgb_out, exist_ok=True) 95 | os.makedirs(depth_out, exist_ok=True) 96 | os.makedirs(pose_out, exist_ok=True) 97 | 98 | images = [(n, p) for n, p in rgb_images.items()] 99 | images.sort(key=lambda x: to_ts(x[0])) 100 | for i, (rgb_name, rgb_path_in) in enumerate(images): 101 | print(f"Writing {rgb_name}", end='\r') 102 | if rgb_name not in depth_images or rgb_name not in confidence_images: 103 | print(f"Skipping image {rgb_name}") 104 | continue 105 | 106 | pose, time_diff = find_pose(trajectory, rgb_name) 107 | if time_diff > eps: 108 | print(f"Skipping {rgb_name} due to time diff {time_diff:.03}", 109 | end='\r') 110 | continue 111 | else: 112 | print(f"Including {rgb_name} time diff {time_diff:.03}", end='\r') 113 | 114 | T_CW = to_transform(pose) 115 | 116 | image_name = f"{i:06}" 117 | pose_path = os.path.join(pose_out, image_name + '.txt') 118 | rgb_path = os.path.join(rgb_out, image_name + '.png') 119 | depth_path = os.path.join(depth_out, image_name + '.png') 120 | 121 | rgb = cv2.imread(rgb_path_in, -1) 122 | depth = cv2.imread(depth_images[rgb_name], -1) 123 | confidence = cv2.imread(confidence_images[rgb_name], -1) 124 | depth[confidence < 2] = 0 125 | cv2.imwrite(depth_path, depth) 126 | cv2.imwrite(rgb_path, rgb) 127 | np.savetxt(pose_path, T_CW) 128 | np.savetxt(os.path.join(flags.out, scene_name, 'intrinsics.txt'), 129 | intrinsics) 130 | 131 | 132 | def main(): 133 | flags = read_args() 134 | 135 | scenes = os.listdir(flags.arkit_scenes) 136 | 137 | for scene in scenes: 138 | traj_file = os.path.join(flags.arkit_scenes, scene, 'lowres_wide.traj') 139 | confidence_dir = os.path.join(flags.arkit_scenes, scene, 'confidence') 140 | depth_dir = os.path.join(flags.arkit_scenes, scene, 'lowres_depth') 141 | rgb_dir = os.path.join(flags.arkit_scenes, scene, 'lowres_wide') 142 | intrinsics_dir = os.path.join(flags.arkit_scenes, scene, 143 | 'lowres_wide_intrinsics') 144 | 145 | if not os.path.exists(traj_file) or not os.path.exists( 146 | confidence_dir) or not os.path.exists( 147 | rgb_dir) or not os.path.exists(intrinsics_dir): 148 | print(f"Missing files in {scene}") 149 | continue 150 | 151 | trajectory = read_trajectory(traj_file) 152 | 153 | rgb_images = collect_images(rgb_dir) 154 | depth_images = collect_images(depth_dir) 155 | confidence_images = collect_images(confidence_dir) 156 | intrinsics = read_intrinsics(intrinsics_dir) 157 | 158 | write_scene(flags, scene, trajectory, rgb_images, depth_images, 159 | confidence_images, intrinsics) 160 | 161 | 162 | if __name__ == "__main__": 163 | main() 164 | -------------------------------------------------------------------------------- /docs/vision-language.md: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | 3 | ## Installing LSeg 4 | 5 | In addition to the regular installation instructions, you also need to install LSeg. This can be done by running the following commands with your python environment loaded. 6 | ``` 7 | git clone https://github.com/kekeblom/lang-seg 8 | cd lang-seg 9 | pip install -e . 10 | ``` 11 | 12 | ## Data conversion 13 | 14 | Follow the instructions in `docs/data.md` to convert the scenes from original datasets into the our format. 15 | 16 | --- 17 | --- 18 | 19 | # [Neural Implicit Vision-language Feature Fields](https://arxiv.org/abs/2303.10962) 20 | 21 | checkout to branch `lseg` 22 | 23 | ## Running ScanNet experiment 24 | 25 | Use the following commands to compute vision-language features, fit the scene representation and evaluate against the ground truth: 26 | ``` 27 | # Train . Has to be run separately for each scene. 28 | python scripts/compute_feature_maps.py / --features lseg --checkpoint 29 | python scripts/train.py --features lseg --feature-dim 512 --iters 25000 / 30 | 31 | # Once trained on all scenes, evaluate. 32 | # 3D queries evaluated against the 3D pointcloud 33 | python scripts/language/evaluate.py --pc --label-map --feature-checkpoint 34 | # 2D queries against the ground truth semantic segmentation maps 35 | python scripts/language/evaluate.py --label-map --feature-checkpoint 36 | ``` 37 | 38 | `dataset-dir` is the path to the scannet converted scenes, `scene` is the name of the scene. `lseg-weights` is the path to the lseg checkpoint. 39 | 40 | ## Running the real-time ROS node 41 | 42 | The `scripts/ros/` directory contains ROS nodes which can be used to integrate with a real-time SLAM system. These have been tested under ROS Noetic. 43 | 44 | `scripts/ros/node.py` is the node which listens to keyframes and integrates the volumetric representation as they come in. It listens to the following topics: 45 | - `/slam/rgb` image messages. 46 | - `/slam/depth` depth frames encoded as uint16 values in millimeters. 47 | - `/slam/keyframe` PoseStamped messages which correspond to camera poses for the rgb and depth messages. 48 | - `/slam/camera_info` CameraInfo message containing the intrinsic parameters. 49 | - `/slam/odometry` (optional) PoseStamped messages. Each time a message comes in, it renders an rgb frame and semantic segmentation map which is published at `/autolabel/image` and `/autolabel/features` respectively. 50 | - `/autolabel/segmentation_classes` segmentation class prompts as a String message published by the `class_input.py` node. 51 | 52 | It can be run with `python scripts/ros/node.py --checkpoint -b `. The bound parameter is optional and defaults to 2.5 meters. It defines the size of the volume, extending `bound` meters from `[-bound, -bound, -bound]` to `[bound, bound, bound]` in the x, y and z directions. 53 | 54 | For an implementation of the SLAM node, you can use the ROS node from the [SpectacularAI SDK examples](https://github.com/SpectacularAI/sdk-examples/blob/main/python/oak/mapping_ros.py), in case you have an OAK-D stereo camera. 55 | 56 | `scripts/ros/class_input.py` presents a graphical user interface which can be used to define the segmentation classes used by the ROS node. It published class at `/autolabel/segmentation_classes`. 57 | 58 | --- 59 | --- 60 | 61 | # [Panoptic Vision-Language Feature Fields](https://arxiv.org/abs/2309.05448) 62 | 63 | checkout to branch `panoptic` 64 | 65 | ## Training 66 | To begin the training process, first run the precomputing steps: 67 | 68 | ``` 69 | # compute the vision-language features 70 | python scripts/compute_feature_maps.py / \ 71 | --features lseg \ 72 | --checkpoint \ 73 | --dim 512 74 | 75 | # compute the instance masks using SAM 76 | python scripts/compute_sam_mask.py / \ 77 | --sam-vit-h-checkpoint 78 | ``` 79 | 80 | `dataset-dir` is the path to the scannet converted scenes, `scene` is the name of the scene. `lseg-weights` is the path to the lseg checkpoint. `sam-weights` is the path to the SAM checkpoint (which can be downloaded [here](https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth)). 81 | 82 | Then, fit the scene representation using the same training script with additional flags on: 83 | ``` 84 | python scripts/train.py / \ 85 | --batch-size 2048 \ 86 | --iters 20000 \ 87 | --workspace \ 88 | --feature-dim 512 \ 89 | --features lseg \ 90 | --contrastive \ 91 | --sam-sampling \ 92 | --slow-center \ 93 | --cluster-instance-features 94 | ``` 95 | 96 | 97 | `workspace` is the folder where the model is saved. `--contrastive` is the option to train instance feature field using contrastive learning. `--sam-sampling` denotes the strategy to sample the SAM masks for training. The strategies include `proportional`, `uniform` and `None`, where `proportional` means sampling the masks according to their areas, `uniform` means sampling these masks uniformly, and `None` means not using sampling strategy and training with multiple positive pairs. `--slow-center` denotes whether to use "slow center strategy". `--cluster-instance-features` denotes to run the clustering after the training and save the cluster centers together with the clusterer itself. 98 | 99 | ## Evaluation 100 | 101 | Scene-level Panoptic Quality and 2D Semantic Segmentation 102 | ``` 103 | python scripts/language/evaluate.py \ 104 | --vis \ # the folder to save the visualization results. 105 | --workspace \ 106 | --out \ # the folder to save the evaluation results. 107 | --label-map \ 108 | --feature-checkpoint \ 109 | --panoptic # the flag to evaluate scene-level PQ and 2D semantic segmentation. 110 | # --debug # whether to save the visualization images. 111 | ``` 112 | 113 | 3D Semantic Segmentation (only for ScanNet) 114 | ``` 115 | python scripts/language/evaluate.py \ 116 | --vis \ # the folder to save the visualization results. 117 | --workspace \ 118 | --out \ # the folder to save the evaluation results. 119 | --label-map \ 120 | --feature-checkpoint \ 121 | --pc # the flag to 3D semantic segmentation. 122 | ``` 123 | 124 | -------------------------------------------------------------------------------- /scripts/compute_feature_maps.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import h5py 3 | import numpy as np 4 | import os 5 | import pickle 6 | import math 7 | import torch 8 | from torch.nn import functional as F 9 | from torchvision.io.image import read_image 10 | from PIL import Image 11 | from autolabel.utils import Scene 12 | from autolabel.utils.feature_utils import get_feature_extractor 13 | from autolabel.models import Autoencoder 14 | from sklearn import decomposition 15 | from tqdm import tqdm 16 | 17 | 18 | def read_args(): 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument('scene') 21 | parser.add_argument('--vis', action='store_true') 22 | parser.add_argument('--video', 23 | type=str, 24 | help="Create video of maps and write to this path.") 25 | parser.add_argument('--features', 26 | type=str, 27 | choices=['fcn50', 'dino', 'lseg']) 28 | parser.add_argument('--checkpoint', 29 | type=str, 30 | help="Which model weights to use.") 31 | parser.add_argument('--dim', type=int, default=64) 32 | parser.add_argument('--autoencode', action='store_true') 33 | return parser.parse_args() 34 | 35 | 36 | def compress_features(features, dim): 37 | features = np.stack(features) 38 | N, H, W, C = features.shape 39 | coder = Autoencoder(C, dim).cuda() 40 | optimizer = torch.optim.Adam(coder.parameters(), lr=1e-3) 41 | dataset = torch.utils.data.TensorDataset( 42 | torch.tensor(features.reshape(N * H * W, C))) 43 | loader = torch.utils.data.DataLoader(dataset, batch_size=2048, shuffle=True) 44 | for _ in range(5): 45 | bar = tqdm(loader) 46 | for batch in bar: 47 | batch = batch[0].cuda() 48 | reconstructed, code = coder(batch) 49 | loss = F.mse_loss(reconstructed, 50 | batch) + 0.01 * torch.abs(code).mean() 51 | bar.set_description(f"Loss: {loss.item()}") 52 | loss.backward() 53 | optimizer.step() 54 | optimizer.zero_grad() 55 | 56 | with torch.inference_mode(): 57 | features_out = np.zeros((N, H, W, dim), dtype=np.float16) 58 | for i, feature in enumerate(features): 59 | feature = torch.tensor(feature).view(H * W, C).cuda() 60 | _, out = coder(feature.view(H * W, C)) 61 | features_out[i] = out.detach().cpu().numpy().reshape(H, W, dim) 62 | return features_out 63 | 64 | 65 | def compute_size(image_path, feature): 66 | image = read_image(image_path) 67 | _, H, W = image.shape 68 | short_side = min(H, W) 69 | if feature in ['fcn50', 'dino']: 70 | target_size = 720 71 | elif feature == 'lseg': 72 | target_size = 242 73 | scale_factor = target_size / short_side 74 | return int(H * scale_factor), int(W * scale_factor) 75 | 76 | 77 | def extract_features(extractor, scene, output_file, flags): 78 | paths = scene.rgb_paths() 79 | H, W = compute_size(paths[0], flags.features) 80 | 81 | shape = extractor.shape((H, W)) 82 | dataset = output_file.create_dataset(flags.features, 83 | (len(paths), *shape, flags.dim), 84 | dtype=np.float16, 85 | compression='lzf') 86 | 87 | extracted = [] 88 | with torch.inference_mode(): 89 | batch_size = 2 90 | for i in tqdm(range(math.ceil(len(paths) / batch_size))): 91 | index = slice(i * batch_size, (i + 1) * batch_size) 92 | batch = paths[index] 93 | image = torch.stack([read_image(p) for p in batch]).cuda() 94 | image = F.interpolate(image, [H, W]) 95 | features = extractor(image / 255.).cpu().numpy() 96 | 97 | if flags.autoencode: 98 | extracted += [f for f in features] 99 | else: 100 | dataset[index] = features[..., :flags.dim] 101 | 102 | if flags.autoencode: 103 | features = compress_features(extracted, flags.dim) 104 | dataset[:] = features 105 | 106 | N, H, W, C = dataset[:].shape 107 | X = dataset[:].reshape(N * H * W, C) 108 | pca = decomposition.PCA(n_components=3) 109 | indices = np.random.randint(0, X.shape[0], size=50000) 110 | subset = X[indices] 111 | transformed = pca.fit_transform(subset) 112 | minimum = transformed.min(axis=0) 113 | maximum = transformed.max(axis=0) 114 | diff = maximum - minimum 115 | 116 | dataset.attrs['pca'] = np.void(pickle.dumps(pca)) 117 | dataset.attrs['min'] = minimum 118 | dataset.attrs['range'] = diff 119 | 120 | 121 | def visualize_features(features): 122 | pca = pickle.loads(features.attrs['pca'].tobytes()) 123 | N, H, W, C = features[:].shape 124 | 125 | from matplotlib import pyplot 126 | feature_maps = features[:] 127 | for fm in feature_maps[::10]: 128 | mapped = pca.transform(fm.reshape(H * W, C)).reshape(H, W, 3) 129 | normalized = np.clip( 130 | (mapped - features.attrs['min']) / features.attrs['range'], 0, 1) 131 | pyplot.imshow(normalized) 132 | pyplot.show() 133 | 134 | 135 | def write_video(features, out): 136 | from skvideo.io.ffmpeg import FFmpegWriter 137 | pca = pickle.loads(features.attrs['pca'].tobytes()) 138 | N, H, W, C = features[:].shape 139 | writer = FFmpegWriter(out, 140 | inputdict={'-framerate': '5'}, 141 | outputdict={ 142 | '-c:v': 'libx264', 143 | '-r': '5', 144 | '-pix_fmt': 'yuv420p' 145 | }) 146 | for feature in tqdm(features, desc="Encoding frames"): 147 | mapped = pca.transform(feature.reshape(H * W, C)).reshape(H, W, 3) 148 | normalized = np.clip( 149 | (mapped - features.attrs['min']) / features.attrs['range'], 0, 1) 150 | frame = (normalized * 255.0).astype(np.uint8) 151 | writer.writeFrame(frame) 152 | 153 | 154 | def main(): 155 | flags = read_args() 156 | np.random.seed(0) 157 | torch.manual_seed(0) 158 | 159 | scene = Scene(flags.scene) 160 | output_file = h5py.File(os.path.join(scene.path, 'features.hdf'), 161 | 'w', 162 | libver='latest') 163 | group = output_file.create_group('features') 164 | 165 | extractor = get_feature_extractor(flags.features, flags.checkpoint) 166 | 167 | extract_features(extractor, scene, group, flags) 168 | if flags.vis: 169 | visualize_features(group[flags.features]) 170 | if flags.video: 171 | write_video(group[flags.features], flags.video) 172 | 173 | 174 | if __name__ == "__main__": 175 | main() 176 | -------------------------------------------------------------------------------- /autolabel/utils/__init__.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import json 3 | import numpy as np 4 | import os 5 | from pathlib import Path 6 | 7 | 8 | class Camera: 9 | 10 | def __init__(self, camera_matrix, size): 11 | self.camera_matrix = camera_matrix 12 | self.size = size 13 | 14 | def scale(self, new_size): 15 | scale_x = new_size[0] / self.size[0] 16 | scale_y = new_size[1] / self.size[1] 17 | camera_matrix = self.camera_matrix.copy() 18 | camera_matrix[0, :] = scale_x * self.camera_matrix[0, :] 19 | camera_matrix[1, :] = scale_y * self.camera_matrix[1, :] 20 | return Camera(camera_matrix, new_size) 21 | 22 | @property 23 | def fx(self): 24 | return self.camera_matrix[0, 0] 25 | 26 | @property 27 | def fy(self): 28 | return self.camera_matrix[1, 1] 29 | 30 | @property 31 | def cx(self): 32 | return self.camera_matrix[0, 2] 33 | 34 | @property 35 | def cy(self): 36 | return self.camera_matrix[1, 2] 37 | 38 | @classmethod 39 | def from_path(self, path, size): 40 | return Camera(np.loadtxt(path), size) 41 | 42 | def write(self, path): 43 | np.savetxt(path, self.camera_matrix) 44 | 45 | 46 | class Scene: 47 | 48 | def __init__(self, scene_path): 49 | self.path = scene_path 50 | self.rgb_path = os.path.join(scene_path, 'rgb') 51 | self.raw_rgb_path = os.path.join(scene_path, 'raw_rgb') 52 | self.depth_path = os.path.join(scene_path, 'depth') 53 | self.raw_depth_path = os.path.join(scene_path, 'raw_depth') 54 | self.pose_path = os.path.join(scene_path, 'pose') 55 | self._read_poses() 56 | intrinsics_path = os.path.join(scene_path, 'intrinsics.txt') 57 | image_size = self.peak_image_size() 58 | if os.path.exists(intrinsics_path): 59 | self.camera = Camera.from_path(intrinsics_path, image_size) 60 | self._n_classes = None 61 | self._metadata = None 62 | 63 | def peak_image_size(self): 64 | if os.path.exists(self.raw_rgb_path): 65 | path = self.raw_rgb_path 66 | elif os.path.exists(self.rgb_path): 67 | path = self.rgb_path 68 | else: 69 | raise ValueError("Doesn't appear to be a valid scene.") 70 | image = cv2.imread(os.path.join(path, os.listdir(path)[0])) 71 | return (image.shape[1], image.shape[0]) 72 | 73 | def _read_poses(self): 74 | if not os.path.exists(self.pose_path): 75 | self.poses = [] 76 | return 77 | pose_files = os.listdir(self.pose_path) 78 | pose_files = sorted([p for p in pose_files if p[0] != '.'], 79 | key=lambda p: int(p.split('.')[0])) 80 | self.poses = [] 81 | for pose_file in pose_files: 82 | T_CW = np.loadtxt(os.path.join(self.pose_path, pose_file)) 83 | self.poses.append(T_CW) 84 | 85 | def __iter__(self): 86 | rgb_frames = self.rgb_paths() 87 | depth_frames = self.depth_paths() 88 | for pose, rgb, depth in zip(self.poses, rgb_frames, depth_frames): 89 | yield (pose, rgb, depth) 90 | 91 | def __len__(self): 92 | return len(self.poses) 93 | 94 | def _get_paths(self, directory): 95 | frames = os.listdir(directory) 96 | frames = sorted(frames, key=lambda x: int(x.split('.')[0])) 97 | return [os.path.join(directory, f) for f in frames] 98 | 99 | def rgb_paths(self): 100 | return self._get_paths(self.rgb_path) 101 | 102 | def depth_paths(self): 103 | return self._get_paths(self.depth_path) 104 | 105 | def semantic_paths(self): 106 | return self._get_paths(os.path.join(self.path, 'semantic')) 107 | 108 | def raw_rgb_paths(self): 109 | return self._get_paths(self.raw_rgb_path) 110 | 111 | def raw_depth_paths(self): 112 | return self._get_paths(self.raw_depth_path) 113 | 114 | def gt_semantic(self): 115 | return self._get_paths(os.path.join(self.path, 'gt_semantic')) 116 | 117 | def gt_instance(self): 118 | return self._get_paths(os.path.join(self.path, 'gt_instance')) 119 | 120 | def image_names(self): 121 | """ 122 | Returns the filenames of rgb images without file extensions. 123 | """ 124 | rgb_frames = os.listdir(self.rgb_path) 125 | rgb_frames = sorted(rgb_frames, key=lambda x: int(x.split('.')[0])) 126 | return [f.split('.')[0] for f in rgb_frames] 127 | 128 | def bbox(self): 129 | return np.loadtxt(os.path.join(self.path, 'bbox.txt'))[:6].reshape(2, 3) 130 | 131 | def gt_masks(self, size): 132 | """ 133 | Returns a list of numpy arrays of ground truth segmentation masks, 134 | if available. Returns an empty list if no masks have been annotated. 135 | size: the desired size for the masks. 136 | returns: list of H x W numpy arrays 137 | """ 138 | gt_masks_dir = os.path.join(self.path, 'gt_masks') 139 | if not os.path.exists(gt_masks_dir): 140 | return [] 141 | masks = [] 142 | mask_files = [ 143 | os.path.join(gt_masks_dir, f) for f in os.listdir(gt_masks_dir) 144 | ] 145 | for mask_file in mask_files: 146 | frame_number = int(os.path.basename(mask_file).split('.')[0]) 147 | mask = _read_gt_mask(mask_file, size) 148 | masks.append((frame_number, _read_gt_mask(mask_file, size))) 149 | return sorted(masks, key=lambda x: x[0]) 150 | 151 | def depth_size(self): 152 | """ 153 | Return: the size (width, height) of the depth images. 154 | """ 155 | depth_paths = self.raw_depth_paths() 156 | if len(depth_paths) == 0: 157 | depth_paths = self.depth_paths() 158 | image = cv2.imread(depth_paths[0], -1) 159 | return (image.shape[1], image.shape[0]) 160 | 161 | @property 162 | def metadata(self): 163 | if self._metadata is None: 164 | metadata_path = os.path.join(self.path, 'metadata.json') 165 | if not os.path.exists(metadata_path): 166 | return None 167 | with open(metadata_path) as f: 168 | self._metadata = json.load(f) 169 | return self._metadata 170 | 171 | @property 172 | def n_classes(self): 173 | if self._n_classes is None: 174 | self._n_classes = self.metadata['n_classes'] 175 | return self._n_classes 176 | 177 | 178 | def transform_points(T, points): 179 | R = T[:3, :3] 180 | t = T[:3, 3] 181 | return (R @ points[..., :, None])[..., :, 0] + t 182 | 183 | 184 | def _read_gt_mask(path, size): 185 | image = np.zeros((size[1], size[0]), dtype=np.uint8) 186 | with open(path, 'rt') as f: 187 | data = json.load(f) 188 | scaling_factor = np.array( 189 | [size[0] / data['imageWidth'], size[1] / data['imageHeight']]) 190 | for shape in data['shapes']: 191 | polygon = (np.stack(shape['points']) * scaling_factor).astype(np.int32) 192 | #TODO: handle multiple classes. 193 | image = cv2.fillPoly(image, polygon[None], 1) 194 | return image 195 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

Panoptic Vision-Language Feature Fields

2 | 3 |

4 | Haoran Chen, 5 | Kenneth Blomqvist, 6 | Francesco Milano, Roland Siegwart 7 |

8 | 9 |

IEEE RA-L 2024

10 |

Paper | Video | Project Page

11 | 12 |

13 | 14 | Panoptic Vision-Language Feature Fields 15 | 16 |

17 | 18 | Recently, methods have been proposed for 3D _open-vocabulary_ semantic segmentation. Such methods are able to segment scenes into arbitrary classes based on text descriptions provided during runtime. In this paper, we propose to the best of our knowledge the first algorithm for _open-vocabulary panoptic_ segmentation in 3D scenes. Our algorithm, Panoptic Vision-Language Feature Fields (PVLFF), learns a semantic feature field of the scene by distilling vision-language features from a pretrained 2D model, and jointly fits an instance feature field through contrastive learning using 2D instance segments on input frames. Despite not being trained on the target classes, our method achieves panoptic segmentation performance similar to the state-of-the-art _closed-set_ 3D systems on the HyperSim, ScanNet and Replica dataset and additionally outperforms current 3D open-vocabulary systems in terms of semantic segmentation. We ablate the components of our method to demonstrate the effectiveness of our model architecture. 19 | 20 | ## Table of Contents 21 | 22 | 1. [Installation](#installation) 23 | 2. [Running experiments](#running-experiments) 24 | 3. [Citation](#citation) 25 | 4. [Acknowledgements](#acknowledgements) 26 | 27 | ## Installation 28 | 29 | The installation instructions were tested for Python 3.8, 3.9 and 3.10. Some dependencies are recommended to be installed through Anaconda and we assume you are using an Anaconda environment for these instructions. 30 | 31 | The software uses CUDA and compiling `tiny-cuda-nn` requires `nvcc`. If you don't have CUDA >= version 11.3, including `nvcc`, installed on your system, you can install it in your anaconda env with: 32 | ``` 33 | conda install -c conda-forge cudatoolkit-dev=11.4 34 | ``` 35 | 36 | To install PyTorch and ffmpeg, run: 37 | ``` 38 | conda install pytorch torchvision cudatoolkit=11.3 -c pytorch 39 | conda install ffmpeg 40 | ``` 41 | 42 | Install into your desired Python environment with the following commands: 43 | ``` 44 | pip install git+https://github.com/NVlabs/tiny-cuda-nn/#subdirectory=bindings/torch 45 | 46 | git submodule update --init --recursive 47 | pushd torch_ngp 48 | git submodule update --init --recursive 49 | pip install -e . 50 | bash scripts/install_ext.sh 51 | popd 52 | 53 | # To use LSeg features for vision-language feature fields. 54 | git clone https://github.com/kekeblom/lang-seg 55 | pushd lang-seg 56 | pip install -e . 57 | popd 58 | 59 | # Finally install Autolabel. 60 | pip install -e . 61 | ``` 62 | 63 | 64 | ## Running experiments 65 | 66 | ### Data conversion 67 | 68 | Follow the instructions in `docs/data.md` to convert the scenes from original datasets into the our format. 69 | 70 | ### Training 71 | To begin the training process, first run the precomputing steps: 72 | 73 | ``` 74 | # Compute the vision-language features. 75 | python scripts/compute_feature_maps.py / \ 76 | --features lseg \ 77 | --checkpoint \ 78 | --dim 512 79 | 80 | # Compute the instance masks using SAM. 81 | python scripts/compute_sam_mask.py / \ 82 | --sam-vit-h-checkpoint 83 | ``` 84 | 85 | where `` is the path to the converted scenes, `` is the name of the scene. `` is the path to the LSeg checkpoint, `` is the path to the SAM checkpoint (which can be downloaded [here](https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth)). 86 | 87 | Then, fit the scene representation using the following training script: 88 | ``` 89 | python scripts/train.py / \ 90 | --batch-size 2048 \ 91 | --iters 20000 \ 92 | --workspace \ 93 | --feature-dim 512 \ 94 | --features lseg \ 95 | --contrastive \ 96 | --sam-sampling \ 97 | --slow-center \ 98 | --cluster-instance-features 99 | ``` 100 | 101 | 102 | where `` is the folder where the model is saved. The flag `--contrastive` enables training the instance feature field using contrastive learning. The flag `--sam-sampling` sets the strategy to sample the SAM masks for training; the strategies include `proportional` and `uniform`, where `proportional` means sampling the masks according to their areas, and `uniform` means sampling these masks uniformly. The flag `--slow-center` enables the use of "slow center strategy". The flag `--cluster-instance-features` enables running the clustering after the training and saving the cluster centers together with the object instance of the HDBSCAN clustering class. 103 | 104 | Here we provide some [checkpoints](https://doi.org/10.3929/ethz-b-000656499) trained on Replica scenes. 105 | 106 | ### Inference 107 | Render the scene views after training by running the following script: 108 | ``` 109 | python scripts/render.py / \ 110 | --stride 1 \ 111 | --model-dir \ 112 | --out \ 113 | --checkpoint \ 114 | --label-map 115 | ``` 116 | where `` is the folder where the model checkpoint is saved (e.g. `//g15_hg+freq_lseg_rgb1.0_d0.1_s1.0_f0.5_c0.1`). `` sets the output folder where the rendered results are saved. `` is the label mapping from id to semantic class of the scene (here is an example [label-map](./configs/label_map.csv) file that we used for replica scenes). 117 | 118 | ### Evaluation 119 | 120 | Scene-level Panoptic Quality and 2D Semantic Segmentation 121 | ``` 122 | python scripts/language/evaluate.py \ 123 | --vis \ # the folder to save the visualization results. 124 | --workspace \ 125 | --out \ # the folder to save the evaluation results. 126 | --label-map \ 127 | --feature-checkpoint \ 128 | --panoptic # the flag to evaluate scene-level PQ and 2D semantic segmentation. 129 | # --debug # whether to save the visualization images. 130 | ``` 131 | 132 | 3D Semantic Segmentation (only for ScanNet) 133 | ``` 134 | python scripts/language/evaluate.py \ 135 | --vis \ # the folder to save the visualization results. 136 | --workspace \ 137 | --out \ # the folder to save the evaluation results. 138 | --label-map \ 139 | --feature-checkpoint \ 140 | --pc # the flag to 3D semantic segmentation. 141 | ``` 142 | 143 | ### 3D interactive segmentation 144 | We provide a demo UI script of interactive open-vocabulary segmentation on pointclouds of ScanNet scenes. 145 | ``` 146 | python scripts/demo_ui.py / \ 147 | --workspace \ 148 | --checkpoint 149 | ``` 150 | 151 | https://github.com/ethz-asl/pvlff/assets/33897834/1c31a03a-c7e9-43dc-af83-de1cf471893e 152 | 153 | ## Citation 154 | 155 | If you find our code or paper useful, please cite: 156 | 157 | ```bibtex 158 | @article{Chen2024PVLFF, 159 | author = {Chen, Haoran and Blomqvist, Kenneth and Milano, Francesco and Siegwart, Roland}, 160 | title = {Panoptic Vision-Language Feature Fields}, 161 | journal = {IEEE Robotics and Automation Letters (RA-L)}, 162 | volume = {9}, 163 | number = {3}, 164 | pages = {2144--2151}, 165 | year = {2024} 166 | } 167 | ``` 168 | 169 | ## Acknowledgements 170 | 171 | A large part of the code is based on [Autolabel](https://github.com/ethz-asl/autolabel): 172 | 173 | - K. Blomqvist, L. Ott, J. J. Chung, and R. Siegwart, "Baking in the Feature: Accelerating Volumetric Segmentation by Rendering Feature Maps", in IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS), 2023 - [Link](https://keke.dev/baking-in-the-feature) 174 | 175 | - K. Blomqvist, F. Milano, J. J. Chung, L. Ott, and R. Siegwart, "Neural Implicit Vision-Language Feature Fields", in IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS), 2023 - [Link](https://arxiv.org/abs/2303.10962) 176 | 177 | Our code uses our customized version of [`torch-ngp`](https://github.com/ashawkey/torch-ngp) as the underlying NeRF framework. Big thanks to [Jiaxiang Tang](https://me.kiui.moe/) for releasing the initial implementation. 178 | -------------------------------------------------------------------------------- /scripts/render.py: -------------------------------------------------------------------------------- 1 | import h5py 2 | import numpy as np 3 | import pandas 4 | import cv2 5 | import os 6 | import pickle 7 | from tqdm import tqdm 8 | import torch 9 | 10 | from autolabel.dataset import SceneDataset 11 | from autolabel import model_utils 12 | from autolabel import visualization 13 | from autolabel.utils.feature_utils import get_feature_extractor 14 | from pathlib import Path 15 | from sklearn import decomposition 16 | from sklearn.metrics.pairwise import cosine_similarity 17 | 18 | 19 | def read_args(): 20 | parser = model_utils.model_flag_parser() 21 | parser.add_argument('scene') 22 | parser.add_argument('--fps', type=int, default=5) 23 | parser.add_argument('--stride', type=int, default=1) 24 | parser.add_argument('--model-dir', type=str, required=True) 25 | parser.add_argument( 26 | '--max-depth', 27 | type=float, 28 | default=7.5, 29 | help="The maximum depth used in colormapping the depth frames.") 30 | parser.add_argument('--checkpoint', type=str) 31 | parser.add_argument('--out', 32 | type=str, 33 | required=True, 34 | help="Where to save the video.") 35 | parser.add_argument('--classes', 36 | default=None, 37 | type=str, 38 | nargs='+', 39 | help="Which classes to segment the scene into.") 40 | parser.add_argument('--label-map', 41 | default=None, 42 | type=str, 43 | help="Path to list of labels.") 44 | return parser.parse_args() 45 | 46 | 47 | class FeatureTransformer: 48 | 49 | def __init__(self, scene_path, feature_name, classes, checkpoint=None, without_features=False): 50 | if not without_features: 51 | with h5py.File(os.path.join(scene_path, 'features.hdf'), 'r') as f: 52 | features = f[f'features/{feature_name}'] 53 | blob = features.attrs['pca'].tobytes() 54 | self.pca = pickle.loads(blob) 55 | self.feature_min = features.attrs['min'] 56 | self.feature_range = features.attrs['range'] 57 | self.first_fit = False 58 | else: 59 | self.pca = decomposition.PCA(n_components=3) 60 | self.feature_min = None 61 | self.feature_range = None 62 | self.first_fit = True 63 | 64 | 65 | if feature_name is not None: 66 | extractor = get_feature_extractor(feature_name, checkpoint) 67 | self.text_features = self._encode_text(extractor, classes) 68 | 69 | def _encode_text(self, extractor, text): 70 | return extractor.encode_text(text) 71 | 72 | def __call__(self, p_features): 73 | H, W, C = p_features.shape 74 | if self.first_fit: 75 | features = self.pca.fit_transform(p_features.reshape(H * W, C)) 76 | self.first_fit = False 77 | else: 78 | features = self.pca.transform(p_features.reshape(H * W, C)) 79 | 80 | if (self.feature_min is not None) and (self.feature_range is not None): 81 | features = np.clip((features - self.feature_min) / self.feature_range, 82 | 0., 1.) 83 | else: 84 | features = np.clip((features - np.min(features)) / (np.max(features) - np.min(features)), 85 | 0., 1.) 86 | return (features.reshape(H, W, 3) * 255.).astype(np.uint8) 87 | 88 | 89 | def compute_semantics(outputs, classes, feature_transform): 90 | if classes is not None: 91 | features = outputs['semantic_features'] 92 | features = (features / torch.norm(features, dim=-1, keepdim=True)) 93 | text_features = feature_transform.text_features 94 | H, W, D = features.shape 95 | C = text_features.shape[0] 96 | similarities = torch.zeros((H, W, C), dtype=features.dtype) 97 | for i in range(H): 98 | similarities[i, :, :] = (features[i, :, None] * 99 | text_features).sum(dim=-1).cpu() 100 | return similarities.argmax(dim=-1) 101 | else: 102 | return outputs['semantic'].argmax(dim=-1).cpu().numpy() 103 | 104 | def compute_instances(outputs, feature_centers): 105 | instance_feature = outputs['contrastive_features'].cpu().numpy() 106 | image_height, image_width, feature_dim = instance_feature.shape 107 | instance_feature = instance_feature.reshape(-1, feature_dim) 108 | sim_mat = cosine_similarity(instance_feature, feature_centers) 109 | pred_instance = np.argmax(sim_mat, axis=1) 110 | pred_instance = pred_instance.reshape(image_height, image_width) 111 | return pred_instance 112 | 113 | def render(model, 114 | batch, 115 | feature_transform, 116 | semantic_color_map, 117 | instance_color_map, 118 | size=(480, 360), 119 | maxdepth=10.0, 120 | classes=None, 121 | con_feature_transform=None): 122 | rays_o = torch.tensor(batch['rays_o']).cuda() 123 | rays_d = torch.tensor(batch['rays_d']).cuda() 124 | direction_norms = torch.tensor(batch['direction_norms']).cuda() 125 | outputs = model.render(rays_o, 126 | rays_d, 127 | direction_norms, 128 | staged=True, 129 | perturb=False, 130 | num_steps=512, 131 | upsample_steps=0) 132 | p_semantic = compute_semantics(outputs, classes, feature_transform) 133 | p_instance = compute_instances(outputs, model.instance_centers) 134 | frame = np.zeros((2 * size[1], 3 * size[0], 3), dtype=np.uint8) 135 | h_mid = size[1] 136 | w_ot, w_tt = size[0], size[0] * 2 137 | p_rgb = (outputs['image'].cpu().numpy() * 255.0).astype(np.uint8) 138 | p_depth = outputs['depth'] 139 | frame[:h_mid, :w_ot, :] = p_rgb 140 | frame[h_mid:, :w_ot] = visualization.visualize_depth( 141 | p_depth.cpu().numpy(), maxdepth=maxdepth)[:, :, :3] 142 | frame[:h_mid, w_tt:] = semantic_color_map[p_semantic] 143 | frame[h_mid:, w_tt:] = instance_color_map[p_instance] 144 | 145 | if feature_transform is not None: 146 | p_features = feature_transform( 147 | outputs['semantic_features'].cpu().numpy()) 148 | frame[:h_mid, w_ot:w_tt] = p_features 149 | 150 | if con_feature_transform is not None: 151 | p_con_features = con_feature_transform( 152 | outputs['contrastive_features'].cpu().numpy()) 153 | frame[h_mid:, w_ot:w_tt] = p_con_features 154 | 155 | return frame 156 | 157 | 158 | def main(): 159 | flags = read_args() 160 | model_params = model_utils.read_params(flags.model_dir) 161 | 162 | view_size = (480, 360) 163 | dataset = SceneDataset('test', 164 | flags.scene, 165 | size=view_size, 166 | batch_size=16384, 167 | features=model_params.features, 168 | load_semantic=False, 169 | lazy=True) 170 | 171 | classes = flags.classes 172 | if flags.label_map is not None: 173 | label_map = pandas.read_csv(flags.label_map) 174 | classes = label_map['prompt'].values 175 | semantic_color_map = (np.random.rand(len(classes), 3) * 255).astype(np.uint8) 176 | 177 | feature_transform = None 178 | if model_params.features is not None: 179 | feature_transform = FeatureTransformer(flags.scene, 180 | model_params.features, classes, 181 | flags.checkpoint) 182 | 183 | con_feature_transform = FeatureTransformer(flags.scene, 184 | None, classes, 185 | without_features=True) 186 | 187 | n_classes = dataset.n_classes if dataset.n_classes is not None else 2 188 | model = model_utils.create_model(dataset.min_bounds, dataset.max_bounds, 189 | n_classes, model_params).cuda() 190 | model = model.eval() 191 | model_utils.load_checkpoint(model, 192 | os.path.join(flags.model_dir, 'checkpoints')) 193 | 194 | instance_color_map = (np.random.rand(model.instance_centers.shape[0], 3) * 255).astype(np.uint8) 195 | 196 | Path(flags.out).mkdir(exist_ok=True, parents=True) 197 | with torch.inference_mode(): 198 | with torch.cuda.amp.autocast(enabled=True): 199 | for frame_index in tqdm(dataset.indices[::flags.stride]): 200 | batch = dataset._get_test(frame_index) 201 | frame = render(model, 202 | batch, 203 | feature_transform, 204 | semantic_color_map=semantic_color_map, 205 | instance_color_map=instance_color_map, 206 | size=view_size, 207 | maxdepth=flags.max_depth, 208 | classes=classes, 209 | con_feature_transform=con_feature_transform) 210 | cv2.imwrite( 211 | os.path.join(flags.out, f"{frame_index}.png"), 212 | cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) 213 | ) 214 | 215 | 216 | if __name__ == "__main__": 217 | main() 218 | -------------------------------------------------------------------------------- /scripts/convert_to_instant_ngp.py: -------------------------------------------------------------------------------- 1 | """Converts .txt world-to-camera poses created by `autolabel` to transforms.json 2 | files that can be used in `instant-ngp`/`torch-ngp`. 3 | 4 | A large part of this code is based on `instant-ngp/scripts/colmap2nerf.py` from 5 | https://github.com/NVlabs/instant-ngp. 6 | """ 7 | import argparse 8 | import cv2 9 | import glob 10 | import json 11 | import math 12 | import numpy as np 13 | import os 14 | 15 | 16 | def variance_of_laplacian(image): 17 | return cv2.Laplacian(image, cv2.CV_64F).var() 18 | 19 | 20 | def sharpness(image_path): 21 | image = cv2.imread(image_path) 22 | gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) 23 | fm = variance_of_laplacian(gray) 24 | return fm 25 | 26 | 27 | def rotmat(a, b): 28 | a, b = a / np.linalg.norm(a), b / np.linalg.norm(b) 29 | v = np.cross(a, b) 30 | c = np.dot(a, b) 31 | s = np.linalg.norm(v) 32 | kmat = np.array([[0, -v[2], v[1]], [v[2], 0, -v[0]], [-v[1], v[0], 0]]) 33 | return np.eye(3) + kmat + kmat.dot(kmat) * ((1 - c) / (s**2 + 1e-10)) 34 | 35 | 36 | def closest_point_2_lines(oa, da, ob, db): 37 | r"""Returns point closest to both rays of form o+t*d, and a weight factor 38 | that goes to 0 if the lines are parallel. 39 | """ 40 | da = da / np.linalg.norm(da) 41 | db = db / np.linalg.norm(db) 42 | c = np.cross(da, db) 43 | denom = np.linalg.norm(c)**2 44 | t = ob - oa 45 | ta = np.linalg.det([t, db, c]) / (denom + 1e-10) 46 | tb = np.linalg.det([t, da, c]) / (denom + 1e-10) 47 | if ta > 0: 48 | ta = 0 49 | if tb > 0: 50 | tb = 0 51 | return (oa + ta * da + ob + tb * db) * 0.5, denom 52 | 53 | 54 | parser = argparse.ArgumentParser() 55 | 56 | parser.add_argument( 57 | '--dataset_folder', 58 | type=str, 59 | required=True, 60 | help= 61 | ("Path to the dataset folder. It is expected to contain a `rgb` subfolder " 62 | "with .png images, a `pose` subfolder with world-to-camera poses as .txt " 63 | "files, each corresponding to an image in `rgb`, and an `intrinsics.txt` " 64 | "file. A `transforms.json` file will be created in it.")) 65 | 66 | args = parser.parse_args() 67 | 68 | _aabb_scale = 8 69 | _dataset_folder = args.dataset_folder 70 | _image_folder = os.path.join(_dataset_folder, "rgb") 71 | _pose_folder = os.path.join(_dataset_folder, "pose") 72 | _intrinsics_file_path = os.path.join(_dataset_folder, "intrinsics.txt") 73 | _output_transform_file = os.path.join(_dataset_folder, "transforms.json") 74 | # List of supported image extensions. 75 | _image_extensions = ["png", "jpg", "jpeg"] 76 | 77 | if (not os.path.exists(_image_folder)): 78 | raise (OSError(f"The image folder '{_image_folder}' could not be found.")) 79 | if (not os.path.exists(_pose_folder)): 80 | raise (OSError(f"The pose folder '{_pose_folder}' could not be found.")) 81 | if (not os.path.exists(_intrinsics_file_path)): 82 | raise (OSError(f"The intrinsics file '{_intrinsics_file_path}' could not " 83 | "be found.")) 84 | if (os.path.exists(_output_transform_file)): 85 | raise (OSError( 86 | f"The output transform file '{_output_transform_file}' " 87 | "already exists. Please remove it or rename to avoid overriding it.")) 88 | 89 | # Find the actual extension of the input images and verify that there is exactly 90 | # one pose for each image. 91 | curr_image_extension_idx = 0 92 | image_list = [] 93 | while len( 94 | image_list) == 0 and curr_image_extension_idx < len(_image_extensions): 95 | image_extension = _image_extensions[curr_image_extension_idx] 96 | image_list = sorted( 97 | glob.glob(os.path.join(_image_folder, f"*.{image_extension}"))) 98 | curr_image_extension_idx += 1 99 | assert (len(image_list) > 0), f"Found no images in '{_image_folder}'." 100 | pose_list = sorted(glob.glob(os.path.join(_pose_folder, "*.txt"))) 101 | assert ( 102 | [os.path.basename(f).split(f'.{image_extension}')[0] for f in image_list 103 | ] == [os.path.basename(f).split('.txt')[0] for f in pose_list] 104 | ), f"Found non-matching images-poses in '{_image_folder}' and '{_pose_folder}'." 105 | 106 | # Read an example image to find the image dimensions. 107 | example_image = cv2.imread(image_list[0]) 108 | H, W = example_image.shape[:2] 109 | 110 | # Read the camera intrinsics. NOTE: A pinhole camera is assumed. 111 | K = np.loadtxt(_intrinsics_file_path) 112 | f_x = K[0, 0] 113 | f_y = K[1, 1] 114 | c_x = K[0, 2] 115 | c_y = K[1, 2] 116 | 117 | angle_x = math.atan(W / (f_x * 2)) * 2 118 | angle_y = math.atan(H / (f_y * 2)) * 2 119 | 120 | # Bottom and up vectors. 121 | bottom = np.array([0.0, 0.0, 0.0, 1.0]).reshape([1, 4]) 122 | up = np.zeros(3) 123 | out = { 124 | "camera_angle_x": angle_x, 125 | "camera_angle_y": angle_y, 126 | "f_x": f_x, 127 | "f_y": f_y, 128 | "k1": 0.0, 129 | "k2": 0.0, 130 | "p1": 0.0, 131 | "p2": 0.0, 132 | "cx": c_x, 133 | "cy": c_y, 134 | "w": W, 135 | "h": H, 136 | "aabb_scale": _aabb_scale, 137 | "frames": [], 138 | } 139 | 140 | print( 141 | f"\033[94mCreating output transform file '{_output_transform_file}'.\033[0m" 142 | ) 143 | 144 | for image_file_path, pose_file_path in zip(image_list, pose_list): 145 | image_rel = os.path.relpath(_image_folder) 146 | relative_image_file_path = f"./rgb/{os.path.basename(image_file_path)}" 147 | sharpness_value = sharpness(image_file_path) 148 | # Read world-to-camera pose. 149 | T_CW = np.loadtxt(pose_file_path).reshape(4, 4) 150 | T_WC = np.linalg.inv(T_CW) 151 | # Apply transformations required by the NeRF convention. 152 | # - Flip the y and z axes. 153 | T_WC[0:3, 2] *= -1 154 | T_WC[0:3, 1] *= -1 155 | # - Swap y and z. 156 | T_WC = T_WC[[1, 0, 2, 3], :] 157 | # - Flip the whole world upside down. 158 | T_WC[2, :] *= -1 159 | 160 | # Update the up vector using the original z axis. 161 | up += T_WC[0:3, 1] 162 | 163 | frame = { 164 | "file_path": relative_image_file_path, 165 | "sharpness": sharpness_value, 166 | "transform_matrix": T_WC 167 | } 168 | out["frames"].append(frame) 169 | num_frames = len(out["frames"]) 170 | up = up / np.linalg.norm(up) 171 | print(f"Found up vector {up}") 172 | 173 | # Rotate up vector to [0, 0, 1]. 174 | R = rotmat(up, [0, 0, 1]) 175 | R = np.pad(R, [0, 1]) 176 | R[-1, -1] = 1 177 | 178 | # Rotate the transforms so that the up vector is the z axis. 179 | for f in out["frames"]: 180 | f["transform_matrix"] = np.matmul(R, f["transform_matrix"]) 181 | 182 | # Find a central point all cameras are looking at. 183 | print("Computing center of attention...") 184 | total_weight = 0.0 185 | center_point = np.array([0.0, 0.0, 0.0]) 186 | for f in out["frames"]: 187 | mf = f["transform_matrix"][0:3, :] 188 | for g in out["frames"]: 189 | mg = g["transform_matrix"][0:3, :] 190 | p, W = closest_point_2_lines(mf[:, 3], mf[:, 2], mg[:, 3], mg[:, 2]) 191 | if W > 0.01: 192 | center_point += p * W 193 | total_weight += W 194 | center_point /= total_weight 195 | # Translate the cameras so that the world origin coincides with the central 196 | # point computed above. 197 | for f in out["frames"]: 198 | f["transform_matrix"][0:3, 3] -= center_point 199 | 200 | # Scale the world coordinate frame (i.e., scale the translation part of the 201 | # camera-to-world transforms) so that the scene fits within a "standard NeRF" 202 | # size. 203 | # In practice: 204 | # - `scale` is a value that gets multiplied to the translation part of the 205 | # poses when training, and scales the scene to a "standard NeRF size". 206 | # - Denoting as UOM the unit of measure of the training coordinates 207 | # resulting from the above scaling, the equivalent in meters of 1 UOM is 208 | # given by the value of one_uom_scene_to_one_m. 209 | # - During training, the pipeline will assume the scene to be bounded within 210 | # a cube [-bound, bound]^3 centered at the scene center (e.g., the object 211 | # center for object-centric scenes), where `bound` is a parameter that can 212 | # be set. This means that the scene will be assumed to be contained within 213 | # a (L1) distance of: 214 | # 215 | # bound [UOM] 216 | # = (bound * one_uom_scene_to_one_m) [m] 217 | # = (bound * 1 / scale) [m] 218 | # = bound * 1 / (1. / (avg_len[m])) 219 | # = (bound * avg_len) [m], 220 | # 221 | # where `avg_len[m]` is the average distance of the camera origins from 222 | # the scene center in meters. As an example, for an average distance of 80 cm, 223 | # the size of the cube containing the scene would be bound * 80 [cm]. 224 | # Setting `scale` to be equal to 1.0 / avg_len is an arbitrary decision to 225 | # fit the scene properly. Originally, it was 4.0 / avg_len in `instant-ngp`, 226 | # where it was described as scaling the scene to be "NeRF-sized". 227 | # The `bound` parameter can be set in the training pipeline. 228 | avg_len = 0. 229 | for f in out["frames"]: 230 | avg_len += np.linalg.norm(f["transform_matrix"][0:3, 3]) 231 | avg_len /= num_frames 232 | 233 | scale = 1.0 / avg_len 234 | one_uom_scene_to_one_m = 1.0 / scale 235 | print(f"\033[94mAverage camera distance from origin = {avg_len} m (NOTE: " 236 | "Assuming the input UOM of the transforms was meters, which is the case " 237 | "when using `autolabel` to extract the poses).\033[0m") 238 | 239 | # Write the transforms to file. 240 | for f in out["frames"]: 241 | f["transform_matrix"] = f["transform_matrix"].tolist() 242 | 243 | out["scale"] = scale 244 | out["one_uom_scene_to_one_m"] = one_uom_scene_to_one_m 245 | 246 | with open(_output_transform_file, "w") as outfile: 247 | json.dump(out, outfile, indent=4) -------------------------------------------------------------------------------- /scripts/data/convert_replica.py: -------------------------------------------------------------------------------- 1 | """ 2 | Converts rendered replica scenes from https://github.com/Harry-Zhi/semantic_nerf 3 | to the autolabel scene format. 4 | 5 | usage: 6 | python scripts/data/convert_replica.py --out 7 | """ 8 | import pandas 9 | import argparse 10 | import cv2 11 | import json 12 | import tempfile 13 | import math 14 | import numpy as np 15 | import open3d as o3d 16 | import os 17 | import shutil 18 | import subprocess 19 | from tqdm import tqdm 20 | 21 | from autolabel.utils import Scene, transform_points 22 | 23 | 24 | def read_args(): 25 | parser = argparse.ArgumentParser() 26 | parser.add_argument("dataset") 27 | parser.add_argument("--out", type=str, required=True) 28 | return parser.parse_args() 29 | 30 | 31 | class SceneConverter: 32 | 33 | def __init__(self, scene, out_scene, metadata): 34 | self.out_scene = out_scene 35 | self.in_scene = scene 36 | self.metadata = metadata 37 | self._collect_paths() 38 | 39 | def _collect_paths(self): 40 | rgb_path = os.path.join(self.in_scene, 'rgb') 41 | depth_path = os.path.join(self.in_scene, 'depth') 42 | semantic_path = os.path.join(self.in_scene, 'semantic_class') 43 | instance_path = os.path.join(self.in_scene, 'instance') 44 | rgb_frames = [f for f in os.listdir(rgb_path) if f[0] != '.'] 45 | depth_frames = [f for f in os.listdir(depth_path) if f[0] != '.'] 46 | semantic_frames = [ 47 | f for f in os.listdir(semantic_path) 48 | if f[0] != '.' and 'semantic' in f 49 | ] 50 | instance_frames = [ 51 | f for f in os.listdir(instance_path) 52 | if f[0] != '.' and 'semantic_instance' in f 53 | ] 54 | rgb_frames = sorted(rgb_frames, 55 | key=lambda x: int(x.split('_')[-1].split('.')[0])) 56 | depth_frames = sorted(depth_frames, 57 | key=lambda x: int(x.split('_')[-1].split('.')[0])) 58 | semantic_frames = sorted( 59 | semantic_frames, key=lambda x: int(x.split('_')[-1].split('.')[0])) 60 | instance_frames = sorted( 61 | instance_frames, key=lambda x: int(x.split('_')[-1].split('.')[0])) 62 | self.rgb_frames = [] 63 | self.depth_frames = [] 64 | self.semantic_frames = [] 65 | self.instance_frames = [] 66 | for rgb, depth, semantic, instance in zip(rgb_frames, depth_frames, 67 | semantic_frames, instance_frames): 68 | self.rgb_frames.append(os.path.join(rgb_path, rgb)) 69 | self.depth_frames.append(os.path.join(depth_path, depth)) 70 | self.semantic_frames.append(os.path.join(semantic_path, semantic)) 71 | self.instance_frames.append(os.path.join(instance_path, instance)) 72 | 73 | def _copy_frames(self): 74 | self.rgb_out = os.path.join(self.out_scene, 'rgb') 75 | self.depth_out = os.path.join(self.out_scene, 'depth') 76 | self.semantic_out = os.path.join(self.out_scene, 'gt_semantic') 77 | self.instance_out = os.path.join(self.out_scene, 'gt_instance') 78 | os.makedirs(self.rgb_out, exist_ok=True) 79 | os.makedirs(self.depth_out, exist_ok=True) 80 | os.makedirs(self.semantic_out, exist_ok=True) 81 | os.makedirs(self.instance_out, exist_ok=True) 82 | 83 | semantic_frames = [] 84 | for i, (rgb, depth, semantic, instance) in enumerate( 85 | zip(tqdm(self.rgb_frames, desc="Copying frames"), 86 | self.depth_frames, self.semantic_frames, self.instance_frames)): 87 | rgb_out_path = os.path.join(self.rgb_out, f"{i:06}.png") 88 | depth_out_path = os.path.join(self.depth_out, f"{i:06}.png") 89 | semantic_out = os.path.join(self.semantic_out, f"{i:06}.png") 90 | instance_out_path = os.path.join(self.instance_out, f"{i:06}.png") 91 | shutil.copy(rgb, rgb_out_path) 92 | shutil.copy(depth, depth_out_path) 93 | shutil.copy(semantic, self.semantic_out) 94 | shutil.copy(instance, instance_out_path) 95 | 96 | metadata = { 'n_classes': int(self.metadata['id'].max() + 1) } 97 | metadata_path = os.path.join(self.out_scene, 'metadata.json') 98 | with open(metadata_path, 'w') as f: 99 | f.write(json.dumps(metadata, indent=2)) 100 | 101 | def _copy_trajectory(self): 102 | pose_dir = os.path.join(self.out_scene, 'pose') 103 | os.makedirs(pose_dir, exist_ok=True) 104 | trajectory = np.loadtxt(os.path.join(self.in_scene, 'traj_w_c.txt'), 105 | delimiter=' ').reshape(-1, 4, 4) 106 | for i, T_CW in enumerate(trajectory): 107 | pose_out = os.path.join(pose_dir, f"{i:06}.txt") 108 | np.savetxt(pose_out, np.linalg.inv(T_CW)) 109 | 110 | def _copy_intrinsics(self): 111 | width = 640 112 | height = 480 113 | hfov = 90.0 114 | fx = width / 2.0 / math.tan(math.radians(hfov / 2.0)) 115 | cx = (width - 1.0) / 2.0 116 | cy = (height - 1.0) / 2.0 117 | camera_matrix = np.eye(3) 118 | camera_matrix[0, 0] = fx 119 | camera_matrix[1, 1] = fx 120 | camera_matrix[0, 2] = cx 121 | camera_matrix[1, 2] = cy 122 | np.savetxt(os.path.join(self.out_scene, 'intrinsics.txt'), 123 | camera_matrix) 124 | 125 | def _compute_bounds(self): 126 | scene = Scene(self.out_scene) 127 | depth_frame = o3d.io.read_image(scene.depth_paths()[0]) 128 | depth_size = np.asarray(depth_frame).shape[::-1] 129 | K = scene.camera.scale(depth_size).camera_matrix 130 | intrinsics = o3d.camera.PinholeCameraIntrinsic(int(depth_size[0]), 131 | int(depth_size[1]), 132 | K[0, 0], K[1, 1], 133 | K[0, 2], K[1, 2]) 134 | pc = o3d.geometry.PointCloud() 135 | 136 | poses = scene.poses[::10] 137 | depths = scene.depth_paths()[::10] 138 | for T_CW, depth in zip(poses, tqdm(depths, desc="Computing bounds")): 139 | T_WC = np.linalg.inv(T_CW) 140 | depth = o3d.io.read_image(depth) 141 | 142 | pc_C = o3d.geometry.PointCloud.create_from_depth_image( 143 | depth, depth_scale=1000.0, intrinsic=intrinsics) 144 | pc_C = np.asarray(pc_C.points) 145 | pc_W = transform_points(T_WC, pc_C) 146 | 147 | pc += o3d.geometry.PointCloud( 148 | o3d.utility.Vector3dVector(pc_W)).uniform_down_sample(50) 149 | filtered, _ = pc.remove_statistical_outlier(nb_neighbors=20, 150 | std_ratio=2.0) 151 | aabb = filtered.get_axis_aligned_bounding_box() 152 | with open(os.path.join(scene.path, 'bbox.txt'), 'wt') as f: 153 | min_str = " ".join([str(x) for x in aabb.get_min_bound()]) 154 | max_str = " ".join([str(x) for x in aabb.get_max_bound()]) 155 | f.write(f"{min_str} {max_str} 0.01") 156 | 157 | def run(self): 158 | self._copy_frames() 159 | self._copy_trajectory() 160 | self._copy_intrinsics() 161 | self._compute_bounds() 162 | 163 | def create_labelmap(semantic_info_dir, out): 164 | metadata = os.path.join(semantic_info_dir, 'room_0', 'info_semantic.json') 165 | with open(metadata, 'r') as f: 166 | metadata = json.load(f) 167 | ids = [] 168 | prompts = [] 169 | for class_info in metadata['classes']: 170 | ids.append(class_info['id']) 171 | prompts.append(class_info['name']) 172 | data = pandas.DataFrame({'id': ids, 'name': prompts}) 173 | data.to_csv(out, index=False) 174 | return data 175 | 176 | 177 | def main(): 178 | flags = read_args() 179 | 180 | zip_files = [f for f in os.listdir(flags.dataset) if '.zip' in f] 181 | instance_zip = [f for f in zip_files if 'Instance' in f][0] 182 | 183 | tmpdir = tempfile.mkdtemp() 184 | try: 185 | success = subprocess.run(['unzip', os.path.join(flags.dataset, instance_zip), '-d', tmpdir]) 186 | if success.returncode != 0: 187 | raise RuntimeError("Failed to extract instance segmentation") 188 | success = subprocess.run(['unzip', os.path.join(flags.dataset, 'semantic_info.zip'), '-d', tmpdir]) 189 | if success.returncode != 0: 190 | raise RuntimeError("Failed to extract segmentation metadata") 191 | metadata = create_labelmap(os.path.join(tmpdir, 'semantic_info'), os.path.join(flags.out, 'label_map.csv')) 192 | 193 | for file in zip_files: 194 | if 'semantic_info' in file or 'Instance' in file or 'replica' in file: 195 | continue 196 | print("Extracting", file) 197 | scene_name = file.split('.')[0] 198 | tmp_scene_dir = os.path.join(tmpdir, scene_name) 199 | success = subprocess.run(['unzip', os.path.join(flags.dataset, file), '-d', tmpdir]) 200 | if success.returncode != 0: 201 | raise RuntimeError("Failed to extract scene") 202 | out_scene = os.path.join(flags.out, scene_name) 203 | os.makedirs(out_scene, exist_ok=True) 204 | in_scene = os.path.join(tmp_scene_dir, 'Sequence_1') 205 | scene_instance_zip = os.path.join(flags.dataset, 'Replica_Instance_Segmentation', scene_name, 'Sequence_1', 'semantic_instance.zip') 206 | success = subprocess.run(['unzip', scene_instance_zip, '-d', tmp_scene_dir]) 207 | if success.returncode != 0: 208 | raise RuntimeError("Failed to extract scene") 209 | success = subprocess.run(['mv', os.path.join(tmp_scene_dir, 'semantic_instance'), os.path.join(tmp_scene_dir, 'Sequence_1', 'instance')]) 210 | if success.returncode != 0: 211 | raise RuntimeError("Failed to move instance folder") 212 | converter = SceneConverter(in_scene, out_scene, metadata) 213 | converter.run() 214 | shutil.rmtree(tmp_scene_dir) 215 | finally: 216 | shutil.rmtree(tmpdir) 217 | 218 | # Exporter(flags).run() 219 | 220 | if __name__ == "__main__": 221 | main() 222 | -------------------------------------------------------------------------------- /scripts/data/convert_hypersim.py: -------------------------------------------------------------------------------- 1 | """ 2 | Converts hypersim scenes from https://github.com/apple/ml-hypersim 3 | to the autolabel scene format. 4 | 5 | usage: 6 | python scripts/data/convert_hypersim.py \ 7 | --out \ 8 | --ori-semantic-labels \ 9 | --camera-parameter-file 10 | """ 11 | import pandas as pd 12 | import argparse 13 | import cv2 14 | import json 15 | import math 16 | import numpy as np 17 | import open3d as o3d 18 | import os 19 | import glob 20 | from natsort import os_sorted 21 | import shutil 22 | from tqdm import tqdm 23 | import h5py 24 | 25 | 26 | from autolabel.utils import Scene, transform_points 27 | 28 | 29 | def read_args(): 30 | parser = argparse.ArgumentParser() 31 | parser.add_argument("dataset") 32 | parser.add_argument("--out", type=str, required=True) 33 | parser.add_argument("--ori-semantic-labels", type=str, required=True) 34 | parser.add_argument("--camera-parameter-file", type=str, required=True) 35 | return parser.parse_args() 36 | 37 | def load_distance_meters_to_depth(hdf_file, width=1024, height=768, focal=886.81): 38 | with h5py.File(hdf_file, "r") as f: 39 | depth_meters = f["dataset"][:].astype(np.float32) 40 | 41 | npyImageplaneX = np.linspace((-0.5 * width) + 0.5, (0.5 * width) - 0.5, width).reshape(1, width).repeat(height, 0).astype(np.float32)[:, :, None] 42 | npyImageplaneY = np.linspace((-0.5 * height) + 0.5, (0.5 * height) - 0.5, height).reshape(height, 1).repeat(width, 1).astype(np.float32)[:, :, None] 43 | npyImageplaneZ = np.full([height, width, 1], focal, np.float32) 44 | npyImageplane = np.concatenate([npyImageplaneX, npyImageplaneY, npyImageplaneZ], 2) 45 | 46 | npyDepth = depth_meters / np.linalg.norm(npyImageplane, 2, 2) * focal 47 | return npyDepth 48 | 49 | def load_camera_poses(hdf_orientation, hdf_position, scale): 50 | 51 | with h5py.File(hdf_orientation, "r") as f: 52 | orientations = f["dataset"][:].astype(np.float32) 53 | 54 | with h5py.File(hdf_position, "r") as f: 55 | positions = f["dataset"][:].astype(np.float32) 56 | 57 | positions *= scale 58 | 59 | poses = [] 60 | trans = np.eye(3) 61 | trans[1, 1] = -1 62 | trans[2, 2] = -1 63 | 64 | for orientation, position in zip(orientations, positions): 65 | T_WC = np.eye(4) 66 | T_WC[:3, :3] = orientation @ trans 67 | T_WC[:3, 3] = position 68 | poses.append(T_WC) 69 | return poses 70 | 71 | 72 | class SceneConverter: 73 | 74 | def __init__(self, scene, out_scene, camera_settings, semantic_label_mapping): 75 | self.out_scene = out_scene 76 | self.in_scene = scene 77 | self._load_camera(camera_settings) 78 | self.semantic_label_mapping = pd.read_csv(semantic_label_mapping) # NYU 40 classes 79 | 80 | self._load_meta_data() 81 | 82 | def _load_camera(self, camera_settings): 83 | height = camera_settings['settings_output_img_height'] 84 | width = camera_settings['settings_output_img_width'] 85 | rate_unit_to_meter = camera_settings['settings_units_info_meters_scale'] 86 | fov_x = camera_settings['settings_camera_fov'] 87 | fx = width / 2.0 / math.tan(fov_x / 2) 88 | 89 | cx = (width - 1.0) / 2.0 90 | cy = (height - 1.0) / 2.0 91 | intrinsic = np.eye(3) 92 | intrinsic[0, 0] = fx 93 | intrinsic[1, 1] = fx 94 | intrinsic[0, 2] = cx 95 | intrinsic[1, 2] = cy 96 | self.camera = { 97 | 'height': int(height), 'width': int(width), 98 | 'rate_unit_to_meter': rate_unit_to_meter, 99 | 'focal_length': fx, 100 | 'intrinsic': intrinsic 101 | } 102 | 103 | def _load_meta_data(self): 104 | cam_list = pd.read_csv(os.path.join(self.in_scene, '_detail', 'metadata_cameras.csv')) 105 | cam_list = cam_list['camera_name'].values.tolist() 106 | 107 | self.meta_data = { 108 | 'cam_list': cam_list 109 | } 110 | 111 | def _save_scene_metadata(self): 112 | metadata = { 'n_classes': int(self.semantic_label_mapping['id'].max()) } 113 | metadata_path = os.path.join(self.out_scene, 'metadata.json') 114 | with open(metadata_path, 'w') as f: 115 | f.write(json.dumps(metadata, indent=2)) 116 | 117 | def _collect_paths(self, cam): 118 | rgb_path = os.path.join(self.in_scene, 'images', f'scene_{cam}_final_preview', 'frame.*.color.jpg') 119 | depth_path = os.path.join(self.in_scene, 'images', f'scene_{cam}_geometry_hdf5', 'frame.*.depth_meters.hdf5') 120 | semantic_path = os.path.join(self.in_scene, 'images', f'scene_{cam}_geometry_hdf5', 'frame.*.semantic.hdf5') 121 | instance_path = os.path.join(self.in_scene, 'images', f'scene_{cam}_geometry_hdf5', 'frame.*.semantic_instance.hdf5') 122 | 123 | rgb_frames = glob.glob(rgb_path) 124 | depth_frames = glob.glob(depth_path) 125 | semantic_frames = glob.glob(semantic_path) 126 | instance_frames = glob.glob(instance_path) 127 | 128 | rgb_frames = os_sorted(rgb_frames) 129 | depth_frames = os_sorted(depth_frames) 130 | semantic_frames = os_sorted(semantic_frames) 131 | instance_frames = os_sorted(instance_frames) 132 | 133 | poses_T_WC = load_camera_poses( 134 | hdf_orientation=os.path.join(self.in_scene, '_detail', f'{cam}', 'camera_keyframe_orientations.hdf5'), 135 | hdf_position=os.path.join(self.in_scene, '_detail', f'{cam}', 'camera_keyframe_positions.hdf5'), 136 | scale=self.camera['rate_unit_to_meter'] 137 | ) 138 | return rgb_frames, depth_frames, semantic_frames, instance_frames, poses_T_WC 139 | 140 | def _copy_frames_and_trajectory(self, cam, rgb_frames, depth_frames, semantic_frames, instance_frames, poses_T_WC): 141 | rgb_out = os.path.join(self.out_scene, 'rgb') 142 | depth_out = os.path.join(self.out_scene, 'depth') 143 | semantic_out = os.path.join(self.out_scene, 'gt_semantic') 144 | instance_out = os.path.join(self.out_scene, 'gt_instance') 145 | os.makedirs(rgb_out, exist_ok=True) 146 | os.makedirs(depth_out, exist_ok=True) 147 | os.makedirs(semantic_out, exist_ok=True) 148 | os.makedirs(instance_out, exist_ok=True) 149 | 150 | pose_dir = os.path.join(self.out_scene, 'pose') 151 | os.makedirs(pose_dir, exist_ok=True) 152 | 153 | for (rgb, depth, semantic, instance, pose_T_WC) in zip(tqdm(rgb_frames, desc=f"Copying {cam} frames"), 154 | depth_frames, semantic_frames, instance_frames, poses_T_WC): 155 | rgb_out_path = os.path.join(rgb_out, f"{self.frame_index:06}.jpg") 156 | depth_out_path = os.path.join(depth_out, f"{self.frame_index:06}.png") 157 | semantic_out_path = os.path.join(semantic_out, f"{self.frame_index:06}.png") 158 | instance_out_path = os.path.join(instance_out, f"{self.frame_index:06}.png") 159 | 160 | shutil.copy(rgb, rgb_out_path) 161 | 162 | depth_img = load_distance_meters_to_depth( 163 | depth, self.camera['width'], self.camera['height'], self.camera['focal_length']) 164 | depth_img = (depth_img * 1000).astype(np.uint16) 165 | cv2.imwrite(depth_out_path, depth_img) 166 | 167 | with h5py.File(semantic, "r") as f: 168 | semantic_img = f["dataset"][:].astype(np.int16) 169 | semantic_img = (semantic_img + 1).astype(np.uint16) 170 | cv2.imwrite(semantic_out_path, semantic_img) 171 | 172 | with h5py.File(instance, "r") as f: 173 | instance_img = f["dataset"][:].astype(np.int16) 174 | instance_img = (instance_img + 1).astype(np.uint16) 175 | cv2.imwrite(instance_out_path, instance_img) 176 | 177 | pose_out_path = os.path.join(pose_dir, f"{self.frame_index:06}.txt") 178 | np.savetxt(pose_out_path, np.linalg.inv(pose_T_WC)) 179 | 180 | self.frame_index += 1 181 | 182 | def _copy_intrinsics(self): 183 | np.savetxt(os.path.join(self.out_scene, 'intrinsics.txt'), self.camera['intrinsic']) 184 | 185 | def _compute_bounds(self): 186 | scene = Scene(self.out_scene) 187 | depth_frame = o3d.io.read_image(scene.depth_paths()[0]) 188 | depth_size = np.asarray(depth_frame).shape[::-1] 189 | K = scene.camera.scale(depth_size).camera_matrix 190 | intrinsics = o3d.camera.PinholeCameraIntrinsic(int(depth_size[0]), 191 | int(depth_size[1]), 192 | K[0, 0], K[1, 1], 193 | K[0, 2], K[1, 2]) 194 | pc = o3d.geometry.PointCloud() 195 | 196 | poses = scene.poses#[::10] 197 | depths = scene.depth_paths()#[::10] 198 | for T_CW, depth in zip(poses, tqdm(depths, desc="Computing bounds")): 199 | T_WC = np.linalg.inv(T_CW) 200 | depth = o3d.io.read_image(depth) 201 | 202 | pc_C = o3d.geometry.PointCloud.create_from_depth_image( 203 | depth, depth_scale=1000.0, intrinsic=intrinsics) 204 | pc_C = np.asarray(pc_C.points) 205 | pc_W = transform_points(T_WC, pc_C) 206 | 207 | pc += o3d.geometry.PointCloud( 208 | o3d.utility.Vector3dVector(pc_W)).uniform_down_sample(50) 209 | filtered, _ = pc.remove_statistical_outlier(nb_neighbors=20, 210 | std_ratio=2.0) 211 | aabb = filtered.get_axis_aligned_bounding_box() 212 | with open(os.path.join(scene.path, 'bbox.txt'), 'wt') as f: 213 | min_str = " ".join([str(x) for x in aabb.get_min_bound()]) 214 | max_str = " ".join([str(x) for x in aabb.get_max_bound()]) 215 | f.write(f"{min_str} {max_str} 0.01") 216 | 217 | def run(self): 218 | self._save_scene_metadata() 219 | self._copy_intrinsics() 220 | 221 | self.frame_index = 0 222 | for cam in self.meta_data['cam_list']: 223 | rgb_frames, depth_frames, semantic_frames, instance_frames, poses_T_WC = self._collect_paths(cam) 224 | self._copy_frames_and_trajectory(cam, rgb_frames, depth_frames, semantic_frames, instance_frames, poses_T_WC) 225 | 226 | self._compute_bounds() 227 | 228 | 229 | def create_labelmap(semantic_labels, out): 230 | semantic_labels = pd.read_csv(semantic_labels) 231 | ids = [] 232 | prompts = [] 233 | for semantic_id, semantic_name in zip(semantic_labels['semantic_id '], semantic_labels[' semantic_name ']): 234 | ids.append(semantic_id + 1) 235 | prompts.append(semantic_name) 236 | data = pd.DataFrame({'id': ids, 'name': prompts}) 237 | data.to_csv(out, index=False) 238 | return data 239 | 240 | def main(): 241 | flags = read_args() 242 | 243 | os.makedirs(flags.out, exist_ok=True) 244 | 245 | label_map = create_labelmap( 246 | flags.ori_semantic_labels, 247 | os.path.join(flags.out, 'label_map.csv') 248 | ) 249 | 250 | all_camera_settings = pd.read_csv(flags.camera_parameter_file, 251 | index_col="scene_name") 252 | 253 | scene_names = os.listdir(flags.dataset) 254 | 255 | for scene_name in scene_names: 256 | print(f"Converting scene [{scene_name}] ...") 257 | in_scene = os.path.join(flags.dataset, scene_name) 258 | out_scene = os.path.join(flags.out, scene_name) 259 | os.makedirs(out_scene, exist_ok=True) 260 | 261 | converter = SceneConverter( 262 | scene=in_scene, out_scene=out_scene, 263 | camera_settings=all_camera_settings.loc[scene_name], 264 | semantic_label_mapping=os.path.join(flags.out, 'label_map.csv')) 265 | converter.run() 266 | 267 | 268 | if __name__ == "__main__": 269 | main() -------------------------------------------------------------------------------- /autolabel/models.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | from torch_ngp.gridencoder import GridEncoder 7 | from torch_ngp.encoding import get_encoder 8 | from torch_ngp.activation import trunc_exp 9 | from torch_ngp.ffmlp import FFMLP 10 | import tinycudann as tcnn 11 | 12 | from torch_ngp.nerf.renderer import NeRFRenderer 13 | 14 | 15 | class FreqEncoder(nn.Module): 16 | 17 | def __init__(self, input_dim): 18 | super().__init__() 19 | self.encoder = tcnn.Encoding(input_dim, { 20 | "otype": "Frequency", 21 | "n_frequencies": 10 22 | }) 23 | self.n_output_dims = self.encoder.n_output_dims 24 | 25 | def forward(self, x, bound): 26 | normalized = (x + bound) / (2.0 * bound) 27 | return self.encoder(normalized) 28 | 29 | 30 | class HGFreqEncoder(nn.Module): 31 | 32 | def __init__(self, input_dim): 33 | super().__init__() 34 | self.encoder = tcnn.Encoding(input_dim, { 35 | "otype": "Frequency", 36 | "n_frequencies": 2 37 | }) 38 | self.grid_encoding = tcnn.Encoding( 39 | input_dim, { 40 | "otype": "Grid", 41 | "type": "Hash", 42 | "n_levels": 16, 43 | "n_features_per_level": 2, 44 | "log2_hashmap_size": 19, 45 | "base_resolution": 16, 46 | "per_level_scale": 2.0, 47 | "interpolation": "Linear" 48 | }) 49 | self.n_output_dims = self.encoder.n_output_dims + self.grid_encoding.n_output_dims 50 | 51 | def forward(self, x, bound): 52 | freq = self.encoder(x) 53 | normalized = (x + bound) / (2.0 * bound) 54 | # Sometimes samples might leak a bit outside the bounds. 55 | # This produces NaNs in the grid encoding, so we simply clip those points 56 | # assuming there aren't many of these. 57 | grid = self.grid_encoding(normalized) 58 | return torch.cat([freq, grid], dim=-1) 59 | 60 | 61 | class ALNetwork(NeRFRenderer): 62 | 63 | def __init__(self, 64 | encoding='hg', 65 | num_layers=2, 66 | hidden_dim=64, 67 | geo_feat_dim=15, 68 | num_layers_color=3, 69 | hidden_dim_color=64, 70 | hidden_dim_semantic=64, 71 | contrastive_feat_dim=8, 72 | semantic_classes=2, 73 | bound=1, 74 | **kwargs): 75 | super().__init__(bound, **kwargs) 76 | 77 | # sigma network 78 | self.num_layers = num_layers 79 | self.hidden_dim = hidden_dim 80 | self.geo_feat_dim = geo_feat_dim 81 | 82 | # instance centers and clusterer 83 | self.instance_centers = None 84 | self.instance_clusterer = None 85 | 86 | self.encoder, self.in_dim = self._get_encoder(encoding) 87 | 88 | self.sigma_net = tcnn.Network(n_input_dims=self.in_dim, 89 | n_output_dims=1 + self.geo_feat_dim, 90 | network_config={ 91 | "otype": "FullyFusedMLP", 92 | "activation": "ReLU", 93 | "output_activation": "None", 94 | "n_neurons": self.hidden_dim, 95 | "n_hidden_layers": self.num_layers 96 | }) 97 | 98 | # color network 99 | self.num_layers_color = num_layers_color 100 | self.hidden_dim_color = hidden_dim_color 101 | self.encoder_dir = tcnn.Encoding(n_input_dims=3, 102 | encoding_config={ 103 | "otype": "SphericalHarmonics", 104 | "degree": 4 105 | }) 106 | self.color_features = self.encoder_dir.n_output_dims + self.geo_feat_dim 107 | 108 | self.color_net = tcnn.Network( 109 | n_input_dims=self.color_features, 110 | n_output_dims=3, 111 | network_config={ 112 | "otype": "FullyFusedMLP", 113 | "activation": "ReLU", 114 | "output_activation": "None", 115 | "n_neurons": self.hidden_dim_color, 116 | "n_hidden_layers": self.num_layers_color 117 | }) 118 | 119 | # hash encoding for features 120 | self.feature_encoder, self.feature_in_dim = self._get_encoder(encoding) 121 | 122 | # semantic features 123 | self.hidden_dim_semantic = hidden_dim_semantic 124 | self.semantic_classes = semantic_classes 125 | self.semantic_features = tcnn.Network( 126 | n_input_dims=self.geo_feat_dim, 127 | n_output_dims=self.hidden_dim_semantic, 128 | network_config={ 129 | "otype": "CutlassMLP", 130 | "activation": "ReLU", 131 | "output_activation": "None", 132 | "n_neurons": self.hidden_dim_semantic, 133 | "n_hidden_layers": 2 134 | }) 135 | 136 | # contrastive features 137 | self.contrastive_feat_dim = contrastive_feat_dim 138 | self.contrastive_features = tcnn.Network( 139 | n_input_dims=self.feature_in_dim, 140 | n_output_dims=self.contrastive_feat_dim, 141 | network_config={ 142 | "otype": "CutlassMLP", 143 | "activation": "ReLU", 144 | "output_activation": "None", 145 | "n_neurons": self.hidden_dim_semantic, 146 | "n_hidden_layers": 2 147 | }) 148 | 149 | def _get_encoder(self, encoding): 150 | if encoding == 'freq': 151 | encoder = FreqEncoder(3) 152 | return encoder, encoder.n_output_dims 153 | elif encoding == 'hg': 154 | return get_encoder('hashgrid', desired_resolution=2**18) 155 | elif encoding == 'hg+freq': 156 | encoder = HGFreqEncoder(3) 157 | return encoder, encoder.n_output_dims 158 | else: 159 | raise NotImplementedError(f"Unknown input encoding {encoding}") 160 | 161 | def forward(self, x, d): 162 | """ 163 | x: [N, 3], in [-bound, bound] points 164 | d: [N, 3], normalized to [-1, 1] viewing directions 165 | """ 166 | x_enc = self.encoder(x, bound=self.bound) 167 | h = self.sigma_net(x_enc) 168 | 169 | sigma = trunc_exp(h[..., 0]) 170 | geo_feat = F.relu(h[..., 1:]) 171 | 172 | d = self.encoder_dir(d) 173 | 174 | h = torch.cat([d, geo_feat], dim=-1) 175 | h = self.color_net(h) 176 | 177 | rgb = torch.sigmoid(h) 178 | 179 | x_feat = self.feature_encoder(x, bound=self.bound) 180 | semantic_features = self.semantic_features(geo_feat) 181 | 182 | contrastive_features = self.contrastive_features(x_feat) 183 | contrastive_features = F.normalize(contrastive_features) 184 | 185 | return sigma, rgb, semantic_features, contrastive_features 186 | 187 | def density(self, x): 188 | """ 189 | x: [N, 3] points in [-bound, bound] 190 | """ 191 | x = self.encoder(x, bound=self.bound) 192 | h = self.sigma_net(x) 193 | 194 | sigma = trunc_exp(h[..., 0]) 195 | geo_feat = h[..., 1:] 196 | 197 | return { 198 | 'sigma': sigma, 199 | 'geo_feat': geo_feat, 200 | } 201 | 202 | def color(self, x, d, mask=None, geo_feat=None, **kwargs): 203 | """ 204 | x: [N, 3] in [-bound, bound] 205 | mask: [N,], bool, indicates where we actually needs to compute rgb. 206 | """ 207 | if mask is not None: 208 | rgbs = torch.zeros(mask.shape[0], 3, dtype=x.dtype, 209 | device=x.device) # [N, 3] 210 | # in case of empty mask 211 | if not mask.any(): 212 | return rgbs 213 | x = x[mask] 214 | d = d[mask] 215 | geo_feat = geo_feat[mask] 216 | 217 | # TinyCudaNN SH encoding requires inputs to be in [0, 1]. 218 | d = (d + 1) / 2 219 | d = self.encoder_dir(d) 220 | 221 | h = torch.cat([d, geo_feat], dim=-1) 222 | 223 | h = self.color_net(h) 224 | 225 | h = torch.sigmoid(h) 226 | 227 | if mask is not None: 228 | rgbs[mask] = h.to(rgbs.dtype) 229 | else: 230 | rgbs = h 231 | 232 | return rgbs 233 | 234 | def get_params(self, lr): 235 | params = [{ 236 | 'params': self.encoder.parameters(), 237 | 'lr': lr 238 | }, { 239 | 'params': self.sigma_net.parameters(), 240 | 'lr': lr 241 | }, { 242 | 'params': self.encoder_dir.parameters(), 243 | 'lr': lr 244 | }, { 245 | 'params': self.color_net.parameters(), 246 | 'lr': lr 247 | }, { 248 | 'params': self.semantic_features.parameters(), 249 | 'lr': lr 250 | }, { 251 | 'params': self.feature_encoder.parameters(), 252 | 'lr': lr 253 | }, { 254 | 'params': self.contrastive_features.parameters(), 255 | 'lr': lr 256 | }] 257 | if self.bg_radius > 0: 258 | params.append({'params': self.encoder_bg.parameters(), 'lr': lr}) 259 | params.append({'params': self.bg_net.parameters(), 'lr': lr}) 260 | 261 | return params 262 | 263 | def semantic(self, geo_features): 264 | """ 265 | features: [N, D] geometric features 266 | sigma: [N, 1] density outputs 267 | returns: [N, C] semantic head outputs 268 | """ 269 | sem_features = self.semantic_features(geo_features) 270 | return sem_features 271 | 272 | def contrastive(self, x_feature_encoding, contrastive_ema=None): 273 | """ 274 | x: [N, 3] points in [-bound, bound] 275 | returns: [N, C] contrastive features 276 | """ 277 | if contrastive_ema is not None: 278 | with contrastive_ema.average_parameters(): 279 | con_features = self.contrastive_features(x_feature_encoding) 280 | else: 281 | con_features = self.contrastive_features(x_feature_encoding) 282 | # con_features = F.normalize(con_features) 283 | return con_features 284 | 285 | def network_parameters(self): 286 | """ 287 | return: list of parameters in the neural networks, excluding encoder parameters 288 | """ 289 | return (list(self.sigma_net.parameters()) + 290 | list(self.color_net.parameters()) + 291 | list(self.semantic_features.parameters()) + 292 | list(self.contrastive_features.parameters())) 293 | 294 | def encoder_parameters(self): 295 | """ 296 | return: list of parameters in the encoders 297 | """ 298 | return (list(self.encoder.parameters()) + 299 | list(self.feature_encoder.parameters())) 300 | 301 | def set_instance_centers(self, instance_centers): 302 | self.instance_centers = instance_centers 303 | 304 | def set_instance_clusterer(self, clusterer): 305 | self.instance_clusterer = clusterer 306 | 307 | 308 | class Autoencoder(nn.Module): 309 | 310 | def __init__(self, in_features, bottleneck): 311 | super().__init__() 312 | self.encoder = tcnn.Network(n_input_dims=in_features, 313 | n_output_dims=bottleneck, 314 | network_config={ 315 | "otype": "CutlassMLP", 316 | "activation": "ReLU", 317 | "output_activation": "ReLU", 318 | "n_neurons": 128, 319 | "n_hidden_layers": 1 320 | }) 321 | self.decoder = tcnn.Network(n_input_dims=bottleneck, 322 | n_output_dims=in_features, 323 | network_config={ 324 | "otype": "CutlassMLP", 325 | "activation": "ReLU", 326 | "output_activation": "None", 327 | "n_neurons": 128, 328 | "n_hidden_layers": 1 329 | }) 330 | 331 | def forward(self, x, p=0.1): 332 | code = self.encoder(x) 333 | out = self.decoder(F.dropout(code, 0.1)) 334 | return out, code 335 | -------------------------------------------------------------------------------- /scripts/demo_ui.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | import os 4 | import open3d as o3d 5 | from plyfile import PlyData 6 | import numpy as np 7 | import torch 8 | import torch.nn.functional as F 9 | import threading 10 | import multiprocessing as mp 11 | from PyQt6 import QtWidgets 12 | from PyQt6 import QtCore 13 | from sklearn.metrics.pairwise import cosine_similarity 14 | from autolabel.constants import COLORS 15 | from autolabel.utils.feature_utils import get_feature_extractor 16 | from autolabel.dataset import SceneDataset 17 | from autolabel import utils, model_utils 18 | 19 | 20 | class PointCloudVisualizer: 21 | 22 | def __init__(self, flags, queue): 23 | self.flags = flags 24 | self.queue = queue 25 | self.visualizer = o3d.visualization.Visualizer() 26 | self.label_mapping = dict() 27 | self._load_scene_model() 28 | self._load_pointcloud() 29 | self._load_text_model() 30 | self._load_point_features() 31 | self._load_point_instance_ids() 32 | 33 | def _load_pointcloud(self): 34 | mesh_path = os.path.join(self.flags.scene, "mesh.ply") 35 | if not os.path.exists(mesh_path): 36 | raise ValueError(f"Mesh file {mesh_path} does not exist.") 37 | plydata = PlyData.read(mesh_path) 38 | points = np.hstack([ 39 | plydata['vertex']['x'].reshape(-1, 1), 40 | plydata['vertex']['y'].reshape(-1, 1), 41 | plydata['vertex']['z'].reshape(-1, 1) 42 | ]) 43 | points_rgb = np.hstack([ 44 | plydata['vertex']['red'].reshape(-1, 1), 45 | plydata['vertex']['green'].reshape(-1, 1), 46 | plydata['vertex']['blue'].reshape(-1, 1) 47 | ]) 48 | points_rgb = points_rgb.astype(np.float32) / 255.0 49 | aabb = np.loadtxt( 50 | os.path.join(self.flags.scene, 'bbox.txt') 51 | )[:6].reshape(2, 3) 52 | scene_center = (aabb[0] + aabb[1]) / 2 53 | points = points - scene_center 54 | fixed = np.zeros_like(points) 55 | fixed[:, 0] = points[:, 1] 56 | fixed[:, 1] = points[:, 2] 57 | fixed[:, 2] = points[:, 0] 58 | self.points = torch.tensor(fixed, dtype=torch.float16) 59 | self.point_infos = {'ori_rgb': points_rgb} 60 | self.pc = o3d.geometry.PointCloud() 61 | self.pc.points = o3d.utility.Vector3dVector(fixed) 62 | self.pc.colors = o3d.utility.Vector3dVector(points_rgb) 63 | # self.pc.paint_uniform_color([0.5, 0.5, 0.5]) 64 | self.visualizer.create_window() 65 | self.visualizer.add_geometry(self.pc) 66 | 67 | def _load_scene_model(self): 68 | models = list() 69 | nerf_dir = model_utils.get_nerf_dir(self.flags.scene, self.flags) 70 | if not os.path.exists(nerf_dir): 71 | raise ValueError(f"Model directory {nerf_dir} does not exist.") 72 | for model in os.listdir(nerf_dir): 73 | checkpoint_dir = os.path.join(nerf_dir, model, 'checkpoints') 74 | if os.path.exists(checkpoint_dir): 75 | models.append(model) 76 | model_path = os.path.join(nerf_dir, models[0]) 77 | print("Loading models: ", model_path) 78 | params = model_utils.read_params(model_path) 79 | dataset = SceneDataset('test', 80 | self.flags.scene, 81 | factor=4.0, 82 | batch_size=self.flags.batch_size, 83 | lazy=True) 84 | n_classes = dataset.n_classes if dataset.n_classes is not None else 2 85 | model = model_utils.create_model(dataset.min_bounds, dataset.max_bounds, 86 | n_classes, params).cuda() 87 | checkpoint_dir = os.path.join(model_path, 'checkpoints') 88 | model_utils.load_checkpoint(model, checkpoint_dir) 89 | self.model = model.eval() 90 | 91 | def _load_text_model(self): 92 | self.extractor = get_feature_extractor('lseg', self.flags.checkpoint) 93 | 94 | def _load_point_features(self): 95 | semantic_features = self._point_features(points=self.points) 96 | self.point_infos['semantic'] = semantic_features 97 | 98 | def _load_point_instance_ids(self): 99 | instance_ids = self._point_instance_ids(points=self.points) 100 | self.point_infos['instance_id'] = instance_ids 101 | instance_colors = np.zeros((len(instance_ids), 3)) 102 | ins_ids = np.unique(instance_ids) 103 | for ins_id in ins_ids: 104 | if ins_id == 0: 105 | continue 106 | instance_colors[instance_ids == ins_id] = np.random.rand(3, ) 107 | self.point_infos['instance_colors'] = instance_colors 108 | 109 | def _denoise_semantic(self, pred_semantic_labels, pred_instance_labels): 110 | pred_semantic_denoised = np.copy(pred_semantic_labels) 111 | instance_ids = np.unique(pred_instance_labels) 112 | for ins_id in instance_ids: 113 | if ins_id == 0: 114 | continue 115 | 116 | semantic_ids = pred_semantic_labels[pred_instance_labels == ins_id] 117 | ids, cnts = np.unique(semantic_ids, return_counts=True) 118 | pred_semantic_denoised[pred_instance_labels == ins_id] = ids[np.argmax(cnts)] 119 | return pred_semantic_denoised 120 | 121 | def _update_colors(self, msg): 122 | print(msg) 123 | if isinstance(msg, list): 124 | prompts = msg 125 | if len(prompts) > 0: 126 | # prompts.append("others") 127 | text_features = self.extractor.encode_text(prompts) 128 | semantic_features = self._point_features() 129 | pred_instance_labels = self._point_instance_ids() 130 | similarities = torch.zeros( 131 | (semantic_features.shape[0], text_features.shape[0]), 132 | dtype=torch.float32, 133 | device=semantic_features.device) 134 | batch_size = 50000 135 | for i in range(0, semantic_features.shape[0], batch_size): 136 | batch = semantic_features[i:i + batch_size] 137 | for prompt_index in range(text_features.shape[0]): 138 | similarities[i:i + batch_size, prompt_index] = ( 139 | batch * text_features[prompt_index][None]).sum(dim=-1) 140 | 141 | update_mask, _ = similarities.max(dim=-1) 142 | update_mask = update_mask.cpu().numpy() > 0.85 143 | closest_prompt = similarities.argmax(dim=-1).cpu().numpy() 144 | denoised_closest_prompt = self._denoise_semantic(closest_prompt, pred_instance_labels) 145 | 146 | colors = np.asarray(self.pc.colors) 147 | colors[update_mask] = COLORS[denoised_closest_prompt[update_mask] % COLORS.shape[0]] / 255. 148 | else: 149 | colors = self.point_infos['ori_rgb'] 150 | elif isinstance(msg, str): 151 | if msg == "show_instance": 152 | colors = self.point_infos['instance_colors'] 153 | else: 154 | raise ValueError("Not support msg type {}".format(type(msg))) 155 | self.pc.colors = o3d.utility.Vector3dVector(colors) 156 | self.visualizer.update_geometry(self.pc) 157 | 158 | def _point_features(self, points=None): 159 | if points is not None: 160 | out = [] 161 | for i in range(0, len(points), self.flags.batch_size): 162 | batch = points[i:i + self.flags.batch_size] 163 | batch = batch.cuda() 164 | with torch.no_grad(): 165 | density = self.model.density(batch) 166 | features = self.model.semantic(density['geo_feat']) 167 | features = features / torch.norm(features, dim=-1, keepdim=True) 168 | features = features.to(torch.float32) 169 | out.append(features) 170 | semantic_features = torch.cat(out, dim=0) 171 | else: 172 | semantic_features = self.point_infos['semantic'] 173 | return semantic_features 174 | 175 | def _point_instance_ids(self, points=None): 176 | if points is not None: 177 | pred_instances = [] 178 | for i in range(0, len(points), self.flags.batch_size): 179 | batch = points[i:i + self.flags.batch_size] 180 | batch = batch.cuda() 181 | with torch.no_grad(): 182 | xyz_feature_encoding = self.model.feature_encoder(batch, bound=self.model.bound) 183 | instance_feature = self.model.contrastive(xyz_feature_encoding, None) 184 | # instance_feature = instance_feature.reshape(-1, feature_dim) 185 | instance_feature = instance_feature.cpu().numpy() 186 | sim_mat = cosine_similarity(instance_feature, self.model.instance_centers) 187 | pred_instance = np.argmax(sim_mat, axis=1) + 1 # start from 1, 0 means noise 188 | pred_instances.append(pred_instance) 189 | instance_ids = np.concatenate(pred_instances, axis=0) 190 | else: 191 | instance_ids = self.point_infos['instance_id'] 192 | return instance_ids 193 | 194 | def run(self): 195 | while True: 196 | if not self.queue.empty(): 197 | msg = self.queue.get(False) 198 | self._update_colors(msg) 199 | self.visualizer.update_geometry(self.pc) 200 | if not self.visualizer.poll_events(): 201 | return 202 | self.visualizer.update_renderer() 203 | 204 | 205 | def run_visualizer(flags, queue): 206 | visualizer = PointCloudVisualizer(flags, queue) 207 | visualizer.run() 208 | 209 | 210 | class ListView(QtWidgets.QWidget): 211 | 212 | def __init__(self, parent=None): 213 | super().__init__(parent) 214 | self.layout = QtWidgets.QVBoxLayout() 215 | self.setLayout(self.layout) 216 | self.items = [] 217 | 218 | def add_item(self, item): 219 | index = len(self.items) 220 | color = COLORS[index % len(COLORS)] 221 | self.items.append(item) 222 | label = QtWidgets.QLabel(item) 223 | label.setMargin(20) 224 | label.setStyleSheet( 225 | f"background-color: rgb({color[0]}, {color[1]}, {color[2]});") 226 | self.layout.addWidget(label) 227 | self.update() 228 | 229 | def get_items(self): 230 | return self.items 231 | 232 | def reset(self): 233 | self.items = [] 234 | for i in reversed(range(self.layout.count())): 235 | self.layout.itemAt(i).widget().setParent(None) 236 | 237 | 238 | class SegmentingApplication(QtWidgets.QMainWindow): 239 | 240 | def __init__(self, queue): 241 | super().__init__() 242 | self.classes = [] 243 | self.setWindowTitle("Segmentation Classes") 244 | self.input_button = QtWidgets.QPushButton("Add") 245 | self.input_button.clicked.connect(self._add_class) 246 | self.reset_button = QtWidgets.QPushButton("Reset") 247 | self.reset_button.clicked.connect(self._reset_classes) 248 | self.show_instance_button = QtWidgets.QPushButton("Show all instances") 249 | self.show_instance_button.clicked.connect(self._show_all_instances) 250 | self.list_view = ListView() 251 | input_line = self._create_input_line() 252 | layout = QtWidgets.QVBoxLayout() 253 | layout.addWidget(self.list_view) 254 | layout.addWidget(input_line) 255 | main_widget = QtWidgets.QWidget() 256 | main_widget.setLayout(layout) 257 | self.setCentralWidget(main_widget) 258 | self.class_queue = queue 259 | 260 | def _create_input_line(self): 261 | layout = QtWidgets.QHBoxLayout() 262 | self.line_edit = QtWidgets.QLineEdit() 263 | self.line_edit.setPlaceholderText("Class description prompt") 264 | self.line_edit.returnPressed.connect(self._add_class) 265 | layout.addWidget(self.line_edit) 266 | layout.addWidget(self.input_button) 267 | layout.addWidget(self.reset_button) 268 | layout.addWidget(self.show_instance_button) 269 | widget = QtWidgets.QWidget() 270 | widget.setLayout(layout) 271 | return widget 272 | 273 | def keyPressEvent(self, event): 274 | if event.key() == QtCore.Qt.Key.Key_Escape: 275 | self.close() 276 | 277 | def _add_class(self): 278 | self.list_view.add_item(self.line_edit.text()) 279 | self.line_edit.clear() 280 | self._publish_classes() 281 | 282 | def _reset_classes(self): 283 | self.list_view.reset() 284 | self._publish_classes() 285 | 286 | def _show_all_instances(self): 287 | self.class_queue.put("show_instance") 288 | 289 | def _publish_classes(self): 290 | self.class_queue.put(self.list_view.get_items()) 291 | 292 | 293 | def main(): 294 | parser = argparse.ArgumentParser() 295 | parser.add_argument("scene", type=str) 296 | parser.add_argument('--workspace', default=None) 297 | parser.add_argument('--checkpoint', 298 | type=str, 299 | required=True, 300 | help='path to feature model checkpoint') 301 | # parser.add_argument('--model', type=str, default='model.pth') 302 | parser.add_argument('--batch-size', type=int, default=1024) 303 | flags = parser.parse_args() 304 | 305 | app = QtWidgets.QApplication(sys.argv) 306 | 307 | queue = mp.Queue() 308 | window = SegmentingApplication(queue) 309 | window.show() 310 | 311 | thread = threading.Thread(target=run_visualizer, args=(flags, queue)) 312 | thread.start() 313 | app.exec() 314 | thread.join() 315 | 316 | 317 | if __name__ == "__main__": 318 | main() -------------------------------------------------------------------------------- /scripts/data/convert_scannet.py: -------------------------------------------------------------------------------- 1 | description = """ 2 | """ 3 | import subprocess 4 | import math 5 | import argparse 6 | import shutil 7 | import json 8 | import pandas 9 | import zlib 10 | import imageio 11 | from argparse import RawTextHelpFormatter 12 | import os, struct 13 | import cv2 14 | import numpy as np 15 | import trimesh 16 | from scipy.spatial.transform import Rotation 17 | import open3d as o3d 18 | 19 | SCANNET20_IDS = [ 20 | 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34, 36, 39 21 | ] 22 | 23 | 24 | def read_args(): 25 | parser = argparse.ArgumentParser(description=description, 26 | formatter_class=RawTextHelpFormatter) 27 | parser.add_argument('scannet_scan_dir') 28 | parser.add_argument( 29 | '--label-map', 30 | required=True, 31 | help="Path to label map .tsv file with semantic label names and ids.") 32 | parser.add_argument('--out', required=True) 33 | parser.add_argument('--max-frames', 34 | type=int, 35 | default=750, 36 | help="Maximum number of frames to keep.") 37 | parser.add_argument('--stride', 38 | type=int, 39 | default=5, 40 | help="Use only every s-th frame.") 41 | parser.add_argument('--nyu40', 42 | action='store_true', 43 | help="Use NYU40 label map.") 44 | parser.add_argument('--scannet20', 45 | action='store_true', 46 | help="Use ScanNet20 evaluation label map.") 47 | return parser.parse_args() 48 | 49 | 50 | class LabelHelper: 51 | 52 | def __init__(self, label_path, flags): 53 | self.remapping = {} 54 | self.prompt_remap = {} 55 | label_map = pandas.read_csv(label_path, sep='\t') 56 | mapping = np.zeros(label_map['id'].values.max() + 1, np.uint16) 57 | if flags.nyu40 or flags.scannet20: 58 | ids = np.arange(1, 41) 59 | texts = [] 60 | for i in ids: 61 | text = label_map['nyu40class'][label_map['nyu40id'] == 62 | i].values[0] 63 | texts.append(text) 64 | for i, num in zip(label_map['id'].values, 65 | label_map['nyu40id'].values): 66 | mapping[i] = num 67 | else: 68 | texts = label_map['raw_category'].values.tolist() 69 | ids = np.arange(1, len(texts) + 1) 70 | for i, num in zip(label_map['id'].values, ids): 71 | mapping[i] = num 72 | 73 | if flags.scannet20: 74 | mapping[np.isin(mapping, SCANNET20_IDS) == False] = 0 75 | texts = [text for text, i in zip(texts, ids) if i in SCANNET20_IDS] 76 | ids = ids[np.isin(ids, SCANNET20_IDS)] 77 | 78 | self.label_text_to_id = {} 79 | for num, text in zip(label_map['id'], label_map['raw_category']): 80 | self.label_text_to_id[text] = num 81 | self.mapping = mapping 82 | 83 | self.label_map = pandas.DataFrame({'id': ids, 'prompt': texts}) 84 | self.classes_in_scene = set() 85 | 86 | def reset(self): 87 | self.classes_in_scene = set() 88 | 89 | def _read_config(self, path): 90 | with open(path, 'rt') as f: 91 | return json.load(f) 92 | 93 | def write_labelmap(self, out): 94 | label_map_out = os.path.join(out, 'label_map.csv') 95 | self.label_map.to_csv(label_map_out, index=False) 96 | 97 | def map_semantics(self, semantic_frame): 98 | return self.mapping[semantic_frame] 99 | 100 | def register_frame(self, frame): 101 | for i in np.unique(frame): 102 | self.classes_in_scene.add(int(i)) 103 | 104 | def label_ids(self): 105 | return self.label_map['id'].values 106 | 107 | def label_to_id(self, label_name): 108 | scannet_id = self.label_text_to_id[label_name] 109 | return self.mapping[scannet_id] 110 | 111 | 112 | def write_intrinsics(out, sensor_reader): 113 | intrinsics = sensor_reader.intrinsic_color 114 | intrinsics_path = os.path.join(out, "intrinsics.txt") 115 | np.savetxt(intrinsics_path, intrinsics) 116 | 117 | 118 | def write_metadata(out, label_helper): 119 | metadata_path = os.path.join(out, "metadata.json") 120 | metadata = { 121 | "n_classes": int(label_helper.label_ids().max()), 122 | 'classes': list(sorted(label_helper.classes_in_scene)) 123 | } 124 | with open(metadata_path, 'w') as f: 125 | f.write(json.dumps(metadata, indent=2)) 126 | 127 | 128 | def read_aggregation(filename): 129 | """From https://github.com/ScanNet/ScanNet""" 130 | assert os.path.isfile(filename) 131 | object_id_to_segs = {} 132 | label_to_segs = {} 133 | with open(filename) as f: 134 | data = json.load(f) 135 | num_objects = len(data['segGroups']) 136 | for i in range(num_objects): 137 | object_id = data['segGroups'][i][ 138 | 'objectId'] + 1 # instance ids should be 1-indexed 139 | label = data['segGroups'][i]['label'] 140 | segs = data['segGroups'][i]['segments'] 141 | object_id_to_segs[object_id] = segs 142 | if label in label_to_segs: 143 | label_to_segs[label].extend(segs) 144 | else: 145 | label_to_segs[label] = segs 146 | return object_id_to_segs, label_to_segs 147 | 148 | 149 | def read_segmentation(filename): 150 | """From https://github.com/ScanNet/ScanNet""" 151 | assert os.path.isfile(filename) 152 | seg_to_verts = {} 153 | with open(filename) as f: 154 | data = json.load(f) 155 | num_verts = len(data['segIndices']) 156 | for i in range(num_verts): 157 | seg_id = data['segIndices'][i] 158 | if seg_id in seg_to_verts: 159 | seg_to_verts[seg_id].append(i) 160 | else: 161 | seg_to_verts[seg_id] = [i] 162 | return seg_to_verts, num_verts 163 | 164 | 165 | def copy_3d_semantics(scene_in, scene, scene_out, label_helper): 166 | mesh_path = os.path.join(scene_in, f"{scene}_vh_clean_2.ply") 167 | aggregation = os.path.join(scene_in, f"{scene}.aggregation.json") 168 | segments = os.path.join(scene_in, f"{scene}_vh_clean_2.0.010000.segs.json") 169 | mesh = trimesh.load(mesh_path) 170 | label_ids = np.zeros((mesh.vertices.shape[0],), dtype=np.uint16) 171 | object_id_to_seg, label_to_segs = read_aggregation(aggregation) 172 | seg_to_vertex, num_vertices = read_segmentation(segments) 173 | for label, segs in label_to_segs.items(): 174 | label_id = label_helper.label_to_id(label) 175 | for seg in segs: 176 | verts = seg_to_vertex[seg] 177 | for vertex in verts: 178 | try: 179 | label_ids[vertex] = label_id 180 | except IndexError: 181 | print( 182 | f"Index error for {scene} vertex {vertex} and seg: {seg}" 183 | ) 184 | 185 | out_mesh = os.path.join(scene_out, 'mesh.ply') 186 | mesh.export(out_mesh) 187 | out_mesh_semantics = os.path.join(scene_out, 'mesh_labels.npy') 188 | np.save(out_mesh_semantics, label_ids) 189 | 190 | 191 | class RGBDFrame(): 192 | 193 | def load(self, file_handle): 194 | self.camera_to_world = np.asarray(struct.unpack( 195 | 'f' * 16, file_handle.read(16 * 4)), 196 | dtype=np.float32).reshape(4, 4) 197 | self.timestamp_color = struct.unpack('Q', file_handle.read(8))[0] 198 | self.timestamp_depth = struct.unpack('Q', file_handle.read(8))[0] 199 | self.color_size_bytes = struct.unpack('Q', file_handle.read(8))[0] 200 | self.depth_size_bytes = struct.unpack('Q', file_handle.read(8))[0] 201 | self.color_data = b''.join( 202 | struct.unpack('c' * self.color_size_bytes, 203 | file_handle.read(self.color_size_bytes))) 204 | self.depth_data = b''.join( 205 | struct.unpack('c' * self.depth_size_bytes, 206 | file_handle.read(self.depth_size_bytes))) 207 | 208 | 209 | class SensReader: 210 | 211 | def __init__(self, sens_file): 212 | self.file = sens_file 213 | self.file_handle = None 214 | self.num_frames = None 215 | self.rgb_size = None 216 | self.depth_size = None 217 | 218 | def __enter__(self): 219 | self.file_handle = open(self.file, 'rb') 220 | f = self.file_handle 221 | version = struct.unpack('I', f.read(4))[0] 222 | assert version == 4 223 | strlen = struct.unpack('Q', f.read(8))[0] 224 | self.sensor_name = ''.join([ 225 | c.decode('utf-8') 226 | for c in struct.unpack('c' * strlen, f.read(strlen)) 227 | ]) 228 | self.intrinsic_color = np.asarray(struct.unpack('f' * 16, 229 | f.read(16 * 4)), 230 | dtype=np.float32).reshape(4, 4) 231 | self.extrinsic_color = np.asarray(struct.unpack('f' * 16, 232 | f.read(16 * 4)), 233 | dtype=np.float32).reshape(4, 4) 234 | self.intrinsic_depth = np.asarray(struct.unpack('f' * 16, 235 | f.read(16 * 4)), 236 | dtype=np.float32).reshape(4, 4) 237 | self.extrinsic_depth = np.asarray(struct.unpack('f' * 16, 238 | f.read(16 * 4)), 239 | dtype=np.float32).reshape(4, 4) 240 | color_compression_type = struct.unpack('i', f.read(4))[0] 241 | depth_compression_type = struct.unpack('i', f.read(4))[0] 242 | color_width = struct.unpack('I', f.read(4))[0] 243 | color_height = struct.unpack('I', f.read(4))[0] 244 | self.rgb_size = (color_width, color_height) 245 | depth_width = struct.unpack('I', f.read(4))[0] 246 | depth_height = struct.unpack('I', f.read(4))[0] 247 | self.depth_size = (depth_width, depth_height) 248 | depth_shift = struct.unpack('f', f.read(4))[0] 249 | self.num_frames = struct.unpack('Q', f.read(8))[0] 250 | return self 251 | 252 | def __exit__(self, *args): 253 | self.file_handle.close() 254 | 255 | def read(self): 256 | for i in range(self.num_frames): 257 | frame = RGBDFrame() 258 | frame.load(self.file_handle) 259 | rgb_frame = imageio.v3.imread(frame.color_data) 260 | depth_frame = zlib.decompress(frame.depth_data) 261 | depth_frame = np.frombuffer(depth_frame, dtype=np.uint16).reshape( 262 | self.depth_size[1], self.depth_size[0]) 263 | yield frame.camera_to_world, rgb_frame, depth_frame 264 | 265 | 266 | def main(): 267 | flags = read_args() 268 | 269 | os.makedirs(flags.out, exist_ok=True) 270 | 271 | label_helper = LabelHelper(flags.label_map, flags) 272 | label_helper.write_labelmap(flags.out) 273 | 274 | scenes = os.listdir(flags.scannet_scan_dir) 275 | 276 | for scene in scenes: 277 | # Reset classes in scene. 278 | label_helper.reset() 279 | scene_dir_in = os.path.join(flags.scannet_scan_dir, scene) 280 | sensor_file = os.path.join(flags.scannet_scan_dir, scene, 281 | f"{scene}.sens") 282 | semantic_dir_in = os.path.join(flags.scannet_scan_dir, scene, 283 | "label-filt") 284 | if not os.path.exists(semantic_dir_in): 285 | label_filt_zip = os.path.join(flags.scannet_scan_dir, scene, 286 | f"{scene}_2d-label-filt.zip") 287 | subprocess.call(['unzip', label_filt_zip, '-d', scene_dir_in]) 288 | 289 | instance_dir_in = os.path.join(flags.scannet_scan_dir, scene, 290 | "instance-filt") 291 | if not os.path.exists(instance_dir_in): 292 | instance_filt_zip = os.path.join(flags.scannet_scan_dir, scene, 293 | f"{scene}_2d-instance-filt.zip") 294 | subprocess.call(['unzip', instance_filt_zip, '-d', scene_dir_in]) 295 | 296 | rgb_dir = os.path.join(flags.out, scene, "rgb") 297 | depth_dir = os.path.join(flags.out, scene, "depth") 298 | pose_dir = os.path.join(flags.out, scene, "pose") 299 | semantic_dir = os.path.join(flags.out, scene, "gt_semantic") 300 | instance_dir = os.path.join(flags.out, scene, "gt_instance") 301 | os.makedirs(rgb_dir, exist_ok=True) 302 | os.makedirs(depth_dir, exist_ok=True) 303 | os.makedirs(pose_dir, exist_ok=True) 304 | os.makedirs(semantic_dir, exist_ok=True) 305 | os.makedirs(instance_dir, exist_ok=True) 306 | 307 | copy_3d_semantics(os.path.join(flags.scannet_scan_dir, scene), scene, 308 | os.path.join(flags.out, scene), label_helper) 309 | 310 | semantic_files = os.listdir(semantic_dir_in) 311 | semantic_files = sorted(semantic_files, 312 | key=lambda x: int(x.split('.')[0])) 313 | 314 | instance_files = os.listdir(instance_dir_in) 315 | instance_files = sorted(instance_files, 316 | key=lambda x: int(x.split('.')[0])) 317 | 318 | scene_out = os.path.join(flags.out, scene) 319 | max_frames = 750 320 | with SensReader(sensor_file) as reader: 321 | 322 | write_intrinsics(scene_out, reader) 323 | stride = max(math.ceil(reader.num_frames / max_frames), 324 | flags.stride) 325 | for i, ((T_WC, rgb, depth), semantic_file, 326 | instance_file) in enumerate( 327 | zip(reader.read(), semantic_files, instance_files)): 328 | if i % flags.stride != 0: 329 | continue 330 | print("Processing frame %d" % i, end='\r') 331 | if np.isnan(T_WC).any() or np.isinf(T_WC).any(): 332 | print("Skipping frame %d" % i, "because of nan or inf.") 333 | continue 334 | T_CW = np.linalg.inv(T_WC) 335 | number = f"{i:06}" 336 | rgb_path = os.path.join(rgb_dir, f"{number}.jpg") 337 | depth_path = os.path.join(depth_dir, f"{number}.png") 338 | pose_path = os.path.join(pose_dir, f"{number}.txt") 339 | imageio.imwrite(rgb_path, rgb) 340 | cv2.imwrite(depth_path, depth) 341 | np.savetxt(pose_path, T_CW) 342 | 343 | semantic_path = os.path.join(semantic_dir, f"{number}.png") 344 | semantic_frame = cv2.imread( 345 | os.path.join(semantic_dir_in, semantic_file), -1) 346 | out_semantic = label_helper.map_semantics(semantic_frame) 347 | label_helper.register_frame(out_semantic) 348 | cv2.imwrite(semantic_path, out_semantic) 349 | 350 | instance_out = os.path.join(instance_dir, f"{number}.png") 351 | instance_path = os.path.join(instance_dir_in, instance_file) 352 | instance_frame = cv2.imread(instance_path, -1) 353 | # Remove instances which belong to classes which are not in the label set. 354 | # 0 means undefined, and shot not be evaluated on. 355 | # An object not in the labelset could easily occlude a labelset object. 356 | instance_frame[out_semantic <= 0] = 0 357 | cv2.imwrite(instance_out, instance_frame) 358 | 359 | write_metadata(scene_out, label_helper) 360 | subprocess.call([ 361 | 'python', 'scripts/compute_scene_bounds.py', 362 | os.path.join(flags.out, scene) 363 | ]) 364 | 365 | shutil.rmtree(os.path.join(semantic_dir_in)) 366 | shutil.rmtree(os.path.join(instance_dir_in)) 367 | 368 | 369 | if __name__ == "__main__": 370 | main() 371 | -------------------------------------------------------------------------------- /scripts/mapping.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import shutil 4 | import numpy as np 5 | import pycolmap 6 | import tempfile 7 | import cv2 8 | import open3d as o3d 9 | from pathlib import Path 10 | from autolabel.utils import Scene, transform_points, Camera 11 | from autolabel.undistort import ImageUndistorter 12 | from hloc import (extract_features, match_features, reconstruction, 13 | pairs_from_exhaustive, pairs_from_retrieval) 14 | from hloc.utils import viz_3d 15 | 16 | 17 | def read_args(): 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument('scene', help="Scene to infer poses for.") 20 | parser.add_argument('--debug', action='store_true') 21 | parser.add_argument('--vis', action='store_true') 22 | return parser.parse_args() 23 | 24 | 25 | class HLoc: 26 | 27 | def __init__(self, tmp_dir, scene, flags): 28 | self.flags = flags 29 | self.scene = scene 30 | self.scene_path = Path(self.scene.path) 31 | self.exhaustive = len((self.scene.raw_rgb_paths())) < 250 32 | 33 | self.tmp_dir = Path(tmp_dir) 34 | self.sfm_pairs = self.tmp_dir / 'sfm-pairs.txt' 35 | self.loc_pairs = self.tmp_dir / 'sfm-pairs-loc.txt' 36 | self.features = self.tmp_dir / 'features.h5' 37 | self.matches = self.tmp_dir / 'matches.h5' 38 | self.feature_conf = extract_features.confs['superpoint_aachen'] 39 | self.retrieval_conf = extract_features.confs['netvlad'] 40 | self.matcher_conf = match_features.confs['superglue'] 41 | 42 | def _run_sfm(self): 43 | image_dir = Path(self.scene.path) / 'raw_rgb' 44 | image_list = [] 45 | image_paths = self.scene.raw_rgb_paths() 46 | image_list_path = [] 47 | indices = np.arange(len(image_paths)) 48 | for index in indices: 49 | image_list.append(image_paths[index]) 50 | image_list_path.append( 51 | str(Path(image_paths[index]).relative_to(image_dir))) 52 | if self.exhaustive: 53 | extract_features.main(self.feature_conf, 54 | image_dir, 55 | feature_path=self.features, 56 | image_list=image_list_path) 57 | pairs_from_exhaustive.main(self.sfm_pairs, 58 | image_list=image_list_path) 59 | match_features.main(self.matcher_conf, 60 | self.sfm_pairs, 61 | features=self.features, 62 | matches=self.matches) 63 | model = reconstruction.main( 64 | self.tmp_dir, 65 | image_dir, 66 | self.sfm_pairs, 67 | self.features, 68 | self.matches, 69 | image_list=image_list_path, 70 | camera_mode=pycolmap.CameraMode.SINGLE, 71 | image_options={'camera_model': "OPENCV"}, 72 | mapper_options={ 73 | 'ba_refine_principal_point': True, 74 | 'ba_refine_extra_params': True, 75 | 'ba_refine_focal_length': True 76 | }) 77 | else: 78 | retrieval_path = extract_features.main(self.retrieval_conf, 79 | image_dir, 80 | self.tmp_dir, 81 | image_list=image_list_path) 82 | pairs_from_retrieval.main(retrieval_path, 83 | self.sfm_pairs, 84 | num_matched=50) 85 | feature_path = extract_features.main(self.feature_conf, 86 | image_dir, 87 | self.tmp_dir, 88 | image_list=image_list_path) 89 | match_path = match_features.main(self.matcher_conf, 90 | self.sfm_pairs, 91 | self.feature_conf['output'], 92 | self.tmp_dir, 93 | matches=self.matches) 94 | model = reconstruction.main( 95 | self.tmp_dir, 96 | image_dir, 97 | self.sfm_pairs, 98 | feature_path, 99 | match_path, 100 | image_list=image_list_path, 101 | camera_mode=pycolmap.CameraMode.SINGLE, 102 | image_options={'camera_model': "OPENCV"}, 103 | mapper_options={ 104 | 'ba_refine_principal_point': True, 105 | 'ba_refine_extra_params': True, 106 | 'ba_refine_focal_length': True 107 | }) 108 | 109 | if self.flags.vis: 110 | fig = viz_3d.init_figure() 111 | viz_3d.plot_reconstruction(fig, 112 | model, 113 | color='rgba(255,0,0,0.5)', 114 | name="mapping") 115 | fig.show() 116 | 117 | if self.flags.debug: 118 | # Save mapping metadata if running in debug mode. 119 | colmap_output_dir = os.path.join(self.scene.path, 'colmap_output') 120 | os.makedirs(colmap_output_dir, exist_ok=True) 121 | model.write_text(colmap_output_dir) 122 | 123 | # Save the intrinsics matrix and the distortion parameters. 124 | assert (len(model.cameras) == 1 and 1 in model.cameras) 125 | (focal_length_x, focal_length_y, c_x, c_y, k_1, k_2, p_1, 126 | p_2) = model.cameras[1].params 127 | self.colmap_K = np.eye(3) 128 | self.colmap_K[0, 0] = focal_length_x 129 | self.colmap_K[1, 1] = focal_length_y 130 | self.colmap_K[0, 2] = c_x 131 | self.colmap_K[1, 2] = c_y 132 | self.colmap_distortion_params = np.array([k_1, k_2, p_1, p_2]) 133 | np.savetxt(fname=os.path.join(self.scene.path, 'intrinsics.txt'), 134 | X=self.colmap_K) 135 | np.savetxt(fname=os.path.join(self.scene.path, 136 | 'distortion_parameters.txt'), 137 | X=self.colmap_distortion_params) 138 | 139 | def _undistort_images(self): 140 | print("Undistorting images according to the estimated intrinsics...") 141 | undistorted_image_folder = os.path.join(self.scene.path, "rgb") 142 | undistorted_depth_folder = os.path.join(self.scene.path, "depth") 143 | os.makedirs(undistorted_image_folder, exist_ok=True) 144 | os.makedirs(undistorted_depth_folder, exist_ok=True) 145 | 146 | color_undistorter = ImageUndistorter(K=self.colmap_K, 147 | D=self.colmap_distortion_params, 148 | H=self.scene.camera.size[1], 149 | W=self.scene.camera.size[0]) 150 | 151 | depth_camera = Camera(self.colmap_K, self.scene.camera.size).scale( 152 | self.scene.depth_size()) 153 | depth_undistorter = ImageUndistorter(K=depth_camera.camera_matrix, 154 | D=self.colmap_distortion_params, 155 | H=depth_camera.size[1], 156 | W=depth_camera.size[0]) 157 | 158 | # Undistort all the images and save the undistorted versions. 159 | image_paths = self.scene.raw_rgb_paths() 160 | for image_path in image_paths: 161 | image = cv2.imread(image_path, cv2.IMREAD_UNCHANGED) 162 | 163 | undistorted_image = color_undistorter.undistort_image(image=image) 164 | cv2.imwrite(img=undistorted_image, 165 | filename=os.path.join(undistorted_image_folder, 166 | os.path.basename(image_path))) 167 | 168 | depth_paths = self.scene.raw_depth_paths() 169 | for depth_path in depth_paths: 170 | depth = cv2.imread(depth_path, cv2.IMREAD_UNCHANGED) 171 | 172 | undistorted_depth = depth_undistorter.undistort_image(image=depth) 173 | cv2.imwrite(img=undistorted_depth, 174 | filename=os.path.join(undistorted_depth_folder, 175 | os.path.basename(depth_path))) 176 | 177 | def run(self): 178 | self._run_sfm() 179 | self._undistort_images() 180 | 181 | 182 | class ScaleEstimation: 183 | min_depth = 0.05 184 | 185 | def __init__(self, scene, colmap_dir): 186 | self.scene = scene 187 | self.colmap_dir = colmap_dir 188 | self.reconstruction = pycolmap.Reconstruction(colmap_dir) 189 | self._read_trajectory() 190 | self._read_depth_maps() 191 | 192 | def _read_depth_maps(self): 193 | self.depth_maps = {} 194 | for path in self.scene.depth_paths(): 195 | frame_name = os.path.basename(path).split('.')[0] 196 | self.depth_maps[frame_name] = cv2.imread(path, -1) / 1000.0 197 | depth_shape = next(iter(self.depth_maps.values())).shape 198 | depth_size = np.array([depth_shape[1], depth_shape[0]], 199 | dtype=np.float64) 200 | self.depth_to_color_ratio = depth_size / np.array( 201 | self.scene.camera.size, dtype=np.float64) 202 | 203 | def _read_trajectory(self): 204 | poses = [] 205 | for image in self.reconstruction.images.values(): 206 | T_CW = np.eye(4) 207 | T_CW[:3, :3] = image.rotmat() 208 | T_CW[:3, 3] = image.tvec 209 | frame_name = image.name.split('.')[0] 210 | poses.append((frame_name, T_CW)) 211 | self.poses = dict(poses) 212 | 213 | def _lookup_depth(self, frame, xy): 214 | xy_depth = np.floor(self.depth_to_color_ratio * xy).astype(int) 215 | return self.depth_maps[frame][xy_depth[1], xy_depth[0]] 216 | 217 | def _estimate_scale(self): 218 | images = self.reconstruction.images 219 | point_depths = [] 220 | measured_depths = [] 221 | for image in images.values(): 222 | frame_name = image.name.split('.')[0] 223 | points = image.get_valid_points2D() 224 | points3D = self.reconstruction.points3D 225 | for point in points: 226 | depth_map_value = self._lookup_depth(frame_name, point.xy) 227 | 228 | if depth_map_value < self.min_depth: 229 | continue 230 | 231 | T_CW = self.poses[frame_name] 232 | point3D = points3D[point.point3D_id] 233 | 234 | p_C = transform_points(T_CW, point3D.xyz) 235 | measured_depths.append(depth_map_value) 236 | point_depths.append(p_C[2]) 237 | 238 | point_depths = np.stack(point_depths) 239 | measured_depths = np.stack(measured_depths) 240 | scales = measured_depths / point_depths 241 | return self._ransac(scales) 242 | 243 | def _ransac(self, scales): 244 | best_set = None 245 | best_inlier_count = 0 246 | indices = np.arange(0, scales.shape[0]) 247 | inlier_threshold = np.median(scales) * 1e-2 248 | for i in range(10000): 249 | selected = np.random.choice(indices) 250 | estimate = scales[selected] 251 | inliers = np.abs(scales - estimate) < inlier_threshold 252 | inlier_count = inliers.sum() 253 | if inlier_count > best_inlier_count: 254 | best_set = scales[inliers] 255 | best_inlier_count = inlier_count 256 | print( 257 | f"Scale estimation inlier count: {best_inlier_count} / {scales.size}" 258 | ) 259 | return np.mean(best_set) 260 | 261 | def _scale_poses(self, ratio): 262 | scaled_poses = {} 263 | for key, pose in self.poses.items(): 264 | new_pose = pose.copy() 265 | new_pose[:3, 3] *= ratio 266 | scaled_poses[key] = new_pose 267 | return scaled_poses 268 | 269 | def run(self): 270 | scale_ratio = self._estimate_scale() 271 | return self._scale_poses(scale_ratio) 272 | 273 | 274 | class PoseSaver: 275 | 276 | def __init__(self, scene, scaled_poses): 277 | self.scene = scene 278 | self.poses = scaled_poses 279 | 280 | def compute_bbox(self, poses): 281 | """ 282 | poses: Metrically scaled transforms from camera to world frame. 283 | """ 284 | # Compute axis-aligned bounding box of the depth values in world frame. 285 | # Then get the center. 286 | min_bounds = np.zeros(3) 287 | max_bounds = np.zeros(3) 288 | depth_frame = o3d.io.read_image(self.scene.depth_paths()[0]) 289 | depth_size = np.asarray(depth_frame).shape[::-1] 290 | K = self.scene.camera.scale(depth_size).camera_matrix 291 | intrinsics = o3d.camera.PinholeCameraIntrinsic(int(depth_size[0]), 292 | int(depth_size[1]), 293 | K[0, 0], K[1, 1], 294 | K[0, 2], K[1, 2]) 295 | pc = o3d.geometry.PointCloud() 296 | depth_frames = dict([(os.path.basename(p).split('.')[0], p) 297 | for p in self.scene.depth_paths()]) 298 | items = [item for item in poses.items()] 299 | stride = max(len(self.scene.depth_paths()) // 100, 1) 300 | for key, T_WC in items[::stride]: 301 | if key not in depth_frames: 302 | print("WARNING: Can't find depth image {key}.png") 303 | continue 304 | depth = o3d.io.read_image(f"{depth_frames[key]}") 305 | 306 | pc_C = o3d.geometry.PointCloud.create_from_depth_image( 307 | depth, depth_scale=1000.0, intrinsic=intrinsics) 308 | pc_C = np.asarray(pc_C.points) 309 | pc_W = transform_points(T_WC, pc_C) 310 | 311 | min_bounds = np.minimum(min_bounds, pc_W.min(axis=0)) 312 | max_bounds = np.maximum(max_bounds, pc_W.max(axis=0)) 313 | pc += o3d.geometry.PointCloud( 314 | o3d.utility.Vector3dVector(pc_W)).uniform_down_sample(50) 315 | 316 | filtered, _ = pc.remove_statistical_outlier(nb_neighbors=20, 317 | std_ratio=2.0) 318 | bbox = filtered.get_oriented_bounding_box(robust=True) 319 | T = np.eye(4) 320 | T[:3, :3] = bbox.R.T 321 | o3d_aabb = o3d.geometry.PointCloud(filtered).transform( 322 | T).get_axis_aligned_bounding_box() 323 | center = o3d_aabb.get_center() 324 | T[:3, 3] = -center 325 | aabb = np.zeros((2, 3)) 326 | aabb[0, :] = o3d_aabb.get_min_bound() - center 327 | aabb[1, :] = o3d_aabb.get_max_bound() - center 328 | return T, aabb, filtered 329 | 330 | def _write_poses(self, poses): 331 | pose_dir = os.path.join(self.scene.path, 'pose') 332 | os.makedirs(pose_dir, exist_ok=True) 333 | for key, T_CW in poses.items(): 334 | pose_file = os.path.join(pose_dir, f'{key}.txt') 335 | np.savetxt(pose_file, T_CW) 336 | 337 | def _write_bounds(self, bounds): 338 | with open(os.path.join(self.scene.path, 'bbox.txt'), 'wt') as f: 339 | min_str = " ".join([str(x) for x in bounds[0]]) 340 | max_str = " ".join([str(x) for x in bounds[1]]) 341 | f.write(f"{min_str} {max_str} 0.01") 342 | 343 | def run(self): 344 | T_WCs = {} 345 | for key, T_CW in self.poses.items(): 346 | T_WCs[key] = np.linalg.inv(T_CW) 347 | T, aabb, point_cloud = self.compute_bbox(T_WCs) 348 | 349 | T_CWs = {} 350 | for key, T_WC in T_WCs.items(): 351 | T_CWs[key] = np.linalg.inv(T @ T_WC) 352 | self._write_poses(T_CWs) 353 | self._write_bounds(aabb) 354 | 355 | 356 | class Pipeline: 357 | 358 | def __init__(self, flags): 359 | self.tmp_dir = tempfile.mkdtemp() 360 | self.flags = flags 361 | self.scene = Scene(flags.scene) 362 | 363 | def run(self): 364 | hloc = HLoc(self.tmp_dir, self.scene, self.flags) 365 | hloc.run() 366 | 367 | # Camera intrinsics might have changed so reload the scene. 368 | self.scene = Scene(self.scene.path) 369 | 370 | scale_estimation = ScaleEstimation(self.scene, self.tmp_dir) 371 | scaled_poses = scale_estimation.run() 372 | pose_saver = PoseSaver(self.scene, scaled_poses) 373 | pose_saver.run() 374 | 375 | if self.flags.debug: 376 | shutil.move(str(self.tmp_dir), "/tmp/sfm_debug") 377 | else: 378 | shutil.rmtree(self.tmp_dir) 379 | 380 | 381 | if __name__ == "__main__": 382 | Pipeline(read_args()).run() 383 | -------------------------------------------------------------------------------- /scripts/language/evaluate.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from pathlib import Path 4 | import numpy as np 5 | import json 6 | import pickle 7 | import pandas 8 | from rich.table import Table 9 | from rich.console import Console 10 | from rich import print as rprint 11 | from autolabel.evaluation import OpenVocabEvaluator2D, OpenVocabEvaluator3D 12 | from autolabel.evaluation import OpenVocabInstancePQEvaluator, PanopticStat 13 | from autolabel.dataset import SceneDataset, LenDataset 14 | from autolabel import utils, model_utils 15 | 16 | 17 | def read_args(): 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument('scenes', nargs='+') 20 | parser.add_argument('--batch-size', default=8182, type=int) 21 | parser.add_argument('--vis', default=None, type=str) 22 | parser.add_argument('--workspace', type=str, default=None) 23 | parser.add_argument('--out', 24 | default=None, 25 | type=str, 26 | help="Where to write results as json, if anywhere.") 27 | parser.add_argument('--label-map', type=str, required=True) 28 | parser.add_argument('--feature-checkpoint', '-f', type=str, required=True) 29 | parser.add_argument( 30 | '--stride', 31 | type=int, 32 | default=1, 33 | help="Only evaluate every Nth frame to save time or for debugging.") 34 | parser.add_argument( 35 | '--pc', 36 | action='store_true', 37 | help= 38 | "Evaluate point cloud segmentation accuracy instead of 2D segmentation maps." 39 | ) 40 | parser.add_argument( 41 | '--panoptic', 42 | action='store_true', 43 | help='Evaluate panoptic segmenation.') 44 | parser.add_argument('--print-verbose', action='store_true') 45 | parser.add_argument('--debug', action='store_true') 46 | parser.add_argument('--only-scene-classes', action='store_true') 47 | parser.add_argument('--random', 48 | action='store_true', 49 | help="Randomize the order of the scenes.") 50 | parser.add_argument('--time', action='store_true') 51 | parser.add_argument('--denoise-method', 52 | type=str, 53 | default='average_similarity', 54 | choices=['majority_voting', 'average_similarity', 'average_feature'], 55 | help="The denoise method for semantics.") 56 | return parser.parse_args() 57 | 58 | 59 | def gather_models(flags, scene_dirs): 60 | models = set() 61 | for scene in scene_dirs: 62 | nerf_dir = model_utils.get_nerf_dir(scene, flags) 63 | if not os.path.exists(nerf_dir): 64 | continue 65 | for model in os.listdir(nerf_dir): 66 | checkpoint_dir = os.path.join(nerf_dir, model, 'checkpoints') 67 | if os.path.exists(checkpoint_dir): 68 | models.add(model) 69 | return list(models) 70 | 71 | 72 | def read_label_map(path): 73 | return pandas.read_csv(path) 74 | 75 | 76 | class NumpyEncoder(json.JSONEncoder): 77 | """ Special json encoder for numpy types """ 78 | def default(self, obj): 79 | if isinstance(obj, np.integer): 80 | return int(obj) 81 | elif isinstance(obj, np.floating): 82 | return float(obj) 83 | elif isinstance(obj, np.ndarray): 84 | return obj.tolist() 85 | return json.JSONEncoder.default(self, obj) 86 | 87 | def write_results(out, tables, json_result, panoptic_stat=None): 88 | out = Path(out) 89 | out.mkdir(parents=True, exist_ok=True) 90 | dumped = json.dumps(json_result, cls=NumpyEncoder, indent=2) 91 | with open(out / 'results.json', 'w') as f: 92 | f.write(dumped) 93 | 94 | with open(out / 'table.txt', 'w') as f: 95 | for table in tables: 96 | rprint(table, file=f) 97 | rprint('\n\n', file=f) 98 | 99 | if panoptic_stat is not None: 100 | with open(out / 'panoptic_stat.pkl', 'wb') as outp: 101 | pickle.dump(panoptic_stat, outp, pickle.HIGHEST_PROTOCOL) 102 | 103 | 104 | def main(flags): 105 | if len(flags.scenes) == 1 and not os.path.exists( 106 | os.path.join(flags.scenes[0], 'rgb')): 107 | # We are dealing with a directory full of scenes and not a list of scenes 108 | scene_dir = flags.scenes[0] 109 | scene_dirs = [ 110 | os.path.join(scene_dir, scene) 111 | for scene in os.listdir(scene_dir) 112 | if os.path.exists(os.path.join(scene_dir, scene, 'rgb')) 113 | ] 114 | else: 115 | scene_dirs = flags.scenes 116 | 117 | original_labels = read_label_map(flags.label_map) 118 | n_classes = len(original_labels) 119 | 120 | scene_names = [os.path.basename(os.path.normpath(p)) for p in scene_dirs] 121 | scenes = [(s, n) for s, n in zip(scene_dirs, scene_names)] 122 | if flags.random: 123 | import random 124 | random.shuffle(scenes) 125 | else: 126 | scenes = sorted(scenes, key=lambda x: x[1]) 127 | 128 | if flags.panoptic: 129 | panoptic_stats = PanopticStat() 130 | else: 131 | ious = [] 132 | accs = [] 133 | ious_d = [] 134 | accs_d = [] 135 | evaluator = None 136 | 137 | for scene_index, (scene, scene_name) in enumerate(scenes): 138 | model = gather_models(flags, [scene]) 139 | if len(model) == 0: 140 | print(f"Skipping scene {scene_name} because no models were found.") 141 | continue 142 | else: 143 | model = model[0] 144 | print(f"Using model {model}") 145 | 146 | print(f"Evaluating scene {scene_name}") 147 | 148 | nerf_dir = model_utils.get_nerf_dir(scene, flags) 149 | model_path = os.path.join(nerf_dir, model) 150 | if not os.path.exists(model_path): 151 | print(f"Skipping scene {scene_name} because no models were found.") 152 | continue 153 | params = model_utils.read_params(model_path) 154 | dataset = SceneDataset('test', 155 | scene, 156 | factor=4.0, 157 | batch_size=flags.batch_size, 158 | lazy=True) 159 | if flags.only_scene_classes: 160 | classes_in_scene = dataset.scene.metadata.get('classes', None) 161 | if classes_in_scene is None: 162 | label_map = original_labels 163 | else: 164 | mask = original_labels['id'].isin(classes_in_scene) 165 | label_map = original_labels[mask] 166 | else: 167 | label_map = original_labels 168 | 169 | n_classes = dataset.n_classes if dataset.n_classes is not None else 2 170 | model = model_utils.create_model(dataset.min_bounds, dataset.max_bounds, 171 | n_classes, params).cuda() 172 | 173 | checkpoint_dir = os.path.join(model_path, 'checkpoints') 174 | if not os.path.exists(checkpoint_dir) or len( 175 | os.listdir(checkpoint_dir)) == 0: 176 | continue 177 | 178 | model_utils.load_checkpoint(model, checkpoint_dir) 179 | model = model.eval() 180 | if flags.vis is not None: 181 | vis_path = os.path.join(flags.vis, scene_name) 182 | else: 183 | vis_path = None 184 | 185 | if evaluator is None: 186 | if flags.panoptic: 187 | evaluator = OpenVocabInstancePQEvaluator( 188 | features=params.features, 189 | name=scene_name, 190 | checkpoint=flags.feature_checkpoint, 191 | debug=flags.debug, 192 | stride=flags.stride, 193 | save_figures=vis_path, 194 | time=flags.time, 195 | denoise_method=flags.denoise_method 196 | ) 197 | else: 198 | if flags.pc: 199 | evaluator = OpenVocabEvaluator3D( 200 | features=params.features, 201 | name=scene_name, 202 | checkpoint=flags.feature_checkpoint, 203 | stride=flags.stride, 204 | debug=flags.debug, 205 | time=flags.time) 206 | else: 207 | evaluator = OpenVocabEvaluator2D( 208 | features=params.features, 209 | name=scene_name, 210 | checkpoint=flags.feature_checkpoint, 211 | debug=flags.debug, 212 | stride=flags.stride, 213 | save_figures=vis_path, 214 | time=flags.time) 215 | assert evaluator.features == params.features 216 | evaluator.reset(model, label_map, vis_path) 217 | if flags.panoptic: 218 | panoptic_stat = evaluator.eval(dataset) 219 | panoptic_stats += panoptic_stat 220 | tables, json_result = print_panoptic_results(panoptic_stat, 221 | categories=evaluator.evaluated_labels, 222 | label_mapping=evaluator.label_mapping, 223 | label_type_mapping=evaluator.label_type_mapping, 224 | verbose=flags.print_verbose) 225 | if flags.out: 226 | write_results( 227 | os.path.join(flags.out, scene_name), tables, json_result, panoptic_stat) 228 | else: 229 | iou, acc, iou_d, acc_d = evaluator.eval(dataset) 230 | ious.append(iou) 231 | accs.append(acc) 232 | ious_d.append(iou_d) 233 | accs_d.append(acc_d) 234 | table = print_iou_acc_results([iou], [acc]) 235 | table_d = print_iou_acc_results([iou_d], [acc_d], table_title="Denoised") 236 | if flags.out: 237 | write_results( 238 | os.path.join(flags.out, scene_name), 239 | [table, table_d], 240 | {'iou': iou, 'acc': acc, 'iou_d': iou_d, 'acc_d': acc_d}) 241 | del model 242 | if flags.panoptic: 243 | final_tables, final_json_result = print_panoptic_results(panoptic_stats, 244 | categories=evaluator.evaluated_labels, 245 | label_mapping=evaluator.label_mapping, 246 | label_type_mapping=evaluator.label_type_mapping, 247 | verbose=flags.print_verbose) 248 | if flags.out: 249 | write_results( 250 | os.path.join(flags.out, 'final'), final_tables, final_json_result, panoptic_stats) 251 | else: 252 | table = print_iou_acc_results(ious, accs) 253 | table_d = print_iou_acc_results(ious_d, accs_d, table_title="Denoised") 254 | if flags.out: 255 | write_results( 256 | os.path.join(flags.out, 'final'), 257 | [table, table_d], 258 | {'ious': ious, 'accs': accs, 'ious_d': ious_d, 'accs_d': accs_d}) 259 | 260 | 261 | def print_panoptic_results(panoptic_stat, categories, label_mapping, label_type_mapping, verbose=False): 262 | 263 | json_result = {} 264 | print_tables = [] 265 | 266 | def percentage_to_string(num): 267 | if num is None: 268 | return "N/A" 269 | else: 270 | v = num * 100 271 | return f"{v:.1f}" 272 | 273 | console = Console() 274 | # panoptic segmentation 275 | pq_total_result, pq_per_class_result = panoptic_stat.pq_average(categories, label_type_mapping, verbose=verbose) 276 | table = Table(show_lines=True, caption_justify='left') 277 | table.add_column('Class') 278 | table.add_column('PQ') 279 | table.add_column('SQ') 280 | table.add_column('RQ') 281 | if verbose: 282 | table.add_column('tp') 283 | table.add_column('fp') 284 | table.add_column('fn') 285 | 286 | table.title = "Panoptic Evaluation" 287 | json_result['panoptic'] = {} 288 | per_class_result = {} 289 | for category_id in categories: 290 | pq_info = pq_per_class_result[category_id] 291 | if pq_info['valid']: 292 | if verbose: 293 | table.add_row(label_mapping[category_id], 294 | percentage_to_string(pq_info['pq']), 295 | percentage_to_string(pq_info['sq']), 296 | percentage_to_string(pq_info['rq']), 297 | str(pq_info['tp']), 298 | str(pq_info['fp']), 299 | str(pq_info['fn'])) 300 | per_class_result[label_mapping[category_id]] = { 301 | 'PQ': pq_info['pq'] * 100, 'SQ': pq_info['sq'] * 100, 'RQ': pq_info['rq'] * 100, 302 | 'tp': pq_info['tp'], 'fp': pq_info['fp'], 'fn': pq_info['fn'] 303 | } 304 | 305 | else: 306 | table.add_row(label_mapping[category_id], 307 | percentage_to_string(pq_info['pq']), 308 | percentage_to_string(pq_info['sq']), 309 | percentage_to_string(pq_info['rq'])) 310 | per_class_result[label_mapping[category_id]] = { 311 | 'PQ': pq_info['pq'] * 100, 'SQ': pq_info['sq'] * 100, 'RQ': pq_info['rq'] * 100 312 | } 313 | json_result['panoptic']['per_class_result'] = per_class_result 314 | if verbose: 315 | table.add_row('Total:\n{} valid panoptic categories.'.format( 316 | pq_total_result['n']), 317 | percentage_to_string(pq_total_result['pq']), 318 | percentage_to_string(pq_total_result['sq']), 319 | percentage_to_string(pq_total_result['rq']), 320 | '{:.1f}'.format(pq_total_result['tp']), 321 | '{:.1f}'.format(pq_total_result['fp']), 322 | '{:.1f}'.format(pq_total_result['fn'])) 323 | json_result['panoptic']['total'] = { 324 | 'PQ': pq_total_result['pq'] * 100, 'SQ': pq_total_result['sq'] * 100, 'RQ': pq_total_result['rq'] * 100, 325 | 'tp': pq_total_result['tp'], 'fp': pq_total_result['fp'], 'fn': pq_total_result['fn'] 326 | } 327 | else: 328 | table.add_row('Total:\n{} valid panoptic categories.'.format( 329 | pq_total_result['n']), 330 | percentage_to_string(pq_total_result['pq']), 331 | percentage_to_string(pq_total_result['sq']), 332 | percentage_to_string(pq_total_result['rq'])) 333 | json_result['panoptic']['total'] = { 334 | 'PQ': pq_total_result['pq'] * 100, 'SQ': pq_total_result['sq'] * 100, 'RQ': pq_total_result['rq'] * 100 335 | } 336 | console.print(table) 337 | print_tables.append(table) 338 | 339 | # semantic segmentation 340 | semantic_total_result, semantic_per_class_result = panoptic_stat.semantic_average(categories) 341 | table = Table(show_lines=True, caption_justify='left') 342 | table.add_column('Class') 343 | table.add_column('S_iou') 344 | table.add_column('S_acc') 345 | table.add_column('S_iou_d') 346 | table.add_column('S_acc_d') 347 | 348 | table.title = "Semantic Evaluation" 349 | 350 | json_result['semantic'] = {} 351 | per_class_result = {} 352 | for category_id in categories: 353 | semantic = semantic_per_class_result[category_id] 354 | if semantic['valid']: 355 | table.add_row(label_mapping[category_id], 356 | percentage_to_string(semantic['iou']), 357 | percentage_to_string(semantic['acc']), 358 | percentage_to_string(semantic['iou_d']), 359 | percentage_to_string(semantic['acc_d'])) 360 | per_class_result[label_mapping[category_id]] = { 361 | 'S_iou': semantic['iou'] * 100, 'S_acc': semantic['acc'] * 100, 'S_iou_d': semantic['iou_d'] * 100, 'S_acc_d': semantic['acc_d'] * 100 362 | } 363 | json_result['semantic']['per_class_result'] = per_class_result 364 | 365 | table.add_row('Total:\n{} valid semantic categories'.format( 366 | semantic_total_result['n']), 367 | percentage_to_string(semantic_total_result['iou']), 368 | percentage_to_string(semantic_total_result['acc']), 369 | percentage_to_string(semantic_total_result['iou_d']), 370 | percentage_to_string(semantic_total_result['acc_d'])) 371 | json_result['semantic']['total'] = { 372 | 'S_iou': semantic_total_result['iou'] * 100, 'S_acc': semantic_total_result['acc'] * 100, 373 | 'S_iou_d': semantic_total_result['iou_d'] * 100, 'S_acc_d': semantic_total_result['acc_d'] * 100 374 | } 375 | console.print(table) 376 | print_tables.append(table) 377 | 378 | # instance segmentation 379 | instance_result = panoptic_stat.instance_average() 380 | table = Table(show_lines=True, caption_justify='left') 381 | table.add_column('mCov') 382 | table.add_column('mWCov') 383 | table.add_column('mPrec') 384 | table.add_column('mRec') 385 | 386 | table.title = "Instance Evaluation" 387 | table.add_row( 388 | percentage_to_string(instance_result['mCov']), 389 | percentage_to_string(instance_result['mWCov']), 390 | percentage_to_string(instance_result['mPrec']), 391 | percentage_to_string(instance_result['mRec'])) 392 | json_result['instance'] = { 393 | 'mCov': instance_result['mCov'] * 100, 'mWCov': instance_result['mWCov'] * 100, 394 | 'mPrec': instance_result['mPrec'] * 100, 'mRec': instance_result['mRec'] * 100 395 | } 396 | console.print(table) 397 | print_tables.append(table) 398 | return print_tables, json_result 399 | 400 | 401 | def print_iou_acc_results(ious, accs, table_title="Direct"): 402 | table = Table() 403 | table.add_column('Class') 404 | table.add_column('mIoU') 405 | table.add_column('mAcc') 406 | table.title = table_title 407 | 408 | def percentage_to_string(iou): 409 | if iou is None: 410 | return "N/A" 411 | else: 412 | v = iou * 100 413 | return f"{v:.1f}" 414 | 415 | reduced_iou = {} 416 | for iou in ious: 417 | for key, value in iou.items(): 418 | if key not in reduced_iou: 419 | reduced_iou[key] = [] 420 | if value is None: 421 | continue 422 | reduced_iou[key].append(value) 423 | reduced_acc = {} 424 | for acc in accs: 425 | for key, value in acc.items(): 426 | if key not in reduced_acc: 427 | reduced_acc[key] = [] 428 | if value is None: 429 | continue 430 | reduced_acc[key].append(value) 431 | for key, values in reduced_iou.items(): 432 | if key == 'total': 433 | continue 434 | mIoU = np.mean(values) 435 | mAcc = np.mean(reduced_acc[key]) 436 | table.add_row(key, percentage_to_string(mIoU), 437 | percentage_to_string(mAcc)) 438 | 439 | scene_total = percentage_to_string( 440 | np.mean([r['total'] for r in ious if 'total' in r])) 441 | scene_total_acc = percentage_to_string( 442 | np.mean([r['total'] for r in accs if 'total' in r])) 443 | table.add_row('Total', scene_total, scene_total_acc) 444 | 445 | console = Console() 446 | console.print(table) 447 | return table 448 | 449 | 450 | if __name__ == "__main__": 451 | main(read_args()) 452 | -------------------------------------------------------------------------------- /autolabel/trainer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import hdbscan 4 | import pickle 5 | import tensorboardX 6 | import torch 7 | from torch.nn import functional as F 8 | from torch import optim 9 | import tqdm 10 | from tqdm import tqdm 11 | from torch_ema import ExponentialMovingAverage 12 | 13 | from torch_ngp.nerf.utils import Trainer 14 | from autolabel.dataset import SAM_BIT_LEN 15 | 16 | DEPTH_EPSILON = 0.01 17 | 18 | def sim_matrix(a, b, eps=1e-8): 19 | """ 20 | added eps for numerical stability 21 | """ 22 | a_n, b_n = a.norm(dim=1)[:, None], b.norm(dim=1)[:, None] 23 | a_norm = a / torch.clamp(a_n, min=eps) 24 | b_norm = b / torch.clamp(b_n, min=eps) 25 | sim_mt = torch.mm(a_norm, b_norm.transpose(0, 1)) 26 | return sim_mt 27 | 28 | 29 | class SimpleTrainer(Trainer): 30 | 31 | def train(self, dataloader, epochs): 32 | if self.use_tensorboardX and self.local_rank == 0: 33 | self.writer = tensorboardX.SummaryWriter( 34 | os.path.join(self.workspace, "run", self.name)) 35 | 36 | if self.model.cuda_ray: 37 | self.model.mark_untrained_grid(dataloader._data.poses, 38 | dataloader._data.intrinsics) 39 | 40 | if not hasattr(self, 'con_ema'): 41 | self.con_ema = ExponentialMovingAverage(self.model.contrastive_features.parameters(), 42 | decay=self.ema_decay if self.ema_decay is not None else 0.95) 43 | 44 | for i in range(0, epochs): 45 | self.train_iterations(dataloader, 1000, epoch=i+1) 46 | if self.opt.slow_center: 47 | self.update_sam_centers(dataloader) 48 | self.epoch += 1 49 | 50 | if self.use_tensorboardX and self.local_rank == 0: 51 | self.writer.close() 52 | 53 | def update_sam_centers(self, dataloader): 54 | dataset = dataloader._data 55 | self.model.eval() 56 | 57 | bar = tqdm(dataset.indices, desc="Updating SAM centers") 58 | with torch.inference_mode(): 59 | for image_index in bar: 60 | data = dataset._next_update(image_index) 61 | rays_o = torch.tensor(data['rays_o']).to(self.device) # [B, 3] 62 | rays_d = torch.tensor(data['rays_d']).to(self.device) # [B, 3] 63 | direction_norms = torch.tensor(data['direction_norms']).to(self.device) # [B, 1] 64 | num_masks = data['num_masks'] 65 | sample_mask_size = data['sample_mask_size'] 66 | 67 | outputs = self.model.render(rays_o, 68 | rays_d, 69 | direction_norms, 70 | staged=False, 71 | bg_color=None, 72 | perturb=True, 73 | contrastive_ema=self.con_ema, 74 | **vars(self.opt)) 75 | contrastive_features = outputs['contrastive_features'] 76 | contrastive_features = contrastive_features.reshape(num_masks, sample_mask_size, -1) 77 | sam_center = torch.mean(contrastive_features, dim=1) 78 | dataset.update_sam_centers(image_index, sam_center.cpu().numpy()) 79 | 80 | def train_iterations(self, dataloader, iterations, epoch): 81 | self.model.train() 82 | if self.model.cuda_ray: 83 | self.model.mark_untrained_grid(dataloader._data.poses, 84 | dataloader._data.intrinsics) 85 | iterator = iter(dataloader) 86 | bar = tqdm(range(iterations), desc=f"[Epoch {epoch}] Loss: N/A") 87 | for _ in bar: 88 | data = next(iterator) 89 | self.global_step += 1 90 | for opt in self.optimizers: 91 | opt.zero_grad() 92 | with torch.cuda.amp.autocast(enabled=self.fp16): 93 | _, _, loss = self.train_step(data) 94 | if self.use_tensorboardX: 95 | self.writer.add_scalar("train/loss", loss.item(), 96 | self.global_step) 97 | self.scaler.scale(loss).backward() 98 | for opt in self.optimizers: 99 | self.scaler.step(opt) 100 | self.scaler.update() 101 | self.con_ema.update() 102 | bar.set_description(f"[Epoch {epoch}] Loss: {loss:.04f}") 103 | if self.ema is not None: 104 | self.ema.update() 105 | self._step_scheduler(loss) 106 | 107 | def compute_contrastive_loss(self, 108 | features, 109 | sam_sampling=True, 110 | anchor_indices=None, 111 | positive_indices=None, 112 | negative_indices=None, 113 | sam_centers=None, 114 | sam_labels=None, 115 | batch_size=None, 116 | chunk_size=None): 117 | if sam_sampling: 118 | assert anchor_indices is not None 119 | assert positive_indices is not None 120 | assert negative_indices is not None 121 | 122 | if not hasattr(self, 'con_loss_fn'): 123 | self.con_loss_fn = torch.nn.CrossEntropyLoss() 124 | 125 | # loss_all = 0 126 | anchor_features = features[anchor_indices] 127 | positive_features = features[positive_indices].detach() 128 | negative_features = features[negative_indices].detach() 129 | 130 | logits_pos = F.cosine_similarity(anchor_features, positive_features, dim=-1) 131 | logits_neg = F.cosine_similarity(anchor_features[:, None, :], negative_features, dim=-1) 132 | logits = torch.cat((logits_pos[:, None], logits_neg), dim=1) 133 | 134 | labels = torch.zeros(anchor_features.shape[0], dtype=torch.int64).to(self.device) 135 | loss_all = self.con_loss_fn(logits/self.opt.contrastive_temperature, labels) 136 | 137 | if sam_centers is not None: 138 | loss_center = F.l1_loss(anchor_features, sam_centers) 139 | loss_center += (1 - F.cosine_similarity(anchor_features, sam_centers, dim=-1)).mean() 140 | loss_all += 0.5 * loss_center 141 | 142 | else: 143 | assert sam_labels is not None 144 | assert batch_size is not None 145 | assert chunk_size is not None 146 | assert features.shape[0] == sam_labels.shape[0] 147 | 148 | loss_all = 0 149 | chunks = batch_size // chunk_size 150 | for chunk in range(chunks): 151 | start = chunk * chunk_size 152 | end = (chunk + 1) * chunk_size 153 | 154 | contrastive_features = features[start: end] 155 | labels = sam_labels[start: end] 156 | 157 | num_features = chunk_size 158 | con_feature_sim_mat = sim_matrix(contrastive_features, contrastive_features) 159 | 160 | loss_contrastive = 0 161 | ONE = torch.tensor(1, device=con_feature_sim_mat.device) 162 | for i in range(SAM_BIT_LEN): 163 | label = torch.bitwise_and(ONE << i, labels).bool() 164 | label_mask = (label.expand(num_features, num_features).transpose(1, 0) 165 | == label.expand(num_features, num_features)) 166 | 167 | self_mask = torch.eye( 168 | con_feature_sim_mat.shape[0], dtype=torch.bool, device=con_feature_sim_mat.device) 169 | 170 | zero_mask = torch.logical_or( 171 | torch.sum(label_mask, dim=1) <= 1, 172 | torch.sum(label_mask, dim=1) >= num_features - 1 173 | ) 174 | zero_mask = zero_mask.expand(num_features, num_features).transpose(1, 0) 175 | 176 | label_mask = label_mask / self.opt.contrastive_temperature 177 | loss = - torch.logsumexp( 178 | con_feature_sim_mat.masked_fill(torch.logical_not( 179 | label_mask), -6e4).masked_fill(self_mask, -6e4).masked_fill(zero_mask, 0), 180 | dim=-1 181 | ) + torch.logsumexp( 182 | con_feature_sim_mat.masked_fill(zero_mask, 0), 183 | dim=-1 184 | ) 185 | loss_contrastive += loss.mean() 186 | loss_contrastive = loss_contrastive / SAM_BIT_LEN 187 | loss_all += loss_contrastive 188 | loss_all = loss_all / chunks 189 | return loss_all 190 | 191 | def train_step(self, data): 192 | rays_o = data['rays_o'].to(self.device) # [B, 3] 193 | rays_d = data['rays_d'].to(self.device) # [B, 3] 194 | direction_norms = data['direction_norms'].to(self.device) # [B, 1] 195 | gt_rgb = data['pixels'].to(self.device) # [B, 3] 196 | gt_depth = data['depth'].to(self.device) # [B, 3] 197 | 198 | outputs = self.model.render(rays_o, 199 | rays_d, 200 | direction_norms, 201 | staged=False, 202 | bg_color=None, 203 | perturb=True, 204 | **vars(self.opt)) 205 | 206 | pred_rgb = outputs['image'] 207 | 208 | loss = self.opt.rgb_weight * self.criterion(pred_rgb, gt_rgb).mean() 209 | 210 | pred_depth = outputs['depth'] 211 | has_depth = (gt_depth > DEPTH_EPSILON) 212 | depth_loss = torch.abs(pred_depth[has_depth] - gt_depth[has_depth]) 213 | 214 | loss = loss + self.opt.depth_weight * depth_loss.mean() 215 | 216 | if self.opt.feature_loss: 217 | gt_features = data['features'].to(self.device) 218 | p_features = outputs['semantic_features'] 219 | loss_feature = F.l1_loss( 220 | p_features[:, :gt_features.shape[1]], gt_features) 221 | loss += self.opt.feature_weight * loss_feature 222 | if self.use_tensorboardX: 223 | self.writer.add_scalar("train/loss_feature", loss_feature.item(), 224 | self.global_step) 225 | 226 | if self.opt.feature_constrastive_learning: 227 | if self.opt.sam_sampling: 228 | anchor_indices = data['anchor_indices'].to(self.device) 229 | positive_indices = data['positive_indices'].to(self.device) 230 | negative_indices = data['negative_indices'].to(self.device) 231 | sam_centers = data['sam_centers'] 232 | if sam_centers is not None: 233 | sam_centers = sam_centers.to(self.device) 234 | loss_contrastive = self.compute_contrastive_loss( 235 | features=outputs['contrastive_features'], 236 | sam_sampling=self.opt.sam_sampling, 237 | anchor_indices=anchor_indices, 238 | positive_indices=positive_indices, 239 | negative_indices=negative_indices, 240 | sam_centers=sam_centers 241 | ) 242 | else: 243 | sam = data['sam'].to(self.device) 244 | chunk_size = data['chunk_size'] 245 | batch_size = len(sam) 246 | loss_contrastive = self.compute_contrastive_loss( 247 | features=outputs['contrastive_features'], 248 | sam_sampling=self.opt.sam_sampling, 249 | sam_labels=sam, 250 | batch_size=batch_size, 251 | chunk_size=chunk_size 252 | ) 253 | 254 | loss += self.opt.contrastive_weight * loss_contrastive 255 | 256 | if self.use_tensorboardX: 257 | self.writer.add_scalar("train/loss_contrastive", loss_contrastive.item(), 258 | self.global_step) 259 | 260 | return pred_rgb, gt_rgb, loss 261 | 262 | def compute_instance_centers(self, dataset): 263 | self.log("Start computing instance centers ...") 264 | with torch.inference_mode(): 265 | with torch.cuda.amp.autocast(enabled=True): 266 | self.model.eval() 267 | instance_features = [] 268 | for i in tqdm(dataset.indices, desc="Processing contrastive features"): 269 | batch = dataset._get_test(i) 270 | # get instance and semantic features 271 | rays_o = torch.tensor(batch['rays_o']).to(self.device) 272 | rays_d = torch.tensor(batch['rays_d']).to(self.device) 273 | direction_norms = torch.tensor(batch['direction_norms']).to(self.device) 274 | outputs = self.model.render(rays_o, 275 | rays_d, 276 | direction_norms, 277 | staged=True, 278 | perturb=False) 279 | instance_feature = outputs['contrastive_features'].cpu().numpy() 280 | instance_features.append(instance_feature) 281 | instance_features = np.stack(instance_features, axis=0) 282 | 283 | # feature clustering 284 | self.log("Clustering features ...") 285 | num_image, image_height, image_width, feature_dim = instance_features.shape 286 | instance_features = instance_features.reshape(-1, feature_dim) 287 | clust = hdbscan.HDBSCAN(min_cluster_size=100, gen_min_span_tree=True) # cluster size depends on the image size 288 | sample_indices = np.random.permutation(instance_features.shape[0])[:200000] 289 | clust.fit(instance_features[sample_indices, :]) 290 | 291 | exemplar = [np.mean(exemplars, axis=0) for exemplars in clust.exemplars_] 292 | exemplar = np.vstack(exemplar) 293 | self.log(f"Total {len(clust.exemplars_)} instance centers.") 294 | 295 | self.model.set_instance_centers(exemplar) 296 | self.model.set_instance_clusterer(clust) 297 | 298 | def save_instance_centers(self, save_cluster=True): 299 | name = f'{self.name}_ep{self.epoch:04d}_instance_centers' 300 | file_path = f"{self.ckpt_path}/{name}.npy" 301 | np.save(file_path, self.model.instance_centers) 302 | 303 | if save_cluster: 304 | name = f'{self.name}_ep{self.epoch:04d}_cluster' 305 | file_path = f"{self.ckpt_path}/{name}.pkl" 306 | with open(file_path, 'wb') as outp: 307 | pickle.dump(self.model.instance_clusterer, outp, pickle.HIGHEST_PROTOCOL) 308 | 309 | def test_step(self, data): 310 | rays_o = data['rays_o'] # [B, N, 3] 311 | rays_d = data['rays_d'] # [B, N, 3] 312 | direction_norms = data['direction_norms'] # [B, N, 1] 313 | H, W = data['H'], data['W'] 314 | 315 | outputs = self.model.render(rays_o, 316 | rays_d, 317 | direction_norms, 318 | staged=True, 319 | perturb=False, 320 | **vars(self.opt)) 321 | 322 | pred_rgb = outputs['image'].reshape(-1, H, W, 3) 323 | pred_depth = outputs['depth'].reshape(-1, H, W) 324 | pred_semantic = outputs['semantic'] 325 | pred_features = outputs['semantic_features'] 326 | _, _, C = pred_semantic.shape 327 | pred_semantic = pred_semantic.reshape(-1, H, W, C) 328 | 329 | return pred_rgb, pred_depth, pred_semantic, pred_features 330 | 331 | def eval_step(self, data): 332 | rays_o = data['rays_o'].to(self.device) # [B, 3] 333 | rays_d = data['rays_d'].to(self.device) # [B, 3] 334 | direction_norms = data['direction_norms'].to(self.device) # [B, 1] 335 | gt_rgb = data['pixels'].to(self.device) # [B, H, W, 3] 336 | gt_depth = data['depth'].to(self.device) # [B, H, W] 337 | gt_semantic = data['semantic'].to(self.device) # [B, H, W] 338 | H, W, _ = gt_rgb.shape 339 | 340 | outputs = self.model.render(rays_o, 341 | rays_d, 342 | direction_norms, 343 | staged=True, 344 | bg_color=None, 345 | perturb=False, 346 | **vars(self.opt)) 347 | 348 | pred_rgb = outputs['image'].reshape(H, W, 3) 349 | pred_depth = outputs['depth'].reshape(H, W) 350 | pred_semantic = outputs['semantic'] 351 | 352 | loss = self.criterion(pred_rgb, gt_rgb).mean() 353 | has_depth = gt_depth > DEPTH_EPSILON 354 | loss += self.opt.depth_weight * torch.abs(pred_depth[has_depth] - 355 | gt_depth[has_depth]).mean() 356 | 357 | has_semantic = gt_semantic >= 0 358 | if has_semantic.sum().item() > 0: 359 | semantic_loss = F.cross_entropy(pred_semantic[has_semantic, :], 360 | gt_semantic[has_semantic]) 361 | loss += self.opt.semantic_weight * semantic_loss 362 | 363 | pred_semantic = pred_semantic.reshape(H, W, pred_semantic.shape[-1]) 364 | 365 | return pred_rgb[None], pred_depth[None], pred_semantic[None], gt_rgb[ 366 | None], loss 367 | 368 | def _step_scheduler(self, loss): 369 | if isinstance(self.lr_schedulers[0], 370 | optim.lr_scheduler.ReduceLROnPlateau): 371 | [s.step(loss) for s in self.lr_schedulers] 372 | else: 373 | [s.step() for s in self.lr_schedulers] 374 | 375 | 376 | class InteractiveTrainer(SimpleTrainer): 377 | 378 | def __init__(self, *args, **kwargs): 379 | lr_scheduler = kwargs['lr_scheduler'] 380 | kwargs['lr_scheduler'] = None 381 | super().__init__(*args, **kwargs) 382 | self.loader = None 383 | self.lr_scheduler = lr_scheduler(self.optimizer) 384 | 385 | def init(self, loader): 386 | self.model.train() 387 | self.iterator = iter(loader) 388 | self.step = 0 389 | self.model.mark_untrained_grid(loader._data.poses, 390 | loader._data.intrinsics) 391 | 392 | def train(self, loader): 393 | while True: 394 | self.model.train() 395 | self.train_one_epoch(loader) 396 | 397 | def train_one_epoch(self, loader): 398 | iterator = iter(loader) 399 | bar = tqdm(range(1000), desc="Loss: N/A") 400 | for _ in bar: 401 | data = next(iterator) 402 | self.optimizer.zero_grad() 403 | with torch.cuda.amp.autocast(enabled=self.fp16): 404 | _, _, loss = self.train_step(data) 405 | self.scaler.scale(loss).backward() 406 | self.scaler.step(self.optimizer) 407 | self.scaler.update() 408 | bar.set_description(f"Loss: {loss:.04f}") 409 | if self.ema is not None: 410 | self.ema.update() 411 | self._step_scheduler(loss) 412 | 413 | def take_step(self): 414 | data = next(self.iterator) 415 | self.optimizer.zero_grad() 416 | 417 | with torch.cuda.amp.autocast(enabled=self.fp16): 418 | _, _, loss = self.train_step(data) 419 | 420 | self.scaler.scale(loss).backward() 421 | self.scaler.step(self.optimizer) 422 | self.scaler.update() 423 | 424 | self.step += 1 425 | if self.step % 100 == 0: 426 | self.ema.update() 427 | self._step_scheduler(loss) 428 | return loss 429 | 430 | def dataset_updated(self, loader): 431 | self.loader = loader 432 | --------------------------------------------------------------------------------