├── test
    ├── __init__.py
    └── test_sampling.py
├── autolabel
    ├── __init__.py
    ├── features
    │   ├── __init__.py
    │   ├── dino.py
    │   ├── fcn50.py
    │   └── lseg.py
    ├── constants.py
    ├── visualization.py
    ├── utils
    │   ├── feature_utils.py
    │   └── __init__.py
    ├── model_utils.py
    ├── models.py
    └── trainer.py
├── setup.py
├── assets
    └── teaser.jpg
├── .gitmodules
├── .gitignore
├── setup.cfg
├── .yapf.vim
├── LICENSE
├── configs
    ├── scannet_mapping.json
    └── label_map.csv
├── scripts
    ├── compute_sam_mask.py
    ├── data
    │   ├── convert_scanner.py
    │   ├── convert_arkitscenes.py
    │   ├── convert_replica.py
    │   ├── convert_hypersim.py
    │   └── convert_scannet.py
    ├── language
    │   ├── pointcloud.py
    │   └── evaluate.py
    ├── evaluate.py
    ├── export.py
    ├── train.py
    ├── compute_feature_maps.py
    ├── render.py
    ├── convert_to_instant_ngp.py
    ├── demo_ui.py
    └── mapping.py
├── docs
    ├── data.md
    └── vision-language.md
└── README.md


/test/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/autolabel/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | 
3 | setup()
4 | 


--------------------------------------------------------------------------------
/assets/teaser.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ethz-asl/pvlff/HEAD/assets/teaser.jpg


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "torch_ngp"]
2 | 	path = torch_ngp
3 | 	url = git@github.com:ethz-asl/torch-ngp.git
4 | 


--------------------------------------------------------------------------------
/autolabel/features/__init__.py:
--------------------------------------------------------------------------------
1 | from autolabel.features.fcn50 import FCN50
2 | from autolabel.features.dino import Dino
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/**/*
2 | __pycache__
3 | .DS_Store
4 | *.egg-info
5 | Hierarchical-Localization/
6 | ops/maplab/maplab
7 | out*/


--------------------------------------------------------------------------------
/autolabel/constants.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from matplotlib import cm
3 | 
4 | colors = (cm.tab10(np.linspace(0, 1, 10)) * 255.0)[:, :3].astype(np.uint8)
5 | COLORS = np.concatenate([colors, colors, colors, colors], axis=0)
6 | 


--------------------------------------------------------------------------------
/autolabel/visualization.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from matplotlib import cm
 3 | 
 4 | 
 5 | def visualize_depth(nparray, maxdepth=10.0):
 6 |     """
 7 |     Takes metric scale np.array and returns a colormapped np.array with type np.uint8.
 8 |     """
 9 |     normalized_depth = 1.0 - np.clip(nparray, 0.0, maxdepth) / maxdepth
10 |     return (cm.inferno(normalized_depth) * 255).astype(np.uint8)
11 | 


--------------------------------------------------------------------------------
/autolabel/utils/feature_utils.py:
--------------------------------------------------------------------------------
 1 | def get_feature_extractor(features, checkpoint=None):
 2 |     if features == 'fcn50':
 3 |         from autolabel.features import FCN50
 4 |         return FCN50()
 5 |     elif features == 'dino':
 6 |         from autolabel.features import Dino
 7 |         return Dino()
 8 |     elif features == 'lseg':
 9 |         from autolabel.features import lseg
10 |         return lseg.LSegFE(checkpoint)
11 |     else:
12 |         raise NotImplementedError()
13 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = autolabel
 3 | version = 0.1
 4 | description = A project for inferring the structure and semantics of scenes.
 5 | license = MIT License
 6 | 
 7 | [options]
 8 | packages = autolabel
 9 | install_requires =
10 |     rich
11 |     numpy
12 |     pillow
13 |     tqdm
14 |     tensorboardX
15 |     opencv-python
16 |     ; open3d
17 |     torch
18 |     torch_ngp
19 |     trimesh
20 |     matplotlib
21 |     scipy
22 |     ; PyQt6
23 |     numba
24 |     scikit-video
25 |     scikit-image
26 | 
27 | [yapf]
28 | based_on_style = google
29 | indent_width = 4
30 | column_limit = 80
31 | 


--------------------------------------------------------------------------------
/.yapf.vim:
--------------------------------------------------------------------------------
 1 | function! yapf#YAPF() range
 2 |   " Determine range to format.
 3 |   let l:cmd = 'yapf'
 4 | 
 5 |   " Call YAPF with the current buffer
 6 |   let l:formatted_text = systemlist(l:cmd, join(getline(1, '$'), "\n") . "\n")
 7 | 
 8 |   if v:shell_error
 9 |     echohl ErrorMsg
10 |     echomsg printf('"%s" returned error: %s', l:cmd, l:formatted_text[-1])
11 |     echohl None
12 |     return
13 |   endif
14 | 
15 |   " Update the buffer.
16 |   execute '1,' . string(line('$')) . 'delete'
17 |   call setline(1, l:formatted_text)
18 | 
19 |   " Reset cursor to first line of the formatted range.
20 |   call cursor(a:firstline, 1)
21 | endfunction
22 | 


--------------------------------------------------------------------------------
/autolabel/features/dino.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchvision import transforms
 3 | from torchvision.io.image import read_image
 4 | from torchvision.transforms.functional import to_pil_image
 5 | from torchvision.models import feature_extraction
 6 | from torch.nn import functional as F
 7 | 
 8 | 
 9 | class Dino:
10 | 
11 |     def __init__(self):
12 |         self.normalize = normalize = transforms.Normalize(
13 |             mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]).cuda()
14 |         self.model = torch.hub.load('facebookresearch/dino:main',
15 |                                     'dino_vits8').half().cuda()
16 |         self.model.eval()
17 | 
18 |     def shape(self, *args):
19 |         return (90, 120)
20 | 
21 |     def __call__(self, x):
22 |         B, C, H, W = x.shape
23 |         x = self.normalize(x)
24 |         x = self.model.get_intermediate_layers(x.half())
25 |         width_out = W // 8
26 |         height_out = H // 8
27 |         return x[0][:, 1:, :].reshape(B, height_out, width_out, 384).detach()
28 | 


--------------------------------------------------------------------------------
/autolabel/features/fcn50.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchvision import transforms
 3 | from torchvision.models.segmentation import fcn_resnet50
 4 | from torchvision.transforms.functional import to_pil_image
 5 | from torchvision.models import feature_extraction
 6 | from torch.nn import functional as F
 7 | 
 8 | 
 9 | class FCN50:
10 | 
11 |     def __init__(self):
12 |         self.model = fcn_resnet50(pretrained=True)
13 |         self.model.eval()
14 |         self.model = self.model.cuda()
15 |         self.extractor = feature_extraction.create_feature_extractor(
16 |             self.model, return_nodes={'classifier.2': 'features'})
17 |         self.normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
18 |                                               std=[0.229, 0.224, 0.225]).cuda()
19 | 
20 |     @property
21 |     def shape(self):
22 |         return (90, 120)
23 | 
24 |     def __call__(self, x):
25 |         batch = self.normalize(x)
26 |         out = self.extractor(batch)
27 | 
28 |         return out['features'].detach().cpu().half().numpy().transpose(
29 |             [0, 2, 3, 1])
30 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Autonomous Systems Lab, ETH Zurich
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/configs/scannet_mapping.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "remap": {
 3 |     "18": 1,
 4 |     "35": 1,
 5 |     "45": 1,
 6 |     "102": 1,
 7 |     "132": 1,
 8 |     "197": 1,
 9 |     "413": 1,
10 |     "414": 1,
11 |     "450": 1,
12 |     "563": 1,
13 |     "572": 1,
14 |     "577": 1,
15 |     "16": 7,
16 |     "37": 7,
17 |     "44": 7,
18 |     "54": 7,
19 |     "172": 7,
20 |     "231": 7,
21 |     "232": 7,
22 |     "341": 7,
23 |     "392": 7,
24 |     "594": 7,
25 |     "32": 15,
26 |     "34": 15,
27 |     "47": 15,
28 |     "75": 15,
29 |     "119": 15,
30 |     "177": 15,
31 |     "359": 15,
32 |     "372": 15,
33 |     "427": 15,
34 |     "496": 15,
35 |     "513": 15,
36 |     "13": 4,
37 |     "76": 4,
38 |     "87": 4,
39 |     "125": 4,
40 |     "128": 4,
41 |     "175": 4,
42 |     "282": 4,
43 |     "420": 4,
44 |     "509": 4,
45 |     "555": 4,
46 |     "561": 4,
47 |     "39": 22,
48 |     "296": 22,
49 |     "329": 22,
50 |     "331": 22,
51 |     "369": 22,
52 |     "389": 22,
53 |     "411": 22,
54 |     "560": 22,
55 |     "604": 22
56 |   },
57 |   "prompts": {
58 |     "1": "chair; stool; office chair; armchair",
59 |     "4": "door; sliding door; doorframe",
60 |     "7": "table; desk; coffee table; nightstand; dining table; side table",
61 |     "15": "cabinet; kitchen cabinet; file cabinet; cabinet door; bathroom cabinet",
62 |     "21": "sink; bathroom sink; kitchen sink",
63 |     "22": "backpack; bag"
64 |   }
65 | }
66 | 


--------------------------------------------------------------------------------
/test/test_sampling.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy as np
 3 | from autolabel.dataset import IndexSampler
 4 | 
 5 | 
 6 | class SamplingTestCase(unittest.TestCase):
 7 | 
 8 |     def test_find_classes(self):
 9 |         semantics = np.zeros((2, 10), int)
10 |         sampler = IndexSampler()
11 |         sampler.update(semantics)
12 |         self.assertFalse(sampler.has_semantics)
13 | 
14 |         self.assertEqual(len(sampler.classes), 0)
15 |         semantics[0, 5] = 1
16 |         semantics[0, 6] = 2
17 |         sampler.update(semantics)
18 |         self.assertEqual(len(sampler.classes), 2)
19 |         self.assertEqual(sampler.classes[0], 1)
20 |         self.assertEqual(sampler.classes[1], 2)
21 | 
22 |     def test_sampling(self):
23 |         semantics = np.zeros((2, 10), int)
24 |         semantics[0, 5] = 1
25 |         semantics[0, 0] = 2
26 |         semantics[1, 5] = 3
27 |         sampler = IndexSampler()
28 |         sampler.update(semantics)
29 |         random_class = sampler.sample_class()
30 |         self.assertIn(random_class, [1, 2, 3])
31 | 
32 |         random_image, random_index = sampler.sample(1, 1)
33 |         self.assertEqual(random_image, 0)
34 |         self.assertEqual(random_index[0], 5)
35 |         random_image, random_index = sampler.sample(2, 1)
36 |         self.assertEqual(random_image, 0)
37 |         self.assertEqual(random_index[0], 0)
38 | 
39 |         random_image, random_indices = sampler.sample(3, 5)
40 |         self.assertEqual(random_image, 1)
41 |         self.assertEqual(len(random_indices), 5)
42 |         self.assertEqual(np.random.choice(random_indices), 5)
43 |         self.assertTrue(sampler.has_semantics)
44 | 
45 |     def test_semantic_indices(self):
46 |         semantics = np.zeros((5, 10), int)
47 |         semantics[0, 5] = 1
48 |         semantics[2, 0] = 2
49 |         semantics[4, 5] = 3
50 |         sampler = IndexSampler()
51 |         sampler.update(semantics)
52 |         indices = sampler.semantic_indices()
53 |         self.assertEqual(len(indices), 3)
54 |         self.assertIn(0, indices)
55 |         self.assertIn(2, indices)
56 |         self.assertIn(4, indices)
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     unittest.main()
61 | 


--------------------------------------------------------------------------------
/configs/label_map.csv:
--------------------------------------------------------------------------------
  1 | ,id,prompt,evaluated,type
  2 | 0,1,backpack,0,-1
  3 | 1,2,base-cabinet,0,-1
  4 | 2,3,basket,0,-1
  5 | 3,4,bathtub,0,-1
  6 | 4,5,beam,0,-1
  7 | 5,6,beanbag,0,-1
  8 | 6,7,bed,1,1
  9 | 7,8,bench,0,-1
 10 | 8,9,bike,0,-1
 11 | 9,10,bin,0,-1
 12 | 10,11,blanket,0,-1
 13 | 11,12,blinds,0,-1
 14 | 12,13,book,0,-1
 15 | 13,14,bottle,0,-1
 16 | 14,15,box,0,-1
 17 | 15,16,bowl,0,-1
 18 | 16,17,camera,0,-1
 19 | 17,18,cabinet,1,0
 20 | 18,19,candle,0,-1
 21 | 19,20,chair,1,1
 22 | 20,21,chopping-board,0,-1
 23 | 21,22,clock,0,-1
 24 | 22,23,cloth,0,-1
 25 | 23,24,clothing,0,-1
 26 | 24,25,coaster,0,-1
 27 | 25,26,comforter,0,-1
 28 | 26,27,computer-keyboard,0,-1
 29 | 27,28,cup,0,-1
 30 | 28,29,cushion,0,-1
 31 | 29,30,curtain,1,0
 32 | 30,31,ceiling,1,0
 33 | 31,32,cooktop,0,-1
 34 | 32,33,countertop,1,0
 35 | 33,34,desk,0,-1
 36 | 34,35,desk-organizer,0,-1
 37 | 35,36,desktop-computer,0,-1
 38 | 36,37,door,1,0
 39 | 37,38,exercise-ball,0,-1
 40 | 38,39,faucet,0,-1
 41 | 39,40,floor,1,0
 42 | 40,41,handbag,0,-1
 43 | 41,42,hair-dryer,0,-1
 44 | 42,43,handrail,0,-1
 45 | 43,44,indoor-plant,0,-1
 46 | 44,45,knife-block,0,-1
 47 | 45,46,kitchen-utensil,0,-1
 48 | 46,47,lamp,1,0
 49 | 47,48,laptop,0,-1
 50 | 48,49,major-appliance,0,-1
 51 | 49,50,mat,0,-1
 52 | 50,51,microwave,0,-1
 53 | 51,52,monitor,0,-1
 54 | 52,53,mouse,0,-1
 55 | 53,54,nightstand,0,-1
 56 | 54,55,pan,0,-1
 57 | 55,56,panel,0,-1
 58 | 56,57,paper-towel,0,-1
 59 | 57,58,phone,0,-1
 60 | 58,59,picture,0,-1
 61 | 59,60,pillar,0,-1
 62 | 60,61,pillow,0,-1
 63 | 61,62,pipe,0,-1
 64 | 62,63,plant-stand,0,-1
 65 | 63,64,plate,0,-1
 66 | 64,65,pot,0,-1
 67 | 65,66,rack,0,-1
 68 | 66,67,refrigerator,0,-1
 69 | 67,68,remote-control,0,-1
 70 | 68,69,scarf,0,-1
 71 | 69,70,sculpture,0,-1
 72 | 70,71,shelf,1,0
 73 | 71,72,shoe,0,-1
 74 | 72,73,shower-stall,0,-1
 75 | 73,74,sink,1,1
 76 | 74,75,small-appliance,0,-1
 77 | 75,76,sofa,1,1
 78 | 76,77,stair,0,-1
 79 | 77,78,stool,0,-1
 80 | 78,79,switch,0,-1
 81 | 79,80,table,1,0
 82 | 80,81,table-runner,0,-1
 83 | 81,82,tablet,0,-1
 84 | 82,83,tissue-paper,0,-1
 85 | 83,84,toilet,1,1
 86 | 84,85,toothbrush,0,-1
 87 | 85,86,towel,0,-1
 88 | 86,87,tv-screen,1,1
 89 | 87,88,tv-stand,0,-1
 90 | 88,89,umbrella,0,-1
 91 | 89,90,utensil-holder,0,-1
 92 | 90,91,vase,0,-1
 93 | 91,92,vent,0,-1
 94 | 92,93,wall,1,0
 95 | 93,94,wall-cabinet,0,-1
 96 | 94,95,wall-plug,0,-1
 97 | 95,96,wardrobe,0,-1
 98 | 96,97,window,1,0
 99 | 97,98,rug,0,-1
100 | 98,99,logo,0,-1
101 | 99,100,bag,1,1
102 | 100,101,set-of-clothing,0,-1
103 | 


--------------------------------------------------------------------------------
/autolabel/features/lseg.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import clip
 3 | from torch.nn import functional as F
 4 | from modules.lseg_module import LSegModule
 5 | from additional_utils.models import LSeg_MultiEvalModule
 6 | from torchvision import transforms
 7 | 
 8 | 
 9 | class LSegFE:
10 | 
11 |     def __init__(self, checkpoint):
12 |         module = LSegModule.load_from_checkpoint(checkpoint_path=checkpoint,
13 |                                                  backbone='clip_vitl16_384',
14 |                                                  data_path=None,
15 |                                                  num_features=256,
16 |                                                  batch_size=1,
17 |                                                  base_lr=1e-3,
18 |                                                  max_epochs=100,
19 |                                                  augment=False,
20 |                                                  aux=True,
21 |                                                  aux_weight=0,
22 |                                                  ignore_index=255,
23 |                                                  dataset='ade20k',
24 |                                                  se_loss=False,
25 |                                                  se_weight=0,
26 |                                                  arch_option=0,
27 |                                                  block_depth=0,
28 |                                                  activation='lrelu')
29 |         # Skip totensor operation.
30 |         self.transform = transforms.Compose(module.val_transform.transforms[1:])
31 |         net = module.net.cuda()
32 |         scales = [1.0]
33 |         self.evaluator = LSeg_MultiEvalModule(module, scales=scales,
34 |                                               flip=False).half().cuda().eval()
35 |         self.text_encoder = module.net.clip_pretrained.to(torch.float32).cuda()
36 | 
37 |     def shape(self, input_shape):
38 |         return (input_shape[0] // 2, input_shape[1] // 2)
39 | 
40 |     def encode_text(self, text):
41 |         """
42 |         text: list of N strings to encode
43 |         returns: torch tensor size N x 512
44 |         """
45 |         with torch.inference_mode():
46 |             tokenized = clip.tokenize(text).cuda()
47 |             features = []
48 |             for item in tokenized:
49 |                 f = self.text_encoder.encode_text(item[None])[0]
50 |                 features.append(f)
51 |             features = torch.stack(features, dim=0)
52 |             return features / torch.norm(features, dim=-1, keepdim=True)
53 | 
54 |     def __call__(self, x):
55 |         x = self.transform(x)
56 |         _, _, H, W = x.shape
57 |         # Return half size features
58 |         H_out, W_out = H // 2, W // 2
59 |         out = []
60 |         x = [F.interpolate(image[None], [H_out, W_out]) for image in x]
61 |         for image in x:
62 |             out.append(self.evaluator.compute_features(image.half()))
63 | 
64 |         out = torch.cat(out, dim=0)
65 | 
66 |         return out.permute(0, 2, 3, 1)
67 | 


--------------------------------------------------------------------------------
/scripts/compute_sam_mask.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ["OPENCV_IO_ENABLE_OPENEXR"]="1"
 3 | import argparse
 4 | import cv2
 5 | from pathlib import Path
 6 | import numpy as np
 7 | from tqdm import tqdm
 8 | from segment_anything import sam_model_registry, SamAutomaticMaskGenerator
 9 | 
10 | 
11 | def read_args():
12 |     parser = argparse.ArgumentParser()
13 |     parser.add_argument('scene')
14 |     parser.add_argument('--sam-vit-h-checkpoint', type=str, required=True)
15 |     parser.add_argument('--prefer-union-mask', action='store_true')
16 |     return parser.parse_args()
17 | 
18 | def _iou(image_vector1, image_vector2):
19 |     intersection = np.logical_and(image_vector1, image_vector2).sum()
20 |     union = np.logical_or(image_vector1, image_vector2).sum()
21 |     iou = intersection / union
22 |     return iou, intersection, union
23 | 
24 | def generate_float32_mask(masks, prefer_union_mask=True):
25 |     indices = []
26 |     for i in np.random.permutation(list(range(len(masks)))):
27 |         if len(indices) >= 32:
28 |             break
29 | 
30 |         overlapped = False
31 |         for j, ind in enumerate(indices):
32 |             iou, intersection, union = _iou(masks[i]['segmentation'].reshape(-1), masks[ind]['segmentation'].reshape(-1))
33 |             if iou > 0.8:
34 |                 overlapped = True
35 |                 break
36 |             if prefer_union_mask:
37 |                 if intersection / masks[ind]['area'] > 0.8:
38 |                     indices[j] = i
39 |                     overlapped = True
40 |                     break
41 |                 elif intersection / masks[i]['area'] > 0.8:
42 |                     overlapped = True
43 |                     break
44 |         if not overlapped:
45 |             indices.append(i)
46 |     
47 |     one = 1
48 |     uint32_mask = np.zeros_like(masks[0]['segmentation'], dtype=np.uint32)
49 |     for i, ind in enumerate(indices):
50 |         mask = masks[ind]
51 |         number = (one << i)
52 |         uint32_mask += (number * mask['segmentation']).astype(np.uint32)
53 |     return uint32_mask.view(np.float32)
54 | 
55 | def main():
56 |     flags = read_args()
57 |     sam_checkpoint = flags.sam_vit_h_checkpoint
58 |     model_type = "vit_h"
59 |     device = "cuda"
60 | 
61 |     scene_folder = Path(flags.scene)
62 | 
63 |     image_folder = scene_folder / "rgb"
64 |     output_folder = scene_folder / "sam_mask"
65 |     output_folder.mkdir(parents=True, exist_ok=True)
66 | 
67 |     sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
68 |     sam.to(device=device)
69 |     mask_generator = SamAutomaticMaskGenerator(sam)
70 | 
71 |     image_files = os.listdir(image_folder)
72 |     image_files.sort()
73 | 
74 |     for image_file in tqdm(image_files):
75 |         image = cv2.imread(
76 |             os.path.join(image_folder, image_file)
77 |         )
78 |         image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
79 |         masks = mask_generator.generate(image)
80 |         masks.sort(key=lambda x: x['area'], reverse=True)
81 |         masks = [mask for mask in masks if mask['area'] > 100]
82 |         
83 |         sam_mask = generate_float32_mask(masks, prefer_union_mask=flags.prefer_union_mask)
84 |         cv2.imwrite(
85 |             os.path.join(output_folder, os.path.splitext(image_file)[0] + '.exr'),
86 |             sam_mask
87 |         )
88 | 
89 | if __name__ == "__main__":
90 |     main()
91 | 


--------------------------------------------------------------------------------
/docs/data.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Importing data
 3 | 
 4 | ## Capturing your own datasets
 5 | 
 6 | If you have a LiDAR enabled iOS device, you can use the [Stray Scanner](https://apps.apple.com/us/app/stray-scanner/id1557051662) app to record data. The script at `scripts/data/convert_scanner.py` will allow you to convert a scene recorded using the app to the above format. You can then run the `mapping.py` script to run structure from motion and compute the other outputs.
 7 | 
 8 | After capturing and moving the scenes over to your computer, convert to Autolabel format with:
 9 | ```
10 | python scripts/data/convert_scanner.py <scanner-scene> --out scenes/scene_name/
11 | 
12 | # Compute camera poses.
13 | python scripts/mapping.py scenes/scene_name/
14 | ```
15 | 
16 | ## Importing ARKitScenes scenes
17 | 
18 | Here are the steps required to download and import scenes from the ARKitScenes dataset.
19 | 
20 | ```
21 | # Clone ARKitScenes repository
22 | git clone https://github.com/apple/ARKitScenes.git arkit-scenes
23 | cd arkit-scenes
24 | 
25 | # Create a directory for the scenes to download them
26 | mkdir -p scenes/raw/ && mkdir -p scenes/converted
27 | 
28 | # Download the required parts of the dataset
29 | # For now, we only download the low resolution RGB images (256x192), but higher
30 | # resolution frames could be used.
31 | python download_data.py raw --split Training \
32 |   --video_id_csv depth_upsampling/upsampling_train_val_splits.csv \
33 |   --download_dir scenes/raw \
34 |   --raw_dataset_assets lowres_wide lowres_depth lowres_wide.traj confidence lowres_wide_intrinsics
35 | 
36 | # Convert the ARKitScenes to the format used by Autolabel
37 | python scripts/data/convert_arkitscenes.py scenes/raw/ --out scenes/converted/
38 | ```
39 | 
40 | ## Importing Replica renders
41 | 
42 | We have written data conversion scripts for different publicly available datasets.
43 | 
44 | Renders from the [Replica](https://github.com/facebookresearch/Replica-Dataset) dataset published by [SemanticNeRF](https://github.com/Harry-Zhi/semantic_nerf/) can be converted using the `scripts/data/convert_replica.py` script.
45 | 
46 | ## Importing HyperSim scenes
47 | 
48 | Download the scenes from [HyperSim](https://github.com/apple/ml-hypersim) using their official [download script](https://github.com/apple/ml-hypersim/blob/main/code/python/tools/dataset_download_images.py) (you can specify which scenes you want to download in the script).
49 | 
50 | Download the semantic label file [here](https://github.com/apple/ml-hypersim/blob/main/code/cpp/tools/scene_annotation_tool/semantic_label_descs.csv).
51 | 
52 | Download the file of camera parameters [here](https://github.com/apple/ml-hypersim/blob/main/contrib/mikeroberts3000/metadata_camera_parameters.csv).
53 | 
54 | 
55 | Then, run the conversion script as:
56 | ```
57 | python scripts/data/convert_hypersim.py <hypersim folder> \
58 |     --out <output-scene-directory> \
59 |     --ori-semantic-labels <hypersim-semantic-labels-file> \
60 |     --camera-parameter-file <csvfile-of-camera>
61 | ```
62 | 
63 | 
64 | ## Importing ScanNet scenes
65 | 
66 | ScanNet scenes can be imported with the `scripts/data/convert_scannet.py` script.
67 | 
68 | It is used as:
69 | ```
70 | python scripts/data/convert_scannet.py <scannet-directory> --label-map <path-to-labelmap.tsv> --out <path-to-output-directory> --stride <N>
71 | ```
72 | 
73 | - `scannet-directory` this is the path to the raw scannet dataset which contains each scene as a subdirectory. Each scene in turn contains the `*-label-filt.zip`, `*.sens` etc. files in the ScanNet format.
74 | - `--out` specifies the output directory. Each scene will be stored as a subdirectory.
75 | - `--stride` is an integer parameter specifying how many frames to keep. Only every Nth frame is kept in the scan.
76 | 
77 | This will in addition also create `mesh.ply` and `mesh_labels.npy` files use by the vision-language evaluation scripts.
78 | 
79 | 


--------------------------------------------------------------------------------
/scripts/data/convert_scanner.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import numpy as np
  4 | import json
  5 | import cv2
  6 | from skvideo import io
  7 | from tqdm import tqdm
  8 | 
  9 | 
 10 | def read_args():
 11 |     parser = argparse.ArgumentParser()
 12 |     parser.add_argument('scan', type=str, help="Scan directory")
 13 |     parser.add_argument('--out', type=str, help="Output directory")
 14 |     parser.add_argument("--rotate",
 15 |                         action="store_true",
 16 |                         help="Rotate frames 90 degrees")
 17 | 
 18 |     parser.add_argument("--subsample",
 19 |                         type=int,
 20 |                         default=1,
 21 |                         help="Subsample use every n frames from the dataset")
 22 |     return parser.parse_args()
 23 | 
 24 | 
 25 | def write_frames(scan_dir, rgb_out_dir, rotate=False, subsample=1):
 26 |     rgb_video = os.path.join(scan_dir, 'rgb.mp4')
 27 |     video = io.vreader(rgb_video)
 28 |     img_idx = 0
 29 |     for i, frame in tqdm(enumerate(video), desc="Writing RGB"):
 30 |         frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
 31 |         if i % subsample != 0:
 32 |             continue
 33 |         if rotate:
 34 |             frame = cv2.rotate(frame, cv2.ROTATE_90_CLOCKWISE)
 35 | 
 36 |         frame_path = os.path.join(rgb_out_dir, f"{img_idx:05}.jpg")
 37 |         img_idx += 1
 38 |         params = [int(cv2.IMWRITE_JPEG_QUALITY), 90]
 39 |         cv2.imwrite(frame_path, frame, params)
 40 | 
 41 | 
 42 | def write_depth(scan_dir, depth_out_dir, rotate=False, subsample=1):
 43 |     depth_dir_in = os.path.join(scan_dir, 'depth')
 44 |     confidence_dir = os.path.join(scan_dir, 'confidence')
 45 |     files = sorted(os.listdir(depth_dir_in))
 46 |     img_idx = 0
 47 |     for i, filename in tqdm(enumerate(files), desc="Writing Depth"):
 48 |         if '.png' not in filename:
 49 |             continue
 50 |         number, _ = filename.split('.')
 51 | 
 52 |         if i % subsample != 0:
 53 |             continue
 54 | 
 55 |         depth = cv2.imread(os.path.join(depth_dir_in, filename), -1)
 56 | 
 57 |         confidence = cv2.imread(os.path.join(confidence_dir,
 58 |                                              number + '.png'))[:, :, 0]
 59 |         if rotate:
 60 |             depth = cv2.rotate(depth, cv2.ROTATE_90_CLOCKWISE)
 61 |             confidence = cv2.rotate(confidence, cv2.ROTATE_90_CLOCKWISE)
 62 | 
 63 |         depth[confidence < 2] = 0
 64 |         cv2.imwrite(os.path.join(depth_out_dir, f"{int(img_idx):05}" + '.png'),
 65 |                     depth)
 66 |         img_idx += 1
 67 |     return img_idx
 68 | 
 69 | 
 70 | def write_intrinsics(scan_dir, out_dir, rotate=False):
 71 |     intrinsics = np.loadtxt(os.path.join(scan_dir, 'camera_matrix.csv'),
 72 |                             delimiter=',')
 73 |     fx = intrinsics[0, 0]
 74 |     fy = intrinsics[1, 1]
 75 |     cx = intrinsics[0, 2]
 76 |     cy = intrinsics[1, 2]
 77 |     if rotate:
 78 |         out_intrinsics = np.array([[fy, 0, cy], [0, fx, cx], [0, 0, 1]])
 79 |     else:
 80 |         out_intrinsics = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]])
 81 |     np.savetxt(os.path.join(out_dir, 'intrinsics.txt'), out_intrinsics)
 82 | 
 83 | 
 84 | def main():
 85 |     flags = read_args()
 86 |     rgb_out = os.path.join(flags.out, 'raw_rgb/')
 87 |     depth_out = os.path.join(flags.out, 'raw_depth/')
 88 |     os.makedirs(rgb_out, exist_ok=True)
 89 |     os.makedirs(depth_out, exist_ok=True)
 90 |     scan_dir = flags.scan
 91 | 
 92 |     write_intrinsics(scan_dir, flags.out, rotate=flags.rotate)
 93 |     write_depth(scan_dir,
 94 |                 depth_out,
 95 |                 rotate=flags.rotate,
 96 |                 subsample=flags.subsample)
 97 |     write_frames(scan_dir,
 98 |                  rgb_out,
 99 |                  rotate=flags.rotate,
100 |                  subsample=flags.subsample)
101 |     print("Done")
102 | 
103 | 
104 | if __name__ == "__main__":
105 |     main()
106 | 


--------------------------------------------------------------------------------
/autolabel/model_utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import glob
  4 | import argparse
  5 | import pickle
  6 | import os
  7 | from autolabel.models import ALNetwork
  8 | 
  9 | 
 10 | def load_checkpoint(model, checkpoint_dir, device='cuda:0'):
 11 |     checkpoint_list = sorted(glob.glob(f'{checkpoint_dir}/*.pth'))
 12 |     best = [c for c in checkpoint_list if 'best.pth' in c]
 13 |     if len(best) != 0:
 14 |         checkpoint = best[0]
 15 |     else:
 16 |         checkpoint = checkpoint_list[-1]
 17 |     checkpoint_dict = torch.load(checkpoint, map_location=device)
 18 |     model.load_state_dict(checkpoint_dict['model'])
 19 | 
 20 |     instance_centers_list = sorted(glob.glob(f'{checkpoint_dir}/*.npy'))
 21 |     if len(instance_centers_list) > 0:
 22 |         model.set_instance_centers(
 23 |             np.load(instance_centers_list[-1])
 24 |         )
 25 |     instance_cluster_list = sorted(glob.glob(f'{checkpoint_dir}/*.pkl'))
 26 |     if len(instance_cluster_list) > 0:
 27 |         with open(instance_cluster_list[-1], 'rb') as inp:
 28 |             clust = pickle.load(inp)
 29 |         model.set_instance_clusterer(clust)
 30 |     return model
 31 | 
 32 | 
 33 | def model_flag_parser():
 34 |     parser = argparse.ArgumentParser()
 35 |     parser.add_argument('--lr', type=float, default=5e-3)
 36 |     parser.add_argument('--geometric-features', '-g', type=int, default=15)
 37 |     parser.add_argument('--encoding',
 38 |                         default='hg+freq',
 39 |                         choices=['freq', 'hg', 'hg+freq'],
 40 |                         type=str,
 41 |                         help="Network positional encoding to use.")
 42 |     parser.add_argument('--features',
 43 |                         type=str,
 44 |                         default=None,
 45 |                         choices=[None, 'fcn50', 'dino', 'lseg'],
 46 |                         help="Use semantic feature supervision.")
 47 |     parser.add_argument('--rgb-weight', default=1.0, type=float)
 48 |     parser.add_argument('--semantic-weight', default=1.0, type=float)
 49 |     parser.add_argument('--feature-weight', default=0.5, type=float)
 50 |     parser.add_argument('--depth-weight', default=0.1, type=float)
 51 |     parser.add_argument('--feature-dim', default=64, type=int)
 52 |     parser.add_argument('--contrastive-weight', default=0.1, type=float)
 53 |     parser.add_argument('--contrastive-feat-dim', default=8, type=int)
 54 |     parser.add_argument('--contrastive-temperature', default=0.1, type=float)
 55 |     return parser
 56 | 
 57 | 
 58 | def model_hash(flags):
 59 |     features = 'plain'
 60 |     if flags.features is not None:
 61 |         features = flags.features
 62 |     string = f"g{flags.geometric_features}_{flags.encoding}_{features}"
 63 |     string += f"_rgb{flags.rgb_weight}_d{flags.depth_weight}_s{flags.semantic_weight}"
 64 |     string += f"_f{flags.feature_weight}"
 65 |     string += f"_c{flags.contrastive_weight}"
 66 |     return string
 67 | 
 68 | 
 69 | def model_dir(scene_path, flags):
 70 |     mhash = model_hash(flags)
 71 |     if flags.workspace is None:
 72 |         return os.path.join(scene_path, 'nerf', mhash)
 73 |     scene_name = os.path.basename(os.path.normpath(flags.scene))
 74 |     return os.path.join(flags.workspace, scene_name, mhash)
 75 | 
 76 | 
 77 | def create_model(min_bounds, max_bounds, n_classes, flags, bound_scale=1.25):
 78 |     bound = np.max([np.abs(min_bounds), np.abs(max_bounds)], axis=0).max() * bound_scale
 79 |     return ALNetwork(num_layers=2,
 80 |                      num_layers_color=2,
 81 |                      hidden_dim_color=128,
 82 |                      hidden_dim=128,
 83 |                      geo_feat_dim=flags.geometric_features,
 84 |                      encoding=flags.encoding,
 85 |                      bound=float(bound),
 86 |                      hidden_dim_semantic=flags.feature_dim,
 87 |                      contrastive_feat_dim=flags.contrastive_feat_dim,
 88 |                      cuda_ray=False,
 89 |                      density_scale=1,
 90 |                      semantic_classes=n_classes)
 91 | 
 92 | 
 93 | def read_params(workspace):
 94 |     with open(os.path.join(workspace, 'params.pkl'), 'rb') as f:
 95 |         return pickle.load(f)
 96 | 
 97 | 
 98 | def write_params(workspace, flags):
 99 |     os.makedirs(workspace, exist_ok=True)
100 |     with open(os.path.join(workspace, 'params.pkl'), 'wb') as f:
101 |         pickle.dump(flags, f)
102 | 
103 | 
104 | def get_nerf_dir(scene, flags):
105 |     scene_name = os.path.basename(os.path.normpath(scene))
106 |     if flags.workspace is None:
107 |         return os.path.join(scene, 'nerf')
108 |     else:
109 |         return os.path.join(flags.workspace, scene_name)
110 |     


--------------------------------------------------------------------------------
/scripts/language/pointcloud.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import numpy as np
  4 | import json
  5 | import pandas
  6 | import torch
  7 | from tqdm import tqdm
  8 | import open3d as o3d
  9 | from autolabel.dataset import SceneDataset, LenDataset
 10 | from autolabel import utils, model_utils
 11 | from autolabel.utils.feature_utils import get_feature_extractor
 12 | 
 13 | 
 14 | def read_args():
 15 |     parser = argparse.ArgumentParser()
 16 |     parser.add_argument('scene')
 17 |     parser.add_argument('--batch-size', default=8182, type=int)
 18 |     parser.add_argument('--workspace', type=str, default=None)
 19 |     parser.add_argument('--out',
 20 |                         type=str,
 21 |                         help="Resulting pointcloud path.",
 22 |                         required=True)
 23 |     parser.add_argument('--feature-checkpoint', '-f', type=str, required=True)
 24 |     parser.add_argument(
 25 |         '--stride',
 26 |         type=int,
 27 |         default=1,
 28 |         help="Only evaluate every Nth frame to save time or for debugging.")
 29 |     parser.add_argument('--vis', action='store_true')
 30 |     parser.add_argument('--features', type=str, default='lseg')
 31 |     parser.add_argument('--heatmap',
 32 |                         type=str,
 33 |                         help="Prompt for generating heatmap.")
 34 |     return parser.parse_args()
 35 | 
 36 | 
 37 | def get_nerf_dir(scene, flags):
 38 |     scene_name = os.path.basename(os.path.normpath(scene))
 39 |     if flags.workspace is None:
 40 |         return os.path.join(scene, 'nerf')
 41 |     else:
 42 |         return os.path.join(flags.workspace, scene_name)
 43 | 
 44 | 
 45 | def get_model(flags, scene_dir):
 46 |     nerf_dir = get_nerf_dir(scene_dir, flags)
 47 |     for model in os.listdir(nerf_dir):
 48 |         checkpoint_dir = os.path.join(nerf_dir, model, 'checkpoints')
 49 |         if os.path.exists(checkpoint_dir):
 50 |             return model
 51 | 
 52 | 
 53 | def render(model, batch, T_CW, dataset, features):
 54 |     rays_o = torch.tensor(batch['rays_o']).cuda()
 55 |     rays_d = torch.tensor(batch['rays_d']).cuda()
 56 |     direction_norms = torch.tensor(batch['direction_norms']).cuda()
 57 |     depth = torch.tensor(batch['depth']).cuda()
 58 |     output = model.render(rays_o,
 59 |                           rays_d,
 60 |                           direction_norms,
 61 |                           staged=True,
 62 |                           perturb=False,
 63 |                           num_steps=512,
 64 |                           upsample_steps=0)
 65 |     variance = output['depth_variance'].cpu().numpy()
 66 |     cutoff = np.percentile(variance, 50)
 67 |     mask = variance < cutoff
 68 |     cm_C = output['coordinates_map']
 69 |     H, W, _ = cm_C.shape
 70 |     cm_C = cm_C.cpu().numpy()[mask]
 71 |     rgb = output['image'].cpu().numpy()[mask]
 72 |     return cm_C[:, :3], rgb
 73 | 
 74 | 
 75 | def main(flags):
 76 |     scene_name = os.path.basename(os.path.normpath(flags.scene))
 77 |     scene = flags.scene
 78 |     print(f"Evaluating scene {scene_name}")
 79 |     nerf_dir = get_nerf_dir(scene, flags)
 80 |     model = get_model(flags, scene)
 81 |     model_path = os.path.join(nerf_dir, model)
 82 |     params = model_utils.read_params(model_path)
 83 |     dataset = SceneDataset('test',
 84 |                            scene,
 85 |                            factor=4.0,
 86 |                            batch_size=flags.batch_size,
 87 |                            lazy=True)
 88 | 
 89 |     model = model_utils.create_model(dataset.min_bounds, dataset.max_bounds,
 90 |                                      606, params).cuda()
 91 | 
 92 |     checkpoint_dir = os.path.join(model_path, 'checkpoints')
 93 |     if not os.path.exists(checkpoint_dir) or len(
 94 |             os.listdir(checkpoint_dir)) == 0:
 95 |         print("Now checkpoint path")
 96 |         exit()
 97 | 
 98 |     model_utils.load_checkpoint(model, checkpoint_dir)
 99 |     model = model.eval()
100 |     feature_extractor = get_feature_extractor(flags.features,
101 |                                               flags.feature_checkpoint)
102 | 
103 |     points = []
104 |     colors = []
105 |     for frame_index in tqdm(dataset.indices[::flags.stride]):
106 |         batch = dataset._get_test(frame_index)
107 |         T_CW = dataset.poses[frame_index]
108 |         points_W, rgb = render(model, batch, T_CW, dataset, feature_extractor)
109 |         points.append(points_W)
110 |         colors.append(rgb)
111 |     rgb = np.concatenate(colors, axis=0)
112 |     p_W = np.concatenate(points, axis=0)
113 |     pc = o3d.geometry.PointCloud(o3d.utility.Vector3dVector(p_W))
114 |     pc.colors = o3d.utility.Vector3dVector(rgb)
115 |     o3d.io.write_point_cloud(flags.out, pc)
116 | 
117 | 
118 | if __name__ == "__main__":
119 |     with torch.inference_mode():
120 |         with torch.cuda.amp.autocast(enabled=True):
121 |             main(read_args())
122 | 


--------------------------------------------------------------------------------
/scripts/evaluate.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import numpy as np
  4 | import json
  5 | from autolabel.evaluation import Evaluator
  6 | from autolabel.dataset import SceneDataset, LenDataset
  7 | from autolabel import utils, model_utils
  8 | 
  9 | 
 10 | def read_args():
 11 |     parser = argparse.ArgumentParser()
 12 |     parser.add_argument('scenes', nargs='+')
 13 |     parser.add_argument('--batch-size', default=8182, type=int)
 14 |     parser.add_argument('--vis', action='store_true')
 15 |     parser.add_argument('--workspace', type=str, default=None)
 16 |     parser.add_argument('--write-images', type=str, default=None)
 17 |     parser.add_argument('--out',
 18 |                         default=None,
 19 |                         type=str,
 20 |                         help="Where to write results as json, if anywhere.")
 21 |     return parser.parse_args()
 22 | 
 23 | 
 24 | def gather_models(flags):
 25 |     models = set()
 26 |     for scene in flags.scenes:
 27 |         nerf_dir = model_utils.get_nerf_dir(scene, flags)
 28 |         if not os.path.exists(nerf_dir):
 29 |             continue
 30 |         for model in os.listdir(nerf_dir):
 31 |             checkpoint_dir = os.path.join(nerf_dir, model, 'checkpoints')
 32 |             if os.path.exists(checkpoint_dir):
 33 |                 models.add(model)
 34 |     return list(models)
 35 | 
 36 | 
 37 | def write_results(out, results):
 38 |     with open(out, 'wt') as f:
 39 |         f.write(json.dumps(results, indent=2))
 40 | 
 41 | 
 42 | def main(flags):
 43 |     models = gather_models(flags)
 44 |     classes = ["Background", "Class 1"]
 45 |     scene_names = [os.path.basename(os.path.normpath(p)) for p in flags.scenes]
 46 |     scenes = [(s, n) for s, n in zip(flags.scenes, scene_names)]
 47 |     scenes = sorted(scenes, key=lambda x: x[1])
 48 |     ious = np.ones((len(scenes), len(models))) * -1.
 49 |     results = []
 50 |     for scene_index, (scene, scene_name) in enumerate(scenes):
 51 |         print(f"Evaluating scene {scene_name}")
 52 | 
 53 |         nerf_dir = model_utils.get_nerf_dir(scene, flags)
 54 | 
 55 |         for model_hash in models:
 56 |             model_path = os.path.join(nerf_dir, model_hash)
 57 |             if not os.path.exists(model_path):
 58 |                 continue
 59 |             params = model_utils.read_params(model_path)
 60 |             dataset = SceneDataset('test',
 61 |                                    scene,
 62 |                                    factor=4.0,
 63 |                                    batch_size=flags.batch_size,
 64 |                                    lazy=True)
 65 |             n_classes = dataset.n_classes if dataset.n_classes is not None else 2
 66 |             model = model_utils.create_model(dataset.min_bounds,
 67 |                                              dataset.max_bounds, n_classes,
 68 |                                              params).cuda()
 69 |             model = model.eval()
 70 | 
 71 |             checkpoint_dir = os.path.join(model_path, 'checkpoints')
 72 |             if not os.path.exists(checkpoint_dir) or len(
 73 |                     os.listdir(checkpoint_dir)) == 0:
 74 |                 continue
 75 | 
 76 |             model_utils.load_checkpoint(model, checkpoint_dir)
 77 |             model = model.eval()
 78 | 
 79 |             save_figure_dir = None
 80 |             if flags.write_images is not None:
 81 |                 save_figure_dir = os.path.join(flags.write_images, scene_name)
 82 |             evaluator = Evaluator(model,
 83 |                                   classes,
 84 |                                   name=model_hash,
 85 |                                   save_figures=save_figure_dir)
 86 |             model_index = models.index(model_hash)
 87 |             assert model_index >= 0
 88 |             result = evaluator.eval(dataset, flags.vis)
 89 | 
 90 |             if len(result.values()) == 0:
 91 |                 continue
 92 |             miou = np.mean([v for v in result.values()])
 93 |             assert ious[scene_index, model_index] < 0.0
 94 |             ious[scene_index, model_index] = miou
 95 |             result = dict(vars(params))
 96 |             result['scene'] = scene_name
 97 |             result['iou'] = miou
 98 |             results.append(result)
 99 | 
100 |     if flags.out is not None:
101 |         write_results(flags.out, results)
102 | 
103 |     from rich.table import Table
104 |     from rich.console import Console
105 |     table = Table()
106 |     table.add_column('Scene')
107 |     for model in models:
108 |         table.add_column(model)
109 |     for scene_name, scene_ious in zip(scene_names, ious):
110 |         table.add_row(scene_name, *[f"{v:.03f}" for v in scene_ious])
111 |     total_row = ['Total'] + [f"{v:.03f}" for v in ious.mean(axis=0)]
112 |     table.add_row(*total_row, end_section=True)
113 | 
114 |     console = Console()
115 |     console.print(table)
116 | 
117 | 
118 | if __name__ == "__main__":
119 |     main(read_args())
120 | 


--------------------------------------------------------------------------------
/scripts/export.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script will export semantic segmentation maps once you are
  3 | done with annotating and fitting a scene.
  4 | 
  5 | usage: python scripts/export.py <scene1> <scene2> ... --workspace <optional-workspace>
  6 | 
  7 | params:
  8 |     workspace: The workspace to lookup trained models from.
  9 |         Else uses <scene>/nerf/.
 10 | 
 11 | Output frames are saved at <scene>/output/semantic/
 12 | """
 13 | import cv2
 14 | import numpy as np
 15 | import os
 16 | from skimage import measure
 17 | import torch
 18 | from tqdm import tqdm
 19 | 
 20 | from autolabel import model_utils
 21 | from autolabel.dataset import SceneDataset
 22 | from autolabel.utils import Scene
 23 | 
 24 | MAX_WIDTH = 640
 25 | 
 26 | 
 27 | def read_args():
 28 |     parser = model_utils.model_flag_parser()
 29 |     parser.add_argument('scenes', nargs='+')
 30 |     parser.add_argument('--workspace', type=str)
 31 |     parser.add_argument('--objects',
 32 |                         type=int,
 33 |                         default=None,
 34 |                         help="""
 35 |             If specified, find the specified number of largest connected components per class in the
 36 |             produced semantic maps as a post-processing step, removing noise from the segmentation maps.
 37 |             """)
 38 |     return parser.parse_args()
 39 | 
 40 | 
 41 | def lookup_frame_size(scene):
 42 |     scene = Scene(scene)
 43 |     width, height = scene.peak_image_size()
 44 |     if width > MAX_WIDTH:
 45 |         scale = MAX_WIDTH / width
 46 |         width *= scale
 47 |         height *= scale
 48 |     return (int(np.round(width)), int(np.round(height)))
 49 | 
 50 | 
 51 | def find_largest_components(p_semantic, class_id, object_count):
 52 |     p_semantic = p_semantic.copy()
 53 |     p_semantic[p_semantic != class_id] = 0
 54 |     labels = measure.label(p_semantic)
 55 |     counts = np.bincount(labels.flat)[1:]
 56 |     largest = []
 57 |     sorted_counts = np.argsort(counts)[::-1]
 58 |     for i in range(object_count):
 59 |         nth_largest_label = sorted_counts[i] + 1
 60 |         largest.append(labels == nth_largest_label)
 61 |     return largest
 62 | 
 63 | 
 64 | def post_process(flags, p_semantic):
 65 |     out = np.zeros_like(p_semantic)
 66 |     class_ids = np.unique(p_semantic)
 67 |     for class_id in class_ids:
 68 |         if class_id == 0:
 69 |             # Skip background class.
 70 |             continue
 71 |         components = find_largest_components(p_semantic, class_id,
 72 |                                              flags.objects)
 73 |         for component in components:
 74 |             out[component] = class_id
 75 |     return out
 76 | 
 77 | 
 78 | def render_frame(model, batch):
 79 |     rays_o = torch.tensor(batch['rays_o']).cuda()
 80 |     rays_d = torch.tensor(batch['rays_d']).cuda()
 81 |     direction_norms = torch.tensor(batch['direction_norms']).cuda()
 82 |     depth = torch.tensor(batch['depth']).cuda()
 83 |     outputs = model.render(rays_o,
 84 |                            rays_d,
 85 |                            direction_norms,
 86 |                            staged=True,
 87 |                            perturb=False,
 88 |                            num_steps=512,
 89 |                            upsample_steps=0)
 90 |     return outputs['semantic'].argmax(dim=-1).cpu().numpy()
 91 | 
 92 | 
 93 | def export_labels(flags, scene):
 94 |     if scene[-1] == os.path.sep:
 95 |         scene = scene[:-1]
 96 |     scene_name = os.path.basename(scene)
 97 |     if flags.workspace is not None:
 98 |         model_dir = os.path.join(flags.workspace, scene_name)
 99 |     else:
100 |         model_dir = os.path.join(scene, 'nerf')
101 |     models = os.listdir(model_dir)
102 |     if len(models) > 1:
103 |         print(
104 |             f"Warning: scene {scene} has more than 1 model directory. Using {models[0]}."
105 |         )
106 |     elif len(models) == 0:
107 |         print(f"Warning: scene {scene} has no trained models. Skipping.")
108 |         return
109 |     model_dir = os.path.join(model_dir, models[0])
110 |     model_params = model_utils.read_params(model_dir)
111 | 
112 |     frame_size = lookup_frame_size(scene)
113 | 
114 |     dataset = SceneDataset('train',
115 |                            scene,
116 |                            size=frame_size,
117 |                            batch_size=16384,
118 |                            features=model_params.features,
119 |                            load_semantic=False)
120 | 
121 |     n_classes = dataset.n_classes if dataset.n_classes is not None else 2
122 |     model = model_utils.create_model(dataset.min_bounds, dataset.max_bounds,
123 |                                      n_classes, model_params).cuda()
124 |     model = model.eval()
125 |     model_utils.load_checkpoint(model, os.path.join(model_dir, 'checkpoints'))
126 | 
127 |     output_path = os.path.join(scene, 'output', 'semantic')
128 |     os.makedirs(output_path, exist_ok=True)
129 | 
130 |     with torch.inference_mode():
131 |         with torch.cuda.amp.autocast(enabled=True):
132 |             for frame_index, rgb_path in zip(tqdm(dataset.indices),
133 |                                              dataset.scene.rgb_paths()):
134 |                 batch = dataset._get_test(frame_index)
135 |                 frame = render_frame(model, batch)
136 | 
137 |                 if flags.objects is not None:
138 |                     frame = post_process(flags, frame)
139 | 
140 |                 frame_name = os.path.splitext(os.path.basename(rgb_path))[0]
141 |                 frame_path = os.path.join(output_path, f"{frame_name}.png")
142 |                 cv2.imwrite(frame_path, frame)
143 | 
144 | 
145 | def main():
146 |     flags = read_args()
147 | 
148 |     for scene in flags.scenes:
149 |         export_labels(flags, scene)
150 | 
151 | 
152 | if __name__ == "__main__":
153 |     main()
154 | 


--------------------------------------------------------------------------------
/scripts/train.py:
--------------------------------------------------------------------------------
  1 | from argparse import Namespace
  2 | import math
  3 | import numpy as np
  4 | import torch
  5 | from torch import optim
  6 | 
  7 | from autolabel import model_utils
  8 | from autolabel.dataset import SceneDataset, LenDataset
  9 | from autolabel.trainer import SimpleTrainer
 10 | 
 11 | 
 12 | def read_args():
 13 |     parser = model_utils.model_flag_parser()
 14 |     parser.add_argument('scene')
 15 |     parser.add_argument('--factor-train', type=float, default=2.0)
 16 |     parser.add_argument('--factor-test', type=float, default=2.0)
 17 |     parser.add_argument('--batch-size', '-b', type=int, default=4096)
 18 |     parser.add_argument('--sample-chunk-size', type=int, default=512)
 19 |     parser.add_argument('--iters', type=int, default=10000)
 20 |     parser.add_argument('--workers', '-w', type=int, default=1)
 21 |     parser.add_argument('--eval', action='store_true')
 22 |     parser.add_argument('--contrastive', action='store_true')
 23 |     parser.add_argument('--use-semantic', action='store_true')
 24 |     parser.add_argument('--sam-sampling',
 25 |                         default='proportional',
 26 |                         choices=['proportional', 'uniform', None],
 27 |                         type=str,
 28 |                         help="SAM sampling method.")
 29 |     parser.add_argument('--slow-center', action='store_true')
 30 |     parser.add_argument('--cluster-instance-features', action='store_true')
 31 |     parser.add_argument(
 32 |         '--workspace',
 33 |         type=str,
 34 |         default=None,
 35 |         help="Save results in this directory instead of the scene directory.")
 36 |     return parser.parse_args()
 37 | 
 38 | 
 39 | def main():
 40 |     flags = read_args()
 41 | 
 42 |     dataset = SceneDataset('train',
 43 |                            flags.scene,
 44 |                            factor=flags.factor_train,
 45 |                            batch_size=flags.batch_size,
 46 |                            sample_chunk_size=flags.sample_chunk_size,
 47 |                            features=flags.features,
 48 |                            load_semantic=flags.use_semantic,
 49 |                            sam_sampling=flags.sam_sampling)
 50 | 
 51 |     n_classes = dataset.n_classes if dataset.n_classes is not None else 2
 52 |     model = model_utils.create_model(dataset.min_bounds, dataset.max_bounds,
 53 |                                      n_classes, flags)
 54 | 
 55 |     opt = Namespace(rand_pose=-1,
 56 |                     color_space='srgb',
 57 |                     feature_loss=flags.features is not None,
 58 |                     feature_constrastive_learning=flags.contrastive,
 59 |                     rgb_weight=flags.rgb_weight,
 60 |                     depth_weight=flags.depth_weight,
 61 |                     semantic_weight=flags.semantic_weight,
 62 |                     feature_weight=flags.feature_weight,
 63 |                     contrastive_weight=flags.contrastive_weight,
 64 |                     contrastive_temperature=flags.contrastive_temperature,
 65 |                     sam_sampling=flags.sam_sampling is not None,
 66 |                     slow_center=flags.slow_center)
 67 | 
 68 |     optimizer = lambda model: torch.optim.Adam([
 69 |         {
 70 |             'name': 'encoding',
 71 |             'params': model.encoder_parameters()
 72 |         },
 73 |         {
 74 |             'name': 'net',
 75 |             'params': model.network_parameters(),
 76 |             'weight_decay': 1e-6
 77 |         },
 78 |     ],
 79 |                                                lr=flags.lr,
 80 |                                                betas=(0.9, 0.99),
 81 |                                                eps=1e-15)
 82 | 
 83 |     train_dataloader = torch.utils.data.DataLoader(LenDataset(dataset, 1000),
 84 |                                                    batch_size=None,
 85 |                                                    num_workers=flags.workers)
 86 |     train_dataloader._data = dataset
 87 | 
 88 |     criterion = torch.nn.MSELoss(reduction='none')
 89 |     gamma = 0.5
 90 |     steps = math.log(1e-4 / flags.lr, gamma)
 91 |     step_size = max(flags.iters // steps // 1000, 1)
 92 |     scheduler = lambda optimizer: optim.lr_scheduler.StepLR(
 93 |         optimizer, gamma=gamma, step_size=step_size)
 94 | 
 95 |     epochs = int(np.ceil(flags.iters / 1000))
 96 |     model_dir = model_utils.model_dir(flags.scene, flags)
 97 |     model_utils.write_params(model_dir, flags)
 98 |     trainer = SimpleTrainer('ngp',
 99 |                             opt,
100 |                             model,
101 |                             device='cuda:0',
102 |                             workspace=model_dir,
103 |                             optimizer=optimizer,
104 |                             criterion=criterion,
105 |                             fp16=True,
106 |                             ema_decay=0.95,
107 |                             lr_scheduler=scheduler,
108 |                             scheduler_update_every_step=False,
109 |                             metrics=[],
110 |                             use_checkpoint='latest')
111 |     trainer.train(train_dataloader, epochs)
112 | 
113 |     trainer.save_checkpoint()
114 | 
115 |     if flags.cluster_instance_features:
116 |         del dataset
117 |         dataset = SceneDataset('test',
118 |                                 flags.scene,
119 |                                 factor=4.0,
120 |                                 batch_size=8182,
121 |                                 lazy=True)
122 |         trainer.compute_instance_centers(dataset)
123 |         trainer.save_instance_centers(save_cluster=True)
124 | 
125 |     if flags.eval:
126 |         testset = SceneDataset('test',
127 |                                flags.scene,
128 |                                factor=flags.factor_test,
129 |                                batch_size=flags.batch_size * 2)
130 |         test_dataloader = torch.utils.data.DataLoader(LenDataset(
131 |             testset, testset.rotations.shape[0]),
132 |                                                       batch_size=None,
133 |                                                       num_workers=0)
134 |         trainer.evaluate(test_dataloader)
135 | 
136 | 
137 | if __name__ == "__main__":
138 |     main()
139 | 


--------------------------------------------------------------------------------
/scripts/data/convert_arkitscenes.py:
--------------------------------------------------------------------------------
  1 | description = """
  2 | This script converts scenes from the ARKitScenes dataset (https://github.com/apple/ARKitScenes) format to
  3 | the format used by autolabel.
  4 | 
  5 | Usage:
  6 |     python scripts/convert_arkitscenes.py <path-to-arkit-scenes-dir> --out <path-to-output-dir>
  7 | 
  8 | After running this script, scripts/compute_scene_bounds.py needs to be run to compute the scene bounding box.
  9 | 
 10 | This script uses the lowres_wide, lowres_depth, lowres_wide.traj, confidence, lowres_wide_intrinsics parts of the dataset.
 11 | 
 12 | See Apple's instructions here for details https://github.com/apple/ARKitScenes/blob/main/DATA.md.
 13 | 
 14 | The script to download the ARKitScenes dataset can be found here https://github.com/apple/ARKitScenes/blob/main/download_data.py.
 15 | 
 16 | To download the required parts use it like this:
 17 | python download_data.py raw --split Training --video_id_csv depth_upsampling/upsampling_train_val_splits.csv --download_dir /tmp/arkit_scenes/ --raw_dataset_assets lowres_wide lowres_depth lowres_wide.traj confidence lowres_wide_intrinsics
 18 | 
 19 | """
 20 | import argparse
 21 | from argparse import RawTextHelpFormatter
 22 | import os
 23 | import cv2
 24 | import numpy as np
 25 | from scipy.spatial.transform import Rotation
 26 | 
 27 | 
 28 | def read_args():
 29 |     parser = argparse.ArgumentParser(description=description,
 30 |                                      formatter_class=RawTextHelpFormatter)
 31 |     parser.add_argument('arkit_scenes')
 32 |     parser.add_argument('--out')
 33 |     return parser.parse_args()
 34 | 
 35 | 
 36 | def read_trajectory(path):
 37 |     return np.loadtxt(path)
 38 | 
 39 | 
 40 | def extract_name(filename):
 41 |     return filename.replace('.png', '')
 42 | 
 43 | 
 44 | def collect_images(dir_path):
 45 |     filenames = os.listdir(dir_path)
 46 |     out = {}
 47 |     for filename in filenames:
 48 |         name = extract_name(filename)
 49 |         out[name] = os.path.join(dir_path, filename)
 50 |     return out
 51 | 
 52 | 
 53 | def read_intrinsics(dir_path):
 54 |     intrinsic_files = os.listdir(dir_path)
 55 |     intrinsic_path = os.path.join(dir_path, intrinsic_files[0])
 56 |     _, _, fx, fy, cx, cy = np.loadtxt(intrinsic_path)
 57 |     C = np.eye(3)
 58 |     C[0, 0] = fx
 59 |     C[1, 1] = fy
 60 |     C[0, 2] = cx
 61 |     C[1, 2] = cy
 62 |     return C
 63 | 
 64 | 
 65 | def to_ts(filename):
 66 |     _, ts = filename.split('_')
 67 |     seconds, ms = [int(v) for v in ts.split('.')]
 68 |     return seconds + ms * 1e-3
 69 | 
 70 | 
 71 | def find_pose(trajectory, rgb_name):
 72 |     timestamp = to_ts(rgb_name)
 73 |     errors = np.abs(trajectory[:, 0] - timestamp)
 74 |     closest = errors.argmin()
 75 |     return trajectory[closest], errors[closest]
 76 | 
 77 | 
 78 | def to_transform(pose):
 79 |     rotvec = pose[1:4]
 80 |     translation = pose[4:]
 81 |     T_CW = np.eye(4)
 82 |     R_CW = Rotation.from_rotvec(rotvec)
 83 |     T_CW[:3, :3] = R_CW.as_matrix()
 84 |     T_CW[:3, 3] = translation
 85 |     return T_CW
 86 | 
 87 | 
 88 | def write_scene(flags, scene_name, trajectory, rgb_images, depth_images,
 89 |                 confidence_images, intrinsics):
 90 |     eps = 1.0 / 90.0
 91 |     rgb_out = os.path.join(flags.out, scene_name, 'rgb')
 92 |     depth_out = os.path.join(flags.out, scene_name, 'depth')
 93 |     pose_out = os.path.join(flags.out, scene_name, 'pose')
 94 |     os.makedirs(rgb_out, exist_ok=True)
 95 |     os.makedirs(depth_out, exist_ok=True)
 96 |     os.makedirs(pose_out, exist_ok=True)
 97 | 
 98 |     images = [(n, p) for n, p in rgb_images.items()]
 99 |     images.sort(key=lambda x: to_ts(x[0]))
100 |     for i, (rgb_name, rgb_path_in) in enumerate(images):
101 |         print(f"Writing {rgb_name}", end='\r')
102 |         if rgb_name not in depth_images or rgb_name not in confidence_images:
103 |             print(f"Skipping image {rgb_name}")
104 |             continue
105 | 
106 |         pose, time_diff = find_pose(trajectory, rgb_name)
107 |         if time_diff > eps:
108 |             print(f"Skipping {rgb_name} due to time diff {time_diff:.03}",
109 |                   end='\r')
110 |             continue
111 |         else:
112 |             print(f"Including {rgb_name} time diff {time_diff:.03}", end='\r')
113 | 
114 |         T_CW = to_transform(pose)
115 | 
116 |         image_name = f"{i:06}"
117 |         pose_path = os.path.join(pose_out, image_name + '.txt')
118 |         rgb_path = os.path.join(rgb_out, image_name + '.png')
119 |         depth_path = os.path.join(depth_out, image_name + '.png')
120 | 
121 |         rgb = cv2.imread(rgb_path_in, -1)
122 |         depth = cv2.imread(depth_images[rgb_name], -1)
123 |         confidence = cv2.imread(confidence_images[rgb_name], -1)
124 |         depth[confidence < 2] = 0
125 |         cv2.imwrite(depth_path, depth)
126 |         cv2.imwrite(rgb_path, rgb)
127 |         np.savetxt(pose_path, T_CW)
128 |     np.savetxt(os.path.join(flags.out, scene_name, 'intrinsics.txt'),
129 |                intrinsics)
130 | 
131 | 
132 | def main():
133 |     flags = read_args()
134 | 
135 |     scenes = os.listdir(flags.arkit_scenes)
136 | 
137 |     for scene in scenes:
138 |         traj_file = os.path.join(flags.arkit_scenes, scene, 'lowres_wide.traj')
139 |         confidence_dir = os.path.join(flags.arkit_scenes, scene, 'confidence')
140 |         depth_dir = os.path.join(flags.arkit_scenes, scene, 'lowres_depth')
141 |         rgb_dir = os.path.join(flags.arkit_scenes, scene, 'lowres_wide')
142 |         intrinsics_dir = os.path.join(flags.arkit_scenes, scene,
143 |                                       'lowres_wide_intrinsics')
144 | 
145 |         if not os.path.exists(traj_file) or not os.path.exists(
146 |                 confidence_dir) or not os.path.exists(
147 |                     rgb_dir) or not os.path.exists(intrinsics_dir):
148 |             print(f"Missing files in {scene}")
149 |             continue
150 | 
151 |         trajectory = read_trajectory(traj_file)
152 | 
153 |         rgb_images = collect_images(rgb_dir)
154 |         depth_images = collect_images(depth_dir)
155 |         confidence_images = collect_images(confidence_dir)
156 |         intrinsics = read_intrinsics(intrinsics_dir)
157 | 
158 |         write_scene(flags, scene, trajectory, rgb_images, depth_images,
159 |                     confidence_images, intrinsics)
160 | 
161 | 
162 | if __name__ == "__main__":
163 |     main()
164 | 


--------------------------------------------------------------------------------
/docs/vision-language.md:
--------------------------------------------------------------------------------
  1 | # Prerequisites
  2 | 
  3 | ## Installing LSeg
  4 | 
  5 | In addition to the regular installation instructions, you also need to install LSeg. This can be done by running the following commands with your python environment loaded.
  6 | ```
  7 | git clone https://github.com/kekeblom/lang-seg
  8 | cd lang-seg
  9 | pip install -e .
 10 | ```
 11 | 
 12 | ## Data conversion
 13 | 
 14 | Follow the instructions in `docs/data.md` to convert the scenes from original datasets into the our format.
 15 | 
 16 | ---
 17 | ---
 18 | 
 19 | # [Neural Implicit Vision-language Feature Fields](https://arxiv.org/abs/2303.10962)
 20 | 
 21 | checkout to branch `lseg`
 22 | 
 23 | ## Running ScanNet experiment
 24 | 
 25 |  Use the following commands to compute vision-language features, fit the scene representation and evaluate against the ground truth:
 26 | ```
 27 | # Train . Has to be run separately for each scene.
 28 | python scripts/compute_feature_maps.py <dataset-dir>/<scene> --features lseg --checkpoint <lseg-weights>
 29 | python scripts/train.py --features lseg --feature-dim 512 --iters 25000 <dataset-dir>/<scene>
 30 | 
 31 | # Once trained on all scenes, evaluate.
 32 | # 3D queries evaluated against the 3D pointcloud
 33 | python scripts/language/evaluate.py --pc --label-map <label-map> --feature-checkpoint <lseg-weights> <dataset-dir>
 34 | # 2D queries against the ground truth semantic segmentation maps
 35 | python scripts/language/evaluate.py --label-map <label-map> --feature-checkpoint <lseg-weights> <dataset-dir>
 36 | ```
 37 | 
 38 | `dataset-dir` is the path to the scannet converted scenes, `scene` is the name of the scene. `lseg-weights` is the path to the lseg checkpoint.
 39 | 
 40 | ## Running the real-time ROS node
 41 | 
 42 | The `scripts/ros/` directory contains ROS nodes which can be used to integrate with a real-time SLAM system. These have been tested under ROS Noetic.
 43 | 
 44 | `scripts/ros/node.py` is the node which listens to keyframes and integrates the volumetric representation as they come in. It listens to the following topics:
 45 | - `/slam/rgb` image messages.
 46 | - `/slam/depth` depth frames encoded as uint16 values in millimeters.
 47 | - `/slam/keyframe` PoseStamped messages which correspond to camera poses for the rgb and depth messages.
 48 | - `/slam/camera_info` CameraInfo message containing the intrinsic parameters.
 49 | - `/slam/odometry` (optional) PoseStamped messages. Each time a message comes in, it renders an rgb frame and semantic segmentation map which is published at `/autolabel/image` and `/autolabel/features` respectively.
 50 | - `/autolabel/segmentation_classes` segmentation class prompts as a String message published by the `class_input.py` node.
 51 | 
 52 | It can be run with `python scripts/ros/node.py --checkpoint <lseg-weights> -b <bound>`. The bound parameter is optional and defaults to 2.5 meters. It defines the size of the volume, extending `bound` meters from `[-bound, -bound, -bound]` to `[bound, bound, bound]` in the x, y and z directions.
 53 | 
 54 | For an implementation of the SLAM node, you can use the ROS node from the [SpectacularAI SDK examples](https://github.com/SpectacularAI/sdk-examples/blob/main/python/oak/mapping_ros.py), in case you have an OAK-D stereo camera.
 55 | 
 56 | `scripts/ros/class_input.py` presents a graphical user interface which can be used to define the segmentation classes used by the ROS node. It published class at `/autolabel/segmentation_classes`.
 57 | 
 58 | ---
 59 | ---
 60 | 
 61 | # [Panoptic Vision-Language Feature Fields](https://arxiv.org/abs/2309.05448)
 62 | 
 63 | checkout to branch `panoptic`
 64 | 
 65 | ## Training
 66 | To begin the training process, first run the precomputing steps:
 67 | 
 68 | ```
 69 | # compute the vision-language features
 70 | python scripts/compute_feature_maps.py <dataset-dir>/<scene> \
 71 |     --features lseg \
 72 |     --checkpoint <lseg-weights> \
 73 |     --dim 512
 74 | 
 75 | # compute the instance masks using SAM
 76 | python scripts/compute_sam_mask.py <dataset-dir>/<scene> \
 77 |     --sam-vit-h-checkpoint <sam-weights>
 78 | ```
 79 | 
 80 | `dataset-dir` is the path to the scannet converted scenes, `scene` is the name of the scene. `lseg-weights` is the path to the lseg checkpoint. `sam-weights` is the path to the SAM checkpoint (which can be downloaded [here](https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth)).
 81 | 
 82 | Then, fit the scene representation using the same training script with additional flags on:
 83 | ```
 84 | python scripts/train.py <dataset-dir>/<scene> \
 85 |     --batch-size 2048 \
 86 |     --iters 20000 \
 87 |     --workspace <workspace> \
 88 |     --feature-dim 512 \
 89 |     --features lseg \
 90 |     --contrastive \
 91 |     --sam-sampling <sampling-method> \
 92 |     --slow-center \
 93 |     --cluster-instance-features
 94 | ```
 95 | 
 96 | 
 97 | `workspace` is the folder where the model is saved. `--contrastive` is the option to train instance feature field using contrastive learning. `--sam-sampling` denotes the strategy to sample the SAM masks for training. The strategies include `proportional`, `uniform` and `None`, where `proportional` means sampling the masks according to their areas, `uniform` means sampling these masks uniformly, and `None` means not using sampling strategy and training with multiple positive pairs. `--slow-center` denotes whether to use "slow center strategy". `--cluster-instance-features` denotes to run the clustering after the training and save the cluster centers together with the clusterer itself.
 98 | 
 99 | ## Evaluation
100 | 
101 | Scene-level Panoptic Quality and 2D Semantic Segmentation
102 | ```
103 | python scripts/language/evaluate.py <dataset-dir> \
104 |     --vis <evaluation-folder/vis> \ # the folder to save the visualization results.
105 |     --workspace <workspace> \
106 |     --out <evaluation-folder> \ # the folder to save the evaluation results.
107 |     --label-map <label-map> \
108 |     --feature-checkpoint <lseg-weights> \
109 |     --panoptic # the flag to evaluate scene-level PQ and 2D semantic segmentation.
110 | #    --debug # whether to save the visualization images.    
111 | ```
112 | 
113 | 3D Semantic Segmentation (only for ScanNet)
114 | ```
115 | python scripts/language/evaluate.py <dataset-dir> \
116 |     --vis <evaluation-folder/vis> \ # the folder to save the visualization results.
117 |     --workspace <workspace> \
118 |     --out <evaluation-folder> \ # the folder to save the evaluation results.
119 |     --label-map <label-map> \
120 |     --feature-checkpoint <lseg-weights> \
121 |     --pc # the flag to 3D semantic segmentation.
122 | ```
123 | 
124 | 


--------------------------------------------------------------------------------
/scripts/compute_feature_maps.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import h5py
  3 | import numpy as np
  4 | import os
  5 | import pickle
  6 | import math
  7 | import torch
  8 | from torch.nn import functional as F
  9 | from torchvision.io.image import read_image
 10 | from PIL import Image
 11 | from autolabel.utils import Scene
 12 | from autolabel.utils.feature_utils import get_feature_extractor
 13 | from autolabel.models import Autoencoder
 14 | from sklearn import decomposition
 15 | from tqdm import tqdm
 16 | 
 17 | 
 18 | def read_args():
 19 |     parser = argparse.ArgumentParser()
 20 |     parser.add_argument('scene')
 21 |     parser.add_argument('--vis', action='store_true')
 22 |     parser.add_argument('--video',
 23 |                         type=str,
 24 |                         help="Create video of maps and write to this path.")
 25 |     parser.add_argument('--features',
 26 |                         type=str,
 27 |                         choices=['fcn50', 'dino', 'lseg'])
 28 |     parser.add_argument('--checkpoint',
 29 |                         type=str,
 30 |                         help="Which model weights to use.")
 31 |     parser.add_argument('--dim', type=int, default=64)
 32 |     parser.add_argument('--autoencode', action='store_true')
 33 |     return parser.parse_args()
 34 | 
 35 | 
 36 | def compress_features(features, dim):
 37 |     features = np.stack(features)
 38 |     N, H, W, C = features.shape
 39 |     coder = Autoencoder(C, dim).cuda()
 40 |     optimizer = torch.optim.Adam(coder.parameters(), lr=1e-3)
 41 |     dataset = torch.utils.data.TensorDataset(
 42 |         torch.tensor(features.reshape(N * H * W, C)))
 43 |     loader = torch.utils.data.DataLoader(dataset, batch_size=2048, shuffle=True)
 44 |     for _ in range(5):
 45 |         bar = tqdm(loader)
 46 |         for batch in bar:
 47 |             batch = batch[0].cuda()
 48 |             reconstructed, code = coder(batch)
 49 |             loss = F.mse_loss(reconstructed,
 50 |                               batch) + 0.01 * torch.abs(code).mean()
 51 |             bar.set_description(f"Loss: {loss.item()}")
 52 |             loss.backward()
 53 |             optimizer.step()
 54 |             optimizer.zero_grad()
 55 | 
 56 |     with torch.inference_mode():
 57 |         features_out = np.zeros((N, H, W, dim), dtype=np.float16)
 58 |         for i, feature in enumerate(features):
 59 |             feature = torch.tensor(feature).view(H * W, C).cuda()
 60 |             _, out = coder(feature.view(H * W, C))
 61 |             features_out[i] = out.detach().cpu().numpy().reshape(H, W, dim)
 62 |     return features_out
 63 | 
 64 | 
 65 | def compute_size(image_path, feature):
 66 |     image = read_image(image_path)
 67 |     _, H, W = image.shape
 68 |     short_side = min(H, W)
 69 |     if feature in ['fcn50', 'dino']:
 70 |         target_size = 720
 71 |     elif feature == 'lseg':
 72 |         target_size = 242
 73 |     scale_factor = target_size / short_side
 74 |     return int(H * scale_factor), int(W * scale_factor)
 75 | 
 76 | 
 77 | def extract_features(extractor, scene, output_file, flags):
 78 |     paths = scene.rgb_paths()
 79 |     H, W = compute_size(paths[0], flags.features)
 80 | 
 81 |     shape = extractor.shape((H, W))
 82 |     dataset = output_file.create_dataset(flags.features,
 83 |                                          (len(paths), *shape, flags.dim),
 84 |                                          dtype=np.float16,
 85 |                                          compression='lzf')
 86 | 
 87 |     extracted = []
 88 |     with torch.inference_mode():
 89 |         batch_size = 2
 90 |         for i in tqdm(range(math.ceil(len(paths) / batch_size))):
 91 |             index = slice(i * batch_size, (i + 1) * batch_size)
 92 |             batch = paths[index]
 93 |             image = torch.stack([read_image(p) for p in batch]).cuda()
 94 |             image = F.interpolate(image, [H, W])
 95 |             features = extractor(image / 255.).cpu().numpy()
 96 | 
 97 |             if flags.autoencode:
 98 |                 extracted += [f for f in features]
 99 |             else:
100 |                 dataset[index] = features[..., :flags.dim]
101 | 
102 |     if flags.autoencode:
103 |         features = compress_features(extracted, flags.dim)
104 |         dataset[:] = features
105 | 
106 |     N, H, W, C = dataset[:].shape
107 |     X = dataset[:].reshape(N * H * W, C)
108 |     pca = decomposition.PCA(n_components=3)
109 |     indices = np.random.randint(0, X.shape[0], size=50000)
110 |     subset = X[indices]
111 |     transformed = pca.fit_transform(subset)
112 |     minimum = transformed.min(axis=0)
113 |     maximum = transformed.max(axis=0)
114 |     diff = maximum - minimum
115 | 
116 |     dataset.attrs['pca'] = np.void(pickle.dumps(pca))
117 |     dataset.attrs['min'] = minimum
118 |     dataset.attrs['range'] = diff
119 | 
120 | 
121 | def visualize_features(features):
122 |     pca = pickle.loads(features.attrs['pca'].tobytes())
123 |     N, H, W, C = features[:].shape
124 | 
125 |     from matplotlib import pyplot
126 |     feature_maps = features[:]
127 |     for fm in feature_maps[::10]:
128 |         mapped = pca.transform(fm.reshape(H * W, C)).reshape(H, W, 3)
129 |         normalized = np.clip(
130 |             (mapped - features.attrs['min']) / features.attrs['range'], 0, 1)
131 |         pyplot.imshow(normalized)
132 |         pyplot.show()
133 | 
134 | 
135 | def write_video(features, out):
136 |     from skvideo.io.ffmpeg import FFmpegWriter
137 |     pca = pickle.loads(features.attrs['pca'].tobytes())
138 |     N, H, W, C = features[:].shape
139 |     writer = FFmpegWriter(out,
140 |                           inputdict={'-framerate': '5'},
141 |                           outputdict={
142 |                               '-c:v': 'libx264',
143 |                               '-r': '5',
144 |                               '-pix_fmt': 'yuv420p'
145 |                           })
146 |     for feature in tqdm(features, desc="Encoding frames"):
147 |         mapped = pca.transform(feature.reshape(H * W, C)).reshape(H, W, 3)
148 |         normalized = np.clip(
149 |             (mapped - features.attrs['min']) / features.attrs['range'], 0, 1)
150 |         frame = (normalized * 255.0).astype(np.uint8)
151 |         writer.writeFrame(frame)
152 | 
153 | 
154 | def main():
155 |     flags = read_args()
156 |     np.random.seed(0)
157 |     torch.manual_seed(0)
158 | 
159 |     scene = Scene(flags.scene)
160 |     output_file = h5py.File(os.path.join(scene.path, 'features.hdf'),
161 |                             'w',
162 |                             libver='latest')
163 |     group = output_file.create_group('features')
164 | 
165 |     extractor = get_feature_extractor(flags.features, flags.checkpoint)
166 | 
167 |     extract_features(extractor, scene, group, flags)
168 |     if flags.vis:
169 |         visualize_features(group[flags.features])
170 |     if flags.video:
171 |         write_video(group[flags.features], flags.video)
172 | 
173 | 
174 | if __name__ == "__main__":
175 |     main()
176 | 


--------------------------------------------------------------------------------
/autolabel/utils/__init__.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import json
  3 | import numpy as np
  4 | import os
  5 | from pathlib import Path
  6 | 
  7 | 
  8 | class Camera:
  9 | 
 10 |     def __init__(self, camera_matrix, size):
 11 |         self.camera_matrix = camera_matrix
 12 |         self.size = size
 13 | 
 14 |     def scale(self, new_size):
 15 |         scale_x = new_size[0] / self.size[0]
 16 |         scale_y = new_size[1] / self.size[1]
 17 |         camera_matrix = self.camera_matrix.copy()
 18 |         camera_matrix[0, :] = scale_x * self.camera_matrix[0, :]
 19 |         camera_matrix[1, :] = scale_y * self.camera_matrix[1, :]
 20 |         return Camera(camera_matrix, new_size)
 21 | 
 22 |     @property
 23 |     def fx(self):
 24 |         return self.camera_matrix[0, 0]
 25 | 
 26 |     @property
 27 |     def fy(self):
 28 |         return self.camera_matrix[1, 1]
 29 | 
 30 |     @property
 31 |     def cx(self):
 32 |         return self.camera_matrix[0, 2]
 33 | 
 34 |     @property
 35 |     def cy(self):
 36 |         return self.camera_matrix[1, 2]
 37 | 
 38 |     @classmethod
 39 |     def from_path(self, path, size):
 40 |         return Camera(np.loadtxt(path), size)
 41 | 
 42 |     def write(self, path):
 43 |         np.savetxt(path, self.camera_matrix)
 44 | 
 45 | 
 46 | class Scene:
 47 | 
 48 |     def __init__(self, scene_path):
 49 |         self.path = scene_path
 50 |         self.rgb_path = os.path.join(scene_path, 'rgb')
 51 |         self.raw_rgb_path = os.path.join(scene_path, 'raw_rgb')
 52 |         self.depth_path = os.path.join(scene_path, 'depth')
 53 |         self.raw_depth_path = os.path.join(scene_path, 'raw_depth')
 54 |         self.pose_path = os.path.join(scene_path, 'pose')
 55 |         self._read_poses()
 56 |         intrinsics_path = os.path.join(scene_path, 'intrinsics.txt')
 57 |         image_size = self.peak_image_size()
 58 |         if os.path.exists(intrinsics_path):
 59 |             self.camera = Camera.from_path(intrinsics_path, image_size)
 60 |         self._n_classes = None
 61 |         self._metadata = None
 62 | 
 63 |     def peak_image_size(self):
 64 |         if os.path.exists(self.raw_rgb_path):
 65 |             path = self.raw_rgb_path
 66 |         elif os.path.exists(self.rgb_path):
 67 |             path = self.rgb_path
 68 |         else:
 69 |             raise ValueError("Doesn't appear to be a valid scene.")
 70 |         image = cv2.imread(os.path.join(path, os.listdir(path)[0]))
 71 |         return (image.shape[1], image.shape[0])
 72 | 
 73 |     def _read_poses(self):
 74 |         if not os.path.exists(self.pose_path):
 75 |             self.poses = []
 76 |             return
 77 |         pose_files = os.listdir(self.pose_path)
 78 |         pose_files = sorted([p for p in pose_files if p[0] != '.'],
 79 |                             key=lambda p: int(p.split('.')[0]))
 80 |         self.poses = []
 81 |         for pose_file in pose_files:
 82 |             T_CW = np.loadtxt(os.path.join(self.pose_path, pose_file))
 83 |             self.poses.append(T_CW)
 84 | 
 85 |     def __iter__(self):
 86 |         rgb_frames = self.rgb_paths()
 87 |         depth_frames = self.depth_paths()
 88 |         for pose, rgb, depth in zip(self.poses, rgb_frames, depth_frames):
 89 |             yield (pose, rgb, depth)
 90 | 
 91 |     def __len__(self):
 92 |         return len(self.poses)
 93 | 
 94 |     def _get_paths(self, directory):
 95 |         frames = os.listdir(directory)
 96 |         frames = sorted(frames, key=lambda x: int(x.split('.')[0]))
 97 |         return [os.path.join(directory, f) for f in frames]
 98 | 
 99 |     def rgb_paths(self):
100 |         return self._get_paths(self.rgb_path)
101 | 
102 |     def depth_paths(self):
103 |         return self._get_paths(self.depth_path)
104 | 
105 |     def semantic_paths(self):
106 |         return self._get_paths(os.path.join(self.path, 'semantic'))
107 | 
108 |     def raw_rgb_paths(self):
109 |         return self._get_paths(self.raw_rgb_path)
110 | 
111 |     def raw_depth_paths(self):
112 |         return self._get_paths(self.raw_depth_path)
113 | 
114 |     def gt_semantic(self):
115 |         return self._get_paths(os.path.join(self.path, 'gt_semantic'))
116 |     
117 |     def gt_instance(self):
118 |         return self._get_paths(os.path.join(self.path, 'gt_instance'))
119 | 
120 |     def image_names(self):
121 |         """
122 |         Returns the filenames of rgb images without file extensions.
123 |         """
124 |         rgb_frames = os.listdir(self.rgb_path)
125 |         rgb_frames = sorted(rgb_frames, key=lambda x: int(x.split('.')[0]))
126 |         return [f.split('.')[0] for f in rgb_frames]
127 | 
128 |     def bbox(self):
129 |         return np.loadtxt(os.path.join(self.path, 'bbox.txt'))[:6].reshape(2, 3)
130 | 
131 |     def gt_masks(self, size):
132 |         """
133 |         Returns a list of numpy arrays of ground truth segmentation masks,
134 |         if available. Returns an empty list if no masks have been annotated.
135 |         size: the desired size for the masks.
136 |         returns: list of H x W numpy arrays
137 |         """
138 |         gt_masks_dir = os.path.join(self.path, 'gt_masks')
139 |         if not os.path.exists(gt_masks_dir):
140 |             return []
141 |         masks = []
142 |         mask_files = [
143 |             os.path.join(gt_masks_dir, f) for f in os.listdir(gt_masks_dir)
144 |         ]
145 |         for mask_file in mask_files:
146 |             frame_number = int(os.path.basename(mask_file).split('.')[0])
147 |             mask = _read_gt_mask(mask_file, size)
148 |             masks.append((frame_number, _read_gt_mask(mask_file, size)))
149 |         return sorted(masks, key=lambda x: x[0])
150 | 
151 |     def depth_size(self):
152 |         """
153 |         Return: the size (width, height) of the depth images.
154 |         """
155 |         depth_paths = self.raw_depth_paths()
156 |         if len(depth_paths) == 0:
157 |             depth_paths = self.depth_paths()
158 |         image = cv2.imread(depth_paths[0], -1)
159 |         return (image.shape[1], image.shape[0])
160 | 
161 |     @property
162 |     def metadata(self):
163 |         if self._metadata is None:
164 |             metadata_path = os.path.join(self.path, 'metadata.json')
165 |             if not os.path.exists(metadata_path):
166 |                 return None
167 |             with open(metadata_path) as f:
168 |                 self._metadata = json.load(f)
169 |         return self._metadata
170 | 
171 |     @property
172 |     def n_classes(self):
173 |         if self._n_classes is None:
174 |             self._n_classes = self.metadata['n_classes']
175 |         return self._n_classes
176 | 
177 | 
178 | def transform_points(T, points):
179 |     R = T[:3, :3]
180 |     t = T[:3, 3]
181 |     return (R @ points[..., :, None])[..., :, 0] + t
182 | 
183 | 
184 | def _read_gt_mask(path, size):
185 |     image = np.zeros((size[1], size[0]), dtype=np.uint8)
186 |     with open(path, 'rt') as f:
187 |         data = json.load(f)
188 |     scaling_factor = np.array(
189 |         [size[0] / data['imageWidth'], size[1] / data['imageHeight']])
190 |     for shape in data['shapes']:
191 |         polygon = (np.stack(shape['points']) * scaling_factor).astype(np.int32)
192 |         #TODO: handle multiple classes.
193 |         image = cv2.fillPoly(image, polygon[None], 1)
194 |     return image
195 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <h1 align="center">Panoptic Vision-Language Feature Fields</h1>
  2 | 
  3 | <p align="center">
  4 | <strong><a href="https://haoranchen1104.github.io/">Haoran Chen</a></strong>,
  5 | <strong><a href="https://keke.dev/">Kenneth Blomqvist</a></strong>,
  6 | <strong><a href="https://scholar.google.com/citations?user=qwSANZoAAAAJ&hl=en&oi=ao">Francesco Milano</a></strong>, <strong><a href="https://asl.ethz.ch/">Roland Siegwart</a></strong>
  7 | </p>
  8 | 
  9 | <h2 align="center">IEEE RA-L 2024</h2>
 10 | <h3 align="center"><a href="https://arxiv.org/abs/2309.05448">Paper</a> | <a href="https://www.youtube.com/watch?v=mZoujkg_axE">Video</a> | <a href="https://ethz-asl.github.io/pvlff/">Project Page</a></h3>
 11 | 
 12 | <p align="center">
 13 |   <a href="">
 14 |     <img src="./assets/teaser.jpg" alt="Panoptic Vision-Language Feature Fields" width="90%">
 15 |   </a>
 16 | </p>
 17 | 
 18 | Recently, methods have been proposed for 3D _open-vocabulary_ semantic segmentation. Such methods are able to segment scenes into arbitrary classes based on text descriptions provided during runtime. In this paper, we propose to the best of our knowledge the first algorithm for _open-vocabulary panoptic_ segmentation in 3D scenes. Our algorithm, Panoptic Vision-Language Feature Fields (PVLFF), learns a semantic feature field of the scene by distilling vision-language features from a pretrained 2D model, and jointly fits an instance feature field through contrastive learning using 2D instance segments on input frames. Despite not being trained on the target classes, our method achieves panoptic segmentation performance similar to the state-of-the-art _closed-set_ 3D systems on the HyperSim, ScanNet and Replica dataset and additionally outperforms current 3D open-vocabulary systems in terms of semantic segmentation. We ablate the components of our method to demonstrate the effectiveness of our model architecture. 
 19 | 
 20 | ## Table of Contents
 21 | 
 22 | 1. [Installation](#installation)
 23 | 2. [Running experiments](#running-experiments)
 24 | 3. [Citation](#citation)
 25 | 4. [Acknowledgements](#acknowledgements)
 26 | 
 27 | ## Installation
 28 | 
 29 | The installation instructions were tested for Python 3.8, 3.9 and 3.10. Some dependencies are recommended to be installed through Anaconda and we assume you are using an Anaconda environment for these instructions.
 30 | 
 31 | The software uses CUDA and compiling `tiny-cuda-nn` requires `nvcc`. If you don't have CUDA >= version 11.3, including `nvcc`, installed on your system, you can install it in your anaconda env with:
 32 | ```
 33 | conda install -c conda-forge cudatoolkit-dev=11.4
 34 | ```
 35 | 
 36 | To install PyTorch and ffmpeg, run:
 37 | ```
 38 | conda install pytorch torchvision cudatoolkit=11.3 -c pytorch
 39 | conda install ffmpeg
 40 | ```
 41 | 
 42 | Install into your desired Python environment with the following commands:
 43 | ```
 44 | pip install git+https://github.com/NVlabs/tiny-cuda-nn/#subdirectory=bindings/torch
 45 | 
 46 | git submodule update --init --recursive
 47 | pushd torch_ngp
 48 | git submodule update --init --recursive
 49 | pip install -e .
 50 | bash scripts/install_ext.sh
 51 | popd
 52 | 
 53 | # To use LSeg features for vision-language feature fields.
 54 | git clone https://github.com/kekeblom/lang-seg
 55 | pushd lang-seg
 56 | pip install -e .
 57 | popd
 58 | 
 59 | # Finally install Autolabel.
 60 | pip install -e .
 61 | ```
 62 | 
 63 | 
 64 | ## Running experiments
 65 | 
 66 | ### Data conversion
 67 | 
 68 | Follow the instructions in `docs/data.md` to convert the scenes from original datasets into the our format.
 69 | 
 70 | ### Training
 71 | To begin the training process, first run the precomputing steps:
 72 | 
 73 | ```
 74 | # Compute the vision-language features.
 75 | python scripts/compute_feature_maps.py <dataset-dir>/<scene> \
 76 |     --features lseg \
 77 |     --checkpoint <lseg-weights> \
 78 |     --dim 512
 79 | 
 80 | # Compute the instance masks using SAM.
 81 | python scripts/compute_sam_mask.py <dataset-dir>/<scene> \
 82 |     --sam-vit-h-checkpoint <sam-weights>
 83 | ```
 84 | 
 85 | where `<dataset-dir>` is the path to the converted scenes, `<scene>` is the name of the scene. `<lseg-weights>` is the path to the LSeg checkpoint, `<sam-weights>` is the path to the SAM checkpoint (which can be downloaded [here](https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth)).
 86 | 
 87 | Then, fit the scene representation using the following training script:
 88 | ```
 89 | python scripts/train.py <dataset-dir>/<scene> \
 90 |     --batch-size 2048 \
 91 |     --iters 20000 \
 92 |     --workspace <workspace> \
 93 |     --feature-dim 512 \
 94 |     --features lseg \
 95 |     --contrastive \
 96 |     --sam-sampling <sampling-method> \
 97 |     --slow-center \
 98 |     --cluster-instance-features
 99 | ```
100 | 
101 | 
102 | where `<workspace>` is the folder where the model is saved. The flag `--contrastive` enables training the instance feature field using contrastive learning. The flag `--sam-sampling` sets the strategy to sample the SAM masks for training; the strategies include `proportional` and `uniform`, where `proportional` means sampling the masks according to their areas, and `uniform` means sampling these masks uniformly. The flag `--slow-center` enables the use of "slow center strategy". The flag `--cluster-instance-features` enables running the clustering after the training and saving the cluster centers together with the object instance of the HDBSCAN clustering class.
103 | 
104 | Here we provide some [checkpoints](https://doi.org/10.3929/ethz-b-000656499) trained on Replica scenes.
105 | 
106 | ### Inference
107 | Render the scene views after training by running the following script:
108 | ```
109 | python scripts/render.py <dataset-dir>/<scene> \
110 |     --stride 1 \
111 |     --model-dir <model-dir> \
112 |     --out <out-dir> \
113 |     --checkpoint <lseg-weights> \
114 |     --label-map <label-map>
115 | ```
116 | where `<model-dir>` is the folder where the model checkpoint is saved (e.g. `<workspace>/<scene>/g15_hg+freq_lseg_rgb1.0_d0.1_s1.0_f0.5_c0.1`). `<out-dir>` sets the output folder where the rendered results are saved. `<label-map>` is the label mapping from id to semantic class of the scene (here is an example [label-map](./configs/label_map.csv) file that we used for replica scenes).
117 | 
118 | ### Evaluation
119 | 
120 | Scene-level Panoptic Quality and 2D Semantic Segmentation
121 | ```
122 | python scripts/language/evaluate.py <dataset-dir> \
123 |     --vis <evaluation-folder/vis> \ # the folder to save the visualization results.
124 |     --workspace <workspace> \
125 |     --out <evaluation-folder> \ # the folder to save the evaluation results.
126 |     --label-map <label-map> \
127 |     --feature-checkpoint <lseg-weights> \
128 |     --panoptic # the flag to evaluate scene-level PQ and 2D semantic segmentation.
129 | #    --debug # whether to save the visualization images.    
130 | ```
131 | 
132 | 3D Semantic Segmentation (only for ScanNet)
133 | ```
134 | python scripts/language/evaluate.py <dataset-dir> \
135 |     --vis <evaluation-folder/vis> \ # the folder to save the visualization results.
136 |     --workspace <workspace> \
137 |     --out <evaluation-folder> \ # the folder to save the evaluation results.
138 |     --label-map <label-map> \
139 |     --feature-checkpoint <lseg-weights> \
140 |     --pc # the flag to 3D semantic segmentation.
141 | ```
142 | 
143 | ### 3D interactive segmentation
144 | We provide a demo UI script of interactive open-vocabulary segmentation on pointclouds of ScanNet scenes. 
145 | ```
146 | python scripts/demo_ui.py <dataset-dir>/<scene> \
147 |     --workspace <workspace> \
148 |     --checkpoint <lseg-weights>
149 | ```
150 | 
151 | https://github.com/ethz-asl/pvlff/assets/33897834/1c31a03a-c7e9-43dc-af83-de1cf471893e
152 | 
153 | ## Citation
154 | 
155 | If you find our code or paper useful, please cite:
156 | 
157 | ```bibtex
158 | @article{Chen2024PVLFF,
159 |   author    = {Chen, Haoran and Blomqvist, Kenneth and Milano, Francesco and Siegwart, Roland},
160 |   title     = {Panoptic Vision-Language Feature Fields},
161 |   journal   = {IEEE Robotics and Automation Letters (RA-L)},
162 |   volume    = {9},
163 |   number    = {3},
164 |   pages     = {2144--2151},
165 |   year      = {2024}
166 | }
167 | ```
168 | 
169 | ## Acknowledgements
170 | 
171 | A large part of the code is based on [Autolabel](https://github.com/ethz-asl/autolabel):
172 | 
173 | - K. Blomqvist, L. Ott, J. J. Chung, and R. Siegwart, "Baking in the Feature: Accelerating Volumetric Segmentation by Rendering Feature Maps", in IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS), 2023 - [Link](https://keke.dev/baking-in-the-feature)
174 | 
175 | - K. Blomqvist, F. Milano, J. J. Chung, L. Ott, and R. Siegwart, "Neural Implicit Vision-Language Feature Fields", in IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS), 2023 - [Link](https://arxiv.org/abs/2303.10962)
176 | 
177 | Our code uses our customized version of [`torch-ngp`](https://github.com/ashawkey/torch-ngp) as the underlying NeRF framework. Big thanks to [Jiaxiang Tang](https://me.kiui.moe/) for releasing the initial implementation.
178 | 


--------------------------------------------------------------------------------
/scripts/render.py:
--------------------------------------------------------------------------------
  1 | import h5py
  2 | import numpy as np
  3 | import pandas
  4 | import cv2
  5 | import os
  6 | import pickle
  7 | from tqdm import tqdm
  8 | import torch
  9 | 
 10 | from autolabel.dataset import SceneDataset
 11 | from autolabel import model_utils
 12 | from autolabel import visualization
 13 | from autolabel.utils.feature_utils import get_feature_extractor
 14 | from pathlib import Path
 15 | from sklearn import decomposition
 16 | from sklearn.metrics.pairwise import cosine_similarity
 17 | 
 18 | 
 19 | def read_args():
 20 |     parser = model_utils.model_flag_parser()
 21 |     parser.add_argument('scene')
 22 |     parser.add_argument('--fps', type=int, default=5)
 23 |     parser.add_argument('--stride', type=int, default=1)
 24 |     parser.add_argument('--model-dir', type=str, required=True)
 25 |     parser.add_argument(
 26 |         '--max-depth',
 27 |         type=float,
 28 |         default=7.5,
 29 |         help="The maximum depth used in colormapping the depth frames.")
 30 |     parser.add_argument('--checkpoint', type=str)
 31 |     parser.add_argument('--out',
 32 |                         type=str,
 33 |                         required=True,
 34 |                         help="Where to save the video.")
 35 |     parser.add_argument('--classes',
 36 |                         default=None,
 37 |                         type=str,
 38 |                         nargs='+',
 39 |                         help="Which classes to segment the scene into.")
 40 |     parser.add_argument('--label-map',
 41 |                         default=None,
 42 |                         type=str,
 43 |                         help="Path to list of labels.")
 44 |     return parser.parse_args()
 45 | 
 46 | 
 47 | class FeatureTransformer:
 48 | 
 49 |     def __init__(self, scene_path, feature_name, classes, checkpoint=None, without_features=False):
 50 |         if not without_features:
 51 |             with h5py.File(os.path.join(scene_path, 'features.hdf'), 'r') as f:
 52 |                 features = f[f'features/{feature_name}']
 53 |                 blob = features.attrs['pca'].tobytes()
 54 |                 self.pca = pickle.loads(blob)
 55 |                 self.feature_min = features.attrs['min']
 56 |                 self.feature_range = features.attrs['range']
 57 |             self.first_fit = False
 58 |         else:
 59 |             self.pca = decomposition.PCA(n_components=3)
 60 |             self.feature_min = None
 61 |             self.feature_range = None
 62 |             self.first_fit = True
 63 | 
 64 | 
 65 |         if feature_name is not None:
 66 |             extractor = get_feature_extractor(feature_name, checkpoint)
 67 |             self.text_features = self._encode_text(extractor, classes)
 68 | 
 69 |     def _encode_text(self, extractor, text):
 70 |         return extractor.encode_text(text)
 71 | 
 72 |     def __call__(self, p_features):
 73 |         H, W, C = p_features.shape
 74 |         if self.first_fit:
 75 |             features = self.pca.fit_transform(p_features.reshape(H * W, C))
 76 |             self.first_fit = False
 77 |         else:
 78 |             features = self.pca.transform(p_features.reshape(H * W, C))
 79 | 
 80 |         if (self.feature_min is not None) and (self.feature_range is not None):
 81 |             features = np.clip((features - self.feature_min) / self.feature_range,
 82 |                             0., 1.)
 83 |         else:
 84 |             features = np.clip((features - np.min(features)) / (np.max(features) - np.min(features)),
 85 |                             0., 1.)
 86 |         return (features.reshape(H, W, 3) * 255.).astype(np.uint8)
 87 | 
 88 | 
 89 | def compute_semantics(outputs, classes, feature_transform):
 90 |     if classes is not None:
 91 |         features = outputs['semantic_features']
 92 |         features = (features / torch.norm(features, dim=-1, keepdim=True))
 93 |         text_features = feature_transform.text_features
 94 |         H, W, D = features.shape
 95 |         C = text_features.shape[0]
 96 |         similarities = torch.zeros((H, W, C), dtype=features.dtype)
 97 |         for i in range(H):
 98 |             similarities[i, :, :] = (features[i, :, None] *
 99 |                                      text_features).sum(dim=-1).cpu()
100 |         return similarities.argmax(dim=-1)
101 |     else:
102 |         return outputs['semantic'].argmax(dim=-1).cpu().numpy()
103 | 
104 | def compute_instances(outputs, feature_centers):
105 |     instance_feature = outputs['contrastive_features'].cpu().numpy()
106 |     image_height, image_width, feature_dim = instance_feature.shape
107 |     instance_feature = instance_feature.reshape(-1, feature_dim)
108 |     sim_mat = cosine_similarity(instance_feature, feature_centers)
109 |     pred_instance = np.argmax(sim_mat, axis=1)
110 |     pred_instance = pred_instance.reshape(image_height, image_width)
111 |     return pred_instance
112 | 
113 | def render(model,
114 |            batch,
115 |            feature_transform,
116 |            semantic_color_map, 
117 |            instance_color_map,
118 |            size=(480, 360),
119 |            maxdepth=10.0,
120 |            classes=None,
121 |            con_feature_transform=None):
122 |     rays_o = torch.tensor(batch['rays_o']).cuda()
123 |     rays_d = torch.tensor(batch['rays_d']).cuda()
124 |     direction_norms = torch.tensor(batch['direction_norms']).cuda()
125 |     outputs = model.render(rays_o,
126 |                            rays_d,
127 |                            direction_norms,
128 |                            staged=True,
129 |                            perturb=False,
130 |                            num_steps=512,
131 |                            upsample_steps=0)
132 |     p_semantic = compute_semantics(outputs, classes, feature_transform)
133 |     p_instance = compute_instances(outputs, model.instance_centers)
134 |     frame = np.zeros((2 * size[1], 3 * size[0], 3), dtype=np.uint8)
135 |     h_mid = size[1]
136 |     w_ot, w_tt = size[0], size[0] * 2
137 |     p_rgb = (outputs['image'].cpu().numpy() * 255.0).astype(np.uint8)
138 |     p_depth = outputs['depth']
139 |     frame[:h_mid, :w_ot, :] = p_rgb
140 |     frame[h_mid:, :w_ot] = visualization.visualize_depth(
141 |         p_depth.cpu().numpy(), maxdepth=maxdepth)[:, :, :3]
142 |     frame[:h_mid, w_tt:] = semantic_color_map[p_semantic]
143 |     frame[h_mid:, w_tt:] = instance_color_map[p_instance]
144 |     
145 |     if feature_transform is not None:
146 |         p_features = feature_transform(
147 |             outputs['semantic_features'].cpu().numpy())
148 |         frame[:h_mid, w_ot:w_tt] = p_features
149 | 
150 |     if con_feature_transform is not None:
151 |         p_con_features = con_feature_transform(
152 |             outputs['contrastive_features'].cpu().numpy())
153 |         frame[h_mid:, w_ot:w_tt] = p_con_features
154 | 
155 |     return frame
156 | 
157 | 
158 | def main():
159 |     flags = read_args()
160 |     model_params = model_utils.read_params(flags.model_dir)
161 | 
162 |     view_size = (480, 360)
163 |     dataset = SceneDataset('test',
164 |                            flags.scene,
165 |                            size=view_size,
166 |                            batch_size=16384,
167 |                            features=model_params.features,
168 |                            load_semantic=False,
169 |                            lazy=True)
170 | 
171 |     classes = flags.classes
172 |     if flags.label_map is not None:
173 |         label_map = pandas.read_csv(flags.label_map)
174 |         classes = label_map['prompt'].values
175 |     semantic_color_map = (np.random.rand(len(classes), 3) * 255).astype(np.uint8)
176 | 
177 |     feature_transform = None
178 |     if model_params.features is not None:
179 |         feature_transform = FeatureTransformer(flags.scene,
180 |                                                model_params.features, classes,
181 |                                                flags.checkpoint)
182 | 
183 |     con_feature_transform = FeatureTransformer(flags.scene,
184 |                                                None, classes,
185 |                                                without_features=True)
186 | 
187 |     n_classes = dataset.n_classes if dataset.n_classes is not None else 2
188 |     model = model_utils.create_model(dataset.min_bounds, dataset.max_bounds,
189 |                                      n_classes, model_params).cuda()
190 |     model = model.eval()
191 |     model_utils.load_checkpoint(model,
192 |                                 os.path.join(flags.model_dir, 'checkpoints'))
193 |     
194 |     instance_color_map = (np.random.rand(model.instance_centers.shape[0], 3) * 255).astype(np.uint8)
195 | 
196 |     Path(flags.out).mkdir(exist_ok=True, parents=True)
197 |     with torch.inference_mode():
198 |         with torch.cuda.amp.autocast(enabled=True):
199 |             for frame_index in tqdm(dataset.indices[::flags.stride]):
200 |                 batch = dataset._get_test(frame_index)
201 |                 frame = render(model,
202 |                                batch,
203 |                                feature_transform,
204 |                                semantic_color_map=semantic_color_map, 
205 |                                instance_color_map=instance_color_map,
206 |                                size=view_size,
207 |                                maxdepth=flags.max_depth,
208 |                                classes=classes,
209 |                                con_feature_transform=con_feature_transform)
210 |                 cv2.imwrite(
211 |                     os.path.join(flags.out, f"{frame_index}.png"),
212 |                     cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
213 |                 )
214 | 
215 | 
216 | if __name__ == "__main__":
217 |     main()
218 | 


--------------------------------------------------------------------------------
/scripts/convert_to_instant_ngp.py:
--------------------------------------------------------------------------------
  1 | """Converts .txt world-to-camera poses created by `autolabel` to transforms.json
  2 | files that can be used in `instant-ngp`/`torch-ngp`.
  3 | 
  4 | A large part of this code is based on `instant-ngp/scripts/colmap2nerf.py` from
  5 | https://github.com/NVlabs/instant-ngp.
  6 | """
  7 | import argparse
  8 | import cv2
  9 | import glob
 10 | import json
 11 | import math
 12 | import numpy as np
 13 | import os
 14 | 
 15 | 
 16 | def variance_of_laplacian(image):
 17 |     return cv2.Laplacian(image, cv2.CV_64F).var()
 18 | 
 19 | 
 20 | def sharpness(image_path):
 21 |     image = cv2.imread(image_path)
 22 |     gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
 23 |     fm = variance_of_laplacian(gray)
 24 |     return fm
 25 | 
 26 | 
 27 | def rotmat(a, b):
 28 |     a, b = a / np.linalg.norm(a), b / np.linalg.norm(b)
 29 |     v = np.cross(a, b)
 30 |     c = np.dot(a, b)
 31 |     s = np.linalg.norm(v)
 32 |     kmat = np.array([[0, -v[2], v[1]], [v[2], 0, -v[0]], [-v[1], v[0], 0]])
 33 |     return np.eye(3) + kmat + kmat.dot(kmat) * ((1 - c) / (s**2 + 1e-10))
 34 | 
 35 | 
 36 | def closest_point_2_lines(oa, da, ob, db):
 37 |     r"""Returns point closest to both rays of form o+t*d, and a weight factor
 38 |     that goes to 0 if the lines are parallel.
 39 |     """
 40 |     da = da / np.linalg.norm(da)
 41 |     db = db / np.linalg.norm(db)
 42 |     c = np.cross(da, db)
 43 |     denom = np.linalg.norm(c)**2
 44 |     t = ob - oa
 45 |     ta = np.linalg.det([t, db, c]) / (denom + 1e-10)
 46 |     tb = np.linalg.det([t, da, c]) / (denom + 1e-10)
 47 |     if ta > 0:
 48 |         ta = 0
 49 |     if tb > 0:
 50 |         tb = 0
 51 |     return (oa + ta * da + ob + tb * db) * 0.5, denom
 52 | 
 53 | 
 54 | parser = argparse.ArgumentParser()
 55 | 
 56 | parser.add_argument(
 57 |     '--dataset_folder',
 58 |     type=str,
 59 |     required=True,
 60 |     help=
 61 |     ("Path to the dataset folder. It is expected to contain a `rgb` subfolder "
 62 |      "with .png images, a `pose` subfolder with world-to-camera poses as .txt "
 63 |      "files, each corresponding to an image in `rgb`, and an `intrinsics.txt` "
 64 |      "file. A `transforms.json` file will be created in it."))
 65 | 
 66 | args = parser.parse_args()
 67 | 
 68 | _aabb_scale = 8
 69 | _dataset_folder = args.dataset_folder
 70 | _image_folder = os.path.join(_dataset_folder, "rgb")
 71 | _pose_folder = os.path.join(_dataset_folder, "pose")
 72 | _intrinsics_file_path = os.path.join(_dataset_folder, "intrinsics.txt")
 73 | _output_transform_file = os.path.join(_dataset_folder, "transforms.json")
 74 | # List of supported image extensions.
 75 | _image_extensions = ["png", "jpg", "jpeg"]
 76 | 
 77 | if (not os.path.exists(_image_folder)):
 78 |     raise (OSError(f"The image folder '{_image_folder}' could not be found."))
 79 | if (not os.path.exists(_pose_folder)):
 80 |     raise (OSError(f"The pose folder '{_pose_folder}' could not be found."))
 81 | if (not os.path.exists(_intrinsics_file_path)):
 82 |     raise (OSError(f"The intrinsics file '{_intrinsics_file_path}' could not "
 83 |                    "be found."))
 84 | if (os.path.exists(_output_transform_file)):
 85 |     raise (OSError(
 86 |         f"The output transform file '{_output_transform_file}' "
 87 |         "already exists. Please remove it or rename to avoid overriding it."))
 88 | 
 89 | # Find the actual extension of the input images and verify that there is exactly
 90 | # one pose for each image.
 91 | curr_image_extension_idx = 0
 92 | image_list = []
 93 | while len(
 94 |         image_list) == 0 and curr_image_extension_idx < len(_image_extensions):
 95 |     image_extension = _image_extensions[curr_image_extension_idx]
 96 |     image_list = sorted(
 97 |         glob.glob(os.path.join(_image_folder, f"*.{image_extension}")))
 98 |     curr_image_extension_idx += 1
 99 | assert (len(image_list) > 0), f"Found no images in '{_image_folder}'."
100 | pose_list = sorted(glob.glob(os.path.join(_pose_folder, "*.txt")))
101 | assert (
102 |     [os.path.basename(f).split(f'.{image_extension}')[0] for f in image_list
103 |     ] == [os.path.basename(f).split('.txt')[0] for f in pose_list]
104 | ), f"Found non-matching images-poses in '{_image_folder}' and '{_pose_folder}'."
105 | 
106 | # Read an example image to find the image dimensions.
107 | example_image = cv2.imread(image_list[0])
108 | H, W = example_image.shape[:2]
109 | 
110 | # Read the camera intrinsics. NOTE: A pinhole camera is assumed.
111 | K = np.loadtxt(_intrinsics_file_path)
112 | f_x = K[0, 0]
113 | f_y = K[1, 1]
114 | c_x = K[0, 2]
115 | c_y = K[1, 2]
116 | 
117 | angle_x = math.atan(W / (f_x * 2)) * 2
118 | angle_y = math.atan(H / (f_y * 2)) * 2
119 | 
120 | # Bottom and up vectors.
121 | bottom = np.array([0.0, 0.0, 0.0, 1.0]).reshape([1, 4])
122 | up = np.zeros(3)
123 | out = {
124 |     "camera_angle_x": angle_x,
125 |     "camera_angle_y": angle_y,
126 |     "f_x": f_x,
127 |     "f_y": f_y,
128 |     "k1": 0.0,
129 |     "k2": 0.0,
130 |     "p1": 0.0,
131 |     "p2": 0.0,
132 |     "cx": c_x,
133 |     "cy": c_y,
134 |     "w": W,
135 |     "h": H,
136 |     "aabb_scale": _aabb_scale,
137 |     "frames": [],
138 | }
139 | 
140 | print(
141 |     f"\033[94mCreating output transform file '{_output_transform_file}'.\033[0m"
142 | )
143 | 
144 | for image_file_path, pose_file_path in zip(image_list, pose_list):
145 |     image_rel = os.path.relpath(_image_folder)
146 |     relative_image_file_path = f"./rgb/{os.path.basename(image_file_path)}"
147 |     sharpness_value = sharpness(image_file_path)
148 |     # Read world-to-camera pose.
149 |     T_CW = np.loadtxt(pose_file_path).reshape(4, 4)
150 |     T_WC = np.linalg.inv(T_CW)
151 |     # Apply transformations required by the NeRF convention.
152 |     # - Flip the y and z axes.
153 |     T_WC[0:3, 2] *= -1
154 |     T_WC[0:3, 1] *= -1
155 |     # - Swap y and z.
156 |     T_WC = T_WC[[1, 0, 2, 3], :]
157 |     # - Flip the whole world upside down.
158 |     T_WC[2, :] *= -1
159 | 
160 |     # Update the up vector using the original z axis.
161 |     up += T_WC[0:3, 1]
162 | 
163 |     frame = {
164 |         "file_path": relative_image_file_path,
165 |         "sharpness": sharpness_value,
166 |         "transform_matrix": T_WC
167 |     }
168 |     out["frames"].append(frame)
169 | num_frames = len(out["frames"])
170 | up = up / np.linalg.norm(up)
171 | print(f"Found up vector {up}")
172 | 
173 | # Rotate up vector to [0, 0, 1].
174 | R = rotmat(up, [0, 0, 1])
175 | R = np.pad(R, [0, 1])
176 | R[-1, -1] = 1
177 | 
178 | # Rotate the transforms so that the up vector is the z axis.
179 | for f in out["frames"]:
180 |     f["transform_matrix"] = np.matmul(R, f["transform_matrix"])
181 | 
182 | # Find a central point all cameras are looking at.
183 | print("Computing center of attention...")
184 | total_weight = 0.0
185 | center_point = np.array([0.0, 0.0, 0.0])
186 | for f in out["frames"]:
187 |     mf = f["transform_matrix"][0:3, :]
188 |     for g in out["frames"]:
189 |         mg = g["transform_matrix"][0:3, :]
190 |         p, W = closest_point_2_lines(mf[:, 3], mf[:, 2], mg[:, 3], mg[:, 2])
191 |         if W > 0.01:
192 |             center_point += p * W
193 |             total_weight += W
194 | center_point /= total_weight
195 | # Translate the cameras so that the world origin coincides with the central
196 | # point computed above.
197 | for f in out["frames"]:
198 |     f["transform_matrix"][0:3, 3] -= center_point
199 | 
200 | # Scale the world coordinate frame (i.e., scale the translation part of the
201 | # camera-to-world transforms) so that the scene fits within a "standard NeRF"
202 | # size.
203 | # In practice:
204 | # - `scale` is a value that gets multiplied to the translation part of the
205 | #   poses when training, and scales the scene to a "standard NeRF size".
206 | # - Denoting as UOM the unit of measure of the training coordinates
207 | #   resulting from the above scaling, the equivalent in meters of 1 UOM is
208 | #   given by the value of one_uom_scene_to_one_m.
209 | # - During training, the pipeline will assume the scene to be bounded within
210 | #   a cube [-bound, bound]^3 centered at the scene center (e.g., the object
211 | #   center for object-centric scenes), where `bound` is a parameter that can
212 | #   be set. This means that the scene will be assumed to be contained within
213 | #   a (L1) distance of:
214 | #
215 | #     bound [UOM]
216 | #   = (bound * one_uom_scene_to_one_m) [m]
217 | #   = (bound * 1 / scale) [m]
218 | #   = bound * 1 / (1. / (avg_len[m]))
219 | #   = (bound * avg_len) [m],
220 | #
221 | #   where `avg_len[m]` is the average distance of the camera origins from
222 | #   the scene center in meters. As an example, for an average distance of 80 cm,
223 | #   the size of the cube containing the scene would be bound * 80 [cm].
224 | #   Setting `scale` to be equal to 1.0 / avg_len is an arbitrary decision to
225 | #   fit the scene properly. Originally, it was 4.0 / avg_len in `instant-ngp`,
226 | #   where it was described as scaling the scene to be "NeRF-sized".
227 | #   The `bound` parameter can be set in the training pipeline.
228 | avg_len = 0.
229 | for f in out["frames"]:
230 |     avg_len += np.linalg.norm(f["transform_matrix"][0:3, 3])
231 | avg_len /= num_frames
232 | 
233 | scale = 1.0 / avg_len
234 | one_uom_scene_to_one_m = 1.0 / scale
235 | print(f"\033[94mAverage camera distance from origin = {avg_len} m (NOTE: "
236 |       "Assuming the input UOM of the transforms was meters, which is the case "
237 |       "when using `autolabel` to extract the poses).\033[0m")
238 | 
239 | # Write the transforms to file.
240 | for f in out["frames"]:
241 |     f["transform_matrix"] = f["transform_matrix"].tolist()
242 | 
243 | out["scale"] = scale
244 | out["one_uom_scene_to_one_m"] = one_uom_scene_to_one_m
245 | 
246 | with open(_output_transform_file, "w") as outfile:
247 |     json.dump(out, outfile, indent=4)


--------------------------------------------------------------------------------
/scripts/data/convert_replica.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Converts rendered replica scenes from https://github.com/Harry-Zhi/semantic_nerf
  3 | to the autolabel scene format.
  4 | 
  5 | usage:
  6 |     python scripts/data/convert_replica.py <replica folder> --out <output-scene-directory>
  7 | """
  8 | import pandas
  9 | import argparse
 10 | import cv2
 11 | import json
 12 | import tempfile
 13 | import math
 14 | import numpy as np
 15 | import open3d as o3d
 16 | import os
 17 | import shutil
 18 | import subprocess
 19 | from tqdm import tqdm
 20 | 
 21 | from autolabel.utils import Scene, transform_points
 22 | 
 23 | 
 24 | def read_args():
 25 |     parser = argparse.ArgumentParser()
 26 |     parser.add_argument("dataset")
 27 |     parser.add_argument("--out", type=str, required=True)
 28 |     return parser.parse_args()
 29 | 
 30 | 
 31 | class SceneConverter:
 32 | 
 33 |     def __init__(self, scene, out_scene, metadata):
 34 |         self.out_scene = out_scene
 35 |         self.in_scene = scene
 36 |         self.metadata = metadata
 37 |         self._collect_paths()
 38 | 
 39 |     def _collect_paths(self):
 40 |         rgb_path = os.path.join(self.in_scene, 'rgb')
 41 |         depth_path = os.path.join(self.in_scene, 'depth')
 42 |         semantic_path = os.path.join(self.in_scene, 'semantic_class')
 43 |         instance_path = os.path.join(self.in_scene, 'instance')
 44 |         rgb_frames = [f for f in os.listdir(rgb_path) if f[0] != '.']
 45 |         depth_frames = [f for f in os.listdir(depth_path) if f[0] != '.']
 46 |         semantic_frames = [
 47 |             f for f in os.listdir(semantic_path)
 48 |             if f[0] != '.' and 'semantic' in f
 49 |         ]
 50 |         instance_frames = [
 51 |             f for f in os.listdir(instance_path)
 52 |             if f[0] != '.' and 'semantic_instance' in f
 53 |         ]
 54 |         rgb_frames = sorted(rgb_frames,
 55 |                             key=lambda x: int(x.split('_')[-1].split('.')[0]))
 56 |         depth_frames = sorted(depth_frames,
 57 |                               key=lambda x: int(x.split('_')[-1].split('.')[0]))
 58 |         semantic_frames = sorted(
 59 |             semantic_frames, key=lambda x: int(x.split('_')[-1].split('.')[0]))
 60 |         instance_frames = sorted(
 61 |             instance_frames, key=lambda x: int(x.split('_')[-1].split('.')[0]))
 62 |         self.rgb_frames = []
 63 |         self.depth_frames = []
 64 |         self.semantic_frames = []
 65 |         self.instance_frames = []
 66 |         for rgb, depth, semantic, instance in zip(rgb_frames, depth_frames,
 67 |                                         semantic_frames, instance_frames):
 68 |             self.rgb_frames.append(os.path.join(rgb_path, rgb))
 69 |             self.depth_frames.append(os.path.join(depth_path, depth))
 70 |             self.semantic_frames.append(os.path.join(semantic_path, semantic))
 71 |             self.instance_frames.append(os.path.join(instance_path, instance))
 72 | 
 73 |     def _copy_frames(self):
 74 |         self.rgb_out = os.path.join(self.out_scene, 'rgb')
 75 |         self.depth_out = os.path.join(self.out_scene, 'depth')
 76 |         self.semantic_out = os.path.join(self.out_scene, 'gt_semantic')
 77 |         self.instance_out = os.path.join(self.out_scene, 'gt_instance')
 78 |         os.makedirs(self.rgb_out, exist_ok=True)
 79 |         os.makedirs(self.depth_out, exist_ok=True)
 80 |         os.makedirs(self.semantic_out, exist_ok=True)
 81 |         os.makedirs(self.instance_out, exist_ok=True)
 82 | 
 83 |         semantic_frames = []
 84 |         for i, (rgb, depth, semantic, instance) in enumerate(
 85 |                 zip(tqdm(self.rgb_frames, desc="Copying frames"),
 86 |                     self.depth_frames, self.semantic_frames, self.instance_frames)):
 87 |             rgb_out_path = os.path.join(self.rgb_out, f"{i:06}.png")
 88 |             depth_out_path = os.path.join(self.depth_out, f"{i:06}.png")
 89 |             semantic_out = os.path.join(self.semantic_out, f"{i:06}.png")
 90 |             instance_out_path = os.path.join(self.instance_out, f"{i:06}.png")
 91 |             shutil.copy(rgb, rgb_out_path)
 92 |             shutil.copy(depth, depth_out_path)
 93 |             shutil.copy(semantic, self.semantic_out)
 94 |             shutil.copy(instance, instance_out_path)
 95 | 
 96 |         metadata = { 'n_classes': int(self.metadata['id'].max() + 1) }
 97 |         metadata_path = os.path.join(self.out_scene, 'metadata.json')
 98 |         with open(metadata_path, 'w') as f:
 99 |             f.write(json.dumps(metadata, indent=2))
100 | 
101 |     def _copy_trajectory(self):
102 |         pose_dir = os.path.join(self.out_scene, 'pose')
103 |         os.makedirs(pose_dir, exist_ok=True)
104 |         trajectory = np.loadtxt(os.path.join(self.in_scene, 'traj_w_c.txt'),
105 |                                 delimiter=' ').reshape(-1, 4, 4)
106 |         for i, T_CW in enumerate(trajectory):
107 |             pose_out = os.path.join(pose_dir, f"{i:06}.txt")
108 |             np.savetxt(pose_out, np.linalg.inv(T_CW))
109 | 
110 |     def _copy_intrinsics(self):
111 |         width = 640
112 |         height = 480
113 |         hfov = 90.0
114 |         fx = width / 2.0 / math.tan(math.radians(hfov / 2.0))
115 |         cx = (width - 1.0) / 2.0
116 |         cy = (height - 1.0) / 2.0
117 |         camera_matrix = np.eye(3)
118 |         camera_matrix[0, 0] = fx
119 |         camera_matrix[1, 1] = fx
120 |         camera_matrix[0, 2] = cx
121 |         camera_matrix[1, 2] = cy
122 |         np.savetxt(os.path.join(self.out_scene, 'intrinsics.txt'),
123 |                    camera_matrix)
124 | 
125 |     def _compute_bounds(self):
126 |         scene = Scene(self.out_scene)
127 |         depth_frame = o3d.io.read_image(scene.depth_paths()[0])
128 |         depth_size = np.asarray(depth_frame).shape[::-1]
129 |         K = scene.camera.scale(depth_size).camera_matrix
130 |         intrinsics = o3d.camera.PinholeCameraIntrinsic(int(depth_size[0]),
131 |                                                        int(depth_size[1]),
132 |                                                        K[0, 0], K[1, 1],
133 |                                                        K[0, 2], K[1, 2])
134 |         pc = o3d.geometry.PointCloud()
135 | 
136 |         poses = scene.poses[::10]
137 |         depths = scene.depth_paths()[::10]
138 |         for T_CW, depth in zip(poses, tqdm(depths, desc="Computing bounds")):
139 |             T_WC = np.linalg.inv(T_CW)
140 |             depth = o3d.io.read_image(depth)
141 | 
142 |             pc_C = o3d.geometry.PointCloud.create_from_depth_image(
143 |                 depth, depth_scale=1000.0, intrinsic=intrinsics)
144 |             pc_C = np.asarray(pc_C.points)
145 |             pc_W = transform_points(T_WC, pc_C)
146 | 
147 |             pc += o3d.geometry.PointCloud(
148 |                 o3d.utility.Vector3dVector(pc_W)).uniform_down_sample(50)
149 |         filtered, _ = pc.remove_statistical_outlier(nb_neighbors=20,
150 |                                                     std_ratio=2.0)
151 |         aabb = filtered.get_axis_aligned_bounding_box()
152 |         with open(os.path.join(scene.path, 'bbox.txt'), 'wt') as f:
153 |             min_str = " ".join([str(x) for x in aabb.get_min_bound()])
154 |             max_str = " ".join([str(x) for x in aabb.get_max_bound()])
155 |             f.write(f"{min_str} {max_str} 0.01")
156 | 
157 |     def run(self):
158 |         self._copy_frames()
159 |         self._copy_trajectory()
160 |         self._copy_intrinsics()
161 |         self._compute_bounds()
162 | 
163 | def create_labelmap(semantic_info_dir, out):
164 |     metadata = os.path.join(semantic_info_dir, 'room_0', 'info_semantic.json')
165 |     with open(metadata, 'r') as f:
166 |         metadata = json.load(f)
167 |     ids = []
168 |     prompts = []
169 |     for class_info in metadata['classes']:
170 |         ids.append(class_info['id'])
171 |         prompts.append(class_info['name'])
172 |     data = pandas.DataFrame({'id': ids, 'name': prompts})
173 |     data.to_csv(out, index=False)
174 |     return data
175 | 
176 | 
177 | def main():
178 |     flags = read_args()
179 | 
180 |     zip_files = [f for f in os.listdir(flags.dataset) if '.zip' in f]
181 |     instance_zip = [f for f in zip_files if 'Instance' in f][0]
182 | 
183 |     tmpdir = tempfile.mkdtemp()
184 |     try:
185 |         success = subprocess.run(['unzip', os.path.join(flags.dataset, instance_zip), '-d', tmpdir])
186 |         if success.returncode != 0:
187 |             raise RuntimeError("Failed to extract instance segmentation")
188 |         success = subprocess.run(['unzip', os.path.join(flags.dataset, 'semantic_info.zip'), '-d', tmpdir])
189 |         if success.returncode != 0:
190 |             raise RuntimeError("Failed to extract segmentation metadata")
191 |         metadata = create_labelmap(os.path.join(tmpdir, 'semantic_info'), os.path.join(flags.out, 'label_map.csv'))
192 | 
193 |         for file in zip_files:
194 |             if 'semantic_info' in file or 'Instance' in file or 'replica' in file:
195 |                 continue
196 |             print("Extracting", file)
197 |             scene_name = file.split('.')[0]
198 |             tmp_scene_dir = os.path.join(tmpdir, scene_name)
199 |             success = subprocess.run(['unzip', os.path.join(flags.dataset, file), '-d', tmpdir])
200 |             if success.returncode != 0:
201 |                 raise RuntimeError("Failed to extract scene")
202 |             out_scene = os.path.join(flags.out, scene_name)
203 |             os.makedirs(out_scene, exist_ok=True)
204 |             in_scene = os.path.join(tmp_scene_dir, 'Sequence_1')
205 |             scene_instance_zip = os.path.join(flags.dataset, 'Replica_Instance_Segmentation', scene_name, 'Sequence_1', 'semantic_instance.zip')
206 |             success = subprocess.run(['unzip', scene_instance_zip, '-d', tmp_scene_dir])
207 |             if success.returncode != 0:
208 |                 raise RuntimeError("Failed to extract scene")
209 |             success = subprocess.run(['mv', os.path.join(tmp_scene_dir, 'semantic_instance'), os.path.join(tmp_scene_dir, 'Sequence_1', 'instance')])
210 |             if success.returncode != 0:
211 |                 raise RuntimeError("Failed to move instance folder")
212 |             converter = SceneConverter(in_scene, out_scene, metadata)
213 |             converter.run()
214 |             shutil.rmtree(tmp_scene_dir)
215 |     finally:
216 |         shutil.rmtree(tmpdir)
217 | 
218 |     # Exporter(flags).run()
219 | 
220 | if __name__ == "__main__":
221 |     main()
222 | 


--------------------------------------------------------------------------------
/scripts/data/convert_hypersim.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Converts hypersim scenes from https://github.com/apple/ml-hypersim
  3 | to the autolabel scene format.
  4 | 
  5 | usage:
  6 |     python scripts/data/convert_hypersim.py <hypersim folder> \
  7 |         --out <output-scene-directory> \
  8 |         --ori-semantic-labels <hypersim-semantic-labels-file> \
  9 |         --camera-parameter-file <csvfile-of-camera>
 10 | """
 11 | import pandas as pd
 12 | import argparse
 13 | import cv2
 14 | import json
 15 | import math
 16 | import numpy as np
 17 | import open3d as o3d
 18 | import os
 19 | import glob
 20 | from natsort import os_sorted
 21 | import shutil
 22 | from tqdm import tqdm
 23 | import h5py
 24 | 
 25 | 
 26 | from autolabel.utils import Scene, transform_points
 27 | 
 28 | 
 29 | def read_args():
 30 |     parser = argparse.ArgumentParser()
 31 |     parser.add_argument("dataset")
 32 |     parser.add_argument("--out", type=str, required=True)
 33 |     parser.add_argument("--ori-semantic-labels", type=str, required=True)
 34 |     parser.add_argument("--camera-parameter-file", type=str, required=True)
 35 |     return parser.parse_args()
 36 | 
 37 | def load_distance_meters_to_depth(hdf_file, width=1024, height=768, focal=886.81):
 38 |     with h5py.File(hdf_file, "r") as f:
 39 |         depth_meters = f["dataset"][:].astype(np.float32)
 40 |     
 41 |     npyImageplaneX = np.linspace((-0.5 * width) + 0.5, (0.5 * width) - 0.5, width).reshape(1, width).repeat(height, 0).astype(np.float32)[:, :, None]
 42 |     npyImageplaneY = np.linspace((-0.5 * height) + 0.5, (0.5 * height) - 0.5, height).reshape(height, 1).repeat(width, 1).astype(np.float32)[:, :, None]
 43 |     npyImageplaneZ = np.full([height, width, 1], focal, np.float32)
 44 |     npyImageplane = np.concatenate([npyImageplaneX, npyImageplaneY, npyImageplaneZ], 2)
 45 | 
 46 |     npyDepth = depth_meters / np.linalg.norm(npyImageplane, 2, 2) * focal
 47 |     return npyDepth
 48 | 
 49 | def load_camera_poses(hdf_orientation, hdf_position, scale):
 50 |     
 51 |     with h5py.File(hdf_orientation, "r") as f:
 52 |         orientations = f["dataset"][:].astype(np.float32)
 53 | 
 54 |     with h5py.File(hdf_position, "r") as f:
 55 |         positions = f["dataset"][:].astype(np.float32)
 56 |     
 57 |     positions *= scale
 58 | 
 59 |     poses = []
 60 |     trans = np.eye(3)
 61 |     trans[1, 1] = -1
 62 |     trans[2, 2] = -1
 63 |     
 64 |     for orientation, position in zip(orientations, positions):
 65 |         T_WC = np.eye(4)
 66 |         T_WC[:3, :3] = orientation @ trans
 67 |         T_WC[:3, 3] = position
 68 |         poses.append(T_WC)
 69 |     return poses
 70 | 
 71 | 
 72 | class SceneConverter:
 73 | 
 74 |     def __init__(self, scene, out_scene, camera_settings, semantic_label_mapping):
 75 |         self.out_scene = out_scene
 76 |         self.in_scene = scene
 77 |         self._load_camera(camera_settings)
 78 |         self.semantic_label_mapping = pd.read_csv(semantic_label_mapping) # NYU 40 classes
 79 | 
 80 |         self._load_meta_data()
 81 | 
 82 |     def _load_camera(self, camera_settings):
 83 |         height = camera_settings['settings_output_img_height']
 84 |         width = camera_settings['settings_output_img_width']
 85 |         rate_unit_to_meter = camera_settings['settings_units_info_meters_scale']
 86 |         fov_x = camera_settings['settings_camera_fov']
 87 |         fx = width / 2.0 / math.tan(fov_x / 2)
 88 | 
 89 |         cx = (width - 1.0) / 2.0
 90 |         cy = (height - 1.0) / 2.0
 91 |         intrinsic = np.eye(3)
 92 |         intrinsic[0, 0] = fx
 93 |         intrinsic[1, 1] = fx
 94 |         intrinsic[0, 2] = cx
 95 |         intrinsic[1, 2] = cy
 96 |         self.camera = {
 97 |             'height': int(height), 'width': int(width),
 98 |             'rate_unit_to_meter': rate_unit_to_meter,
 99 |             'focal_length': fx,
100 |             'intrinsic': intrinsic
101 |         }
102 |     
103 |     def _load_meta_data(self):
104 |         cam_list = pd.read_csv(os.path.join(self.in_scene, '_detail', 'metadata_cameras.csv'))
105 |         cam_list = cam_list['camera_name'].values.tolist()
106 | 
107 |         self.meta_data = {
108 |             'cam_list': cam_list
109 |         }
110 | 
111 |     def _save_scene_metadata(self):
112 |         metadata = { 'n_classes': int(self.semantic_label_mapping['id'].max()) }
113 |         metadata_path = os.path.join(self.out_scene, 'metadata.json')
114 |         with open(metadata_path, 'w') as f:
115 |             f.write(json.dumps(metadata, indent=2))
116 | 
117 |     def _collect_paths(self, cam):
118 |         rgb_path = os.path.join(self.in_scene, 'images', f'scene_{cam}_final_preview', 'frame.*.color.jpg')
119 |         depth_path = os.path.join(self.in_scene, 'images', f'scene_{cam}_geometry_hdf5', 'frame.*.depth_meters.hdf5')
120 |         semantic_path = os.path.join(self.in_scene, 'images', f'scene_{cam}_geometry_hdf5', 'frame.*.semantic.hdf5')
121 |         instance_path = os.path.join(self.in_scene, 'images', f'scene_{cam}_geometry_hdf5', 'frame.*.semantic_instance.hdf5')
122 |         
123 |         rgb_frames = glob.glob(rgb_path)
124 |         depth_frames = glob.glob(depth_path)
125 |         semantic_frames = glob.glob(semantic_path)
126 |         instance_frames = glob.glob(instance_path)
127 |         
128 |         rgb_frames = os_sorted(rgb_frames)
129 |         depth_frames = os_sorted(depth_frames)
130 |         semantic_frames = os_sorted(semantic_frames)
131 |         instance_frames = os_sorted(instance_frames)
132 | 
133 |         poses_T_WC = load_camera_poses(
134 |             hdf_orientation=os.path.join(self.in_scene, '_detail', f'{cam}', 'camera_keyframe_orientations.hdf5'),
135 |             hdf_position=os.path.join(self.in_scene, '_detail', f'{cam}', 'camera_keyframe_positions.hdf5'),
136 |             scale=self.camera['rate_unit_to_meter']
137 |         )
138 |         return rgb_frames, depth_frames, semantic_frames, instance_frames, poses_T_WC
139 | 
140 |     def _copy_frames_and_trajectory(self, cam, rgb_frames, depth_frames, semantic_frames, instance_frames, poses_T_WC):
141 |         rgb_out = os.path.join(self.out_scene, 'rgb')
142 |         depth_out = os.path.join(self.out_scene, 'depth')
143 |         semantic_out = os.path.join(self.out_scene, 'gt_semantic')
144 |         instance_out = os.path.join(self.out_scene, 'gt_instance')
145 |         os.makedirs(rgb_out, exist_ok=True)
146 |         os.makedirs(depth_out, exist_ok=True)
147 |         os.makedirs(semantic_out, exist_ok=True)
148 |         os.makedirs(instance_out, exist_ok=True)
149 | 
150 |         pose_dir = os.path.join(self.out_scene, 'pose')
151 |         os.makedirs(pose_dir, exist_ok=True)
152 | 
153 |         for (rgb, depth, semantic, instance, pose_T_WC) in zip(tqdm(rgb_frames, desc=f"Copying {cam} frames"),
154 |                                                         depth_frames, semantic_frames, instance_frames, poses_T_WC):
155 |             rgb_out_path = os.path.join(rgb_out, f"{self.frame_index:06}.jpg")
156 |             depth_out_path = os.path.join(depth_out, f"{self.frame_index:06}.png")
157 |             semantic_out_path = os.path.join(semantic_out, f"{self.frame_index:06}.png")
158 |             instance_out_path = os.path.join(instance_out, f"{self.frame_index:06}.png")
159 |             
160 |             shutil.copy(rgb, rgb_out_path)
161 | 
162 |             depth_img = load_distance_meters_to_depth(
163 |                 depth, self.camera['width'], self.camera['height'], self.camera['focal_length'])
164 |             depth_img = (depth_img * 1000).astype(np.uint16)
165 |             cv2.imwrite(depth_out_path, depth_img)
166 | 
167 |             with h5py.File(semantic, "r") as f:
168 |                 semantic_img = f["dataset"][:].astype(np.int16)
169 |             semantic_img = (semantic_img + 1).astype(np.uint16)
170 |             cv2.imwrite(semantic_out_path, semantic_img)
171 |             
172 |             with h5py.File(instance, "r") as f:
173 |                 instance_img = f["dataset"][:].astype(np.int16)
174 |             instance_img = (instance_img + 1).astype(np.uint16)
175 |             cv2.imwrite(instance_out_path, instance_img)
176 | 
177 |             pose_out_path = os.path.join(pose_dir, f"{self.frame_index:06}.txt")
178 |             np.savetxt(pose_out_path, np.linalg.inv(pose_T_WC))
179 |             
180 |             self.frame_index += 1
181 | 
182 |     def _copy_intrinsics(self):
183 |         np.savetxt(os.path.join(self.out_scene, 'intrinsics.txt'), self.camera['intrinsic'])
184 | 
185 |     def _compute_bounds(self):
186 |         scene = Scene(self.out_scene)
187 |         depth_frame = o3d.io.read_image(scene.depth_paths()[0])
188 |         depth_size = np.asarray(depth_frame).shape[::-1]
189 |         K = scene.camera.scale(depth_size).camera_matrix
190 |         intrinsics = o3d.camera.PinholeCameraIntrinsic(int(depth_size[0]),
191 |                                                        int(depth_size[1]),
192 |                                                        K[0, 0], K[1, 1],
193 |                                                        K[0, 2], K[1, 2])
194 |         pc = o3d.geometry.PointCloud()
195 | 
196 |         poses = scene.poses#[::10]
197 |         depths = scene.depth_paths()#[::10]
198 |         for T_CW, depth in zip(poses, tqdm(depths, desc="Computing bounds")):
199 |             T_WC = np.linalg.inv(T_CW)
200 |             depth = o3d.io.read_image(depth)
201 | 
202 |             pc_C = o3d.geometry.PointCloud.create_from_depth_image(
203 |                 depth, depth_scale=1000.0, intrinsic=intrinsics)
204 |             pc_C = np.asarray(pc_C.points)
205 |             pc_W = transform_points(T_WC, pc_C)
206 | 
207 |             pc += o3d.geometry.PointCloud(
208 |                 o3d.utility.Vector3dVector(pc_W)).uniform_down_sample(50)
209 |         filtered, _ = pc.remove_statistical_outlier(nb_neighbors=20,
210 |                                                     std_ratio=2.0)
211 |         aabb = filtered.get_axis_aligned_bounding_box()
212 |         with open(os.path.join(scene.path, 'bbox.txt'), 'wt') as f:
213 |             min_str = " ".join([str(x) for x in aabb.get_min_bound()])
214 |             max_str = " ".join([str(x) for x in aabb.get_max_bound()])
215 |             f.write(f"{min_str} {max_str} 0.01")
216 | 
217 |     def run(self):
218 |         self._save_scene_metadata()
219 |         self._copy_intrinsics()
220 | 
221 |         self.frame_index = 0
222 |         for cam in self.meta_data['cam_list']:
223 |             rgb_frames, depth_frames, semantic_frames, instance_frames, poses_T_WC = self._collect_paths(cam)
224 |             self._copy_frames_and_trajectory(cam, rgb_frames, depth_frames, semantic_frames, instance_frames, poses_T_WC)
225 | 
226 |         self._compute_bounds()
227 | 
228 | 
229 | def create_labelmap(semantic_labels, out):
230 |     semantic_labels = pd.read_csv(semantic_labels)
231 |     ids = []
232 |     prompts = []
233 |     for semantic_id, semantic_name in zip(semantic_labels['semantic_id '], semantic_labels[' semantic_name  ']):
234 |         ids.append(semantic_id + 1)
235 |         prompts.append(semantic_name)
236 |     data = pd.DataFrame({'id': ids, 'name': prompts})
237 |     data.to_csv(out, index=False)
238 |     return data
239 | 
240 | def main():
241 |     flags = read_args()
242 | 
243 |     os.makedirs(flags.out, exist_ok=True)
244 | 
245 |     label_map = create_labelmap(
246 |         flags.ori_semantic_labels,
247 |         os.path.join(flags.out, 'label_map.csv')
248 |     )
249 | 
250 |     all_camera_settings = pd.read_csv(flags.camera_parameter_file, 
251 |                                       index_col="scene_name")
252 | 
253 |     scene_names = os.listdir(flags.dataset)
254 | 
255 |     for scene_name in scene_names:
256 |         print(f"Converting scene [{scene_name}] ...")
257 |         in_scene = os.path.join(flags.dataset, scene_name)
258 |         out_scene = os.path.join(flags.out, scene_name)
259 |         os.makedirs(out_scene, exist_ok=True)
260 | 
261 |         converter = SceneConverter(
262 |             scene=in_scene, out_scene=out_scene, 
263 |             camera_settings=all_camera_settings.loc[scene_name], 
264 |             semantic_label_mapping=os.path.join(flags.out, 'label_map.csv'))
265 |         converter.run()
266 | 
267 | 
268 | if __name__ == "__main__":
269 |     main()


--------------------------------------------------------------------------------
/autolabel/models.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | 
  6 | from torch_ngp.gridencoder import GridEncoder
  7 | from torch_ngp.encoding import get_encoder
  8 | from torch_ngp.activation import trunc_exp
  9 | from torch_ngp.ffmlp import FFMLP
 10 | import tinycudann as tcnn
 11 | 
 12 | from torch_ngp.nerf.renderer import NeRFRenderer
 13 | 
 14 | 
 15 | class FreqEncoder(nn.Module):
 16 | 
 17 |     def __init__(self, input_dim):
 18 |         super().__init__()
 19 |         self.encoder = tcnn.Encoding(input_dim, {
 20 |             "otype": "Frequency",
 21 |             "n_frequencies": 10
 22 |         })
 23 |         self.n_output_dims = self.encoder.n_output_dims
 24 | 
 25 |     def forward(self, x, bound):
 26 |         normalized = (x + bound) / (2.0 * bound)
 27 |         return self.encoder(normalized)
 28 | 
 29 | 
 30 | class HGFreqEncoder(nn.Module):
 31 | 
 32 |     def __init__(self, input_dim):
 33 |         super().__init__()
 34 |         self.encoder = tcnn.Encoding(input_dim, {
 35 |             "otype": "Frequency",
 36 |             "n_frequencies": 2
 37 |         })
 38 |         self.grid_encoding = tcnn.Encoding(
 39 |             input_dim, {
 40 |                 "otype": "Grid",
 41 |                 "type": "Hash",
 42 |                 "n_levels": 16,
 43 |                 "n_features_per_level": 2,
 44 |                 "log2_hashmap_size": 19,
 45 |                 "base_resolution": 16,
 46 |                 "per_level_scale": 2.0,
 47 |                 "interpolation": "Linear"
 48 |             })
 49 |         self.n_output_dims = self.encoder.n_output_dims + self.grid_encoding.n_output_dims
 50 | 
 51 |     def forward(self, x, bound):
 52 |         freq = self.encoder(x)
 53 |         normalized = (x + bound) / (2.0 * bound)
 54 |         # Sometimes samples might leak a bit outside the bounds.
 55 |         # This produces NaNs in the grid encoding, so we simply clip those points
 56 |         # assuming there aren't many of these.
 57 |         grid = self.grid_encoding(normalized)
 58 |         return torch.cat([freq, grid], dim=-1)
 59 | 
 60 | 
 61 | class ALNetwork(NeRFRenderer):
 62 | 
 63 |     def __init__(self,
 64 |                  encoding='hg',
 65 |                  num_layers=2,
 66 |                  hidden_dim=64,
 67 |                  geo_feat_dim=15,
 68 |                  num_layers_color=3,
 69 |                  hidden_dim_color=64,
 70 |                  hidden_dim_semantic=64,
 71 |                  contrastive_feat_dim=8,
 72 |                  semantic_classes=2,
 73 |                  bound=1,
 74 |                  **kwargs):
 75 |         super().__init__(bound, **kwargs)
 76 | 
 77 |         # sigma network
 78 |         self.num_layers = num_layers
 79 |         self.hidden_dim = hidden_dim
 80 |         self.geo_feat_dim = geo_feat_dim
 81 | 
 82 |         # instance centers and clusterer
 83 |         self.instance_centers = None
 84 |         self.instance_clusterer = None
 85 | 
 86 |         self.encoder, self.in_dim = self._get_encoder(encoding)
 87 | 
 88 |         self.sigma_net = tcnn.Network(n_input_dims=self.in_dim,
 89 |                                       n_output_dims=1 + self.geo_feat_dim,
 90 |                                       network_config={
 91 |                                           "otype": "FullyFusedMLP",
 92 |                                           "activation": "ReLU",
 93 |                                           "output_activation": "None",
 94 |                                           "n_neurons": self.hidden_dim,
 95 |                                           "n_hidden_layers": self.num_layers
 96 |                                       })
 97 | 
 98 |         # color network
 99 |         self.num_layers_color = num_layers_color
100 |         self.hidden_dim_color = hidden_dim_color
101 |         self.encoder_dir = tcnn.Encoding(n_input_dims=3,
102 |                                          encoding_config={
103 |                                              "otype": "SphericalHarmonics",
104 |                                              "degree": 4
105 |                                          })
106 |         self.color_features = self.encoder_dir.n_output_dims + self.geo_feat_dim
107 | 
108 |         self.color_net = tcnn.Network(
109 |             n_input_dims=self.color_features,
110 |             n_output_dims=3,
111 |             network_config={
112 |                 "otype": "FullyFusedMLP",
113 |                 "activation": "ReLU",
114 |                 "output_activation": "None",
115 |                 "n_neurons": self.hidden_dim_color,
116 |                 "n_hidden_layers": self.num_layers_color
117 |             })
118 | 
119 |         # hash encoding for features
120 |         self.feature_encoder, self.feature_in_dim = self._get_encoder(encoding)
121 |         
122 |         # semantic features
123 |         self.hidden_dim_semantic = hidden_dim_semantic
124 |         self.semantic_classes = semantic_classes
125 |         self.semantic_features = tcnn.Network(
126 |             n_input_dims=self.geo_feat_dim,
127 |             n_output_dims=self.hidden_dim_semantic,
128 |             network_config={
129 |                 "otype": "CutlassMLP",
130 |                 "activation": "ReLU",
131 |                 "output_activation": "None",
132 |                 "n_neurons": self.hidden_dim_semantic,
133 |                 "n_hidden_layers": 2
134 |             })
135 | 
136 |         # contrastive features
137 |         self.contrastive_feat_dim = contrastive_feat_dim
138 |         self.contrastive_features = tcnn.Network(
139 |             n_input_dims=self.feature_in_dim,
140 |             n_output_dims=self.contrastive_feat_dim,
141 |             network_config={
142 |                 "otype": "CutlassMLP",
143 |                 "activation": "ReLU",
144 |                 "output_activation": "None",
145 |                 "n_neurons": self.hidden_dim_semantic,
146 |                 "n_hidden_layers": 2
147 |             })
148 | 
149 |     def _get_encoder(self, encoding):
150 |         if encoding == 'freq':
151 |             encoder = FreqEncoder(3)
152 |             return encoder, encoder.n_output_dims
153 |         elif encoding == 'hg':
154 |             return get_encoder('hashgrid', desired_resolution=2**18)
155 |         elif encoding == 'hg+freq':
156 |             encoder = HGFreqEncoder(3)
157 |             return encoder, encoder.n_output_dims
158 |         else:
159 |             raise NotImplementedError(f"Unknown input encoding {encoding}")
160 | 
161 |     def forward(self, x, d):
162 |         """
163 |         x: [N, 3], in [-bound, bound] points
164 |         d: [N, 3], normalized to [-1, 1] viewing directions
165 |         """
166 |         x_enc = self.encoder(x, bound=self.bound)
167 |         h = self.sigma_net(x_enc)
168 | 
169 |         sigma = trunc_exp(h[..., 0])
170 |         geo_feat = F.relu(h[..., 1:])
171 | 
172 |         d = self.encoder_dir(d)
173 | 
174 |         h = torch.cat([d, geo_feat], dim=-1)
175 |         h = self.color_net(h)
176 | 
177 |         rgb = torch.sigmoid(h)
178 | 
179 |         x_feat = self.feature_encoder(x, bound=self.bound)
180 |         semantic_features = self.semantic_features(geo_feat)
181 | 
182 |         contrastive_features = self.contrastive_features(x_feat)
183 |         contrastive_features = F.normalize(contrastive_features)
184 | 
185 |         return sigma, rgb, semantic_features, contrastive_features
186 | 
187 |     def density(self, x):
188 |         """
189 |         x: [N, 3] points in [-bound, bound]
190 |         """
191 |         x = self.encoder(x, bound=self.bound)
192 |         h = self.sigma_net(x)
193 | 
194 |         sigma = trunc_exp(h[..., 0])
195 |         geo_feat = h[..., 1:]
196 | 
197 |         return {
198 |             'sigma': sigma,
199 |             'geo_feat': geo_feat,
200 |         }
201 | 
202 |     def color(self, x, d, mask=None, geo_feat=None, **kwargs):
203 |         """
204 |         x: [N, 3] in [-bound, bound]
205 |         mask: [N,], bool, indicates where we actually needs to compute rgb.
206 |         """
207 |         if mask is not None:
208 |             rgbs = torch.zeros(mask.shape[0], 3, dtype=x.dtype,
209 |                                device=x.device)  # [N, 3]
210 |             # in case of empty mask
211 |             if not mask.any():
212 |                 return rgbs
213 |             x = x[mask]
214 |             d = d[mask]
215 |             geo_feat = geo_feat[mask]
216 | 
217 |         # TinyCudaNN SH encoding requires inputs to be in [0, 1].
218 |         d = (d + 1) / 2
219 |         d = self.encoder_dir(d)
220 | 
221 |         h = torch.cat([d, geo_feat], dim=-1)
222 | 
223 |         h = self.color_net(h)
224 | 
225 |         h = torch.sigmoid(h)
226 | 
227 |         if mask is not None:
228 |             rgbs[mask] = h.to(rgbs.dtype)
229 |         else:
230 |             rgbs = h
231 | 
232 |         return rgbs
233 | 
234 |     def get_params(self, lr):
235 |         params = [{
236 |             'params': self.encoder.parameters(),
237 |             'lr': lr
238 |         }, {
239 |             'params': self.sigma_net.parameters(),
240 |             'lr': lr
241 |         }, {
242 |             'params': self.encoder_dir.parameters(),
243 |             'lr': lr
244 |         }, {
245 |             'params': self.color_net.parameters(),
246 |             'lr': lr
247 |         }, {
248 |             'params': self.semantic_features.parameters(),
249 |             'lr': lr
250 |         }, {
251 |             'params': self.feature_encoder.parameters(),
252 |             'lr': lr
253 |         }, {
254 |             'params': self.contrastive_features.parameters(),
255 |             'lr': lr
256 |         }]
257 |         if self.bg_radius > 0:
258 |             params.append({'params': self.encoder_bg.parameters(), 'lr': lr})
259 |             params.append({'params': self.bg_net.parameters(), 'lr': lr})
260 | 
261 |         return params
262 | 
263 |     def semantic(self, geo_features):
264 |         """
265 |         features: [N, D] geometric features
266 |         sigma: [N, 1] density outputs
267 |         returns: [N, C] semantic head outputs
268 |         """
269 |         sem_features = self.semantic_features(geo_features)
270 |         return sem_features
271 | 
272 |     def contrastive(self, x_feature_encoding, contrastive_ema=None):
273 |         """
274 |         x: [N, 3] points in [-bound, bound]
275 |         returns: [N, C] contrastive features
276 |         """
277 |         if contrastive_ema is not None:
278 |             with contrastive_ema.average_parameters():
279 |                 con_features = self.contrastive_features(x_feature_encoding)
280 |         else:
281 |             con_features = self.contrastive_features(x_feature_encoding)
282 |         # con_features = F.normalize(con_features)
283 |         return con_features
284 | 
285 |     def network_parameters(self):
286 |         """
287 |         return: list of parameters in the neural networks, excluding encoder parameters
288 |         """
289 |         return (list(self.sigma_net.parameters()) +
290 |                 list(self.color_net.parameters()) +
291 |                 list(self.semantic_features.parameters()) +
292 |                 list(self.contrastive_features.parameters()))
293 |     
294 |     def encoder_parameters(self):
295 |         """
296 |         return: list of parameters in the encoders
297 |         """
298 |         return (list(self.encoder.parameters()) +
299 |                 list(self.feature_encoder.parameters()))
300 |     
301 |     def set_instance_centers(self, instance_centers):
302 |         self.instance_centers = instance_centers
303 | 
304 |     def set_instance_clusterer(self, clusterer):
305 |         self.instance_clusterer = clusterer
306 | 
307 | 
308 | class Autoencoder(nn.Module):
309 | 
310 |     def __init__(self, in_features, bottleneck):
311 |         super().__init__()
312 |         self.encoder = tcnn.Network(n_input_dims=in_features,
313 |                                     n_output_dims=bottleneck,
314 |                                     network_config={
315 |                                         "otype": "CutlassMLP",
316 |                                         "activation": "ReLU",
317 |                                         "output_activation": "ReLU",
318 |                                         "n_neurons": 128,
319 |                                         "n_hidden_layers": 1
320 |                                     })
321 |         self.decoder = tcnn.Network(n_input_dims=bottleneck,
322 |                                     n_output_dims=in_features,
323 |                                     network_config={
324 |                                         "otype": "CutlassMLP",
325 |                                         "activation": "ReLU",
326 |                                         "output_activation": "None",
327 |                                         "n_neurons": 128,
328 |                                         "n_hidden_layers": 1
329 |                                     })
330 | 
331 |     def forward(self, x, p=0.1):
332 |         code = self.encoder(x)
333 |         out = self.decoder(F.dropout(code, 0.1))
334 |         return out, code
335 | 


--------------------------------------------------------------------------------
/scripts/demo_ui.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import argparse
  3 | import os
  4 | import open3d as o3d
  5 | from plyfile import PlyData
  6 | import numpy as np
  7 | import torch
  8 | import torch.nn.functional as F
  9 | import threading
 10 | import multiprocessing as mp
 11 | from PyQt6 import QtWidgets
 12 | from PyQt6 import QtCore
 13 | from sklearn.metrics.pairwise import cosine_similarity
 14 | from autolabel.constants import COLORS
 15 | from autolabel.utils.feature_utils import get_feature_extractor
 16 | from autolabel.dataset import SceneDataset
 17 | from autolabel import utils, model_utils
 18 | 
 19 | 
 20 | class PointCloudVisualizer:
 21 | 
 22 |     def __init__(self, flags, queue):
 23 |         self.flags = flags
 24 |         self.queue = queue
 25 |         self.visualizer = o3d.visualization.Visualizer()
 26 |         self.label_mapping = dict()
 27 |         self._load_scene_model()
 28 |         self._load_pointcloud()
 29 |         self._load_text_model()
 30 |         self._load_point_features()
 31 |         self._load_point_instance_ids()
 32 | 
 33 |     def _load_pointcloud(self):
 34 |         mesh_path = os.path.join(self.flags.scene, "mesh.ply")
 35 |         if not os.path.exists(mesh_path):
 36 |             raise ValueError(f"Mesh file {mesh_path} does not exist.")
 37 |         plydata = PlyData.read(mesh_path)
 38 |         points = np.hstack([
 39 |             plydata['vertex']['x'].reshape(-1, 1),
 40 |             plydata['vertex']['y'].reshape(-1, 1),
 41 |             plydata['vertex']['z'].reshape(-1, 1)
 42 |         ])
 43 |         points_rgb = np.hstack([
 44 |             plydata['vertex']['red'].reshape(-1, 1),
 45 |             plydata['vertex']['green'].reshape(-1, 1),
 46 |             plydata['vertex']['blue'].reshape(-1, 1)
 47 |         ])
 48 |         points_rgb = points_rgb.astype(np.float32) / 255.0
 49 |         aabb = np.loadtxt(
 50 |             os.path.join(self.flags.scene, 'bbox.txt')
 51 |         )[:6].reshape(2, 3)
 52 |         scene_center = (aabb[0] + aabb[1]) / 2
 53 |         points = points - scene_center
 54 |         fixed = np.zeros_like(points)
 55 |         fixed[:, 0] = points[:, 1]
 56 |         fixed[:, 1] = points[:, 2]
 57 |         fixed[:, 2] = points[:, 0]
 58 |         self.points = torch.tensor(fixed, dtype=torch.float16)
 59 |         self.point_infos = {'ori_rgb': points_rgb}
 60 |         self.pc = o3d.geometry.PointCloud()
 61 |         self.pc.points = o3d.utility.Vector3dVector(fixed)
 62 |         self.pc.colors = o3d.utility.Vector3dVector(points_rgb)
 63 |         # self.pc.paint_uniform_color([0.5, 0.5, 0.5])
 64 |         self.visualizer.create_window()
 65 |         self.visualizer.add_geometry(self.pc)
 66 | 
 67 |     def _load_scene_model(self):
 68 |         models = list()
 69 |         nerf_dir = model_utils.get_nerf_dir(self.flags.scene, self.flags)
 70 |         if not os.path.exists(nerf_dir):
 71 |             raise ValueError(f"Model directory {nerf_dir} does not exist.")
 72 |         for model in os.listdir(nerf_dir):
 73 |             checkpoint_dir = os.path.join(nerf_dir, model, 'checkpoints')
 74 |             if os.path.exists(checkpoint_dir):
 75 |                 models.append(model)
 76 |         model_path = os.path.join(nerf_dir, models[0])
 77 |         print("Loading models: ", model_path)
 78 |         params = model_utils.read_params(model_path)
 79 |         dataset = SceneDataset('test',
 80 |                                self.flags.scene,
 81 |                                factor=4.0,
 82 |                                batch_size=self.flags.batch_size,
 83 |                                lazy=True)
 84 |         n_classes = dataset.n_classes if dataset.n_classes is not None else 2
 85 |         model = model_utils.create_model(dataset.min_bounds, dataset.max_bounds,
 86 |                                          n_classes, params).cuda()
 87 |         checkpoint_dir = os.path.join(model_path, 'checkpoints')
 88 |         model_utils.load_checkpoint(model, checkpoint_dir)
 89 |         self.model = model.eval()
 90 | 
 91 |     def _load_text_model(self):
 92 |         self.extractor = get_feature_extractor('lseg', self.flags.checkpoint)
 93 | 
 94 |     def _load_point_features(self):
 95 |         semantic_features = self._point_features(points=self.points)
 96 |         self.point_infos['semantic'] = semantic_features
 97 | 
 98 |     def _load_point_instance_ids(self):
 99 |         instance_ids = self._point_instance_ids(points=self.points)
100 |         self.point_infos['instance_id'] = instance_ids
101 |         instance_colors = np.zeros((len(instance_ids), 3))
102 |         ins_ids = np.unique(instance_ids)
103 |         for ins_id in ins_ids:
104 |             if ins_id == 0:
105 |                 continue
106 |             instance_colors[instance_ids == ins_id] = np.random.rand(3, )
107 |         self.point_infos['instance_colors'] = instance_colors
108 | 
109 |     def _denoise_semantic(self, pred_semantic_labels, pred_instance_labels):
110 |         pred_semantic_denoised = np.copy(pred_semantic_labels)
111 |         instance_ids = np.unique(pred_instance_labels)
112 |         for ins_id in instance_ids:
113 |             if ins_id == 0:
114 |                 continue
115 |             
116 |             semantic_ids = pred_semantic_labels[pred_instance_labels == ins_id]
117 |             ids, cnts = np.unique(semantic_ids, return_counts=True)
118 |             pred_semantic_denoised[pred_instance_labels == ins_id] = ids[np.argmax(cnts)]
119 |         return pred_semantic_denoised
120 |     
121 |     def _update_colors(self, msg):
122 |         print(msg)
123 |         if isinstance(msg, list):
124 |             prompts = msg
125 |             if len(prompts) > 0:
126 |                 # prompts.append("others")
127 |                 text_features = self.extractor.encode_text(prompts)
128 |                 semantic_features = self._point_features()
129 |                 pred_instance_labels = self._point_instance_ids()
130 |                 similarities = torch.zeros(
131 |                     (semantic_features.shape[0], text_features.shape[0]),
132 |                     dtype=torch.float32,
133 |                     device=semantic_features.device)
134 |                 batch_size = 50000
135 |                 for i in range(0, semantic_features.shape[0], batch_size):
136 |                     batch = semantic_features[i:i + batch_size]
137 |                     for prompt_index in range(text_features.shape[0]):
138 |                         similarities[i:i + batch_size, prompt_index] = (
139 |                             batch * text_features[prompt_index][None]).sum(dim=-1)
140 |                 
141 |                 update_mask, _ = similarities.max(dim=-1)
142 |                 update_mask = update_mask.cpu().numpy() > 0.85
143 |                 closest_prompt = similarities.argmax(dim=-1).cpu().numpy()
144 |                 denoised_closest_prompt = self._denoise_semantic(closest_prompt, pred_instance_labels)
145 |                 
146 |                 colors = np.asarray(self.pc.colors)
147 |                 colors[update_mask] = COLORS[denoised_closest_prompt[update_mask] % COLORS.shape[0]] / 255.
148 |             else:
149 |                 colors = self.point_infos['ori_rgb']
150 |         elif isinstance(msg, str):
151 |             if msg == "show_instance":
152 |                 colors = self.point_infos['instance_colors']
153 |         else:
154 |             raise ValueError("Not support msg type {}".format(type(msg)))
155 |         self.pc.colors = o3d.utility.Vector3dVector(colors)
156 |         self.visualizer.update_geometry(self.pc)
157 | 
158 |     def _point_features(self, points=None):
159 |         if points is not None:
160 |             out = []
161 |             for i in range(0, len(points), self.flags.batch_size):
162 |                 batch = points[i:i + self.flags.batch_size]
163 |                 batch = batch.cuda()
164 |                 with torch.no_grad():
165 |                     density = self.model.density(batch)
166 |                     features = self.model.semantic(density['geo_feat'])
167 |                     features = features / torch.norm(features, dim=-1, keepdim=True)
168 |                 features = features.to(torch.float32)
169 |                 out.append(features)
170 |             semantic_features = torch.cat(out, dim=0)
171 |         else:
172 |             semantic_features = self.point_infos['semantic']
173 |         return semantic_features
174 |     
175 |     def _point_instance_ids(self, points=None):
176 |         if points is not None:
177 |             pred_instances = []
178 |             for i in range(0, len(points), self.flags.batch_size):
179 |                 batch = points[i:i + self.flags.batch_size]
180 |                 batch = batch.cuda()
181 |                 with torch.no_grad():
182 |                     xyz_feature_encoding = self.model.feature_encoder(batch, bound=self.model.bound)
183 |                     instance_feature = self.model.contrastive(xyz_feature_encoding, None)
184 |                     # instance_feature = instance_feature.reshape(-1, feature_dim)
185 |                     instance_feature = instance_feature.cpu().numpy()
186 |                     sim_mat = cosine_similarity(instance_feature, self.model.instance_centers)
187 |                     pred_instance = np.argmax(sim_mat, axis=1) + 1 # start from 1, 0 means noise
188 |                 pred_instances.append(pred_instance) 
189 |             instance_ids = np.concatenate(pred_instances, axis=0) 
190 |         else:
191 |             instance_ids = self.point_infos['instance_id']
192 |         return instance_ids
193 | 
194 |     def run(self):
195 |         while True:
196 |             if not self.queue.empty():
197 |                 msg = self.queue.get(False)
198 |                 self._update_colors(msg)
199 |             self.visualizer.update_geometry(self.pc)
200 |             if not self.visualizer.poll_events():
201 |                 return
202 |             self.visualizer.update_renderer()
203 | 
204 | 
205 | def run_visualizer(flags, queue):
206 |     visualizer = PointCloudVisualizer(flags, queue)
207 |     visualizer.run()
208 | 
209 | 
210 | class ListView(QtWidgets.QWidget):
211 | 
212 |     def __init__(self, parent=None):
213 |         super().__init__(parent)
214 |         self.layout = QtWidgets.QVBoxLayout()
215 |         self.setLayout(self.layout)
216 |         self.items = []
217 | 
218 |     def add_item(self, item):
219 |         index = len(self.items)
220 |         color = COLORS[index % len(COLORS)]
221 |         self.items.append(item)
222 |         label = QtWidgets.QLabel(item)
223 |         label.setMargin(20)
224 |         label.setStyleSheet(
225 |             f"background-color: rgb({color[0]}, {color[1]}, {color[2]});")
226 |         self.layout.addWidget(label)
227 |         self.update()
228 | 
229 |     def get_items(self):
230 |         return self.items
231 | 
232 |     def reset(self):
233 |         self.items = []
234 |         for i in reversed(range(self.layout.count())):
235 |             self.layout.itemAt(i).widget().setParent(None)
236 | 
237 | 
238 | class SegmentingApplication(QtWidgets.QMainWindow):
239 | 
240 |     def __init__(self, queue):
241 |         super().__init__()
242 |         self.classes = []
243 |         self.setWindowTitle("Segmentation Classes")
244 |         self.input_button = QtWidgets.QPushButton("Add")
245 |         self.input_button.clicked.connect(self._add_class)
246 |         self.reset_button = QtWidgets.QPushButton("Reset")
247 |         self.reset_button.clicked.connect(self._reset_classes)
248 |         self.show_instance_button = QtWidgets.QPushButton("Show all instances")
249 |         self.show_instance_button.clicked.connect(self._show_all_instances)
250 |         self.list_view = ListView()
251 |         input_line = self._create_input_line()
252 |         layout = QtWidgets.QVBoxLayout()
253 |         layout.addWidget(self.list_view)
254 |         layout.addWidget(input_line)
255 |         main_widget = QtWidgets.QWidget()
256 |         main_widget.setLayout(layout)
257 |         self.setCentralWidget(main_widget)
258 |         self.class_queue = queue
259 | 
260 |     def _create_input_line(self):
261 |         layout = QtWidgets.QHBoxLayout()
262 |         self.line_edit = QtWidgets.QLineEdit()
263 |         self.line_edit.setPlaceholderText("Class description prompt")
264 |         self.line_edit.returnPressed.connect(self._add_class)
265 |         layout.addWidget(self.line_edit)
266 |         layout.addWidget(self.input_button)
267 |         layout.addWidget(self.reset_button)
268 |         layout.addWidget(self.show_instance_button)
269 |         widget = QtWidgets.QWidget()
270 |         widget.setLayout(layout)
271 |         return widget
272 | 
273 |     def keyPressEvent(self, event):
274 |         if event.key() == QtCore.Qt.Key.Key_Escape:
275 |             self.close()
276 | 
277 |     def _add_class(self):
278 |         self.list_view.add_item(self.line_edit.text())
279 |         self.line_edit.clear()
280 |         self._publish_classes()
281 | 
282 |     def _reset_classes(self):
283 |         self.list_view.reset()
284 |         self._publish_classes()
285 | 
286 |     def _show_all_instances(self):
287 |         self.class_queue.put("show_instance")
288 | 
289 |     def _publish_classes(self):
290 |         self.class_queue.put(self.list_view.get_items())
291 | 
292 | 
293 | def main():
294 |     parser = argparse.ArgumentParser()
295 |     parser.add_argument("scene", type=str)
296 |     parser.add_argument('--workspace', default=None)
297 |     parser.add_argument('--checkpoint',
298 |                         type=str,
299 |                         required=True,
300 |                         help='path to feature model checkpoint')
301 |     # parser.add_argument('--model', type=str, default='model.pth')
302 |     parser.add_argument('--batch-size', type=int, default=1024)
303 |     flags = parser.parse_args()
304 | 
305 |     app = QtWidgets.QApplication(sys.argv)
306 | 
307 |     queue = mp.Queue()
308 |     window = SegmentingApplication(queue)
309 |     window.show()
310 | 
311 |     thread = threading.Thread(target=run_visualizer, args=(flags, queue))
312 |     thread.start()
313 |     app.exec()
314 |     thread.join()
315 | 
316 | 
317 | if __name__ == "__main__":
318 |     main()


--------------------------------------------------------------------------------
/scripts/data/convert_scannet.py:
--------------------------------------------------------------------------------
  1 | description = """
  2 | """
  3 | import subprocess
  4 | import math
  5 | import argparse
  6 | import shutil
  7 | import json
  8 | import pandas
  9 | import zlib
 10 | import imageio
 11 | from argparse import RawTextHelpFormatter
 12 | import os, struct
 13 | import cv2
 14 | import numpy as np
 15 | import trimesh
 16 | from scipy.spatial.transform import Rotation
 17 | import open3d as o3d
 18 | 
 19 | SCANNET20_IDS = [
 20 |     1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34, 36, 39
 21 | ]
 22 | 
 23 | 
 24 | def read_args():
 25 |     parser = argparse.ArgumentParser(description=description,
 26 |                                      formatter_class=RawTextHelpFormatter)
 27 |     parser.add_argument('scannet_scan_dir')
 28 |     parser.add_argument(
 29 |         '--label-map',
 30 |         required=True,
 31 |         help="Path to label map .tsv file with semantic label names and ids.")
 32 |     parser.add_argument('--out', required=True)
 33 |     parser.add_argument('--max-frames',
 34 |                         type=int,
 35 |                         default=750,
 36 |                         help="Maximum number of frames to keep.")
 37 |     parser.add_argument('--stride',
 38 |                         type=int,
 39 |                         default=5,
 40 |                         help="Use only every s-th frame.")
 41 |     parser.add_argument('--nyu40',
 42 |                         action='store_true',
 43 |                         help="Use NYU40 label map.")
 44 |     parser.add_argument('--scannet20',
 45 |                         action='store_true',
 46 |                         help="Use ScanNet20 evaluation label map.")
 47 |     return parser.parse_args()
 48 | 
 49 | 
 50 | class LabelHelper:
 51 | 
 52 |     def __init__(self, label_path, flags):
 53 |         self.remapping = {}
 54 |         self.prompt_remap = {}
 55 |         label_map = pandas.read_csv(label_path, sep='\t')
 56 |         mapping = np.zeros(label_map['id'].values.max() + 1, np.uint16)
 57 |         if flags.nyu40 or flags.scannet20:
 58 |             ids = np.arange(1, 41)
 59 |             texts = []
 60 |             for i in ids:
 61 |                 text = label_map['nyu40class'][label_map['nyu40id'] ==
 62 |                                                i].values[0]
 63 |                 texts.append(text)
 64 |             for i, num in zip(label_map['id'].values,
 65 |                               label_map['nyu40id'].values):
 66 |                 mapping[i] = num
 67 |         else:
 68 |             texts = label_map['raw_category'].values.tolist()
 69 |             ids = np.arange(1, len(texts) + 1)
 70 |             for i, num in zip(label_map['id'].values, ids):
 71 |                 mapping[i] = num
 72 | 
 73 |         if flags.scannet20:
 74 |             mapping[np.isin(mapping, SCANNET20_IDS) == False] = 0
 75 |             texts = [text for text, i in zip(texts, ids) if i in SCANNET20_IDS]
 76 |             ids = ids[np.isin(ids, SCANNET20_IDS)]
 77 | 
 78 |         self.label_text_to_id = {}
 79 |         for num, text in zip(label_map['id'], label_map['raw_category']):
 80 |             self.label_text_to_id[text] = num
 81 |         self.mapping = mapping
 82 | 
 83 |         self.label_map = pandas.DataFrame({'id': ids, 'prompt': texts})
 84 |         self.classes_in_scene = set()
 85 | 
 86 |     def reset(self):
 87 |         self.classes_in_scene = set()
 88 | 
 89 |     def _read_config(self, path):
 90 |         with open(path, 'rt') as f:
 91 |             return json.load(f)
 92 | 
 93 |     def write_labelmap(self, out):
 94 |         label_map_out = os.path.join(out, 'label_map.csv')
 95 |         self.label_map.to_csv(label_map_out, index=False)
 96 | 
 97 |     def map_semantics(self, semantic_frame):
 98 |         return self.mapping[semantic_frame]
 99 | 
100 |     def register_frame(self, frame):
101 |         for i in np.unique(frame):
102 |             self.classes_in_scene.add(int(i))
103 | 
104 |     def label_ids(self):
105 |         return self.label_map['id'].values
106 | 
107 |     def label_to_id(self, label_name):
108 |         scannet_id = self.label_text_to_id[label_name]
109 |         return self.mapping[scannet_id]
110 | 
111 | 
112 | def write_intrinsics(out, sensor_reader):
113 |     intrinsics = sensor_reader.intrinsic_color
114 |     intrinsics_path = os.path.join(out, "intrinsics.txt")
115 |     np.savetxt(intrinsics_path, intrinsics)
116 | 
117 | 
118 | def write_metadata(out, label_helper):
119 |     metadata_path = os.path.join(out, "metadata.json")
120 |     metadata = {
121 |         "n_classes": int(label_helper.label_ids().max()),
122 |         'classes': list(sorted(label_helper.classes_in_scene))
123 |     }
124 |     with open(metadata_path, 'w') as f:
125 |         f.write(json.dumps(metadata, indent=2))
126 | 
127 | 
128 | def read_aggregation(filename):
129 |     """From https://github.com/ScanNet/ScanNet"""
130 |     assert os.path.isfile(filename)
131 |     object_id_to_segs = {}
132 |     label_to_segs = {}
133 |     with open(filename) as f:
134 |         data = json.load(f)
135 |         num_objects = len(data['segGroups'])
136 |         for i in range(num_objects):
137 |             object_id = data['segGroups'][i][
138 |                 'objectId'] + 1  # instance ids should be 1-indexed
139 |             label = data['segGroups'][i]['label']
140 |             segs = data['segGroups'][i]['segments']
141 |             object_id_to_segs[object_id] = segs
142 |             if label in label_to_segs:
143 |                 label_to_segs[label].extend(segs)
144 |             else:
145 |                 label_to_segs[label] = segs
146 |     return object_id_to_segs, label_to_segs
147 | 
148 | 
149 | def read_segmentation(filename):
150 |     """From https://github.com/ScanNet/ScanNet"""
151 |     assert os.path.isfile(filename)
152 |     seg_to_verts = {}
153 |     with open(filename) as f:
154 |         data = json.load(f)
155 |         num_verts = len(data['segIndices'])
156 |         for i in range(num_verts):
157 |             seg_id = data['segIndices'][i]
158 |             if seg_id in seg_to_verts:
159 |                 seg_to_verts[seg_id].append(i)
160 |             else:
161 |                 seg_to_verts[seg_id] = [i]
162 |     return seg_to_verts, num_verts
163 | 
164 | 
165 | def copy_3d_semantics(scene_in, scene, scene_out, label_helper):
166 |     mesh_path = os.path.join(scene_in, f"{scene}_vh_clean_2.ply")
167 |     aggregation = os.path.join(scene_in, f"{scene}.aggregation.json")
168 |     segments = os.path.join(scene_in, f"{scene}_vh_clean_2.0.010000.segs.json")
169 |     mesh = trimesh.load(mesh_path)
170 |     label_ids = np.zeros((mesh.vertices.shape[0],), dtype=np.uint16)
171 |     object_id_to_seg, label_to_segs = read_aggregation(aggregation)
172 |     seg_to_vertex, num_vertices = read_segmentation(segments)
173 |     for label, segs in label_to_segs.items():
174 |         label_id = label_helper.label_to_id(label)
175 |         for seg in segs:
176 |             verts = seg_to_vertex[seg]
177 |             for vertex in verts:
178 |                 try:
179 |                     label_ids[vertex] = label_id
180 |                 except IndexError:
181 |                     print(
182 |                         f"Index error for {scene} vertex {vertex} and seg: {seg}"
183 |                     )
184 | 
185 |     out_mesh = os.path.join(scene_out, 'mesh.ply')
186 |     mesh.export(out_mesh)
187 |     out_mesh_semantics = os.path.join(scene_out, 'mesh_labels.npy')
188 |     np.save(out_mesh_semantics, label_ids)
189 | 
190 | 
191 | class RGBDFrame():
192 | 
193 |     def load(self, file_handle):
194 |         self.camera_to_world = np.asarray(struct.unpack(
195 |             'f' * 16, file_handle.read(16 * 4)),
196 |                                           dtype=np.float32).reshape(4, 4)
197 |         self.timestamp_color = struct.unpack('Q', file_handle.read(8))[0]
198 |         self.timestamp_depth = struct.unpack('Q', file_handle.read(8))[0]
199 |         self.color_size_bytes = struct.unpack('Q', file_handle.read(8))[0]
200 |         self.depth_size_bytes = struct.unpack('Q', file_handle.read(8))[0]
201 |         self.color_data = b''.join(
202 |             struct.unpack('c' * self.color_size_bytes,
203 |                           file_handle.read(self.color_size_bytes)))
204 |         self.depth_data = b''.join(
205 |             struct.unpack('c' * self.depth_size_bytes,
206 |                           file_handle.read(self.depth_size_bytes)))
207 | 
208 | 
209 | class SensReader:
210 | 
211 |     def __init__(self, sens_file):
212 |         self.file = sens_file
213 |         self.file_handle = None
214 |         self.num_frames = None
215 |         self.rgb_size = None
216 |         self.depth_size = None
217 | 
218 |     def __enter__(self):
219 |         self.file_handle = open(self.file, 'rb')
220 |         f = self.file_handle
221 |         version = struct.unpack('I', f.read(4))[0]
222 |         assert version == 4
223 |         strlen = struct.unpack('Q', f.read(8))[0]
224 |         self.sensor_name = ''.join([
225 |             c.decode('utf-8')
226 |             for c in struct.unpack('c' * strlen, f.read(strlen))
227 |         ])
228 |         self.intrinsic_color = np.asarray(struct.unpack('f' * 16,
229 |                                                         f.read(16 * 4)),
230 |                                           dtype=np.float32).reshape(4, 4)
231 |         self.extrinsic_color = np.asarray(struct.unpack('f' * 16,
232 |                                                         f.read(16 * 4)),
233 |                                           dtype=np.float32).reshape(4, 4)
234 |         self.intrinsic_depth = np.asarray(struct.unpack('f' * 16,
235 |                                                         f.read(16 * 4)),
236 |                                           dtype=np.float32).reshape(4, 4)
237 |         self.extrinsic_depth = np.asarray(struct.unpack('f' * 16,
238 |                                                         f.read(16 * 4)),
239 |                                           dtype=np.float32).reshape(4, 4)
240 |         color_compression_type = struct.unpack('i', f.read(4))[0]
241 |         depth_compression_type = struct.unpack('i', f.read(4))[0]
242 |         color_width = struct.unpack('I', f.read(4))[0]
243 |         color_height = struct.unpack('I', f.read(4))[0]
244 |         self.rgb_size = (color_width, color_height)
245 |         depth_width = struct.unpack('I', f.read(4))[0]
246 |         depth_height = struct.unpack('I', f.read(4))[0]
247 |         self.depth_size = (depth_width, depth_height)
248 |         depth_shift = struct.unpack('f', f.read(4))[0]
249 |         self.num_frames = struct.unpack('Q', f.read(8))[0]
250 |         return self
251 | 
252 |     def __exit__(self, *args):
253 |         self.file_handle.close()
254 | 
255 |     def read(self):
256 |         for i in range(self.num_frames):
257 |             frame = RGBDFrame()
258 |             frame.load(self.file_handle)
259 |             rgb_frame = imageio.v3.imread(frame.color_data)
260 |             depth_frame = zlib.decompress(frame.depth_data)
261 |             depth_frame = np.frombuffer(depth_frame, dtype=np.uint16).reshape(
262 |                 self.depth_size[1], self.depth_size[0])
263 |             yield frame.camera_to_world, rgb_frame, depth_frame
264 | 
265 | 
266 | def main():
267 |     flags = read_args()
268 | 
269 |     os.makedirs(flags.out, exist_ok=True)
270 | 
271 |     label_helper = LabelHelper(flags.label_map, flags)
272 |     label_helper.write_labelmap(flags.out)
273 | 
274 |     scenes = os.listdir(flags.scannet_scan_dir)
275 | 
276 |     for scene in scenes:
277 |         # Reset classes in scene.
278 |         label_helper.reset()
279 |         scene_dir_in = os.path.join(flags.scannet_scan_dir, scene)
280 |         sensor_file = os.path.join(flags.scannet_scan_dir, scene,
281 |                                    f"{scene}.sens")
282 |         semantic_dir_in = os.path.join(flags.scannet_scan_dir, scene,
283 |                                        "label-filt")
284 |         if not os.path.exists(semantic_dir_in):
285 |             label_filt_zip = os.path.join(flags.scannet_scan_dir, scene,
286 |                                           f"{scene}_2d-label-filt.zip")
287 |             subprocess.call(['unzip', label_filt_zip, '-d', scene_dir_in])
288 | 
289 |         instance_dir_in = os.path.join(flags.scannet_scan_dir, scene,
290 |                                        "instance-filt")
291 |         if not os.path.exists(instance_dir_in):
292 |             instance_filt_zip = os.path.join(flags.scannet_scan_dir, scene,
293 |                                              f"{scene}_2d-instance-filt.zip")
294 |             subprocess.call(['unzip', instance_filt_zip, '-d', scene_dir_in])
295 | 
296 |         rgb_dir = os.path.join(flags.out, scene, "rgb")
297 |         depth_dir = os.path.join(flags.out, scene, "depth")
298 |         pose_dir = os.path.join(flags.out, scene, "pose")
299 |         semantic_dir = os.path.join(flags.out, scene, "gt_semantic")
300 |         instance_dir = os.path.join(flags.out, scene, "gt_instance")
301 |         os.makedirs(rgb_dir, exist_ok=True)
302 |         os.makedirs(depth_dir, exist_ok=True)
303 |         os.makedirs(pose_dir, exist_ok=True)
304 |         os.makedirs(semantic_dir, exist_ok=True)
305 |         os.makedirs(instance_dir, exist_ok=True)
306 | 
307 |         copy_3d_semantics(os.path.join(flags.scannet_scan_dir, scene), scene,
308 |                           os.path.join(flags.out, scene), label_helper)
309 | 
310 |         semantic_files = os.listdir(semantic_dir_in)
311 |         semantic_files = sorted(semantic_files,
312 |                                 key=lambda x: int(x.split('.')[0]))
313 | 
314 |         instance_files = os.listdir(instance_dir_in)
315 |         instance_files = sorted(instance_files,
316 |                                 key=lambda x: int(x.split('.')[0]))
317 | 
318 |         scene_out = os.path.join(flags.out, scene)
319 |         max_frames = 750
320 |         with SensReader(sensor_file) as reader:
321 | 
322 |             write_intrinsics(scene_out, reader)
323 |             stride = max(math.ceil(reader.num_frames / max_frames),
324 |                          flags.stride)
325 |             for i, ((T_WC, rgb, depth), semantic_file,
326 |                     instance_file) in enumerate(
327 |                         zip(reader.read(), semantic_files, instance_files)):
328 |                 if i % flags.stride != 0:
329 |                     continue
330 |                 print("Processing frame %d" % i, end='\r')
331 |                 if np.isnan(T_WC).any() or np.isinf(T_WC).any():
332 |                     print("Skipping frame %d" % i, "because of nan or inf.")
333 |                     continue
334 |                 T_CW = np.linalg.inv(T_WC)
335 |                 number = f"{i:06}"
336 |                 rgb_path = os.path.join(rgb_dir, f"{number}.jpg")
337 |                 depth_path = os.path.join(depth_dir, f"{number}.png")
338 |                 pose_path = os.path.join(pose_dir, f"{number}.txt")
339 |                 imageio.imwrite(rgb_path, rgb)
340 |                 cv2.imwrite(depth_path, depth)
341 |                 np.savetxt(pose_path, T_CW)
342 | 
343 |                 semantic_path = os.path.join(semantic_dir, f"{number}.png")
344 |                 semantic_frame = cv2.imread(
345 |                     os.path.join(semantic_dir_in, semantic_file), -1)
346 |                 out_semantic = label_helper.map_semantics(semantic_frame)
347 |                 label_helper.register_frame(out_semantic)
348 |                 cv2.imwrite(semantic_path, out_semantic)
349 | 
350 |                 instance_out = os.path.join(instance_dir, f"{number}.png")
351 |                 instance_path = os.path.join(instance_dir_in, instance_file)
352 |                 instance_frame = cv2.imread(instance_path, -1)
353 |                 # Remove instances which belong to classes which are not in the label set.
354 |                 # 0 means undefined, and shot not be evaluated on.
355 |                 # An object not in the labelset could easily occlude a labelset object.
356 |                 instance_frame[out_semantic <= 0] = 0
357 |                 cv2.imwrite(instance_out, instance_frame)
358 | 
359 |         write_metadata(scene_out, label_helper)
360 |         subprocess.call([
361 |             'python', 'scripts/compute_scene_bounds.py',
362 |             os.path.join(flags.out, scene)
363 |         ])
364 | 
365 |         shutil.rmtree(os.path.join(semantic_dir_in))
366 |         shutil.rmtree(os.path.join(instance_dir_in))
367 | 
368 | 
369 | if __name__ == "__main__":
370 |     main()
371 | 


--------------------------------------------------------------------------------
/scripts/mapping.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import shutil
  4 | import numpy as np
  5 | import pycolmap
  6 | import tempfile
  7 | import cv2
  8 | import open3d as o3d
  9 | from pathlib import Path
 10 | from autolabel.utils import Scene, transform_points, Camera
 11 | from autolabel.undistort import ImageUndistorter
 12 | from hloc import (extract_features, match_features, reconstruction,
 13 |                   pairs_from_exhaustive, pairs_from_retrieval)
 14 | from hloc.utils import viz_3d
 15 | 
 16 | 
 17 | def read_args():
 18 |     parser = argparse.ArgumentParser()
 19 |     parser.add_argument('scene', help="Scene to infer poses for.")
 20 |     parser.add_argument('--debug', action='store_true')
 21 |     parser.add_argument('--vis', action='store_true')
 22 |     return parser.parse_args()
 23 | 
 24 | 
 25 | class HLoc:
 26 | 
 27 |     def __init__(self, tmp_dir, scene, flags):
 28 |         self.flags = flags
 29 |         self.scene = scene
 30 |         self.scene_path = Path(self.scene.path)
 31 |         self.exhaustive = len((self.scene.raw_rgb_paths())) < 250
 32 | 
 33 |         self.tmp_dir = Path(tmp_dir)
 34 |         self.sfm_pairs = self.tmp_dir / 'sfm-pairs.txt'
 35 |         self.loc_pairs = self.tmp_dir / 'sfm-pairs-loc.txt'
 36 |         self.features = self.tmp_dir / 'features.h5'
 37 |         self.matches = self.tmp_dir / 'matches.h5'
 38 |         self.feature_conf = extract_features.confs['superpoint_aachen']
 39 |         self.retrieval_conf = extract_features.confs['netvlad']
 40 |         self.matcher_conf = match_features.confs['superglue']
 41 | 
 42 |     def _run_sfm(self):
 43 |         image_dir = Path(self.scene.path) / 'raw_rgb'
 44 |         image_list = []
 45 |         image_paths = self.scene.raw_rgb_paths()
 46 |         image_list_path = []
 47 |         indices = np.arange(len(image_paths))
 48 |         for index in indices:
 49 |             image_list.append(image_paths[index])
 50 |             image_list_path.append(
 51 |                 str(Path(image_paths[index]).relative_to(image_dir)))
 52 |         if self.exhaustive:
 53 |             extract_features.main(self.feature_conf,
 54 |                                   image_dir,
 55 |                                   feature_path=self.features,
 56 |                                   image_list=image_list_path)
 57 |             pairs_from_exhaustive.main(self.sfm_pairs,
 58 |                                        image_list=image_list_path)
 59 |             match_features.main(self.matcher_conf,
 60 |                                 self.sfm_pairs,
 61 |                                 features=self.features,
 62 |                                 matches=self.matches)
 63 |             model = reconstruction.main(
 64 |                 self.tmp_dir,
 65 |                 image_dir,
 66 |                 self.sfm_pairs,
 67 |                 self.features,
 68 |                 self.matches,
 69 |                 image_list=image_list_path,
 70 |                 camera_mode=pycolmap.CameraMode.SINGLE,
 71 |                 image_options={'camera_model': "OPENCV"},
 72 |                 mapper_options={
 73 |                     'ba_refine_principal_point': True,
 74 |                     'ba_refine_extra_params': True,
 75 |                     'ba_refine_focal_length': True
 76 |                 })
 77 |         else:
 78 |             retrieval_path = extract_features.main(self.retrieval_conf,
 79 |                                                    image_dir,
 80 |                                                    self.tmp_dir,
 81 |                                                    image_list=image_list_path)
 82 |             pairs_from_retrieval.main(retrieval_path,
 83 |                                       self.sfm_pairs,
 84 |                                       num_matched=50)
 85 |             feature_path = extract_features.main(self.feature_conf,
 86 |                                                  image_dir,
 87 |                                                  self.tmp_dir,
 88 |                                                  image_list=image_list_path)
 89 |             match_path = match_features.main(self.matcher_conf,
 90 |                                              self.sfm_pairs,
 91 |                                              self.feature_conf['output'],
 92 |                                              self.tmp_dir,
 93 |                                              matches=self.matches)
 94 |             model = reconstruction.main(
 95 |                 self.tmp_dir,
 96 |                 image_dir,
 97 |                 self.sfm_pairs,
 98 |                 feature_path,
 99 |                 match_path,
100 |                 image_list=image_list_path,
101 |                 camera_mode=pycolmap.CameraMode.SINGLE,
102 |                 image_options={'camera_model': "OPENCV"},
103 |                 mapper_options={
104 |                     'ba_refine_principal_point': True,
105 |                     'ba_refine_extra_params': True,
106 |                     'ba_refine_focal_length': True
107 |                 })
108 | 
109 |         if self.flags.vis:
110 |             fig = viz_3d.init_figure()
111 |             viz_3d.plot_reconstruction(fig,
112 |                                        model,
113 |                                        color='rgba(255,0,0,0.5)',
114 |                                        name="mapping")
115 |             fig.show()
116 | 
117 |         if self.flags.debug:
118 |             # Save mapping metadata if running in debug mode.
119 |             colmap_output_dir = os.path.join(self.scene.path, 'colmap_output')
120 |             os.makedirs(colmap_output_dir, exist_ok=True)
121 |             model.write_text(colmap_output_dir)
122 | 
123 |         # Save the intrinsics matrix and the distortion parameters.
124 |         assert (len(model.cameras) == 1 and 1 in model.cameras)
125 |         (focal_length_x, focal_length_y, c_x, c_y, k_1, k_2, p_1,
126 |          p_2) = model.cameras[1].params
127 |         self.colmap_K = np.eye(3)
128 |         self.colmap_K[0, 0] = focal_length_x
129 |         self.colmap_K[1, 1] = focal_length_y
130 |         self.colmap_K[0, 2] = c_x
131 |         self.colmap_K[1, 2] = c_y
132 |         self.colmap_distortion_params = np.array([k_1, k_2, p_1, p_2])
133 |         np.savetxt(fname=os.path.join(self.scene.path, 'intrinsics.txt'),
134 |                    X=self.colmap_K)
135 |         np.savetxt(fname=os.path.join(self.scene.path,
136 |                                       'distortion_parameters.txt'),
137 |                    X=self.colmap_distortion_params)
138 | 
139 |     def _undistort_images(self):
140 |         print("Undistorting images according to the estimated intrinsics...")
141 |         undistorted_image_folder = os.path.join(self.scene.path, "rgb")
142 |         undistorted_depth_folder = os.path.join(self.scene.path, "depth")
143 |         os.makedirs(undistorted_image_folder, exist_ok=True)
144 |         os.makedirs(undistorted_depth_folder, exist_ok=True)
145 | 
146 |         color_undistorter = ImageUndistorter(K=self.colmap_K,
147 |                                              D=self.colmap_distortion_params,
148 |                                              H=self.scene.camera.size[1],
149 |                                              W=self.scene.camera.size[0])
150 | 
151 |         depth_camera = Camera(self.colmap_K, self.scene.camera.size).scale(
152 |             self.scene.depth_size())
153 |         depth_undistorter = ImageUndistorter(K=depth_camera.camera_matrix,
154 |                                              D=self.colmap_distortion_params,
155 |                                              H=depth_camera.size[1],
156 |                                              W=depth_camera.size[0])
157 | 
158 |         # Undistort all the images and save the undistorted versions.
159 |         image_paths = self.scene.raw_rgb_paths()
160 |         for image_path in image_paths:
161 |             image = cv2.imread(image_path, cv2.IMREAD_UNCHANGED)
162 | 
163 |             undistorted_image = color_undistorter.undistort_image(image=image)
164 |             cv2.imwrite(img=undistorted_image,
165 |                         filename=os.path.join(undistorted_image_folder,
166 |                                               os.path.basename(image_path)))
167 | 
168 |         depth_paths = self.scene.raw_depth_paths()
169 |         for depth_path in depth_paths:
170 |             depth = cv2.imread(depth_path, cv2.IMREAD_UNCHANGED)
171 | 
172 |             undistorted_depth = depth_undistorter.undistort_image(image=depth)
173 |             cv2.imwrite(img=undistorted_depth,
174 |                         filename=os.path.join(undistorted_depth_folder,
175 |                                               os.path.basename(depth_path)))
176 | 
177 |     def run(self):
178 |         self._run_sfm()
179 |         self._undistort_images()
180 | 
181 | 
182 | class ScaleEstimation:
183 |     min_depth = 0.05
184 | 
185 |     def __init__(self, scene, colmap_dir):
186 |         self.scene = scene
187 |         self.colmap_dir = colmap_dir
188 |         self.reconstruction = pycolmap.Reconstruction(colmap_dir)
189 |         self._read_trajectory()
190 |         self._read_depth_maps()
191 | 
192 |     def _read_depth_maps(self):
193 |         self.depth_maps = {}
194 |         for path in self.scene.depth_paths():
195 |             frame_name = os.path.basename(path).split('.')[0]
196 |             self.depth_maps[frame_name] = cv2.imread(path, -1) / 1000.0
197 |         depth_shape = next(iter(self.depth_maps.values())).shape
198 |         depth_size = np.array([depth_shape[1], depth_shape[0]],
199 |                               dtype=np.float64)
200 |         self.depth_to_color_ratio = depth_size / np.array(
201 |             self.scene.camera.size, dtype=np.float64)
202 | 
203 |     def _read_trajectory(self):
204 |         poses = []
205 |         for image in self.reconstruction.images.values():
206 |             T_CW = np.eye(4)
207 |             T_CW[:3, :3] = image.rotmat()
208 |             T_CW[:3, 3] = image.tvec
209 |             frame_name = image.name.split('.')[0]
210 |             poses.append((frame_name, T_CW))
211 |         self.poses = dict(poses)
212 | 
213 |     def _lookup_depth(self, frame, xy):
214 |         xy_depth = np.floor(self.depth_to_color_ratio * xy).astype(int)
215 |         return self.depth_maps[frame][xy_depth[1], xy_depth[0]]
216 | 
217 |     def _estimate_scale(self):
218 |         images = self.reconstruction.images
219 |         point_depths = []
220 |         measured_depths = []
221 |         for image in images.values():
222 |             frame_name = image.name.split('.')[0]
223 |             points = image.get_valid_points2D()
224 |             points3D = self.reconstruction.points3D
225 |             for point in points:
226 |                 depth_map_value = self._lookup_depth(frame_name, point.xy)
227 | 
228 |                 if depth_map_value < self.min_depth:
229 |                     continue
230 | 
231 |                 T_CW = self.poses[frame_name]
232 |                 point3D = points3D[point.point3D_id]
233 | 
234 |                 p_C = transform_points(T_CW, point3D.xyz)
235 |                 measured_depths.append(depth_map_value)
236 |                 point_depths.append(p_C[2])
237 | 
238 |         point_depths = np.stack(point_depths)
239 |         measured_depths = np.stack(measured_depths)
240 |         scales = measured_depths / point_depths
241 |         return self._ransac(scales)
242 | 
243 |     def _ransac(self, scales):
244 |         best_set = None
245 |         best_inlier_count = 0
246 |         indices = np.arange(0, scales.shape[0])
247 |         inlier_threshold = np.median(scales) * 1e-2
248 |         for i in range(10000):
249 |             selected = np.random.choice(indices)
250 |             estimate = scales[selected]
251 |             inliers = np.abs(scales - estimate) < inlier_threshold
252 |             inlier_count = inliers.sum()
253 |             if inlier_count > best_inlier_count:
254 |                 best_set = scales[inliers]
255 |                 best_inlier_count = inlier_count
256 |         print(
257 |             f"Scale estimation inlier count: {best_inlier_count} / {scales.size}"
258 |         )
259 |         return np.mean(best_set)
260 | 
261 |     def _scale_poses(self, ratio):
262 |         scaled_poses = {}
263 |         for key, pose in self.poses.items():
264 |             new_pose = pose.copy()
265 |             new_pose[:3, 3] *= ratio
266 |             scaled_poses[key] = new_pose
267 |         return scaled_poses
268 | 
269 |     def run(self):
270 |         scale_ratio = self._estimate_scale()
271 |         return self._scale_poses(scale_ratio)
272 | 
273 | 
274 | class PoseSaver:
275 | 
276 |     def __init__(self, scene, scaled_poses):
277 |         self.scene = scene
278 |         self.poses = scaled_poses
279 | 
280 |     def compute_bbox(self, poses):
281 |         """
282 |         poses: Metrically scaled transforms from camera to world frame.
283 |         """
284 |         # Compute axis-aligned bounding box of the depth values in world frame.
285 |         # Then get the center.
286 |         min_bounds = np.zeros(3)
287 |         max_bounds = np.zeros(3)
288 |         depth_frame = o3d.io.read_image(self.scene.depth_paths()[0])
289 |         depth_size = np.asarray(depth_frame).shape[::-1]
290 |         K = self.scene.camera.scale(depth_size).camera_matrix
291 |         intrinsics = o3d.camera.PinholeCameraIntrinsic(int(depth_size[0]),
292 |                                                        int(depth_size[1]),
293 |                                                        K[0, 0], K[1, 1],
294 |                                                        K[0, 2], K[1, 2])
295 |         pc = o3d.geometry.PointCloud()
296 |         depth_frames = dict([(os.path.basename(p).split('.')[0], p)
297 |                              for p in self.scene.depth_paths()])
298 |         items = [item for item in poses.items()]
299 |         stride = max(len(self.scene.depth_paths()) // 100, 1)
300 |         for key, T_WC in items[::stride]:
301 |             if key not in depth_frames:
302 |                 print("WARNING: Can't find depth image {key}.png")
303 |                 continue
304 |             depth = o3d.io.read_image(f"{depth_frames[key]}")
305 | 
306 |             pc_C = o3d.geometry.PointCloud.create_from_depth_image(
307 |                 depth, depth_scale=1000.0, intrinsic=intrinsics)
308 |             pc_C = np.asarray(pc_C.points)
309 |             pc_W = transform_points(T_WC, pc_C)
310 | 
311 |             min_bounds = np.minimum(min_bounds, pc_W.min(axis=0))
312 |             max_bounds = np.maximum(max_bounds, pc_W.max(axis=0))
313 |             pc += o3d.geometry.PointCloud(
314 |                 o3d.utility.Vector3dVector(pc_W)).uniform_down_sample(50)
315 | 
316 |         filtered, _ = pc.remove_statistical_outlier(nb_neighbors=20,
317 |                                                     std_ratio=2.0)
318 |         bbox = filtered.get_oriented_bounding_box(robust=True)
319 |         T = np.eye(4)
320 |         T[:3, :3] = bbox.R.T
321 |         o3d_aabb = o3d.geometry.PointCloud(filtered).transform(
322 |             T).get_axis_aligned_bounding_box()
323 |         center = o3d_aabb.get_center()
324 |         T[:3, 3] = -center
325 |         aabb = np.zeros((2, 3))
326 |         aabb[0, :] = o3d_aabb.get_min_bound() - center
327 |         aabb[1, :] = o3d_aabb.get_max_bound() - center
328 |         return T, aabb, filtered
329 | 
330 |     def _write_poses(self, poses):
331 |         pose_dir = os.path.join(self.scene.path, 'pose')
332 |         os.makedirs(pose_dir, exist_ok=True)
333 |         for key, T_CW in poses.items():
334 |             pose_file = os.path.join(pose_dir, f'{key}.txt')
335 |             np.savetxt(pose_file, T_CW)
336 | 
337 |     def _write_bounds(self, bounds):
338 |         with open(os.path.join(self.scene.path, 'bbox.txt'), 'wt') as f:
339 |             min_str = " ".join([str(x) for x in bounds[0]])
340 |             max_str = " ".join([str(x) for x in bounds[1]])
341 |             f.write(f"{min_str} {max_str} 0.01")
342 | 
343 |     def run(self):
344 |         T_WCs = {}
345 |         for key, T_CW in self.poses.items():
346 |             T_WCs[key] = np.linalg.inv(T_CW)
347 |         T, aabb, point_cloud = self.compute_bbox(T_WCs)
348 | 
349 |         T_CWs = {}
350 |         for key, T_WC in T_WCs.items():
351 |             T_CWs[key] = np.linalg.inv(T @ T_WC)
352 |         self._write_poses(T_CWs)
353 |         self._write_bounds(aabb)
354 | 
355 | 
356 | class Pipeline:
357 | 
358 |     def __init__(self, flags):
359 |         self.tmp_dir = tempfile.mkdtemp()
360 |         self.flags = flags
361 |         self.scene = Scene(flags.scene)
362 | 
363 |     def run(self):
364 |         hloc = HLoc(self.tmp_dir, self.scene, self.flags)
365 |         hloc.run()
366 | 
367 |         # Camera intrinsics might have changed so reload the scene.
368 |         self.scene = Scene(self.scene.path)
369 | 
370 |         scale_estimation = ScaleEstimation(self.scene, self.tmp_dir)
371 |         scaled_poses = scale_estimation.run()
372 |         pose_saver = PoseSaver(self.scene, scaled_poses)
373 |         pose_saver.run()
374 | 
375 |         if self.flags.debug:
376 |             shutil.move(str(self.tmp_dir), "/tmp/sfm_debug")
377 |         else:
378 |             shutil.rmtree(self.tmp_dir)
379 | 
380 | 
381 | if __name__ == "__main__":
382 |     Pipeline(read_args()).run()
383 | 


--------------------------------------------------------------------------------
/scripts/language/evaluate.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | from pathlib import Path
  4 | import numpy as np
  5 | import json
  6 | import pickle
  7 | import pandas
  8 | from rich.table import Table
  9 | from rich.console import Console
 10 | from rich import print as rprint
 11 | from autolabel.evaluation import OpenVocabEvaluator2D, OpenVocabEvaluator3D
 12 | from autolabel.evaluation import OpenVocabInstancePQEvaluator, PanopticStat
 13 | from autolabel.dataset import SceneDataset, LenDataset
 14 | from autolabel import utils, model_utils
 15 | 
 16 | 
 17 | def read_args():
 18 |     parser = argparse.ArgumentParser()
 19 |     parser.add_argument('scenes', nargs='+')
 20 |     parser.add_argument('--batch-size', default=8182, type=int)
 21 |     parser.add_argument('--vis', default=None, type=str)
 22 |     parser.add_argument('--workspace', type=str, default=None)
 23 |     parser.add_argument('--out',
 24 |                         default=None,
 25 |                         type=str,
 26 |                         help="Where to write results as json, if anywhere.")
 27 |     parser.add_argument('--label-map', type=str, required=True)
 28 |     parser.add_argument('--feature-checkpoint', '-f', type=str, required=True)
 29 |     parser.add_argument(
 30 |         '--stride',
 31 |         type=int,
 32 |         default=1,
 33 |         help="Only evaluate every Nth frame to save time or for debugging.")
 34 |     parser.add_argument(
 35 |         '--pc',
 36 |         action='store_true',
 37 |         help=
 38 |         "Evaluate point cloud segmentation accuracy instead of 2D segmentation maps."
 39 |     )
 40 |     parser.add_argument(
 41 |         '--panoptic', 
 42 |         action='store_true',
 43 |         help='Evaluate panoptic segmenation.')
 44 |     parser.add_argument('--print-verbose', action='store_true')
 45 |     parser.add_argument('--debug', action='store_true')
 46 |     parser.add_argument('--only-scene-classes', action='store_true')
 47 |     parser.add_argument('--random',
 48 |                         action='store_true',
 49 |                         help="Randomize the order of the scenes.")
 50 |     parser.add_argument('--time', action='store_true')
 51 |     parser.add_argument('--denoise-method', 
 52 |                         type=str, 
 53 |                         default='average_similarity',
 54 |                         choices=['majority_voting', 'average_similarity', 'average_feature'],
 55 |                         help="The denoise method for semantics.")
 56 |     return parser.parse_args()
 57 | 
 58 | 
 59 | def gather_models(flags, scene_dirs):
 60 |     models = set()
 61 |     for scene in scene_dirs:
 62 |         nerf_dir = model_utils.get_nerf_dir(scene, flags)
 63 |         if not os.path.exists(nerf_dir):
 64 |             continue
 65 |         for model in os.listdir(nerf_dir):
 66 |             checkpoint_dir = os.path.join(nerf_dir, model, 'checkpoints')
 67 |             if os.path.exists(checkpoint_dir):
 68 |                 models.add(model)
 69 |     return list(models)
 70 | 
 71 | 
 72 | def read_label_map(path):
 73 |     return pandas.read_csv(path)
 74 | 
 75 | 
 76 | class NumpyEncoder(json.JSONEncoder):
 77 |     """ Special json encoder for numpy types """
 78 |     def default(self, obj):
 79 |         if isinstance(obj, np.integer):
 80 |             return int(obj)
 81 |         elif isinstance(obj, np.floating):
 82 |             return float(obj)
 83 |         elif isinstance(obj, np.ndarray):
 84 |             return obj.tolist()
 85 |         return json.JSONEncoder.default(self, obj)
 86 | 
 87 | def write_results(out, tables, json_result, panoptic_stat=None):
 88 |     out = Path(out)
 89 |     out.mkdir(parents=True, exist_ok=True)
 90 |     dumped = json.dumps(json_result, cls=NumpyEncoder, indent=2)
 91 |     with open(out / 'results.json', 'w') as f:
 92 |         f.write(dumped)
 93 | 
 94 |     with open(out / 'table.txt', 'w') as f:
 95 |         for table in tables:
 96 |             rprint(table, file=f)
 97 |             rprint('\n\n', file=f)
 98 |     
 99 |     if panoptic_stat is not None:
100 |         with open(out / 'panoptic_stat.pkl', 'wb') as outp:
101 |             pickle.dump(panoptic_stat, outp, pickle.HIGHEST_PROTOCOL)
102 | 
103 | 
104 | def main(flags):
105 |     if len(flags.scenes) == 1 and not os.path.exists(
106 |             os.path.join(flags.scenes[0], 'rgb')):
107 |         # We are dealing with a directory full of scenes and not a list of scenes
108 |         scene_dir = flags.scenes[0]
109 |         scene_dirs = [
110 |             os.path.join(scene_dir, scene)
111 |             for scene in os.listdir(scene_dir)
112 |             if os.path.exists(os.path.join(scene_dir, scene, 'rgb'))
113 |         ]
114 |     else:
115 |         scene_dirs = flags.scenes
116 | 
117 |     original_labels = read_label_map(flags.label_map)
118 |     n_classes = len(original_labels)
119 | 
120 |     scene_names = [os.path.basename(os.path.normpath(p)) for p in scene_dirs]
121 |     scenes = [(s, n) for s, n in zip(scene_dirs, scene_names)]
122 |     if flags.random:
123 |         import random
124 |         random.shuffle(scenes)
125 |     else:
126 |         scenes = sorted(scenes, key=lambda x: x[1])
127 |     
128 |     if flags.panoptic:
129 |         panoptic_stats = PanopticStat()
130 |     else:
131 |         ious = []
132 |         accs = []
133 |         ious_d = []
134 |         accs_d = []
135 |     evaluator = None
136 | 
137 |     for scene_index, (scene, scene_name) in enumerate(scenes):
138 |         model = gather_models(flags, [scene])
139 |         if len(model) == 0:
140 |             print(f"Skipping scene {scene_name} because no models were found.")
141 |             continue
142 |         else:
143 |             model = model[0]
144 |         print(f"Using model {model}")
145 | 
146 |         print(f"Evaluating scene {scene_name}")
147 | 
148 |         nerf_dir = model_utils.get_nerf_dir(scene, flags)
149 |         model_path = os.path.join(nerf_dir, model)
150 |         if not os.path.exists(model_path):
151 |             print(f"Skipping scene {scene_name} because no models were found.")
152 |             continue
153 |         params = model_utils.read_params(model_path)
154 |         dataset = SceneDataset('test',
155 |                                scene,
156 |                                factor=4.0,
157 |                                batch_size=flags.batch_size,
158 |                                lazy=True)
159 |         if flags.only_scene_classes:
160 |             classes_in_scene = dataset.scene.metadata.get('classes', None)
161 |             if classes_in_scene is None:
162 |                 label_map = original_labels
163 |             else:
164 |                 mask = original_labels['id'].isin(classes_in_scene)
165 |                 label_map = original_labels[mask]
166 |         else:
167 |             label_map = original_labels
168 | 
169 |         n_classes = dataset.n_classes if dataset.n_classes is not None else 2
170 |         model = model_utils.create_model(dataset.min_bounds, dataset.max_bounds,
171 |                                          n_classes, params).cuda()
172 | 
173 |         checkpoint_dir = os.path.join(model_path, 'checkpoints')
174 |         if not os.path.exists(checkpoint_dir) or len(
175 |                 os.listdir(checkpoint_dir)) == 0:
176 |             continue
177 | 
178 |         model_utils.load_checkpoint(model, checkpoint_dir)
179 |         model = model.eval()
180 |         if flags.vis is not None:
181 |             vis_path = os.path.join(flags.vis, scene_name)
182 |         else:
183 |             vis_path = None
184 | 
185 |         if evaluator is None:
186 |             if flags.panoptic:
187 |                 evaluator = OpenVocabInstancePQEvaluator(
188 |                     features=params.features,
189 |                     name=scene_name,
190 |                     checkpoint=flags.feature_checkpoint,
191 |                     debug=flags.debug,
192 |                     stride=flags.stride,
193 |                     save_figures=vis_path,
194 |                     time=flags.time,
195 |                     denoise_method=flags.denoise_method
196 |                 )
197 |             else:
198 |                 if flags.pc:
199 |                     evaluator = OpenVocabEvaluator3D(
200 |                         features=params.features,
201 |                         name=scene_name,
202 |                         checkpoint=flags.feature_checkpoint,
203 |                         stride=flags.stride,
204 |                         debug=flags.debug,
205 |                         time=flags.time)
206 |                 else:
207 |                     evaluator = OpenVocabEvaluator2D(
208 |                         features=params.features,
209 |                         name=scene_name,
210 |                         checkpoint=flags.feature_checkpoint,
211 |                         debug=flags.debug,
212 |                         stride=flags.stride,
213 |                         save_figures=vis_path,
214 |                         time=flags.time)
215 |         assert evaluator.features == params.features
216 |         evaluator.reset(model, label_map, vis_path)
217 |         if flags.panoptic:
218 |             panoptic_stat = evaluator.eval(dataset)
219 |             panoptic_stats += panoptic_stat
220 |             tables, json_result = print_panoptic_results(panoptic_stat, 
221 |                                         categories=evaluator.evaluated_labels,
222 |                                         label_mapping=evaluator.label_mapping,
223 |                                         label_type_mapping=evaluator.label_type_mapping,
224 |                                         verbose=flags.print_verbose)
225 |             if flags.out:
226 |                 write_results(
227 |                     os.path.join(flags.out, scene_name), tables, json_result, panoptic_stat)
228 |         else:
229 |             iou, acc, iou_d, acc_d = evaluator.eval(dataset)
230 |             ious.append(iou)
231 |             accs.append(acc)
232 |             ious_d.append(iou_d)
233 |             accs_d.append(acc_d)
234 |             table = print_iou_acc_results([iou], [acc])
235 |             table_d = print_iou_acc_results([iou_d], [acc_d], table_title="Denoised")
236 |             if flags.out:
237 |                 write_results(
238 |                     os.path.join(flags.out, scene_name), 
239 |                     [table, table_d], 
240 |                     {'iou': iou, 'acc': acc, 'iou_d': iou_d, 'acc_d': acc_d})
241 |         del model
242 |     if flags.panoptic:
243 |         final_tables, final_json_result = print_panoptic_results(panoptic_stats, 
244 |                                                  categories=evaluator.evaluated_labels,
245 |                                                  label_mapping=evaluator.label_mapping,
246 |                                                  label_type_mapping=evaluator.label_type_mapping,
247 |                                                  verbose=flags.print_verbose)
248 |         if flags.out:
249 |             write_results(
250 |                 os.path.join(flags.out, 'final'), final_tables, final_json_result, panoptic_stats)
251 |     else:
252 |         table = print_iou_acc_results(ious, accs)
253 |         table_d = print_iou_acc_results(ious_d, accs_d, table_title="Denoised")
254 |         if flags.out:
255 |             write_results(
256 |                 os.path.join(flags.out, 'final'),
257 |                 [table, table_d], 
258 |                 {'ious': ious, 'accs': accs, 'ious_d': ious_d, 'accs_d': accs_d})
259 | 
260 | 
261 | def print_panoptic_results(panoptic_stat, categories, label_mapping, label_type_mapping, verbose=False):
262 | 
263 |     json_result = {}
264 |     print_tables = []
265 | 
266 |     def percentage_to_string(num):
267 |         if num is None:
268 |             return "N/A"
269 |         else:
270 |             v = num * 100
271 |             return f"{v:.1f}"
272 | 
273 |     console = Console()
274 |     # panoptic segmentation
275 |     pq_total_result, pq_per_class_result = panoptic_stat.pq_average(categories, label_type_mapping, verbose=verbose)
276 |     table = Table(show_lines=True, caption_justify='left')
277 |     table.add_column('Class')
278 |     table.add_column('PQ')
279 |     table.add_column('SQ')
280 |     table.add_column('RQ')
281 |     if verbose:
282 |         table.add_column('tp')
283 |         table.add_column('fp')
284 |         table.add_column('fn')
285 | 
286 |     table.title = "Panoptic Evaluation"
287 |     json_result['panoptic'] = {}
288 |     per_class_result = {}
289 |     for category_id in categories:
290 |         pq_info = pq_per_class_result[category_id]
291 |         if pq_info['valid']:
292 |             if verbose:
293 |                 table.add_row(label_mapping[category_id], 
294 |                         percentage_to_string(pq_info['pq']),
295 |                         percentage_to_string(pq_info['sq']),
296 |                         percentage_to_string(pq_info['rq']),
297 |                         str(pq_info['tp']),
298 |                         str(pq_info['fp']),
299 |                         str(pq_info['fn']))
300 |                 per_class_result[label_mapping[category_id]] = {
301 |                     'PQ': pq_info['pq'] * 100, 'SQ': pq_info['sq'] * 100, 'RQ': pq_info['rq'] * 100,
302 |                     'tp': pq_info['tp'], 'fp': pq_info['fp'], 'fn': pq_info['fn']
303 |                 }
304 |             
305 |             else:
306 |                 table.add_row(label_mapping[category_id], 
307 |                         percentage_to_string(pq_info['pq']),
308 |                         percentage_to_string(pq_info['sq']),
309 |                         percentage_to_string(pq_info['rq']))
310 |                 per_class_result[label_mapping[category_id]] = {
311 |                     'PQ': pq_info['pq'] * 100, 'SQ': pq_info['sq'] * 100, 'RQ': pq_info['rq'] * 100
312 |                 }
313 |     json_result['panoptic']['per_class_result'] = per_class_result
314 |     if verbose:
315 |         table.add_row('Total:\n{} valid panoptic categories.'.format(
316 |                         pq_total_result['n']),
317 |                   percentage_to_string(pq_total_result['pq']), 
318 |                   percentage_to_string(pq_total_result['sq']), 
319 |                   percentage_to_string(pq_total_result['rq']),
320 |                   '{:.1f}'.format(pq_total_result['tp']),
321 |                   '{:.1f}'.format(pq_total_result['fp']),
322 |                   '{:.1f}'.format(pq_total_result['fn']))
323 |         json_result['panoptic']['total'] = {
324 |             'PQ': pq_total_result['pq'] * 100, 'SQ': pq_total_result['sq'] * 100, 'RQ': pq_total_result['rq'] * 100,
325 |             'tp': pq_total_result['tp'], 'fp': pq_total_result['fp'], 'fn': pq_total_result['fn']
326 |         }
327 |     else:
328 |         table.add_row('Total:\n{} valid panoptic categories.'.format(
329 |                         pq_total_result['n']),
330 |                   percentage_to_string(pq_total_result['pq']), 
331 |                   percentage_to_string(pq_total_result['sq']), 
332 |                   percentage_to_string(pq_total_result['rq']))
333 |         json_result['panoptic']['total'] = {
334 |             'PQ': pq_total_result['pq'] * 100, 'SQ': pq_total_result['sq'] * 100, 'RQ': pq_total_result['rq'] * 100
335 |         }
336 |     console.print(table)
337 |     print_tables.append(table)
338 | 
339 |     # semantic segmentation
340 |     semantic_total_result, semantic_per_class_result = panoptic_stat.semantic_average(categories)
341 |     table = Table(show_lines=True, caption_justify='left')
342 |     table.add_column('Class')
343 |     table.add_column('S_iou')
344 |     table.add_column('S_acc')
345 |     table.add_column('S_iou_d')
346 |     table.add_column('S_acc_d')
347 | 
348 |     table.title = "Semantic Evaluation"
349 | 
350 |     json_result['semantic'] = {}
351 |     per_class_result = {}
352 |     for category_id in categories:
353 |         semantic = semantic_per_class_result[category_id]
354 |         if semantic['valid']:
355 |             table.add_row(label_mapping[category_id],
356 |                     percentage_to_string(semantic['iou']),
357 |                     percentage_to_string(semantic['acc']),
358 |                     percentage_to_string(semantic['iou_d']),
359 |                     percentage_to_string(semantic['acc_d']))
360 |             per_class_result[label_mapping[category_id]] = {
361 |                 'S_iou': semantic['iou'] * 100, 'S_acc': semantic['acc'] * 100, 'S_iou_d': semantic['iou_d'] * 100, 'S_acc_d': semantic['acc_d'] * 100
362 |             }
363 |     json_result['semantic']['per_class_result'] = per_class_result
364 | 
365 |     table.add_row('Total:\n{} valid semantic categories'.format(
366 |                     semantic_total_result['n']),
367 |                 percentage_to_string(semantic_total_result['iou']),
368 |                 percentage_to_string(semantic_total_result['acc']),
369 |                 percentage_to_string(semantic_total_result['iou_d']),
370 |                 percentage_to_string(semantic_total_result['acc_d']))
371 |     json_result['semantic']['total'] = {
372 |         'S_iou': semantic_total_result['iou'] * 100, 'S_acc': semantic_total_result['acc'] * 100, 
373 |         'S_iou_d': semantic_total_result['iou_d'] * 100, 'S_acc_d': semantic_total_result['acc_d'] * 100
374 |     }
375 |     console.print(table)
376 |     print_tables.append(table)
377 | 
378 |     # instance segmentation
379 |     instance_result = panoptic_stat.instance_average()
380 |     table = Table(show_lines=True, caption_justify='left')
381 |     table.add_column('mCov')
382 |     table.add_column('mWCov')
383 |     table.add_column('mPrec')
384 |     table.add_column('mRec')
385 | 
386 |     table.title = "Instance Evaluation"
387 |     table.add_row(
388 |         percentage_to_string(instance_result['mCov']),
389 |         percentage_to_string(instance_result['mWCov']),
390 |         percentage_to_string(instance_result['mPrec']),
391 |         percentage_to_string(instance_result['mRec']))
392 |     json_result['instance'] = {
393 |         'mCov': instance_result['mCov'] * 100, 'mWCov': instance_result['mWCov'] * 100,
394 |         'mPrec': instance_result['mPrec'] * 100, 'mRec': instance_result['mRec'] * 100
395 |     }
396 |     console.print(table)
397 |     print_tables.append(table)
398 |     return print_tables, json_result
399 | 
400 | 
401 | def print_iou_acc_results(ious, accs, table_title="Direct"):
402 |     table = Table()
403 |     table.add_column('Class')
404 |     table.add_column('mIoU')
405 |     table.add_column('mAcc')
406 |     table.title = table_title
407 | 
408 |     def percentage_to_string(iou):
409 |         if iou is None:
410 |             return "N/A"
411 |         else:
412 |             v = iou * 100
413 |             return f"{v:.1f}"
414 | 
415 |     reduced_iou = {}
416 |     for iou in ious:
417 |         for key, value in iou.items():
418 |             if key not in reduced_iou:
419 |                 reduced_iou[key] = []
420 |             if value is None:
421 |                 continue
422 |             reduced_iou[key].append(value)
423 |     reduced_acc = {}
424 |     for acc in accs:
425 |         for key, value in acc.items():
426 |             if key not in reduced_acc:
427 |                 reduced_acc[key] = []
428 |             if value is None:
429 |                 continue
430 |             reduced_acc[key].append(value)
431 |     for key, values in reduced_iou.items():
432 |         if key == 'total':
433 |             continue
434 |         mIoU = np.mean(values)
435 |         mAcc = np.mean(reduced_acc[key])
436 |         table.add_row(key, percentage_to_string(mIoU),
437 |                       percentage_to_string(mAcc))
438 | 
439 |     scene_total = percentage_to_string(
440 |         np.mean([r['total'] for r in ious if 'total' in r]))
441 |     scene_total_acc = percentage_to_string(
442 |         np.mean([r['total'] for r in accs if 'total' in r]))
443 |     table.add_row('Total', scene_total, scene_total_acc)
444 | 
445 |     console = Console()
446 |     console.print(table)
447 |     return table
448 | 
449 | 
450 | if __name__ == "__main__":
451 |     main(read_args())
452 | 


--------------------------------------------------------------------------------
/autolabel/trainer.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import hdbscan
  4 | import pickle
  5 | import tensorboardX
  6 | import torch
  7 | from torch.nn import functional as F
  8 | from torch import optim
  9 | import tqdm
 10 | from tqdm import tqdm
 11 | from torch_ema import ExponentialMovingAverage
 12 | 
 13 | from torch_ngp.nerf.utils import Trainer
 14 | from autolabel.dataset import SAM_BIT_LEN
 15 | 
 16 | DEPTH_EPSILON = 0.01
 17 | 
 18 | def sim_matrix(a, b, eps=1e-8):
 19 |     """
 20 |     added eps for numerical stability
 21 |     """
 22 |     a_n, b_n = a.norm(dim=1)[:, None], b.norm(dim=1)[:, None]
 23 |     a_norm = a / torch.clamp(a_n, min=eps)
 24 |     b_norm = b / torch.clamp(b_n, min=eps)
 25 |     sim_mt = torch.mm(a_norm, b_norm.transpose(0, 1))
 26 |     return sim_mt
 27 | 
 28 | 
 29 | class SimpleTrainer(Trainer):
 30 | 
 31 |     def train(self, dataloader, epochs):
 32 |         if self.use_tensorboardX and self.local_rank == 0:
 33 |             self.writer = tensorboardX.SummaryWriter(
 34 |                 os.path.join(self.workspace, "run", self.name))
 35 | 
 36 |         if self.model.cuda_ray:
 37 |             self.model.mark_untrained_grid(dataloader._data.poses,
 38 |                                            dataloader._data.intrinsics)
 39 | 
 40 |         if not hasattr(self, 'con_ema'):
 41 |             self.con_ema = ExponentialMovingAverage(self.model.contrastive_features.parameters(),
 42 |                             decay=self.ema_decay if self.ema_decay is not None else 0.95)
 43 | 
 44 |         for i in range(0, epochs):
 45 |             self.train_iterations(dataloader, 1000, epoch=i+1)
 46 |             if self.opt.slow_center:
 47 |                 self.update_sam_centers(dataloader)
 48 |             self.epoch += 1
 49 | 
 50 |         if self.use_tensorboardX and self.local_rank == 0:
 51 |             self.writer.close()
 52 | 
 53 |     def update_sam_centers(self, dataloader):
 54 |         dataset = dataloader._data
 55 |         self.model.eval()
 56 | 
 57 |         bar = tqdm(dataset.indices, desc="Updating SAM centers")
 58 |         with torch.inference_mode():
 59 |             for image_index in bar:
 60 |                 data = dataset._next_update(image_index)
 61 |                 rays_o = torch.tensor(data['rays_o']).to(self.device)  # [B, 3]
 62 |                 rays_d = torch.tensor(data['rays_d']).to(self.device)  # [B, 3]
 63 |                 direction_norms = torch.tensor(data['direction_norms']).to(self.device)  # [B, 1]
 64 |                 num_masks = data['num_masks']
 65 |                 sample_mask_size = data['sample_mask_size']
 66 | 
 67 |                 outputs = self.model.render(rays_o,
 68 |                                         rays_d,
 69 |                                         direction_norms,
 70 |                                         staged=False,
 71 |                                         bg_color=None,
 72 |                                         perturb=True,
 73 |                                         contrastive_ema=self.con_ema,
 74 |                                         **vars(self.opt))
 75 |                 contrastive_features = outputs['contrastive_features']
 76 |                 contrastive_features = contrastive_features.reshape(num_masks, sample_mask_size, -1)
 77 |                 sam_center = torch.mean(contrastive_features, dim=1)
 78 |                 dataset.update_sam_centers(image_index, sam_center.cpu().numpy())
 79 |     
 80 |     def train_iterations(self, dataloader, iterations, epoch):
 81 |         self.model.train()
 82 |         if self.model.cuda_ray:
 83 |             self.model.mark_untrained_grid(dataloader._data.poses,
 84 |                                            dataloader._data.intrinsics)
 85 |         iterator = iter(dataloader)
 86 |         bar = tqdm(range(iterations), desc=f"[Epoch {epoch}] Loss: N/A")
 87 |         for _ in bar:
 88 |             data = next(iterator)
 89 |             self.global_step += 1
 90 |             for opt in self.optimizers:
 91 |                 opt.zero_grad()
 92 |             with torch.cuda.amp.autocast(enabled=self.fp16):
 93 |                 _, _, loss = self.train_step(data)
 94 |             if self.use_tensorboardX:
 95 |                 self.writer.add_scalar("train/loss", loss.item(),
 96 |                                        self.global_step)
 97 |             self.scaler.scale(loss).backward()
 98 |             for opt in self.optimizers:
 99 |                 self.scaler.step(opt)
100 |             self.scaler.update()
101 |             self.con_ema.update()
102 |             bar.set_description(f"[Epoch {epoch}] Loss: {loss:.04f}")
103 |         if self.ema is not None:
104 |             self.ema.update()
105 |         self._step_scheduler(loss)
106 | 
107 |     def compute_contrastive_loss(self, 
108 |                                  features, 
109 |                                  sam_sampling=True,
110 |                                  anchor_indices=None,
111 |                                  positive_indices=None,
112 |                                  negative_indices=None, 
113 |                                  sam_centers=None,
114 |                                  sam_labels=None, 
115 |                                  batch_size=None, 
116 |                                  chunk_size=None):
117 |         if sam_sampling:
118 |             assert anchor_indices is not None
119 |             assert positive_indices is not None
120 |             assert negative_indices is not None
121 | 
122 |             if not hasattr(self, 'con_loss_fn'):
123 |                 self.con_loss_fn = torch.nn.CrossEntropyLoss()
124 | 
125 |             # loss_all = 0
126 |             anchor_features = features[anchor_indices]
127 |             positive_features = features[positive_indices].detach()
128 |             negative_features = features[negative_indices].detach()
129 | 
130 |             logits_pos = F.cosine_similarity(anchor_features, positive_features, dim=-1)
131 |             logits_neg = F.cosine_similarity(anchor_features[:, None, :], negative_features, dim=-1)
132 |             logits = torch.cat((logits_pos[:, None], logits_neg), dim=1)
133 | 
134 |             labels = torch.zeros(anchor_features.shape[0], dtype=torch.int64).to(self.device)
135 |             loss_all = self.con_loss_fn(logits/self.opt.contrastive_temperature, labels)
136 | 
137 |             if sam_centers is not None:
138 |                 loss_center = F.l1_loss(anchor_features, sam_centers)
139 |                 loss_center += (1 - F.cosine_similarity(anchor_features, sam_centers, dim=-1)).mean()
140 |                 loss_all += 0.5 * loss_center                   
141 | 
142 |         else:
143 |             assert sam_labels is not None
144 |             assert batch_size is not None
145 |             assert chunk_size is not None
146 |             assert features.shape[0] == sam_labels.shape[0]
147 | 
148 |             loss_all = 0
149 |             chunks = batch_size // chunk_size
150 |             for chunk in range(chunks):
151 |                 start = chunk * chunk_size
152 |                 end = (chunk + 1) * chunk_size
153 | 
154 |                 contrastive_features = features[start: end]
155 |                 labels = sam_labels[start: end]
156 | 
157 |                 num_features = chunk_size
158 |                 con_feature_sim_mat = sim_matrix(contrastive_features, contrastive_features)
159 | 
160 |                 loss_contrastive = 0
161 |                 ONE = torch.tensor(1, device=con_feature_sim_mat.device)
162 |                 for i in range(SAM_BIT_LEN):
163 |                     label = torch.bitwise_and(ONE << i, labels).bool()
164 |                     label_mask = (label.expand(num_features, num_features).transpose(1, 0)
165 |                                         == label.expand(num_features, num_features))
166 | 
167 |                     self_mask = torch.eye(
168 |                         con_feature_sim_mat.shape[0], dtype=torch.bool, device=con_feature_sim_mat.device)
169 | 
170 |                     zero_mask = torch.logical_or(
171 |                         torch.sum(label_mask, dim=1) <= 1, 
172 |                         torch.sum(label_mask, dim=1) >= num_features - 1
173 |                     )
174 |                     zero_mask = zero_mask.expand(num_features, num_features).transpose(1, 0)
175 | 
176 |                     label_mask = label_mask / self.opt.contrastive_temperature
177 |                     loss = - torch.logsumexp(
178 |                         con_feature_sim_mat.masked_fill(torch.logical_not(
179 |                             label_mask), -6e4).masked_fill(self_mask, -6e4).masked_fill(zero_mask, 0),
180 |                         dim=-1
181 |                     ) + torch.logsumexp(
182 |                         con_feature_sim_mat.masked_fill(zero_mask, 0),
183 |                         dim=-1
184 |                     )
185 |                     loss_contrastive += loss.mean()
186 |                 loss_contrastive = loss_contrastive / SAM_BIT_LEN
187 |                 loss_all += loss_contrastive
188 |             loss_all = loss_all / chunks
189 |         return loss_all
190 |     
191 |     def train_step(self, data):
192 |         rays_o = data['rays_o'].to(self.device)  # [B, 3]
193 |         rays_d = data['rays_d'].to(self.device)  # [B, 3]
194 |         direction_norms = data['direction_norms'].to(self.device)  # [B, 1]
195 |         gt_rgb = data['pixels'].to(self.device)  # [B, 3]
196 |         gt_depth = data['depth'].to(self.device)  # [B, 3]
197 | 
198 |         outputs = self.model.render(rays_o,
199 |                                     rays_d,
200 |                                     direction_norms,
201 |                                     staged=False,
202 |                                     bg_color=None,
203 |                                     perturb=True,
204 |                                     **vars(self.opt))
205 | 
206 |         pred_rgb = outputs['image']
207 | 
208 |         loss = self.opt.rgb_weight * self.criterion(pred_rgb, gt_rgb).mean()
209 | 
210 |         pred_depth = outputs['depth']
211 |         has_depth = (gt_depth > DEPTH_EPSILON)
212 |         depth_loss = torch.abs(pred_depth[has_depth] - gt_depth[has_depth])
213 | 
214 |         loss = loss + self.opt.depth_weight * depth_loss.mean()
215 | 
216 |         if self.opt.feature_loss:
217 |             gt_features = data['features'].to(self.device)
218 |             p_features = outputs['semantic_features']
219 |             loss_feature = F.l1_loss(
220 |                 p_features[:, :gt_features.shape[1]], gt_features)
221 |             loss += self.opt.feature_weight * loss_feature
222 |             if self.use_tensorboardX:
223 |                 self.writer.add_scalar("train/loss_feature", loss_feature.item(),
224 |                                         self.global_step)
225 | 
226 |         if self.opt.feature_constrastive_learning:
227 |             if self.opt.sam_sampling:
228 |                 anchor_indices = data['anchor_indices'].to(self.device)
229 |                 positive_indices = data['positive_indices'].to(self.device)
230 |                 negative_indices = data['negative_indices'].to(self.device)
231 |                 sam_centers = data['sam_centers']
232 |                 if sam_centers is not None:
233 |                     sam_centers = sam_centers.to(self.device)
234 |                 loss_contrastive = self.compute_contrastive_loss(
235 |                     features=outputs['contrastive_features'],
236 |                     sam_sampling=self.opt.sam_sampling,
237 |                     anchor_indices=anchor_indices,
238 |                     positive_indices=positive_indices,
239 |                     negative_indices=negative_indices,
240 |                     sam_centers=sam_centers
241 |                 )
242 |             else:
243 |                 sam = data['sam'].to(self.device)
244 |                 chunk_size = data['chunk_size']
245 |                 batch_size = len(sam)
246 |                 loss_contrastive = self.compute_contrastive_loss(
247 |                     features=outputs['contrastive_features'],
248 |                     sam_sampling=self.opt.sam_sampling,
249 |                     sam_labels=sam,
250 |                     batch_size=batch_size,
251 |                     chunk_size=chunk_size
252 |                 )
253 |             
254 |             loss += self.opt.contrastive_weight * loss_contrastive
255 | 
256 |             if self.use_tensorboardX:
257 |                 self.writer.add_scalar("train/loss_contrastive", loss_contrastive.item(),
258 |                                         self.global_step)
259 | 
260 |         return pred_rgb, gt_rgb, loss
261 | 
262 |     def compute_instance_centers(self, dataset):
263 |         self.log("Start computing instance centers ...")
264 |         with torch.inference_mode():
265 |             with torch.cuda.amp.autocast(enabled=True):
266 |                 self.model.eval()
267 |                 instance_features = []
268 |                 for i in tqdm(dataset.indices, desc="Processing contrastive features"):
269 |                     batch = dataset._get_test(i)
270 |                     # get instance and semantic features
271 |                     rays_o = torch.tensor(batch['rays_o']).to(self.device)
272 |                     rays_d = torch.tensor(batch['rays_d']).to(self.device)
273 |                     direction_norms = torch.tensor(batch['direction_norms']).to(self.device)
274 |                     outputs = self.model.render(rays_o,
275 |                                                 rays_d,
276 |                                                 direction_norms,
277 |                                                 staged=True,
278 |                                                 perturb=False)
279 |                     instance_feature = outputs['contrastive_features'].cpu().numpy()
280 |                     instance_features.append(instance_feature)
281 |         instance_features = np.stack(instance_features, axis=0)
282 | 
283 |         # feature clustering
284 |         self.log("Clustering features ...")
285 |         num_image, image_height, image_width, feature_dim = instance_features.shape
286 |         instance_features = instance_features.reshape(-1, feature_dim)
287 |         clust = hdbscan.HDBSCAN(min_cluster_size=100, gen_min_span_tree=True) # cluster size depends on the image size
288 |         sample_indices = np.random.permutation(instance_features.shape[0])[:200000]
289 |         clust.fit(instance_features[sample_indices, :])
290 | 
291 |         exemplar = [np.mean(exemplars, axis=0) for exemplars in clust.exemplars_]
292 |         exemplar = np.vstack(exemplar)
293 |         self.log(f"Total {len(clust.exemplars_)} instance centers.")
294 | 
295 |         self.model.set_instance_centers(exemplar)
296 |         self.model.set_instance_clusterer(clust)
297 | 
298 |     def save_instance_centers(self, save_cluster=True):
299 |         name = f'{self.name}_ep{self.epoch:04d}_instance_centers'
300 |         file_path = f"{self.ckpt_path}/{name}.npy"
301 |         np.save(file_path, self.model.instance_centers)
302 | 
303 |         if save_cluster:
304 |             name = f'{self.name}_ep{self.epoch:04d}_cluster'
305 |             file_path = f"{self.ckpt_path}/{name}.pkl"
306 |             with open(file_path, 'wb') as outp:
307 |                 pickle.dump(self.model.instance_clusterer, outp, pickle.HIGHEST_PROTOCOL)
308 |     
309 |     def test_step(self, data):
310 |         rays_o = data['rays_o']  # [B, N, 3]
311 |         rays_d = data['rays_d']  # [B, N, 3]
312 |         direction_norms = data['direction_norms']  # [B, N, 1]
313 |         H, W = data['H'], data['W']
314 | 
315 |         outputs = self.model.render(rays_o,
316 |                                     rays_d,
317 |                                     direction_norms,
318 |                                     staged=True,
319 |                                     perturb=False,
320 |                                     **vars(self.opt))
321 | 
322 |         pred_rgb = outputs['image'].reshape(-1, H, W, 3)
323 |         pred_depth = outputs['depth'].reshape(-1, H, W)
324 |         pred_semantic = outputs['semantic']
325 |         pred_features = outputs['semantic_features']
326 |         _, _, C = pred_semantic.shape
327 |         pred_semantic = pred_semantic.reshape(-1, H, W, C)
328 | 
329 |         return pred_rgb, pred_depth, pred_semantic, pred_features
330 | 
331 |     def eval_step(self, data):
332 |         rays_o = data['rays_o'].to(self.device)  # [B, 3]
333 |         rays_d = data['rays_d'].to(self.device)  # [B, 3]
334 |         direction_norms = data['direction_norms'].to(self.device)  # [B, 1]
335 |         gt_rgb = data['pixels'].to(self.device)  # [B, H, W, 3]
336 |         gt_depth = data['depth'].to(self.device)  # [B, H, W]
337 |         gt_semantic = data['semantic'].to(self.device)  # [B, H, W]
338 |         H, W, _ = gt_rgb.shape
339 | 
340 |         outputs = self.model.render(rays_o,
341 |                                     rays_d,
342 |                                     direction_norms,
343 |                                     staged=True,
344 |                                     bg_color=None,
345 |                                     perturb=False,
346 |                                     **vars(self.opt))
347 | 
348 |         pred_rgb = outputs['image'].reshape(H, W, 3)
349 |         pred_depth = outputs['depth'].reshape(H, W)
350 |         pred_semantic = outputs['semantic']
351 | 
352 |         loss = self.criterion(pred_rgb, gt_rgb).mean()
353 |         has_depth = gt_depth > DEPTH_EPSILON
354 |         loss += self.opt.depth_weight * torch.abs(pred_depth[has_depth] -
355 |                                                   gt_depth[has_depth]).mean()
356 | 
357 |         has_semantic = gt_semantic >= 0
358 |         if has_semantic.sum().item() > 0:
359 |             semantic_loss = F.cross_entropy(pred_semantic[has_semantic, :],
360 |                                             gt_semantic[has_semantic])
361 |             loss += self.opt.semantic_weight * semantic_loss
362 | 
363 |         pred_semantic = pred_semantic.reshape(H, W, pred_semantic.shape[-1])
364 | 
365 |         return pred_rgb[None], pred_depth[None], pred_semantic[None], gt_rgb[
366 |             None], loss
367 | 
368 |     def _step_scheduler(self, loss):
369 |         if isinstance(self.lr_schedulers[0],
370 |                       optim.lr_scheduler.ReduceLROnPlateau):
371 |             [s.step(loss) for s in self.lr_schedulers]
372 |         else:
373 |             [s.step() for s in self.lr_schedulers]
374 | 
375 | 
376 | class InteractiveTrainer(SimpleTrainer):
377 | 
378 |     def __init__(self, *args, **kwargs):
379 |         lr_scheduler = kwargs['lr_scheduler']
380 |         kwargs['lr_scheduler'] = None
381 |         super().__init__(*args, **kwargs)
382 |         self.loader = None
383 |         self.lr_scheduler = lr_scheduler(self.optimizer)
384 | 
385 |     def init(self, loader):
386 |         self.model.train()
387 |         self.iterator = iter(loader)
388 |         self.step = 0
389 |         self.model.mark_untrained_grid(loader._data.poses,
390 |                                        loader._data.intrinsics)
391 | 
392 |     def train(self, loader):
393 |         while True:
394 |             self.model.train()
395 |             self.train_one_epoch(loader)
396 | 
397 |     def train_one_epoch(self, loader):
398 |         iterator = iter(loader)
399 |         bar = tqdm(range(1000), desc="Loss: N/A")
400 |         for _ in bar:
401 |             data = next(iterator)
402 |             self.optimizer.zero_grad()
403 |             with torch.cuda.amp.autocast(enabled=self.fp16):
404 |                 _, _, loss = self.train_step(data)
405 |             self.scaler.scale(loss).backward()
406 |             self.scaler.step(self.optimizer)
407 |             self.scaler.update()
408 |             bar.set_description(f"Loss: {loss:.04f}")
409 |         if self.ema is not None:
410 |             self.ema.update()
411 |         self._step_scheduler(loss)
412 | 
413 |     def take_step(self):
414 |         data = next(self.iterator)
415 |         self.optimizer.zero_grad()
416 | 
417 |         with torch.cuda.amp.autocast(enabled=self.fp16):
418 |             _, _, loss = self.train_step(data)
419 | 
420 |         self.scaler.scale(loss).backward()
421 |         self.scaler.step(self.optimizer)
422 |         self.scaler.update()
423 | 
424 |         self.step += 1
425 |         if self.step % 100 == 0:
426 |             self.ema.update()
427 |             self._step_scheduler(loss)
428 |         return loss
429 | 
430 |     def dataset_updated(self, loader):
431 |         self.loader = loader
432 | 


--------------------------------------------------------------------------------