├── .gitignore ├── LICENSE ├── README.md ├── configs ├── Base.yaml ├── Base_Omni3D.yaml ├── Base_Omni3D_ARKitScenes.yaml ├── Base_Omni3D_KITTI.yaml ├── Base_Omni3D_SUNRGBD.yaml ├── Base_Omni3D_nuScenes.yaml ├── cubercnn_DLA34_FPN.yaml ├── cubercnn_ResNet34_FPN.yaml ├── cubercnn_densenet_FPN.yaml ├── cubercnn_mnasnet_FPN.yaml └── cubercnn_shufflenet_FPN.yaml ├── cubercnn ├── config │ ├── __init__.py │ └── config.py ├── data │ ├── __init__.py │ ├── build.py │ ├── builtin.py │ ├── dataset_mapper.py │ └── datasets.py ├── evaluation │ ├── __init__.py │ └── omni3d_evaluation.py ├── generate_label │ ├── __init__.py │ ├── priors.py │ ├── process_indoor.py │ ├── process_outdoor.py │ ├── raytrace.py │ └── util.py ├── modeling │ ├── backbone │ │ ├── __init__.py │ │ ├── densenet.py │ │ ├── dla.py │ │ ├── mnasnet.py │ │ ├── resnet.py │ │ └── shufflenet.py │ ├── meta_arch │ │ ├── __init__.py │ │ ├── rcnn3d.py │ │ └── rcnn3d_text.py │ ├── proposal_generator │ │ ├── __init__.py │ │ └── rpn.py │ └── roi_heads │ │ ├── __init__.py │ │ ├── cube_head.py │ │ ├── fast_rcnn.py │ │ ├── fast_rcnn_text.py │ │ ├── roi_heads.py │ │ └── roi_heads_text.py ├── solver │ ├── __init__.py │ ├── build.py │ └── checkpoint.py ├── util │ ├── __init__.py │ ├── math_util.py │ ├── model_zoo.py │ └── util.py └── vis │ ├── __init__.py │ ├── logperf.py │ └── vis.py ├── datasets └── Omni3D │ ├── download_omni3d_json.sh │ └── stats.json ├── docs └── teaser.png ├── scripts ├── generate_pseudo_label.sh ├── test.sh ├── train.sh └── train_KITTI.sh ├── third_party ├── Grounded-Segment-Anything │ ├── .gitignore │ ├── .gitmodules │ ├── CITATION.cff │ ├── Dockerfile │ ├── EfficientSAM │ │ ├── EdgeSAM │ │ │ ├── common.py │ │ │ ├── rep_vit.py │ │ │ └── setup_edge_sam.py │ │ ├── FastSAM │ │ │ └── tools.py │ │ ├── LightHQSAM │ │ │ ├── example_light_hqsam.png │ │ │ ├── grounded_light_hqsam_annotated_image.jpg │ │ │ ├── setup_light_hqsam.py │ │ │ └── tiny_vit_sam.py │ │ ├── MobileSAM │ │ │ ├── setup_mobile_sam.py │ │ │ └── tiny_vit_sam.py │ │ ├── README.md │ │ ├── RepViTSAM │ │ │ ├── repvit.py │ │ │ └── setup_repvit_sam.py │ │ ├── grounded_edge_sam.py │ │ ├── grounded_efficient_sam.py │ │ ├── grounded_fast_sam.py │ │ ├── grounded_light_hqsam.py │ │ ├── grounded_mobile_sam.py │ │ └── grounded_repvit_sam.py │ ├── GroundingDINO │ │ ├── .asset │ │ │ ├── COCO.png │ │ │ ├── GD_GLIGEN.png │ │ │ ├── GD_SD.png │ │ │ ├── ODinW.png │ │ │ ├── arch.png │ │ │ ├── cats.png │ │ │ └── hero_figure.png │ │ ├── LICENSE │ │ ├── README.md │ │ ├── demo │ │ │ ├── gradio_app.py │ │ │ └── inference_on_a_image.py │ │ ├── groundingdino │ │ │ ├── __init__.py │ │ │ ├── config │ │ │ │ ├── GroundingDINO_SwinB.py │ │ │ │ └── GroundingDINO_SwinT_OGC.py │ │ │ ├── datasets │ │ │ │ ├── __init__.py │ │ │ │ └── transforms.py │ │ │ ├── models │ │ │ │ ├── GroundingDINO │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── backbone │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── backbone.py │ │ │ │ │ │ ├── position_encoding.py │ │ │ │ │ │ └── swin_transformer.py │ │ │ │ │ ├── bertwarper.py │ │ │ │ │ ├── csrc │ │ │ │ │ │ ├── MsDeformAttn │ │ │ │ │ │ │ ├── ms_deform_attn.h │ │ │ │ │ │ │ ├── ms_deform_attn_cpu.cpp │ │ │ │ │ │ │ ├── ms_deform_attn_cpu.h │ │ │ │ │ │ │ ├── ms_deform_attn_cuda.cu │ │ │ │ │ │ │ ├── ms_deform_attn_cuda.h │ │ │ │ │ │ │ └── ms_deform_im2col_cuda.cuh │ │ │ │ │ │ ├── cuda_version.cu │ │ │ │ │ │ └── vision.cpp │ │ │ │ │ ├── fuse_modules.py │ │ │ │ │ ├── groundingdino.py │ │ │ │ │ ├── ms_deform_attn.py │ │ │ │ │ ├── transformer.py │ │ │ │ │ ├── transformer_vanilla.py │ │ │ │ │ └── utils.py │ │ │ │ ├── __init__.py │ │ │ │ └── registry.py │ │ │ ├── util │ │ │ │ ├── __init__.py │ │ │ │ ├── box_ops.py │ │ │ │ ├── get_tokenlizer.py │ │ │ │ ├── inference.py │ │ │ │ ├── logger.py │ │ │ │ ├── misc.py │ │ │ │ ├── slconfig.py │ │ │ │ ├── slio.py │ │ │ │ ├── time_counter.py │ │ │ │ ├── utils.py │ │ │ │ ├── visualizer.py │ │ │ │ └── vl_utils.py │ │ │ └── version.py │ │ ├── pyproject.toml │ │ ├── requirements.txt │ │ └── setup.py │ ├── LICENSE │ ├── Makefile │ ├── README.md │ ├── automatic_label_demo.py │ ├── automatic_label_ram_demo.py │ ├── automatic_label_simple_demo.py │ ├── automatic_label_tag2text_demo.py │ ├── chatbot.py │ ├── cog.yaml │ ├── grounded_sam_detect.py │ ├── grounded_sam_detect_ground.py │ ├── playground │ │ ├── DeepFloyd │ │ │ ├── README.md │ │ │ ├── dream.py │ │ │ ├── inpaint.py │ │ │ └── style_transfer.py │ │ ├── ImageBind_SAM │ │ │ ├── .assets │ │ │ │ ├── bird_audio.wav │ │ │ │ ├── bird_image.jpg │ │ │ │ ├── car_audio.wav │ │ │ │ ├── car_image.jpg │ │ │ │ ├── dog_audio.wav │ │ │ │ └── dog_image.jpg │ │ │ ├── README.md │ │ │ ├── audio_referring_seg_demo.py │ │ │ ├── bpe │ │ │ │ └── bpe_simple_vocab_16e6.txt.gz │ │ │ ├── data.py │ │ │ ├── demo.py │ │ │ ├── image_referring_seg_demo.py │ │ │ ├── models │ │ │ │ ├── __init__.py │ │ │ │ ├── helpers.py │ │ │ │ ├── imagebind_model.py │ │ │ │ ├── multimodal_preprocessors.py │ │ │ │ └── transformer.py │ │ │ ├── text_referring_seg_demo.py │ │ │ └── utils.py │ │ ├── LaMa │ │ │ ├── README.md │ │ │ ├── lama_inpaint_demo.py │ │ │ └── sam_lama.py │ │ ├── PaintByExample │ │ │ ├── README.md │ │ │ ├── paint_by_example.py │ │ │ └── sam_paint_by_example.py │ │ ├── README.md │ │ └── RePaint │ │ │ ├── README.md │ │ │ └── repaint.py │ ├── predict.py │ ├── requirements.txt │ └── segment_anything │ │ ├── .flake8 │ │ ├── CODE_OF_CONDUCT.md │ │ ├── CONTRIBUTING.md │ │ ├── LICENSE │ │ ├── README.md │ │ ├── assets │ │ ├── masks1.png │ │ ├── masks2.jpg │ │ ├── model_diagram.png │ │ ├── notebook1.png │ │ └── notebook2.png │ │ ├── linter.sh │ │ ├── notebooks │ │ └── images │ │ │ ├── dog.jpg │ │ │ ├── groceries.jpg │ │ │ └── truck.jpg │ │ ├── scripts │ │ ├── amg.py │ │ └── export_onnx_model.py │ │ ├── segment_anything │ │ ├── __init__.py │ │ ├── automatic_mask_generator.py │ │ ├── build_sam.py │ │ ├── build_sam_hq.py │ │ ├── modeling │ │ │ ├── __init__.py │ │ │ ├── common.py │ │ │ ├── image_encoder.py │ │ │ ├── mask_decoder.py │ │ │ ├── mask_decoder_hq.py │ │ │ ├── prompt_encoder.py │ │ │ ├── sam.py │ │ │ └── transformer.py │ │ ├── predictor.py │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── amg.py │ │ │ ├── onnx.py │ │ │ └── transforms.py │ │ ├── setup.cfg │ │ └── setup.py └── UniDepth │ ├── .gitignore │ ├── LICENSE │ ├── README.md │ ├── assets │ ├── demo │ │ ├── depth.png │ │ ├── intrinsics.npy │ │ ├── output.png │ │ └── rgb.png │ └── docs │ │ ├── V2_README.md │ │ ├── nuscenes_surround.gif │ │ ├── theoffice.gif │ │ └── unidepth-banner.png │ ├── configs │ ├── config_v1_cnvnxtl.json │ ├── config_v1_vitl14.json │ ├── config_v2_vitl14.json │ └── config_v2_vits14.json │ ├── hubconf.py │ ├── pyproject.toml │ ├── requirements.txt │ ├── run_unidepth.py │ ├── scripts │ └── demo.py │ └── unidepth │ ├── layers │ ├── __init__.py │ ├── activation.py │ ├── attention.py │ ├── convnext.py │ ├── drop_path.py │ ├── layer_scale.py │ ├── mlp.py │ ├── nystrom_attention.py │ ├── positional_encoding.py │ └── upsample.py │ ├── models │ ├── __init__.py │ ├── backbones │ │ ├── __init__.py │ │ ├── convnext.py │ │ ├── convnext2.py │ │ ├── dinov2.py │ │ └── metadinov2 │ │ │ ├── __init__.py │ │ │ ├── attention.py │ │ │ ├── block.py │ │ │ ├── dino_head.py │ │ │ ├── drop_path.py │ │ │ ├── layer_scale.py │ │ │ ├── mlp.py │ │ │ ├── patch_embed.py │ │ │ └── swiglu_ffn.py │ ├── encoder.py │ ├── unidepthv1 │ │ ├── __init__.py │ │ ├── decoder.py │ │ └── unidepthv1.py │ └── unidepthv2 │ │ ├── __init__.py │ │ ├── decoder.py │ │ ├── decoder_old.py │ │ ├── export.py │ │ └── unidepthv2.py │ ├── ops │ ├── __init__.py │ ├── losses.py │ └── scheduler.py │ └── utils │ ├── __init__.py │ ├── constants.py │ ├── distributed.py │ ├── ema_torch.py │ ├── evaluation_depth.py │ ├── geometric.py │ ├── misc.py │ ├── positional_embedding.py │ ├── sht.py │ └── visualization.py └── tools ├── __init__.py ├── generate_pseudo_bbox.py ├── train_net.py └── transform_to_coco.py /.gitignore: -------------------------------------------------------------------------------- 1 | # folders or files 2 | datasets/* 3 | !datasets/Omni3D 4 | .vscode/ 5 | .ipynb_checkpoints/ 6 | .idea/ 7 | output/* 8 | checkpoints/* 9 | pseudo_label/* 10 | third_party/detectron2 11 | 12 | cubercnn/external/ 13 | 14 | # filetypes 15 | *.pyc 16 | *.mexa64 17 | */output/* 18 | */output*/* 19 | *~ 20 | *.so 21 | *.ipynb 22 | -------------------------------------------------------------------------------- /configs/Base.yaml: -------------------------------------------------------------------------------- 1 | SOLVER: 2 | TYPE: "sgd" 3 | IMS_PER_BATCH: 32 4 | BASE_LR: 0.02 5 | STEPS: (19200, 25600) 6 | MAX_ITER: 32000 7 | WEIGHT_DECAY: 0.0001 8 | LR_SCHEDULER_NAME: "WarmupMultiStepLR" 9 | INPUT: 10 | MIN_SIZE_TRAIN: (256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640,) 11 | MIN_SIZE_TEST: 512 12 | MAX_SIZE_TRAIN: 4096 13 | MAX_SIZE_TEST: 4096 14 | TEST: 15 | VISIBILITY_THRES: 0.33333333 16 | TRUNCATION_THRES: 0.33333333 17 | EVAL_PERIOD: 16000 18 | DATASETS: 19 | TRAIN: ('KITTI_train', 'KITTI_val') 20 | TEST: ('KITTI_test',) 21 | CATEGORY_NAMES: ('pedestrian', 'car', 'cyclist', 'van', 'truck', 'tram', 'person') 22 | IGNORE_NAMES: "['dontcare', 'ignore', 'void']" 23 | MIN_HEIGHT_THRES: 0.05 24 | TRUNCATION_THRES: 0.75 25 | VISIBILITY_THRES: 0.25 26 | TRUNC_2D_BOXES: True 27 | VIS_PERIOD: 640 28 | DATALOADER: 29 | SAMPLER_TRAIN: "RepeatFactorTrainingSampler" 30 | REPEAT_THRESHOLD: 0.1 31 | MODEL: 32 | PIXEL_MEAN: [103.530, 116.280, 123.675] 33 | PIXEL_STD: [57.375, 57.120, 58.395] 34 | META_ARCHITECTURE: "RCNN3D" 35 | MASK_ON: False 36 | STABILIZE: 0.02 37 | USE_BN: True 38 | BACKBONE: 39 | FREEZE_AT: 0 40 | NAME: 'build_dla_from_vision_fpn_backbone' 41 | DLA: 42 | TYPE: 'dla34' 43 | FPN: 44 | IN_FEATURES: ['p2', 'p3', 'p4', 'p5', 'p6'] 45 | ANCHOR_GENERATOR: 46 | SIZES: [[32], [64], [128], [256], [512]] # One size for each in feature map 47 | ASPECT_RATIOS: [[0.5, 1.0, 2.0]] # Three aspect ratios (same for all in feature maps) 48 | RPN: 49 | HEAD_NAME: "StandardRPNHead" 50 | IN_FEATURES: ['p2', 'p3', 'p4', 'p5', 'p6'] 51 | PRE_NMS_TOPK_TRAIN: 2000 # Per FPN level 52 | PRE_NMS_TOPK_TEST: 1000 # Per FPN level 53 | POST_NMS_TOPK_TRAIN: 1000 54 | POST_NMS_TOPK_TEST: 1000 55 | BOUNDARY_THRESH: -1 56 | OBJECTNESS_UNCERTAINTY: "IoUness" 57 | IOU_THRESHOLDS: [0.05, 0.05] 58 | POSITIVE_FRACTION: 1.0 59 | PROPOSAL_GENERATOR: 60 | NAME: "RPNWithIgnore" 61 | ROI_HEADS: 62 | NAME: "ROIHeads3D" 63 | IN_FEATURES: ["p2", "p3", "p4", "p5", 'p6'] 64 | BATCH_SIZE_PER_IMAGE: 512 65 | SCORE_THRESH_TEST: 0.01 66 | NUM_CLASSES: 43 67 | ROI_BOX_HEAD: 68 | NAME: "FastRCNNConvFCHead" 69 | NUM_FC: 2 70 | POOLER_RESOLUTION: 7 71 | ROI_CUBE_HEAD: 72 | NAME: 'CubeHead' 73 | Z_TYPE: 'direct' 74 | POSE_TYPE: '6d' 75 | NUM_FC: 2 76 | SHARED_FC: True 77 | USE_CONFIDENCE: 1.0 78 | LOSS_W_3D: 1.0 79 | POOLER_TYPE: 'ROIAlignV2' 80 | POOLER_RESOLUTION: 7 81 | DIMS_PRIORS_ENABLED: True 82 | DISENTANGLED_LOSS: True 83 | ALLOCENTRIC_POSE: True 84 | VIRTUAL_FOCAL: 512.0 85 | VIRTUAL_DEPTH: True 86 | CHAMFER_POSE: True 87 | VERSION: 2 -------------------------------------------------------------------------------- /configs/Base_Omni3D.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base.yaml" 2 | SOLVER: 3 | TYPE: "sgd" 4 | IMS_PER_BATCH: 192 5 | BASE_LR: 0.12 6 | STEPS: (69600, 92800) 7 | MAX_ITER: 116000 8 | WARMUP_ITERS: 3625 9 | TEST: 10 | EVAL_PERIOD: 29000 11 | VIS_PERIOD: 2320 12 | DATASETS: 13 | TRAIN: ('SUNRGBD_train', 'SUNRGBD_val', 'Hypersim_train', 'Hypersim_val', 'ARKitScenes_train', 'ARKitScenes_val', 'Objectron_train', 'Objectron_val', 'nuScenes_train', 'nuScenes_val', 'KITTI_train', 'KITTI_val') 14 | TEST: ('SUNRGBD_test', 'Hypersim_test', 'ARKitScenes_test', 'Objectron_test', 'KITTI_test', 'nuScenes_test') 15 | CATEGORY_NAMES: ('chair', 'table', 'cabinet', 'car', 'lamp', 'books', 'sofa', 'pedestrian', 'picture', 'window', 'pillow', 'truck', 'door', 'blinds', 'sink', 'shelves', 'television', 'shoes', 'cup', 'bottle', 'bookcase', 'laptop', 'desk', 'cereal box', 'floor mat', 'traffic cone', 'mirror', 'barrier', 'counter', 'camera', 'bicycle', 'toilet', 'bus', 'bed', 'refrigerator', 'trailer', 'box', 'oven', 'clothes', 'van', 'towel', 'motorcycle', 'night stand', 'stove', 'machine', 'stationery', 'bathtub', 'cyclist', 'curtain', 'bin') 16 | MODEL: 17 | ROI_HEADS: 18 | NUM_CLASSES: 50 -------------------------------------------------------------------------------- /configs/Base_Omni3D_ARKitScenes.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base.yaml" 2 | SOLVER: 3 | TYPE: "sgd" 4 | IMS_PER_BATCH: 32 5 | BASE_LR: 0.02 6 | STEPS: (17400, 23200) 7 | MAX_ITER: 29000 8 | WARMUP_ITERS: 906 9 | TEST: 10 | EVAL_PERIOD: 7250 11 | VIS_PERIOD: 580 12 | DATASETS: 13 | TRAIN: ('ARKitScenes_train', 'ARKitScenes_val') 14 | TEST: ('ARKitScenes_test',) 15 | CATEGORY_NAMES: ('bed', 'table', 'chair', 'fireplace', 'machine', 'cabinet', 'oven', 'shelves', 'sink', 'stove', 'bathtub', 'toilet', 'sofa', 'television', 'refrigerator') 16 | FOLDER_NAME: 'Omni3D_pl' 17 | MODEL: 18 | META_ARCHITECTURE: RCNN3D_text 19 | ROI_HEADS: 20 | NAME : ROIHeads3D_Text 21 | NUM_CLASSES: 15 22 | STABILIZE: 0.5 23 | 24 | -------------------------------------------------------------------------------- /configs/Base_Omni3D_KITTI.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base.yaml" 2 | SOLVER: 3 | TYPE: "sgd" 4 | IMS_PER_BATCH: 16 5 | BASE_LR: 0.01 6 | STEPS: (17400, 23200) 7 | MAX_ITER: 29000 8 | WARMUP_ITERS: 906 9 | TEST: 10 | EVAL_PERIOD: 7250 11 | VIS_PERIOD: 580 12 | DATASETS: 13 | TRAIN: ('KITTI_train', 'KITTI_val') 14 | TEST: ('KITTI_test',) 15 | CATEGORY_NAMES: ('pedestrian', 'car', 'cyclist', 'van', 'truck') 16 | FOLDER_NAME: 'Omni3D_pl' 17 | MODEL: 18 | META_ARCHITECTURE: RCNN3D_text 19 | ROI_HEADS: 20 | NAME : ROIHeads3D_Text 21 | NUM_CLASSES: 5 22 | STABILIZE: 0.5 -------------------------------------------------------------------------------- /configs/Base_Omni3D_SUNRGBD.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base.yaml" 2 | SOLVER: 3 | TYPE: "sgd" 4 | IMS_PER_BATCH: 32 5 | BASE_LR: 0.02 6 | STEPS: (17400, 23200) 7 | MAX_ITER: 29000 8 | WARMUP_ITERS: 906 9 | TEST: 10 | EVAL_PERIOD: 7250 11 | VIS_PERIOD: 580 12 | DATASETS: 13 | TRAIN: ('SUNRGBD_train', 'SUNRGBD_val') 14 | TEST: ('SUNRGBD_test',) 15 | CATEGORY_NAMES: ('bicycle', 'books', 'bottle', 'chair', 'cup', 'laptop', 'shoes', 'towel', 'blinds', 'window', 'lamp', 'shelves', 'mirror', 'sink', 'cabinet', 'bathtub', 'door', 'toilet', 'desk', 'box', 'bookcase', 'picture', 'table', 'counter', 'bed', 'night stand', 'pillow', 'sofa', 'television', 'floor mat', 'curtain', 'clothes', 'stationery', 'refrigerator', 'bin', 'stove', 'oven', 'machine') 16 | FOLDER_NAME: 'Omni3D_pl' 17 | MODEL: 18 | META_ARCHITECTURE: RCNN3D_text 19 | ROI_HEADS: 20 | NAME : ROIHeads3D_Text 21 | NUM_CLASSES: 38 22 | STABILIZE: 0.5 -------------------------------------------------------------------------------- /configs/Base_Omni3D_nuScenes.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base.yaml" 2 | SOLVER: 3 | TYPE: "sgd" 4 | IMS_PER_BATCH: 32 5 | BASE_LR: 0.01 6 | STEPS: (17400, 23200) 7 | MAX_ITER: 29000 8 | WARMUP_ITERS: 906 9 | TEST: 10 | EVAL_PERIOD: 7250 11 | VIS_PERIOD: 580 12 | DATASETS: 13 | TRAIN: ('nuScenes_train', 'nuScenes_val') 14 | TEST: ('nuScenes_test',) 15 | CATEGORY_NAMES: ('pedestrian', 'car', 'truck', 'traffic cone', 'barrier', 'motorcycle', 'bicycle', 'bus', 'trailer') 16 | FOLDER_NAME: 'Omni3D_pl' 17 | MODEL: 18 | META_ARCHITECTURE: RCNN3D_text 19 | ROI_HEADS: 20 | NAME : ROIHeads3D_Text 21 | NUM_CLASSES: 9 22 | STABILIZE: 0.5 -------------------------------------------------------------------------------- /configs/cubercnn_DLA34_FPN.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base_Omni3D.yaml" 2 | MODEL: 3 | BACKBONE: 4 | NAME: 'build_dla_from_vision_fpn_backbone' 5 | DLA: 6 | TYPE: 'dla34' -------------------------------------------------------------------------------- /configs/cubercnn_ResNet34_FPN.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base_Omni3D.yaml" 2 | MODEL: 3 | BACKBONE: 4 | NAME: 'build_resnet_from_vision_fpn_backbone' 5 | RESNETS: 6 | DEPTH: 34 7 | TORCHVISION: True -------------------------------------------------------------------------------- /configs/cubercnn_densenet_FPN.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base_Omni3D.yaml" 2 | MODEL: 3 | BACKBONE: 4 | NAME: 'build_densenet_fpn_backbone' -------------------------------------------------------------------------------- /configs/cubercnn_mnasnet_FPN.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base_Omni3D.yaml" 2 | MODEL: 3 | BACKBONE: 4 | NAME: 'build_mnasnet_fpn_backbone' -------------------------------------------------------------------------------- /configs/cubercnn_shufflenet_FPN.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base_Omni3D.yaml" 2 | MODEL: 3 | BACKBONE: 4 | NAME: 'build_shufflenet_fpn_backbone' -------------------------------------------------------------------------------- /cubercnn/config/__init__.py: -------------------------------------------------------------------------------- 1 | from .config import * -------------------------------------------------------------------------------- /cubercnn/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .datasets import * 2 | from .dataset_mapper import * 3 | from .build import * 4 | from .builtin import * -------------------------------------------------------------------------------- /cubercnn/data/builtin.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates 2 | 3 | def get_omni3d_categories(dataset="omni3d"): 4 | """ 5 | Returns the Omni3D categories for dataset 6 | Args: 7 | dataset: str 8 | Returns: 9 | cats: set of strings with category names 10 | """ 11 | 12 | if dataset == "omni3d": 13 | cats = set({'chair', 'table', 'cabinet', 'car', 'lamp', 'books', 'sofa', 'pedestrian', 'picture', 'window', 'pillow', 'truck', 'door', 'blinds', 'sink', 'shelves', 'television', 'shoes', 'cup', 'bottle', 'bookcase', 'laptop', 'desk', 'cereal box', 'floor mat', 'traffic cone', 'mirror', 'barrier', 'counter', 'camera', 'bicycle', 'toilet', 'bus', 'bed', 'refrigerator', 'trailer', 'box', 'oven', 'clothes', 'van', 'towel', 'motorcycle', 'night stand', 'stove', 'machine', 'stationery', 'bathtub', 'cyclist', 'curtain', 'bin'}) 14 | assert len(cats) == 50 15 | elif dataset == "omni3d_in": 16 | cats = set({'stationery', 'sink', 'table', 'floor mat', 'bottle', 'bookcase', 'bin', 'blinds', 'pillow', 'bicycle', 'refrigerator', 'night stand', 'chair', 'sofa', 'books', 'oven', 'towel', 'cabinet', 'window', 'curtain', 'bathtub', 'laptop', 'desk', 'television', 'clothes', 'stove', 'cup', 'shelves', 'box', 'shoes', 'mirror', 'door', 'picture', 'lamp', 'machine', 'counter', 'bed', 'toilet'}) 17 | assert len(cats) == 38 18 | elif dataset == "omni3d_out": 19 | cats = set({'cyclist', 'pedestrian', 'trailer', 'bus', 'motorcycle', 'car', 'barrier', 'truck', 'van', 'traffic cone', 'bicycle'}) 20 | assert len(cats) == 11 21 | elif dataset in ["SUNRGBD_train", "SUNRGBD_val", "SUNRGBD_test"]: 22 | cats = set({'bicycle', 'books', 'bottle', 'chair', 'cup', 'laptop', 'shoes', 'towel', 'blinds', 'window', 'lamp', 'shelves', 'mirror', 'sink', 'cabinet', 'bathtub', 'door', 'toilet', 'desk', 'box', 'bookcase', 'picture', 'table', 'counter', 'bed', 'night stand', 'pillow', 'sofa', 'television', 'floor mat', 'curtain', 'clothes', 'stationery', 'refrigerator', 'bin', 'stove', 'oven', 'machine'}) 23 | assert len(cats) == 38 24 | elif dataset in ["Hypersim_train", "Hypersim_val"]: 25 | cats = set({'books', 'chair', 'towel', 'blinds', 'window', 'lamp', 'shelves', 'mirror', 'sink', 'cabinet', 'bathtub', 'door', 'toilet', 'desk', 'box', 'bookcase', 'picture', 'table', 'counter', 'bed', 'night stand', 'pillow', 'sofa', 'television', 'floor mat', 'curtain', 'clothes', 'stationery', 'refrigerator'}) 26 | assert len(cats) == 29 27 | elif dataset == "Hypersim_test": 28 | # Hypersim test annotation does not contain toilet 29 | cats = set({'books', 'chair', 'towel', 'blinds', 'window', 'lamp', 'shelves', 'mirror', 'sink', 'cabinet', 'bathtub', 'door', 'desk', 'box', 'bookcase', 'picture', 'table', 'counter', 'bed', 'night stand', 'pillow', 'sofa', 'television', 'floor mat', 'curtain', 'clothes', 'stationery', 'refrigerator'}) 30 | assert len(cats) == 28 31 | elif dataset in ["ARKitScenes_train", "ARKitScenes_val", "ARKitScenes_test"]: 32 | cats = set({'table', 'bed', 'sofa', 'television', 'refrigerator', 'chair', 'oven', 'machine', 'stove', 'shelves', 'sink', 'cabinet', 'bathtub', 'toilet'}) 33 | assert len(cats) == 14 34 | elif dataset in ["Objectron_train", "Objectron_val", "Objectron_test"]: 35 | cats = set({'bicycle', 'books', 'bottle', 'camera', 'cereal box', 'chair', 'cup', 'laptop', 'shoes'}) 36 | assert len(cats) == 9 37 | elif dataset in ["KITTI_train", "KITTI_val", "KITTI_test"]: 38 | cats = set({'pedestrian', 'car', 'cyclist', 'van', 'truck'}) 39 | assert len(cats) == 5 40 | elif dataset in ["nuScenes_train", "nuScenes_val", "nuScenes_test"]: 41 | cats = set({'pedestrian', 'car', 'truck', 'traffic cone', 'barrier', 'motorcycle', 'bicycle', 'bus', 'trailer'}) 42 | assert len(cats) == 9 43 | else: 44 | raise ValueError("%s dataset is not registered." % (dataset)) 45 | 46 | return cats -------------------------------------------------------------------------------- /cubercnn/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from .omni3d_evaluation import * -------------------------------------------------------------------------------- /cubercnn/generate_label/__init__.py: -------------------------------------------------------------------------------- 1 | from .process_indoor import * 2 | from .process_outdoor import * 3 | from .priors import * -------------------------------------------------------------------------------- /cubercnn/generate_label/priors.py: -------------------------------------------------------------------------------- 1 | llm_generated_prior = { 2 | 'SUNRGBD': {'bicycle': [0.5, 1, 1.5], 'books': [0.2, 0.1, 0.3], 'bottle': [0.1, 0.3, 0.1], 'chair': [0.5, 1, 0.5], 'cup': [0.1, 0.1, 0.1], 'laptop': [0.3, 0.1, 0.4], 'shoes': [0.2, 0.1, 0.3], 'towel': [0.2, 0.1, 0.3], 'blinds': [0.1, 1, 1.5], 'window': [0.1, 1, 1.5], 'lamp': [0.3, 0.6, 0.3], 'shelves': [0.3, 1.5, 1.5], 'mirror': [0.1, 1, 0.5], 'sink': [0.5, 0.2, 0.8], 'cabinet': [0.5, 1.5, 1], 'bathtub': [0.8, 0.5, 1.5], 'door': [0.1, 2, 1], 'toilet': [0.4, 0.8, 0.5], 'desk': [0.6, 0.8, 1.2], 'box': [0.5, 0.5, 0.5], 'bookcase': [0.3, 2, 1], 'picture': [0.1, 0.5, 0.5], 'table': [0.8, 0.8, 1.5], 'counter': [0.6, 1, 1.5], 'bed': [1.5, 0.5, 2], 'night stand': [0.4, 0.5, 0.5], 'pillow': [0.3, 0.3, 0.5], 'sofa': [1, 1, 2], 'television': [1, 0.5, 0.1], 'floor mat': [1, 0.1, 1.5], 'curtain': [0.1, 1.5, 1], 'clothes': [0.5, 1, 0.5], 'stationery': [0.3, 0.3, 0.3], 'refrigerator': [0.8, 1.5, 0.8], 'bin': [0.5, 0.5, 0.5], 'stove': [0.6, 0.8, 0.8], 'oven': [0.6, 0.8, 0.8], 'machine': [0.8, 1, 1]}, 3 | 'KITTI': { "car": [1.8, 1.5, 4.5], "van": [2.0, 2.0, 5.0], "truck": [2.5, 3.5, 10.0], "pedestrian": [0.5, 1.7, 0.8], "cyclist": [0.6, 1.7, 1.5]}, 4 | 'ARKitScenes': {'refrigerator': [0.8, 1.5, 0.8], 'chair': [0.5, 1, 0.5], 'oven': [0.6, 0.8, 0.8], 'machine': [0.8, 1, 1], 'stove': [0.6, 0.8, 0.8], 'shelves': [0.3, 1.5, 1.5], 'sink': [0.5, 0.2, 0.8], 'cabinet': [0.5, 1.5, 1], 'bathtub': [0.8, 0.5, 1.5], 'toilet': [0.4, 0.8, 0.5], 'table': [0.8, 0.8, 1.5], 'bed': [1.5, 0.5, 2], 'sofa': [1, 1, 2], 'television': [1, 0.5, 0.1]}, 5 | 'nuScenes': { "pedestrian": [0.5, 1.7, 0.8], "car": [1.8, 1.5, 4.5], "truck": [2.5, 3.5, 8.0], "traffic cone": [0.3, 0.7, 0.3], "barrier": [0.5, 2.0, 2.0], "motorcycle": [0.8, 1.2, 2.0], "bicycle": [0.6, 1.2, 1.8], "bus": [2.75, 3.5, 11.0], "trailer": [2.75, 3.25, 11.0]}, 6 | } -------------------------------------------------------------------------------- /cubercnn/modeling/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | from .densenet import * 2 | from .mnasnet import * 3 | from .resnet import * 4 | from .shufflenet import * 5 | from .dla import * -------------------------------------------------------------------------------- /cubercnn/modeling/backbone/densenet.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates 2 | from torchvision import models 3 | from detectron2.layers import ShapeSpec 4 | from detectron2.modeling.backbone import Backbone 5 | from detectron2.modeling.backbone.build import BACKBONE_REGISTRY 6 | import torch.nn.functional as F 7 | 8 | from detectron2.modeling.backbone.fpn import FPN 9 | 10 | class DenseNetBackbone(Backbone): 11 | def __init__(self, cfg, input_shape, pretrained=True): 12 | super().__init__() 13 | 14 | base = models.densenet121(pretrained) 15 | base = base.features 16 | 17 | self.base = base 18 | 19 | self._out_feature_channels = {'p2': 256, 'p3': 512, 'p4': 1024, 'p5': 1024, 'p6': 1024} 20 | self._out_feature_strides ={'p2': 4, 'p3': 8, 'p4': 16, 'p5': 32, 'p6': 64} 21 | self._out_features = ['p2', 'p3', 'p4', 'p5', 'p6'] 22 | 23 | def forward(self, x): 24 | 25 | outputs = {} 26 | 27 | db1 = self.base[0:5](x) 28 | db2 = self.base[5:7](db1) 29 | db3 = self.base[7:9](db2) 30 | p5 = self.base[9:](db3) 31 | p6 = F.max_pool2d(p5, kernel_size=1, stride=2, padding=0) 32 | outputs['p2'] = db1 33 | outputs['p3'] = db2 34 | outputs['p4'] = db3 35 | outputs['p5'] = p5 36 | outputs['p6'] = p6 37 | 38 | return outputs 39 | 40 | 41 | @BACKBONE_REGISTRY.register() 42 | def build_densenet_fpn_backbone(cfg, input_shape: ShapeSpec, priors=None): 43 | """ 44 | Args: 45 | cfg: a detectron2 CfgNode 46 | 47 | Returns: 48 | backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`. 49 | """ 50 | 51 | imagenet_pretrain = cfg.MODEL.WEIGHTS_PRETRAIN + cfg.MODEL.WEIGHTS == '' 52 | 53 | bottom_up = DenseNetBackbone(cfg, input_shape, pretrained=imagenet_pretrain) 54 | in_features = cfg.MODEL.FPN.IN_FEATURES 55 | out_channels = cfg.MODEL.FPN.OUT_CHANNELS 56 | 57 | backbone = FPN( 58 | bottom_up=bottom_up, 59 | in_features=in_features, 60 | out_channels=out_channels, 61 | norm=cfg.MODEL.FPN.NORM, 62 | fuse_type=cfg.MODEL.FPN.FUSE_TYPE 63 | ) 64 | return backbone -------------------------------------------------------------------------------- /cubercnn/modeling/backbone/mnasnet.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates 2 | from torchvision import models 3 | from detectron2.layers import ShapeSpec 4 | from detectron2.modeling.backbone import Backbone 5 | from detectron2.modeling.backbone.build import BACKBONE_REGISTRY 6 | import torch.nn.functional as F 7 | 8 | from detectron2.modeling.backbone.fpn import FPN 9 | 10 | class MNASNetBackbone(Backbone): 11 | def __init__(self, cfg, input_shape, pretrained=True): 12 | super().__init__() 13 | 14 | base = models.mnasnet1_0(pretrained) 15 | base = base.layers 16 | 17 | self.base = base 18 | 19 | self._out_feature_channels = {'p2': 24, 'p3': 40, 'p4': 96, 'p5': 320, 'p6': 320} 20 | self._out_feature_strides ={'p2': 4, 'p3': 8, 'p4': 16, 'p5': 32, 'p6': 64} 21 | self._out_features = ['p2', 'p3', 'p4', 'p5', 'p6'] 22 | 23 | def forward(self, x): 24 | 25 | outputs = {} 26 | 27 | p2 = self.base[0:9](x) 28 | p3 = self.base[9](p2) 29 | p4 = self.base[10:12](p3) 30 | p5 = self.base[12:14](p4) 31 | p6 = F.max_pool2d(p5, kernel_size=1, stride=2, padding=0) 32 | outputs['p2'] = p2 33 | outputs['p3'] = p3 34 | outputs['p4'] = p4 35 | outputs['p5'] = p5 36 | outputs['p6'] = p6 37 | 38 | return outputs 39 | 40 | @BACKBONE_REGISTRY.register() 41 | def build_mnasnet_fpn_backbone(cfg, input_shape: ShapeSpec, priors=None): 42 | """ 43 | Args: 44 | cfg: a detectron2 CfgNode 45 | 46 | Returns: 47 | backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`. 48 | """ 49 | 50 | imagenet_pretrain = cfg.MODEL.WEIGHTS_PRETRAIN + cfg.MODEL.WEIGHTS == '' 51 | 52 | bottom_up = MNASNetBackbone(cfg, input_shape, pretrained=imagenet_pretrain) 53 | in_features = cfg.MODEL.FPN.IN_FEATURES 54 | out_channels = cfg.MODEL.FPN.OUT_CHANNELS 55 | 56 | backbone = FPN( 57 | bottom_up=bottom_up, 58 | in_features=in_features, 59 | out_channels=out_channels, 60 | norm=cfg.MODEL.FPN.NORM, 61 | fuse_type=cfg.MODEL.FPN.FUSE_TYPE, 62 | ) 63 | return backbone 64 | -------------------------------------------------------------------------------- /cubercnn/modeling/backbone/resnet.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates 2 | from torchvision import models 3 | from detectron2.layers import ShapeSpec 4 | from detectron2.modeling.backbone import Backbone 5 | from detectron2.modeling.backbone.fpn import LastLevelMaxPool 6 | from detectron2.modeling.backbone.resnet import build_resnet_backbone 7 | from detectron2.modeling.backbone.build import BACKBONE_REGISTRY 8 | import torch.nn.functional as F 9 | 10 | from detectron2.modeling.backbone.fpn import FPN 11 | 12 | class ResNet(Backbone): 13 | def __init__(self, cfg, input_shape, pretrained=True): 14 | super().__init__() 15 | 16 | if cfg.MODEL.RESNETS.DEPTH == 18: 17 | base = models.resnet18(pretrained) 18 | self._out_feature_channels = {'p2': 64, 'p3': 128, 'p4': 256, 'p5': 512, 'p6': 512} 19 | elif cfg.MODEL.RESNETS.DEPTH == 34: 20 | base = models.resnet34(pretrained) 21 | self._out_feature_channels = {'p2': 64, 'p3': 128, 'p4': 256, 'p5': 512, 'p6': 512} 22 | elif cfg.MODEL.RESNETS.DEPTH == 50: 23 | base = models.resnet50(pretrained) 24 | self._out_feature_channels = {'p2': 256, 'p3': 512, 'p4': 1024, 'p5': 2048, 'p6': 2048} 25 | elif cfg.MODEL.RESNETS.DEPTH == 101: 26 | base = models.resnet101(pretrained) 27 | self._out_feature_channels = {'p2': 256, 'p3': 512, 'p4': 1024, 'p5': 2048, 'p6': 2048} 28 | else: 29 | raise ValueError('No configuration currently supporting depth of {}'.format(cfg.MODEL.RESNETS.DEPTH)) 30 | 31 | self.conv1 = base.conv1 32 | self.bn1 = base.bn1 33 | self.relu = base.relu 34 | self.maxpool = base.maxpool 35 | self.layer1 = base.layer1 36 | self.layer2 = base.layer2 37 | self.layer3 = base.layer3 38 | self.layer4 = base.layer4 39 | 40 | self._out_feature_strides ={'p2': 4, 'p3': 8, 'p4': 16, 'p5': 32, 'p6': 64} 41 | self._out_features = ['p2', 'p3', 'p4', 'p5', 'p6'] 42 | 43 | def forward(self, x): 44 | 45 | outputs = {} 46 | 47 | x = self.conv1(x) 48 | x = self.bn1(x) 49 | x = self.relu(x) 50 | x = self.maxpool(x) 51 | p2 = self.layer1(x) 52 | p3 = self.layer2(p2) 53 | p4 = self.layer3(p3) 54 | p5 = self.layer4(p4) 55 | p6 = F.max_pool2d(p5, kernel_size=1, stride=2, padding=0) 56 | 57 | outputs['p2'] = p2 58 | outputs['p3'] = p3 59 | outputs['p4'] = p4 60 | outputs['p5'] = p5 61 | outputs['p6'] = p6 62 | 63 | return outputs 64 | 65 | 66 | @BACKBONE_REGISTRY.register() 67 | def build_resnet_from_vision_fpn_backbone(cfg, input_shape: ShapeSpec, priors=None): 68 | """ 69 | Args: 70 | cfg: a detectron2 CfgNode 71 | 72 | Returns: 73 | backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`. 74 | """ 75 | 76 | imagenet_pretrain = cfg.MODEL.WEIGHTS_PRETRAIN + cfg.MODEL.WEIGHTS == '' 77 | 78 | if cfg.MODEL.RESNETS.TORCHVISION: 79 | bottom_up = ResNet(cfg, input_shape, pretrained=imagenet_pretrain) 80 | 81 | else: 82 | # use the MSRA modeling logic to build the backbone. 83 | bottom_up = build_resnet_backbone(cfg, input_shape) 84 | 85 | in_features = cfg.MODEL.FPN.IN_FEATURES 86 | out_channels = cfg.MODEL.FPN.OUT_CHANNELS 87 | 88 | backbone = FPN( 89 | bottom_up=bottom_up, 90 | in_features=in_features, 91 | out_channels=out_channels, 92 | norm=cfg.MODEL.FPN.NORM, 93 | top_block=LastLevelMaxPool(), 94 | fuse_type=cfg.MODEL.FPN.FUSE_TYPE, 95 | ) 96 | return backbone 97 | -------------------------------------------------------------------------------- /cubercnn/modeling/backbone/shufflenet.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates 2 | from torchvision import models 3 | from detectron2.layers import ShapeSpec 4 | from detectron2.modeling.backbone import Backbone 5 | from detectron2.modeling.backbone.build import BACKBONE_REGISTRY 6 | import torch.nn.functional as F 7 | 8 | from detectron2.modeling.backbone.fpn import FPN 9 | 10 | class ShufflenetBackbone(Backbone): 11 | def __init__(self, cfg, input_shape, pretrained=True): 12 | super().__init__() 13 | 14 | base = models.shufflenet_v2_x1_0(pretrained) 15 | self.conv1 = base.conv1 16 | self.maxpool = base.maxpool 17 | self.stage2 = base.stage2 18 | self.stage3 = base.stage3 19 | self.stage4 = base.stage4 20 | self.conv5 = base.conv5 21 | 22 | self._out_feature_channels = {'p2': 24, 'p3': 116, 'p4': 232, 'p5': 464, 'p6': 464} 23 | self._out_feature_strides ={'p2': 4, 'p3': 8, 'p4': 16, 'p5': 32, 'p6': 64} 24 | self._out_features = ['p2', 'p3', 'p4', 'p5', 'p6'] 25 | 26 | def forward(self, x): 27 | 28 | outputs = {} 29 | 30 | x = self.conv1(x) 31 | p2 = self.maxpool(x) 32 | p3 = self.stage2(p2) 33 | p4 = self.stage3(p3) 34 | p5 = self.stage4(p4) 35 | p6 = F.max_pool2d(p5, kernel_size=1, stride=2, padding=0) 36 | 37 | outputs['p2'] = p2 38 | outputs['p3'] = p3 39 | outputs['p4'] = p4 40 | outputs['p5'] = p5 41 | outputs['p6'] = p6 42 | 43 | return outputs 44 | 45 | 46 | @BACKBONE_REGISTRY.register() 47 | def build_shufflenet_fpn_backbone(cfg, input_shape: ShapeSpec, priors=None): 48 | """ 49 | Args: 50 | cfg: a detectron2 CfgNode 51 | 52 | Returns: 53 | backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`. 54 | """ 55 | 56 | imagenet_pretrain = cfg.MODEL.WEIGHTS_PRETRAIN + cfg.MODEL.WEIGHTS == '' 57 | 58 | bottom_up = ShufflenetBackbone(cfg, input_shape, pretrained=imagenet_pretrain) 59 | in_features = cfg.MODEL.FPN.IN_FEATURES 60 | out_channels = cfg.MODEL.FPN.OUT_CHANNELS 61 | 62 | backbone = FPN( 63 | bottom_up=bottom_up, 64 | in_features=in_features, 65 | out_channels=out_channels, 66 | norm=cfg.MODEL.FPN.NORM, 67 | fuse_type=cfg.MODEL.FPN.FUSE_TYPE, 68 | ) 69 | return backbone 70 | -------------------------------------------------------------------------------- /cubercnn/modeling/meta_arch/__init__.py: -------------------------------------------------------------------------------- 1 | # from .rcnn3d import * 2 | from .rcnn3d_text import * -------------------------------------------------------------------------------- /cubercnn/modeling/proposal_generator/__init__.py: -------------------------------------------------------------------------------- 1 | from .rpn import * 2 | -------------------------------------------------------------------------------- /cubercnn/modeling/roi_heads/__init__.py: -------------------------------------------------------------------------------- 1 | # from .roi_heads import * 2 | from .roi_heads_text import * -------------------------------------------------------------------------------- /cubercnn/solver/__init__.py: -------------------------------------------------------------------------------- 1 | from .build import * 2 | from .checkpoint import * -------------------------------------------------------------------------------- /cubercnn/solver/build.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates 2 | import torch 3 | from typing import Any, Dict, List, Set 4 | from detectron2.solver.build import maybe_add_gradient_clipping 5 | 6 | def build_optimizer(cfg, model): 7 | norm_module_types = ( 8 | torch.nn.BatchNorm1d, 9 | torch.nn.BatchNorm2d, 10 | torch.nn.BatchNorm3d, 11 | torch.nn.SyncBatchNorm, 12 | torch.nn.GroupNorm, 13 | torch.nn.InstanceNorm1d, 14 | torch.nn.InstanceNorm2d, 15 | torch.nn.InstanceNorm3d, 16 | torch.nn.LayerNorm, 17 | torch.nn.LocalResponseNorm, 18 | ) 19 | params: List[Dict[str, Any]] = [] 20 | memo: Set[torch.nn.parameter.Parameter] = set() 21 | for module in model.modules(): 22 | for key, value in module.named_parameters(recurse=False): 23 | if not value.requires_grad: 24 | continue 25 | # Avoid duplicating parameters 26 | if value in memo: 27 | continue 28 | memo.add(value) 29 | 30 | lr = cfg.SOLVER.BASE_LR 31 | weight_decay = cfg.SOLVER.WEIGHT_DECAY 32 | 33 | if isinstance(module, norm_module_types) and (cfg.SOLVER.WEIGHT_DECAY_NORM is not None): 34 | weight_decay = cfg.SOLVER.WEIGHT_DECAY_NORM 35 | 36 | elif key == "bias": 37 | if (cfg.SOLVER.BIAS_LR_FACTOR is not None): 38 | lr = cfg.SOLVER.BASE_LR * cfg.SOLVER.BIAS_LR_FACTOR 39 | if (cfg.SOLVER.WEIGHT_DECAY_BIAS is not None): 40 | weight_decay = cfg.SOLVER.WEIGHT_DECAY_BIAS 41 | 42 | # these params do not need weight decay at all 43 | # TODO parameterize these in configs instead. 44 | if key in ['priors_dims_per_cat', 'priors_z_scales', 'priors_z_stats']: 45 | weight_decay = 0.0 46 | 47 | params += [{"params": [value], "lr": lr, "weight_decay": weight_decay}] 48 | 49 | if cfg.SOLVER.TYPE == 'sgd': 50 | optimizer = torch.optim.SGD( 51 | params, 52 | cfg.SOLVER.BASE_LR, 53 | momentum=cfg.SOLVER.MOMENTUM, 54 | nesterov=cfg.SOLVER.NESTEROV, 55 | weight_decay=cfg.SOLVER.WEIGHT_DECAY 56 | ) 57 | elif cfg.SOLVER.TYPE == 'adam': 58 | optimizer = torch.optim.Adam(params, cfg.SOLVER.BASE_LR, eps=1e-02) 59 | elif cfg.SOLVER.TYPE == 'adam+amsgrad': 60 | optimizer = torch.optim.Adam(params, cfg.SOLVER.BASE_LR, amsgrad=True, eps=1e-02) 61 | elif cfg.SOLVER.TYPE == 'adamw': 62 | optimizer = torch.optim.AdamW(params, cfg.SOLVER.BASE_LR, eps=1e-02) 63 | elif cfg.SOLVER.TYPE == 'adamw+amsgrad': 64 | optimizer = torch.optim.AdamW(params, cfg.SOLVER.BASE_LR, amsgrad=True, eps=1e-02) 65 | else: 66 | raise ValueError('{} is not supported as an optimizer.'.format(cfg.SOLVER.TYPE)) 67 | 68 | optimizer = maybe_add_gradient_clipping(cfg, optimizer) 69 | return optimizer 70 | 71 | def freeze_bn(network): 72 | 73 | for _, module in network.named_modules(): 74 | if isinstance(module, torch.nn.BatchNorm2d): 75 | module.eval() 76 | module.track_running_stats = False 77 | -------------------------------------------------------------------------------- /cubercnn/solver/checkpoint.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates 2 | from detectron2.checkpoint import PeriodicCheckpointer 3 | from typing import Any 4 | 5 | class PeriodicCheckpointerOnlyOne(PeriodicCheckpointer): 6 | def step(self, iteration: int, **kwargs: Any) -> None: 7 | """ 8 | Perform the appropriate action at the given iteration. 9 | 10 | Args: 11 | iteration (int): the current iteration, ranged in [0, max_iter-1]. 12 | kwargs (Any): extra data to save, same as in 13 | :meth:`Checkpointer.save`. 14 | """ 15 | iteration = int(iteration) 16 | additional_state = {"iteration": iteration} 17 | additional_state.update(kwargs) 18 | 19 | if (iteration + 1) % self.period == 0: 20 | 21 | # simply save a single recent model 22 | self.checkpointer.save( 23 | "{}_recent".format(self.file_prefix), **additional_state 24 | ) 25 | 26 | if self.max_iter is not None: 27 | if iteration >= self.max_iter - 1: 28 | self.checkpointer.save(f"{self.file_prefix}_final", **additional_state) -------------------------------------------------------------------------------- /cubercnn/util/__init__.py: -------------------------------------------------------------------------------- 1 | from .util import * 2 | from .model_zoo import * 3 | from .math_util import * -------------------------------------------------------------------------------- /cubercnn/util/model_zoo.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates 2 | from detectron2.utils.file_io import PathHandler, PathManager 3 | 4 | __all__ = ["CubeRCNNHandler"] 5 | 6 | class CubeRCNNHandler(PathHandler): 7 | """ 8 | Resolves CubeRCNN's model zoo files. 9 | """ 10 | 11 | PREFIX = "cubercnn://" 12 | CUBERCNN_PREFIX = "https://dl.fbaipublicfiles.com/cubercnn/" 13 | 14 | def _get_supported_prefixes(self): 15 | return [self.PREFIX] 16 | 17 | def _get_local_path(self, path): 18 | name = path[len(self.PREFIX) :] 19 | return PathManager.get_local_path(self.CUBERCNN_PREFIX + name) 20 | 21 | def _open(self, path, mode="r", **kwargs): 22 | return PathManager.open(self._get_local_path(path), mode, **kwargs) 23 | 24 | 25 | PathManager.register_handler(CubeRCNNHandler()) -------------------------------------------------------------------------------- /cubercnn/vis/__init__.py: -------------------------------------------------------------------------------- 1 | from .vis import * -------------------------------------------------------------------------------- /cubercnn/vis/logperf.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates 2 | from termcolor import colored 3 | import itertools 4 | from tabulate import tabulate 5 | import logging 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | def print_ap_category_histogram(dataset, results): 10 | """ 11 | Prints AP performance for each category. 12 | Args: 13 | results: dictionary; each entry contains information for a dataset 14 | """ 15 | num_classes = len(results) 16 | N_COLS = 9 17 | data = list( 18 | itertools.chain( 19 | *[ 20 | [ 21 | cat, 22 | out["AP2D"], 23 | out["AP3D"], 24 | ] 25 | for cat, out in results.items() 26 | ] 27 | ) 28 | ) 29 | data.extend([None] * (N_COLS - (len(data) % N_COLS))) 30 | data = itertools.zip_longest(*[data[i::N_COLS] for i in range(N_COLS)]) 31 | table = tabulate( 32 | data, 33 | headers=["category", "AP2D", "AP3D"] * (N_COLS // 2), 34 | tablefmt="pipe", 35 | numalign="left", 36 | stralign="center", 37 | ) 38 | logger.info( 39 | "Performance for each of {} categories on {}:\n".format(num_classes, dataset) 40 | + colored(table, "cyan") 41 | ) 42 | 43 | 44 | def print_ap_analysis_histogram(results): 45 | """ 46 | Prints AP performance for various IoU thresholds and (near, medium, far) objects. 47 | Args: 48 | results: dictionary. Each entry in results contains outputs for a dataset 49 | """ 50 | metric_names = ["AP2D", "AP3D", "AP3D@15", "AP3D@25", "AP3D@50", "AP3D-N", "AP3D-M", "AP3D-F"] 51 | N_COLS = 10 52 | data = [] 53 | for name, metrics in results.items(): 54 | data_item = [name, metrics["iters"], metrics["AP2D"], metrics["AP3D"], metrics["AP3D@15"], metrics["AP3D@25"], metrics["AP3D@50"], metrics["AP3D-N"], metrics["AP3D-M"], metrics["AP3D-F"]] 55 | data.append(data_item) 56 | table = tabulate( 57 | data, 58 | headers=["Dataset", "#iters", "AP2D", "AP3D", "AP3D@15", "AP3D@25", "AP3D@50", "AP3D-N", "AP3D-M", "AP3D-F"], 59 | tablefmt="grid", 60 | numalign="left", 61 | stralign="center", 62 | ) 63 | logger.info( 64 | "Per-dataset performance analysis on test set:\n" 65 | + colored(table, "cyan") 66 | ) 67 | 68 | 69 | def print_ap_dataset_histogram(results): 70 | """ 71 | Prints AP performance for each dataset. 72 | Args: 73 | results: list of dicts. Each entry in results contains outputs for a dataset 74 | """ 75 | metric_names = ["AP2D", "AP3D"] 76 | N_COLS = 4 77 | data = [] 78 | for name, metrics in results.items(): 79 | data_item = [name, metrics["iters"], metrics["AP2D"], metrics["AP3D"]] 80 | data.append(data_item) 81 | table = tabulate( 82 | data, 83 | headers=["Dataset", "#iters", "AP2D", "AP3D"], 84 | tablefmt="grid", 85 | numalign="left", 86 | stralign="center", 87 | ) 88 | logger.info( 89 | "Per-dataset performance on test set:\n" 90 | + colored(table, "cyan") 91 | ) 92 | 93 | 94 | def print_ap_omni_histogram(results): 95 | """ 96 | Prints AP performance for Omni3D dataset. 97 | Args: 98 | results: list of dicts. Each entry in results contains outputs for a dataset 99 | """ 100 | metric_names = ["AP2D", "AP3D"] 101 | N_COLS = 4 102 | data = [] 103 | for name, metrics in results.items(): 104 | data_item = [name, metrics["iters"], metrics["AP2D"], metrics["AP3D"]] 105 | data.append(data_item) 106 | table = tabulate( 107 | data, 108 | headers=["Dataset", "#iters", "AP2D", "AP3D"], 109 | tablefmt="grid", 110 | numalign="left", 111 | stralign="center", 112 | ) 113 | logger.info("Omni3D performance on test set. The numbers below should be used to compare to others approaches on Omni3D, such as Cube R-CNN") 114 | logger.info( 115 | "Performance on Omni3D:\n" 116 | + colored(table, "magenta") 117 | ) 118 | -------------------------------------------------------------------------------- /datasets/Omni3D/download_omni3d_json.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) Meta, Inc. and its affiliates. All Rights Reserved 4 | 5 | wget https://dl.fbaipublicfiles.com/omni3d_data/Omni3D_json.zip 6 | unzip Omni3D_json.zip -------------------------------------------------------------------------------- /docs/teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/docs/teaser.png -------------------------------------------------------------------------------- /scripts/generate_pseudo_label.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DATASET=$1 4 | 5 | # Step 1: Predict depth using UniDepth 6 | CUDA_VISIBLE_DEVICES=0 python third_party/UniDepth/run_unidepth.py --dataset $DATASET 7 | 8 | # Step 2: Segment novel objects using Grounded-SAM 9 | CUDA_VISIBLE_DEVICES=0 python third_party/Grounded-Segment-Anything/grounded_sam_detect.py --dataset $DATASET 10 | CUDA_VISIBLE_DEVICES=0 python third_party/Grounded-Segment-Anything/grounded_sam_detect_ground.py --dataset $DATASET 11 | 12 | # Step 3: Generate pseudo 3D bounding boxes 13 | python tools/generate_pseudo_bbox.py \ 14 | --config-file configs/Base_Omni3D_${DATASET}.yaml \ 15 | OUTPUT_DIR output/generate_pseudo_label/$DATASET \ 16 | 17 | # Step 4: Convert to COCO dataset format 18 | python tools/transform_to_coco.py --dataset_name $DATASET -------------------------------------------------------------------------------- /scripts/test.sh: -------------------------------------------------------------------------------- 1 | DATASET=$1 2 | 3 | CUDA_VISIBLE_DEVICES=0 python tools/train_net.py \ 4 | --eval-only --config-file checkpoints/$DATASET/config.yaml --dist-url tcp://0.0.0.0:12345 --num-gpus 1 \ 5 | MODEL.WEIGHTS checkpoints/$DATASET/model_recent.pth \ 6 | OUTPUT_DIR output/test/$DATASET 7 | -------------------------------------------------------------------------------- /scripts/train.sh: -------------------------------------------------------------------------------- 1 | DATASET=$1 2 | 3 | CUDA_VISIBLE_DEVICES=0,1 python tools/train_net.py \ 4 | --config-file configs/Base_Omni3D_$DATASET.yaml --dist-url tcp://0.0.0.0:12345 --num-gpus 2 \ 5 | DATASETS.FOLDER_NAME "Omni3D_pl" \ 6 | OUTPUT_DIR output/training/$DATASET 7 | 8 | -------------------------------------------------------------------------------- /scripts/train_KITTI.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 python tools/train_net.py \ 2 | --config-file configs/Base_Omni3D_KITTI.yaml --dist-url tcp://0.0.0.0:12345 --num-gpus 1 \ 3 | DATASETS.FOLDER_NAME "Omni3D_pl" \ 4 | OUTPUT_DIR output/training/KITTI 5 | 6 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # checkpoint 132 | *.pth 133 | outputs/ 134 | 135 | .idea/ 136 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/.gitmodules: -------------------------------------------------------------------------------- 1 | 2 | [submodule "grounded-sam-osx"] 3 | path = grounded-sam-osx 4 | url = https://github.com/linjing7/grounded-sam-osx.git 5 | [submodule "VISAM"] 6 | path = VISAM 7 | url = https://github.com/BingfengYan/VISAM 8 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software, please cite it as below." 3 | authors: 4 | - name: "Grounded-SAM Contributors" 5 | title: "Grounded-Segment-Anything" 6 | date-released: 2023-04-06 7 | url: "https://github.com/IDEA-Research/Grounded-Segment-Anything" 8 | license: Apache-2.0 9 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:1.13.1-cuda11.6-cudnn8-devel 2 | 3 | # Arguments to build Docker Image using CUDA 4 | ARG USE_CUDA=0 5 | ARG TORCH_ARCH= 6 | 7 | ENV AM_I_DOCKER True 8 | ENV BUILD_WITH_CUDA "${USE_CUDA}" 9 | ENV TORCH_CUDA_ARCH_LIST "${TORCH_ARCH}" 10 | ENV CUDA_HOME /usr/local/cuda-11.6/ 11 | 12 | RUN mkdir -p /home/appuser/Grounded-Segment-Anything 13 | COPY . /home/appuser/Grounded-Segment-Anything/ 14 | 15 | RUN apt-get update && apt-get install --no-install-recommends wget ffmpeg=7:* \ 16 | libsm6=2:* libxext6=2:* git=1:* nano=2.* \ 17 | vim=2:* -y \ 18 | && apt-get clean && apt-get autoremove && rm -rf /var/lib/apt/lists/* 19 | 20 | WORKDIR /home/appuser/Grounded-Segment-Anything 21 | RUN python -m pip install --no-cache-dir -e segment_anything 22 | 23 | # When using build isolation, PyTorch with newer CUDA is installed and can't compile GroundingDINO 24 | RUN python -m pip install --no-cache-dir wheel 25 | RUN python -m pip install --no-cache-dir --no-build-isolation -e GroundingDINO 26 | 27 | WORKDIR /home/appuser 28 | RUN pip install --no-cache-dir diffusers[torch]==0.15.1 opencv-python==4.7.0.72 \ 29 | pycocotools==2.0.6 matplotlib==3.5.3 \ 30 | onnxruntime==1.14.1 onnx==1.13.1 ipykernel==6.16.2 scipy gradio openai 31 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/EfficientSAM/EdgeSAM/common.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import torch 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | 11 | from typing import Type 12 | 13 | 14 | class MLPBlock(nn.Module): 15 | def __init__( 16 | self, 17 | embedding_dim: int, 18 | mlp_dim: int, 19 | act: Type[nn.Module] = nn.GELU, 20 | ) -> None: 21 | super().__init__() 22 | self.lin1 = nn.Linear(embedding_dim, mlp_dim) 23 | self.lin2 = nn.Linear(mlp_dim, embedding_dim) 24 | self.act = act() 25 | 26 | def forward(self, x: torch.Tensor) -> torch.Tensor: 27 | return self.lin2(self.act(self.lin1(x))) 28 | 29 | 30 | # From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa 31 | # Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119 # noqa 32 | class LayerNorm2d(nn.Module): 33 | def __init__(self, num_channels: int, eps: float = 1e-6) -> None: 34 | super().__init__() 35 | self.weight = nn.Parameter(torch.ones(num_channels)) 36 | self.bias = nn.Parameter(torch.zeros(num_channels)) 37 | self.eps = eps 38 | 39 | def forward(self, x: torch.Tensor) -> torch.Tensor: 40 | u = x.mean(1, keepdim=True) 41 | s = (x - u).pow(2).mean(1, keepdim=True) 42 | x = (x - u) / torch.sqrt(s + self.eps) 43 | x = self.weight[:, None, None] * x + self.bias[:, None, None] 44 | return x 45 | 46 | 47 | def val2list(x: list or tuple or any, repeat_time=1) -> list: 48 | if isinstance(x, (list, tuple)): 49 | return list(x) 50 | return [x for _ in range(repeat_time)] 51 | 52 | 53 | def val2tuple(x: list or tuple or any, min_len: int = 1, idx_repeat: int = -1) -> tuple: 54 | x = val2list(x) 55 | 56 | # repeat elements if necessary 57 | if len(x) > 0: 58 | x[idx_repeat:idx_repeat] = [x[idx_repeat] for _ in range(min_len - len(x))] 59 | 60 | return tuple(x) 61 | 62 | 63 | def list_sum(x: list) -> any: 64 | return x[0] if len(x) == 1 else x[0] + list_sum(x[1:]) 65 | 66 | 67 | def resize( 68 | x: torch.Tensor, 69 | size: any or None = None, 70 | scale_factor=None, 71 | mode: str = "bicubic", 72 | align_corners: bool or None = False, 73 | ) -> torch.Tensor: 74 | if mode in ["bilinear", "bicubic"]: 75 | return F.interpolate( 76 | x, 77 | size=size, 78 | scale_factor=scale_factor, 79 | mode=mode, 80 | align_corners=align_corners, 81 | ) 82 | elif mode in ["nearest", "area"]: 83 | return F.interpolate(x, size=size, scale_factor=scale_factor, mode=mode) 84 | else: 85 | raise NotImplementedError(f"resize(mode={mode}) not implemented.") 86 | 87 | 88 | class UpSampleLayer(nn.Module): 89 | def __init__( 90 | self, 91 | mode="bicubic", 92 | size=None, 93 | factor=2, 94 | align_corners=False, 95 | ): 96 | super(UpSampleLayer, self).__init__() 97 | self.mode = mode 98 | self.size = val2list(size, 2) if size is not None else None 99 | self.factor = None if self.size is not None else factor 100 | self.align_corners = align_corners 101 | 102 | def forward(self, x: torch.Tensor) -> torch.Tensor: 103 | return resize(x, self.size, self.factor, self.mode, self.align_corners) 104 | 105 | 106 | class OpSequential(nn.Module): 107 | def __init__(self, op_list): 108 | super(OpSequential, self).__init__() 109 | valid_op_list = [] 110 | for op in op_list: 111 | if op is not None: 112 | valid_op_list.append(op) 113 | self.op_list = nn.ModuleList(valid_op_list) 114 | 115 | def forward(self, x: torch.Tensor) -> torch.Tensor: 116 | for op in self.op_list: 117 | x = op(x) 118 | return x -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/EfficientSAM/EdgeSAM/setup_edge_sam.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import torch 8 | 9 | from functools import partial 10 | 11 | from segment_anything.modeling import ImageEncoderViT, MaskDecoder, PromptEncoder, Sam, TwoWayTransformer 12 | from EdgeSAM.rep_vit import RepViT 13 | 14 | 15 | prompt_embed_dim = 256 16 | image_size = 1024 17 | vit_patch_size = 16 18 | image_embedding_size = image_size // vit_patch_size 19 | 20 | 21 | def build_edge_sam(checkpoint=None, upsample_mode="bicubic"): 22 | image_encoder = RepViT( 23 | arch="m1", 24 | img_size=image_size, 25 | upsample_mode=upsample_mode 26 | ) 27 | return _build_sam(image_encoder, checkpoint) 28 | 29 | 30 | sam_model_registry = { 31 | "default": build_edge_sam, 32 | "edge_sam": build_edge_sam, 33 | } 34 | 35 | def _build_sam_encoder( 36 | encoder_embed_dim, 37 | encoder_depth, 38 | encoder_num_heads, 39 | encoder_global_attn_indexes, 40 | ): 41 | image_encoder = ImageEncoderViT( 42 | depth=encoder_depth, 43 | embed_dim=encoder_embed_dim, 44 | img_size=image_size, 45 | mlp_ratio=4, 46 | norm_layer=partial(torch.nn.LayerNorm, eps=1e-6), 47 | num_heads=encoder_num_heads, 48 | patch_size=vit_patch_size, 49 | qkv_bias=True, 50 | use_rel_pos=True, 51 | global_attn_indexes=encoder_global_attn_indexes, 52 | window_size=14, 53 | out_chans=prompt_embed_dim, 54 | ) 55 | return image_encoder 56 | 57 | 58 | def _build_sam( 59 | image_encoder, 60 | checkpoint=None, 61 | ): 62 | sam = Sam( 63 | image_encoder=image_encoder, 64 | prompt_encoder=PromptEncoder( 65 | embed_dim=prompt_embed_dim, 66 | image_embedding_size=(image_embedding_size, image_embedding_size), 67 | input_image_size=(image_size, image_size), 68 | mask_in_chans=16, 69 | ), 70 | mask_decoder=MaskDecoder( 71 | num_multimask_outputs=3, 72 | transformer=TwoWayTransformer( 73 | depth=2, 74 | embedding_dim=prompt_embed_dim, 75 | mlp_dim=2048, 76 | num_heads=8, 77 | ), 78 | transformer_dim=prompt_embed_dim, 79 | iou_head_depth=3, 80 | iou_head_hidden_dim=256, 81 | ), 82 | pixel_mean=[123.675, 116.28, 103.53], 83 | pixel_std=[58.395, 57.12, 57.375], 84 | ) 85 | sam.eval() 86 | if checkpoint is not None: 87 | with open(checkpoint, "rb") as f: 88 | state_dict = torch.load(f, map_location="cpu") 89 | sam.load_state_dict(state_dict) 90 | return sam -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/EfficientSAM/LightHQSAM/example_light_hqsam.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/EfficientSAM/LightHQSAM/example_light_hqsam.png -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/EfficientSAM/LightHQSAM/grounded_light_hqsam_annotated_image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/EfficientSAM/LightHQSAM/grounded_light_hqsam_annotated_image.jpg -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/EfficientSAM/LightHQSAM/setup_light_hqsam.py: -------------------------------------------------------------------------------- 1 | from LightHQSAM.tiny_vit_sam import TinyViT 2 | from segment_anything.modeling import MaskDecoderHQ, PromptEncoder, Sam, TwoWayTransformer 3 | 4 | def setup_model(): 5 | prompt_embed_dim = 256 6 | image_size = 1024 7 | vit_patch_size = 16 8 | image_embedding_size = image_size // vit_patch_size 9 | mobile_sam = Sam( 10 | image_encoder=TinyViT(img_size=1024, in_chans=3, num_classes=1000, 11 | embed_dims=[64, 128, 160, 320], 12 | depths=[2, 2, 6, 2], 13 | num_heads=[2, 4, 5, 10], 14 | window_sizes=[7, 7, 14, 7], 15 | mlp_ratio=4., 16 | drop_rate=0., 17 | drop_path_rate=0.0, 18 | use_checkpoint=False, 19 | mbconv_expand_ratio=4.0, 20 | local_conv_size=3, 21 | layer_lr_decay=0.8 22 | ), 23 | prompt_encoder=PromptEncoder( 24 | embed_dim=prompt_embed_dim, 25 | image_embedding_size=(image_embedding_size, image_embedding_size), 26 | input_image_size=(image_size, image_size), 27 | mask_in_chans=16, 28 | ), 29 | mask_decoder=MaskDecoderHQ( 30 | num_multimask_outputs=3, 31 | transformer=TwoWayTransformer( 32 | depth=2, 33 | embedding_dim=prompt_embed_dim, 34 | mlp_dim=2048, 35 | num_heads=8, 36 | ), 37 | transformer_dim=prompt_embed_dim, 38 | iou_head_depth=3, 39 | iou_head_hidden_dim=256, 40 | vit_dim=160, 41 | ), 42 | pixel_mean=[123.675, 116.28, 103.53], 43 | pixel_std=[58.395, 57.12, 57.375], 44 | ) 45 | return mobile_sam -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/EfficientSAM/MobileSAM/setup_mobile_sam.py: -------------------------------------------------------------------------------- 1 | from MobileSAM.tiny_vit_sam import TinyViT 2 | from segment_anything.modeling import MaskDecoder, PromptEncoder, Sam, TwoWayTransformer 3 | 4 | def setup_model(): 5 | prompt_embed_dim = 256 6 | image_size = 1024 7 | vit_patch_size = 16 8 | image_embedding_size = image_size // vit_patch_size 9 | mobile_sam = Sam( 10 | image_encoder=TinyViT(img_size=1024, in_chans=3, num_classes=1000, 11 | embed_dims=[64, 128, 160, 320], 12 | depths=[2, 2, 6, 2], 13 | num_heads=[2, 4, 5, 10], 14 | window_sizes=[7, 7, 14, 7], 15 | mlp_ratio=4., 16 | drop_rate=0., 17 | drop_path_rate=0.0, 18 | use_checkpoint=False, 19 | mbconv_expand_ratio=4.0, 20 | local_conv_size=3, 21 | layer_lr_decay=0.8 22 | ), 23 | prompt_encoder=PromptEncoder( 24 | embed_dim=prompt_embed_dim, 25 | image_embedding_size=(image_embedding_size, image_embedding_size), 26 | input_image_size=(image_size, image_size), 27 | mask_in_chans=16, 28 | ), 29 | mask_decoder=MaskDecoder( 30 | num_multimask_outputs=3, 31 | transformer=TwoWayTransformer( 32 | depth=2, 33 | embedding_dim=prompt_embed_dim, 34 | mlp_dim=2048, 35 | num_heads=8, 36 | ), 37 | transformer_dim=prompt_embed_dim, 38 | iou_head_depth=3, 39 | iou_head_hidden_dim=256, 40 | ), 41 | pixel_mean=[123.675, 116.28, 103.53], 42 | pixel_std=[58.395, 57.12, 57.375], 43 | ) 44 | return mobile_sam -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/EfficientSAM/RepViTSAM/setup_repvit_sam.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import torch 8 | from functools import partial 9 | from segment_anything.modeling import ImageEncoderViT, MaskDecoder, PromptEncoder, Sam, TwoWayTransformer 10 | from RepViTSAM import repvit 11 | from timm.models import create_model 12 | 13 | def build_sam_repvit(checkpoint=None): 14 | prompt_embed_dim = 256 15 | image_size = 1024 16 | vit_patch_size = 16 17 | image_embedding_size = image_size // vit_patch_size 18 | repvit_sam = Sam( 19 | image_encoder=create_model('repvit'), 20 | prompt_encoder=PromptEncoder( 21 | embed_dim=prompt_embed_dim, 22 | image_embedding_size=(image_embedding_size, image_embedding_size), 23 | input_image_size=(image_size, image_size), 24 | mask_in_chans=16, 25 | ), 26 | mask_decoder=MaskDecoder( 27 | num_multimask_outputs=3, 28 | transformer=TwoWayTransformer( 29 | depth=2, 30 | embedding_dim=prompt_embed_dim, 31 | mlp_dim=2048, 32 | num_heads=8, 33 | ), 34 | transformer_dim=prompt_embed_dim, 35 | iou_head_depth=3, 36 | iou_head_hidden_dim=256, 37 | ), 38 | pixel_mean=[123.675, 116.28, 103.53], 39 | pixel_std=[58.395, 57.12, 57.375], 40 | ) 41 | 42 | repvit_sam.eval() 43 | if checkpoint is not None: 44 | with open(checkpoint, "rb") as f: 45 | state_dict = torch.load(f) 46 | repvit_sam.load_state_dict(state_dict) 47 | return repvit_sam 48 | 49 | from functools import partial 50 | 51 | sam_model_registry = { 52 | "repvit": partial(build_sam_repvit), 53 | } 54 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/EfficientSAM/grounded_edge_sam.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import supervision as sv 4 | 5 | import torch 6 | import torchvision 7 | 8 | from groundingdino.util.inference import Model 9 | from segment_anything import SamPredictor 10 | from EdgeSAM.setup_edge_sam import build_edge_sam 11 | 12 | DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 13 | 14 | # GroundingDINO config and checkpoint 15 | GROUNDING_DINO_CONFIG_PATH = "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py" 16 | GROUNDING_DINO_CHECKPOINT_PATH = "./groundingdino_swint_ogc.pth" 17 | 18 | # Building GroundingDINO inference model 19 | grounding_dino_model = Model(model_config_path=GROUNDING_DINO_CONFIG_PATH, model_checkpoint_path=GROUNDING_DINO_CHECKPOINT_PATH) 20 | 21 | # Building MobileSAM predictor 22 | EdgeSAM_CHECKPOINT_PATH = "./EfficientSAM/edge_sam_3x.pth" 23 | edge_sam = build_edge_sam(checkpoint=EdgeSAM_CHECKPOINT_PATH) 24 | edge_sam.to(device=DEVICE) 25 | 26 | sam_predictor = SamPredictor(edge_sam) 27 | 28 | 29 | # Predict classes and hyper-param for GroundingDINO 30 | SOURCE_IMAGE_PATH = "./EfficientSAM/LightHQSAM/example_light_hqsam.png" 31 | CLASSES = ["bench"] 32 | BOX_THRESHOLD = 0.25 33 | TEXT_THRESHOLD = 0.25 34 | NMS_THRESHOLD = 0.8 35 | 36 | 37 | # load image 38 | image = cv2.imread(SOURCE_IMAGE_PATH) 39 | 40 | # detect objects 41 | detections = grounding_dino_model.predict_with_classes( 42 | image=image, 43 | classes=CLASSES, 44 | box_threshold=BOX_THRESHOLD, 45 | text_threshold=TEXT_THRESHOLD 46 | ) 47 | 48 | # annotate image with detections 49 | box_annotator = sv.BoxAnnotator() 50 | labels = [ 51 | f"{CLASSES[class_id]} {confidence:0.2f}" 52 | for _, _, confidence, class_id, _, _ 53 | in detections] 54 | annotated_frame = box_annotator.annotate(scene=image.copy(), detections=detections, labels=labels) 55 | 56 | # save the annotated grounding dino image 57 | cv2.imwrite("EfficientSAM/LightHQSAM/groundingdino_annotated_image.jpg", annotated_frame) 58 | 59 | 60 | # NMS post process 61 | print(f"Before NMS: {len(detections.xyxy)} boxes") 62 | nms_idx = torchvision.ops.nms( 63 | torch.from_numpy(detections.xyxy), 64 | torch.from_numpy(detections.confidence), 65 | NMS_THRESHOLD 66 | ).numpy().tolist() 67 | 68 | detections.xyxy = detections.xyxy[nms_idx] 69 | detections.confidence = detections.confidence[nms_idx] 70 | detections.class_id = detections.class_id[nms_idx] 71 | 72 | print(f"After NMS: {len(detections.xyxy)} boxes") 73 | 74 | # Prompting SAM with detected boxes 75 | def segment(sam_predictor: SamPredictor, image: np.ndarray, xyxy: np.ndarray) -> np.ndarray: 76 | sam_predictor.set_image(image) 77 | result_masks = [] 78 | for box in xyxy: 79 | masks, scores, logits = sam_predictor.predict( 80 | box=box, 81 | multimask_output=False, 82 | hq_token_only=True, 83 | ) 84 | index = np.argmax(scores) 85 | result_masks.append(masks[index]) 86 | return np.array(result_masks) 87 | 88 | 89 | # convert detections to masks 90 | detections.mask = segment( 91 | sam_predictor=sam_predictor, 92 | image=cv2.cvtColor(image, cv2.COLOR_BGR2RGB), 93 | xyxy=detections.xyxy 94 | ) 95 | 96 | # annotate image with detections 97 | box_annotator = sv.BoxAnnotator() 98 | mask_annotator = sv.MaskAnnotator() 99 | labels = [ 100 | f"{CLASSES[class_id]} {confidence:0.2f}" 101 | for _, _, confidence, class_id, _, _ 102 | in detections] 103 | annotated_image = mask_annotator.annotate(scene=image.copy(), detections=detections) 104 | annotated_image = box_annotator.annotate(scene=annotated_image, detections=detections, labels=labels) 105 | 106 | # save the annotated grounded-sam image 107 | cv2.imwrite("EfficientSAM/grounded_edge_sam_annotated_image.jpg", annotated_image) 108 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/EfficientSAM/grounded_light_hqsam.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import supervision as sv 4 | 5 | import torch 6 | import torchvision 7 | 8 | from groundingdino.util.inference import Model 9 | from segment_anything import SamPredictor 10 | from LightHQSAM.setup_light_hqsam import setup_model 11 | 12 | DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 13 | 14 | # GroundingDINO config and checkpoint 15 | GROUNDING_DINO_CONFIG_PATH = "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py" 16 | GROUNDING_DINO_CHECKPOINT_PATH = "./groundingdino_swint_ogc.pth" 17 | 18 | # Building GroundingDINO inference model 19 | grounding_dino_model = Model(model_config_path=GROUNDING_DINO_CONFIG_PATH, model_checkpoint_path=GROUNDING_DINO_CHECKPOINT_PATH) 20 | 21 | # Building MobileSAM predictor 22 | HQSAM_CHECKPOINT_PATH = "./EfficientSAM/sam_hq_vit_tiny.pth" 23 | checkpoint = torch.load(HQSAM_CHECKPOINT_PATH) 24 | light_hqsam = setup_model() 25 | light_hqsam.load_state_dict(checkpoint, strict=True) 26 | light_hqsam.to(device=DEVICE) 27 | 28 | sam_predictor = SamPredictor(light_hqsam) 29 | 30 | 31 | # Predict classes and hyper-param for GroundingDINO 32 | SOURCE_IMAGE_PATH = "./EfficientSAM/LightHQSAM/example_light_hqsam.png" 33 | CLASSES = ["bench"] 34 | BOX_THRESHOLD = 0.25 35 | TEXT_THRESHOLD = 0.25 36 | NMS_THRESHOLD = 0.8 37 | 38 | 39 | # load image 40 | image = cv2.imread(SOURCE_IMAGE_PATH) 41 | 42 | # detect objects 43 | detections = grounding_dino_model.predict_with_classes( 44 | image=image, 45 | classes=CLASSES, 46 | box_threshold=BOX_THRESHOLD, 47 | text_threshold=TEXT_THRESHOLD 48 | ) 49 | 50 | # annotate image with detections 51 | box_annotator = sv.BoxAnnotator() 52 | labels = [ 53 | f"{CLASSES[class_id]} {confidence:0.2f}" 54 | for _, _, confidence, class_id, _, _ 55 | in detections] 56 | annotated_frame = box_annotator.annotate(scene=image.copy(), detections=detections, labels=labels) 57 | 58 | # save the annotated grounding dino image 59 | cv2.imwrite("EfficientSAM/LightHQSAM/groundingdino_annotated_image.jpg", annotated_frame) 60 | 61 | 62 | # NMS post process 63 | print(f"Before NMS: {len(detections.xyxy)} boxes") 64 | nms_idx = torchvision.ops.nms( 65 | torch.from_numpy(detections.xyxy), 66 | torch.from_numpy(detections.confidence), 67 | NMS_THRESHOLD 68 | ).numpy().tolist() 69 | 70 | detections.xyxy = detections.xyxy[nms_idx] 71 | detections.confidence = detections.confidence[nms_idx] 72 | detections.class_id = detections.class_id[nms_idx] 73 | 74 | print(f"After NMS: {len(detections.xyxy)} boxes") 75 | 76 | # Prompting SAM with detected boxes 77 | def segment(sam_predictor: SamPredictor, image: np.ndarray, xyxy: np.ndarray) -> np.ndarray: 78 | sam_predictor.set_image(image) 79 | result_masks = [] 80 | for box in xyxy: 81 | masks, scores, logits = sam_predictor.predict( 82 | box=box, 83 | multimask_output=False, 84 | hq_token_only=True, 85 | ) 86 | index = np.argmax(scores) 87 | result_masks.append(masks[index]) 88 | return np.array(result_masks) 89 | 90 | 91 | # convert detections to masks 92 | detections.mask = segment( 93 | sam_predictor=sam_predictor, 94 | image=cv2.cvtColor(image, cv2.COLOR_BGR2RGB), 95 | xyxy=detections.xyxy 96 | ) 97 | 98 | # annotate image with detections 99 | box_annotator = sv.BoxAnnotator() 100 | mask_annotator = sv.MaskAnnotator() 101 | labels = [ 102 | f"{CLASSES[class_id]} {confidence:0.2f}" 103 | for _, _, confidence, class_id, _, _ 104 | in detections] 105 | annotated_image = mask_annotator.annotate(scene=image.copy(), detections=detections) 106 | annotated_image = box_annotator.annotate(scene=annotated_image, detections=detections, labels=labels) 107 | 108 | # save the annotated grounded-sam image 109 | cv2.imwrite("EfficientSAM/LightHQSAM/grounded_light_hqsam_annotated_image.jpg", annotated_image) 110 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/EfficientSAM/grounded_repvit_sam.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import supervision as sv 4 | 5 | import torch 6 | import torchvision 7 | 8 | from groundingdino.util.inference import Model 9 | from segment_anything import SamPredictor 10 | from RepViTSAM.setup_repvit_sam import build_sam_repvit 11 | 12 | DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 13 | 14 | # GroundingDINO config and checkpoint 15 | GROUNDING_DINO_CONFIG_PATH = "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py" 16 | GROUNDING_DINO_CHECKPOINT_PATH = "./groundingdino_swint_ogc.pth" 17 | 18 | # Building GroundingDINO inference model 19 | grounding_dino_model = Model(model_config_path=GROUNDING_DINO_CONFIG_PATH, model_checkpoint_path=GROUNDING_DINO_CHECKPOINT_PATH) 20 | 21 | # Building MobileSAM predictor 22 | RepViTSAM_CHECKPOINT_PATH = "./EfficientSAM/repvit_sam.pt" 23 | repvit_sam = build_sam_repvit(checkpoint=RepViTSAM_CHECKPOINT_PATH) 24 | repvit_sam.to(device=DEVICE) 25 | 26 | sam_predictor = SamPredictor(repvit_sam) 27 | 28 | 29 | # Predict classes and hyper-param for GroundingDINO 30 | SOURCE_IMAGE_PATH = "./EfficientSAM/LightHQSAM/example_light_hqsam.png" 31 | CLASSES = ["bench"] 32 | BOX_THRESHOLD = 0.25 33 | TEXT_THRESHOLD = 0.25 34 | NMS_THRESHOLD = 0.8 35 | 36 | 37 | # load image 38 | image = cv2.imread(SOURCE_IMAGE_PATH) 39 | 40 | # detect objects 41 | detections = grounding_dino_model.predict_with_classes( 42 | image=image, 43 | classes=CLASSES, 44 | box_threshold=BOX_THRESHOLD, 45 | text_threshold=TEXT_THRESHOLD 46 | ) 47 | 48 | # annotate image with detections 49 | box_annotator = sv.BoxAnnotator() 50 | labels = [ 51 | f"{CLASSES[class_id]} {confidence:0.2f}" 52 | for _, _, confidence, class_id, _, _ 53 | in detections] 54 | annotated_frame = box_annotator.annotate(scene=image.copy(), detections=detections, labels=labels) 55 | 56 | # save the annotated grounding dino image 57 | cv2.imwrite("EfficientSAM/LightHQSAM/groundingdino_annotated_image.jpg", annotated_frame) 58 | 59 | 60 | # NMS post process 61 | print(f"Before NMS: {len(detections.xyxy)} boxes") 62 | nms_idx = torchvision.ops.nms( 63 | torch.from_numpy(detections.xyxy), 64 | torch.from_numpy(detections.confidence), 65 | NMS_THRESHOLD 66 | ).numpy().tolist() 67 | 68 | detections.xyxy = detections.xyxy[nms_idx] 69 | detections.confidence = detections.confidence[nms_idx] 70 | detections.class_id = detections.class_id[nms_idx] 71 | 72 | print(f"After NMS: {len(detections.xyxy)} boxes") 73 | 74 | # Prompting SAM with detected boxes 75 | def segment(sam_predictor: SamPredictor, image: np.ndarray, xyxy: np.ndarray) -> np.ndarray: 76 | sam_predictor.set_image(image) 77 | result_masks = [] 78 | for box in xyxy: 79 | masks, scores, logits = sam_predictor.predict( 80 | box=box, 81 | multimask_output=False, 82 | hq_token_only=True, 83 | ) 84 | index = np.argmax(scores) 85 | result_masks.append(masks[index]) 86 | return np.array(result_masks) 87 | 88 | 89 | # convert detections to masks 90 | detections.mask = segment( 91 | sam_predictor=sam_predictor, 92 | image=cv2.cvtColor(image, cv2.COLOR_BGR2RGB), 93 | xyxy=detections.xyxy 94 | ) 95 | 96 | # annotate image with detections 97 | box_annotator = sv.BoxAnnotator() 98 | mask_annotator = sv.MaskAnnotator() 99 | labels = [ 100 | f"{CLASSES[class_id]} {confidence:0.2f}" 101 | for _, _, confidence, class_id, _, _ 102 | in detections] 103 | annotated_image = mask_annotator.annotate(scene=image.copy(), detections=detections) 104 | annotated_image = box_annotator.annotate(scene=annotated_image, detections=detections, labels=labels) 105 | 106 | # save the annotated grounded-sam image 107 | cv2.imwrite("EfficientSAM/grounded_repvit_sam_annotated_image.jpg", annotated_image) 108 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/GroundingDINO/.asset/COCO.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/GroundingDINO/.asset/COCO.png -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/GroundingDINO/.asset/GD_GLIGEN.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/GroundingDINO/.asset/GD_GLIGEN.png -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/GroundingDINO/.asset/GD_SD.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/GroundingDINO/.asset/GD_SD.png -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/GroundingDINO/.asset/ODinW.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/GroundingDINO/.asset/ODinW.png -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/GroundingDINO/.asset/arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/GroundingDINO/.asset/arch.png -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/GroundingDINO/.asset/cats.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/GroundingDINO/.asset/cats.png -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/GroundingDINO/.asset/hero_figure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/GroundingDINO/.asset/hero_figure.png -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/__init__.py -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/config/GroundingDINO_SwinB.py: -------------------------------------------------------------------------------- 1 | batch_size = 1 2 | modelname = "groundingdino" 3 | backbone = "swin_B_384_22k" 4 | position_embedding = "sine" 5 | pe_temperatureH = 20 6 | pe_temperatureW = 20 7 | return_interm_indices = [1, 2, 3] 8 | backbone_freeze_keywords = None 9 | enc_layers = 6 10 | dec_layers = 6 11 | pre_norm = False 12 | dim_feedforward = 2048 13 | hidden_dim = 256 14 | dropout = 0.0 15 | nheads = 8 16 | num_queries = 900 17 | query_dim = 4 18 | num_patterns = 0 19 | num_feature_levels = 4 20 | enc_n_points = 4 21 | dec_n_points = 4 22 | two_stage_type = "standard" 23 | two_stage_bbox_embed_share = False 24 | two_stage_class_embed_share = False 25 | transformer_activation = "relu" 26 | dec_pred_bbox_embed_share = True 27 | dn_box_noise_scale = 1.0 28 | dn_label_noise_ratio = 0.5 29 | dn_label_coef = 1.0 30 | dn_bbox_coef = 1.0 31 | embed_init_tgt = True 32 | dn_labelbook_size = 2000 33 | max_text_len = 256 34 | text_encoder_type = "bert-base-uncased" 35 | use_text_enhancer = True 36 | use_fusion_layer = True 37 | use_checkpoint = True 38 | use_transformer_ckpt = True 39 | use_text_cross_attention = True 40 | text_dropout = 0.0 41 | fusion_dropout = 0.0 42 | fusion_droppath = 0.1 43 | sub_sentence_present = True 44 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py: -------------------------------------------------------------------------------- 1 | batch_size = 1 2 | modelname = "groundingdino" 3 | backbone = "swin_T_224_1k" 4 | position_embedding = "sine" 5 | pe_temperatureH = 20 6 | pe_temperatureW = 20 7 | return_interm_indices = [1, 2, 3] 8 | backbone_freeze_keywords = None 9 | enc_layers = 6 10 | dec_layers = 6 11 | pre_norm = False 12 | dim_feedforward = 2048 13 | hidden_dim = 256 14 | dropout = 0.0 15 | nheads = 8 16 | num_queries = 900 17 | query_dim = 4 18 | num_patterns = 0 19 | num_feature_levels = 4 20 | enc_n_points = 4 21 | dec_n_points = 4 22 | two_stage_type = "standard" 23 | two_stage_bbox_embed_share = False 24 | two_stage_class_embed_share = False 25 | transformer_activation = "relu" 26 | dec_pred_bbox_embed_share = True 27 | dn_box_noise_scale = 1.0 28 | dn_label_noise_ratio = 0.5 29 | dn_label_coef = 1.0 30 | dn_bbox_coef = 1.0 31 | embed_init_tgt = True 32 | dn_labelbook_size = 2000 33 | max_text_len = 256 34 | text_encoder_type = "bert-base-uncased" 35 | use_text_enhancer = True 36 | use_fusion_layer = True 37 | use_checkpoint = True 38 | use_transformer_ckpt = True 39 | use_text_cross_attention = True 40 | text_dropout = 0.0 41 | fusion_dropout = 0.0 42 | fusion_droppath = 0.1 43 | sub_sentence_present = True 44 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/datasets/__init__.py -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/models/GroundingDINO/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Grounding DINO 3 | # url: https://github.com/IDEA-Research/GroundingDINO 4 | # Copyright (c) 2023 IDEA. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------ 7 | # Conditional DETR 8 | # Copyright (c) 2021 Microsoft. All Rights Reserved. 9 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 10 | # ------------------------------------------------------------------------ 11 | # Copied from DETR (https://github.com/facebookresearch/detr) 12 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 13 | # ------------------------------------------------------------------------ 14 | 15 | from .groundingdino import build_groundingdino 16 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/models/GroundingDINO/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | from .backbone import build_backbone 2 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/models/GroundingDINO/csrc/MsDeformAttn/ms_deform_attn.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | 13 | #include "ms_deform_attn_cpu.h" 14 | 15 | #ifdef WITH_CUDA 16 | #include "ms_deform_attn_cuda.h" 17 | #endif 18 | 19 | namespace groundingdino { 20 | 21 | at::Tensor 22 | ms_deform_attn_forward( 23 | const at::Tensor &value, 24 | const at::Tensor &spatial_shapes, 25 | const at::Tensor &level_start_index, 26 | const at::Tensor &sampling_loc, 27 | const at::Tensor &attn_weight, 28 | const int im2col_step) 29 | { 30 | if (value.type().is_cuda()) 31 | { 32 | #ifdef WITH_CUDA 33 | return ms_deform_attn_cuda_forward( 34 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step); 35 | #else 36 | AT_ERROR("Not compiled with GPU support"); 37 | #endif 38 | } 39 | AT_ERROR("Not implemented on the CPU"); 40 | } 41 | 42 | std::vector 43 | ms_deform_attn_backward( 44 | const at::Tensor &value, 45 | const at::Tensor &spatial_shapes, 46 | const at::Tensor &level_start_index, 47 | const at::Tensor &sampling_loc, 48 | const at::Tensor &attn_weight, 49 | const at::Tensor &grad_output, 50 | const int im2col_step) 51 | { 52 | if (value.type().is_cuda()) 53 | { 54 | #ifdef WITH_CUDA 55 | return ms_deform_attn_cuda_backward( 56 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step); 57 | #else 58 | AT_ERROR("Not compiled with GPU support"); 59 | #endif 60 | } 61 | AT_ERROR("Not implemented on the CPU"); 62 | } 63 | 64 | } // namespace groundingdino -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/models/GroundingDINO/csrc/MsDeformAttn/ms_deform_attn_cpu.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #include 12 | 13 | #include 14 | #include 15 | 16 | namespace groundingdino { 17 | 18 | at::Tensor 19 | ms_deform_attn_cpu_forward( 20 | const at::Tensor &value, 21 | const at::Tensor &spatial_shapes, 22 | const at::Tensor &level_start_index, 23 | const at::Tensor &sampling_loc, 24 | const at::Tensor &attn_weight, 25 | const int im2col_step) 26 | { 27 | AT_ERROR("Not implement on cpu"); 28 | } 29 | 30 | std::vector 31 | ms_deform_attn_cpu_backward( 32 | const at::Tensor &value, 33 | const at::Tensor &spatial_shapes, 34 | const at::Tensor &level_start_index, 35 | const at::Tensor &sampling_loc, 36 | const at::Tensor &attn_weight, 37 | const at::Tensor &grad_output, 38 | const int im2col_step) 39 | { 40 | AT_ERROR("Not implement on cpu"); 41 | } 42 | 43 | } // namespace groundingdino 44 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/models/GroundingDINO/csrc/MsDeformAttn/ms_deform_attn_cpu.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | #include 13 | 14 | namespace groundingdino { 15 | 16 | at::Tensor 17 | ms_deform_attn_cpu_forward( 18 | const at::Tensor &value, 19 | const at::Tensor &spatial_shapes, 20 | const at::Tensor &level_start_index, 21 | const at::Tensor &sampling_loc, 22 | const at::Tensor &attn_weight, 23 | const int im2col_step); 24 | 25 | std::vector 26 | ms_deform_attn_cpu_backward( 27 | const at::Tensor &value, 28 | const at::Tensor &spatial_shapes, 29 | const at::Tensor &level_start_index, 30 | const at::Tensor &sampling_loc, 31 | const at::Tensor &attn_weight, 32 | const at::Tensor &grad_output, 33 | const int im2col_step); 34 | 35 | } // namespace groundingdino 36 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/models/GroundingDINO/csrc/MsDeformAttn/ms_deform_attn_cuda.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | #include 13 | 14 | namespace groundingdino { 15 | 16 | at::Tensor ms_deform_attn_cuda_forward( 17 | const at::Tensor &value, 18 | const at::Tensor &spatial_shapes, 19 | const at::Tensor &level_start_index, 20 | const at::Tensor &sampling_loc, 21 | const at::Tensor &attn_weight, 22 | const int im2col_step); 23 | 24 | std::vector ms_deform_attn_cuda_backward( 25 | const at::Tensor &value, 26 | const at::Tensor &spatial_shapes, 27 | const at::Tensor &level_start_index, 28 | const at::Tensor &sampling_loc, 29 | const at::Tensor &attn_weight, 30 | const at::Tensor &grad_output, 31 | const int im2col_step); 32 | 33 | } // namespace groundingdino -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/models/GroundingDINO/csrc/cuda_version.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | namespace groundingdino { 4 | int get_cudart_version() { 5 | return CUDART_VERSION; 6 | } 7 | } // namespace groundingdino 8 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/models/GroundingDINO/csrc/vision.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | 3 | #include "MsDeformAttn/ms_deform_attn.h" 4 | 5 | namespace groundingdino { 6 | 7 | #ifdef WITH_CUDA 8 | extern int get_cudart_version(); 9 | #endif 10 | 11 | std::string get_cuda_version() { 12 | #ifdef WITH_CUDA 13 | std::ostringstream oss; 14 | 15 | // copied from 16 | // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231 17 | auto printCudaStyleVersion = [&](int v) { 18 | oss << (v / 1000) << "." << (v / 10 % 100); 19 | if (v % 10 != 0) { 20 | oss << "." << (v % 10); 21 | } 22 | }; 23 | printCudaStyleVersion(get_cudart_version()); 24 | return oss.str(); 25 | #else 26 | return std::string("not available"); 27 | #endif 28 | } 29 | 30 | // similar to 31 | // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp 32 | std::string get_compiler_version() { 33 | std::ostringstream ss; 34 | #if defined(__GNUC__) 35 | #ifndef __clang__ 36 | { ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; } 37 | #endif 38 | #endif 39 | 40 | #if defined(__clang_major__) 41 | { 42 | ss << "clang " << __clang_major__ << "." << __clang_minor__ << "." 43 | << __clang_patchlevel__; 44 | } 45 | #endif 46 | 47 | #if defined(_MSC_VER) 48 | { ss << "MSVC " << _MSC_FULL_VER; } 49 | #endif 50 | return ss.str(); 51 | } 52 | 53 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 54 | m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward"); 55 | m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward"); 56 | } 57 | 58 | } // namespace groundingdino -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/models/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Grounding DINO 3 | # url: https://github.com/IDEA-Research/GroundingDINO 4 | # Copyright (c) 2023 IDEA. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------ 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | from .GroundingDINO import build_groundingdino 9 | 10 | 11 | def build_model(args): 12 | # we use register to maintain models from catdet6 on. 13 | from .registry import MODULE_BUILD_FUNCS 14 | 15 | assert args.modelname in MODULE_BUILD_FUNCS._module_dict 16 | build_func = MODULE_BUILD_FUNCS.get(args.modelname) 17 | model = build_func(args) 18 | return model 19 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/models/registry.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Grounding DINO 3 | # url: https://github.com/IDEA-Research/GroundingDINO 4 | # Copyright (c) 2023 IDEA. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------ 7 | # -*- coding: utf-8 -*- 8 | # @Author: Yihao Chen 9 | # @Date: 2021-08-16 16:03:17 10 | # @Last Modified by: Shilong Liu 11 | # @Last Modified time: 2022-01-23 15:26 12 | # modified from mmcv 13 | 14 | import inspect 15 | from functools import partial 16 | 17 | 18 | class Registry(object): 19 | def __init__(self, name): 20 | self._name = name 21 | self._module_dict = dict() 22 | 23 | def __repr__(self): 24 | format_str = self.__class__.__name__ + "(name={}, items={})".format( 25 | self._name, list(self._module_dict.keys()) 26 | ) 27 | return format_str 28 | 29 | def __len__(self): 30 | return len(self._module_dict) 31 | 32 | @property 33 | def name(self): 34 | return self._name 35 | 36 | @property 37 | def module_dict(self): 38 | return self._module_dict 39 | 40 | def get(self, key): 41 | return self._module_dict.get(key, None) 42 | 43 | def registe_with_name(self, module_name=None, force=False): 44 | return partial(self.register, module_name=module_name, force=force) 45 | 46 | def register(self, module_build_function, module_name=None, force=False): 47 | """Register a module build function. 48 | Args: 49 | module (:obj:`nn.Module`): Module to be registered. 50 | """ 51 | if not inspect.isfunction(module_build_function): 52 | raise TypeError( 53 | "module_build_function must be a function, but got {}".format( 54 | type(module_build_function) 55 | ) 56 | ) 57 | if module_name is None: 58 | module_name = module_build_function.__name__ 59 | if not force and module_name in self._module_dict: 60 | raise KeyError("{} is already registered in {}".format(module_name, self.name)) 61 | self._module_dict[module_name] = module_build_function 62 | 63 | return module_build_function 64 | 65 | 66 | MODULE_BUILD_FUNCS = Registry("model build functions") 67 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/util/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/util/get_tokenlizer.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer, BertModel, BertTokenizer, RobertaModel, RobertaTokenizerFast 2 | 3 | 4 | def get_tokenlizer(text_encoder_type, bert_base_uncased_path): 5 | if not isinstance(text_encoder_type, str): 6 | # print("text_encoder_type is not a str") 7 | if hasattr(text_encoder_type, "text_encoder_type"): 8 | text_encoder_type = text_encoder_type.text_encoder_type 9 | elif text_encoder_type.get("text_encoder_type", False): 10 | text_encoder_type = text_encoder_type.get("text_encoder_type") 11 | else: 12 | raise ValueError( 13 | "Unknown type of text_encoder_type: {}".format(type(text_encoder_type)) 14 | ) 15 | 16 | # solve huggingface connect issue 17 | if is_bert_model_use_local_path(bert_base_uncased_path) and text_encoder_type == "bert-base-uncased": 18 | print("use local bert model path: {}".format(bert_base_uncased_path)) 19 | return AutoTokenizer.from_pretrained(bert_base_uncased_path) 20 | 21 | print("final text_encoder_type: {}".format(text_encoder_type)) 22 | 23 | tokenizer = AutoTokenizer.from_pretrained(text_encoder_type) 24 | return tokenizer 25 | 26 | 27 | def get_pretrained_language_model(text_encoder_type, bert_base_uncased_path): 28 | if text_encoder_type == "bert-base-uncased": 29 | if is_bert_model_use_local_path(bert_base_uncased_path): 30 | return BertModel.from_pretrained(bert_base_uncased_path) 31 | return BertModel.from_pretrained(text_encoder_type) 32 | if text_encoder_type == "roberta-base": 33 | return RobertaModel.from_pretrained(text_encoder_type) 34 | raise ValueError("Unknown text_encoder_type {}".format(text_encoder_type)) 35 | 36 | def is_bert_model_use_local_path(bert_base_uncased_path): 37 | return bert_base_uncased_path is not None and len(bert_base_uncased_path) > 0 38 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/util/logger.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | import functools 3 | import logging 4 | import os 5 | import sys 6 | 7 | from termcolor import colored 8 | 9 | 10 | class _ColorfulFormatter(logging.Formatter): 11 | def __init__(self, *args, **kwargs): 12 | self._root_name = kwargs.pop("root_name") + "." 13 | self._abbrev_name = kwargs.pop("abbrev_name", "") 14 | if len(self._abbrev_name): 15 | self._abbrev_name = self._abbrev_name + "." 16 | super(_ColorfulFormatter, self).__init__(*args, **kwargs) 17 | 18 | def formatMessage(self, record): 19 | record.name = record.name.replace(self._root_name, self._abbrev_name) 20 | log = super(_ColorfulFormatter, self).formatMessage(record) 21 | if record.levelno == logging.WARNING: 22 | prefix = colored("WARNING", "red", attrs=["blink"]) 23 | elif record.levelno == logging.ERROR or record.levelno == logging.CRITICAL: 24 | prefix = colored("ERROR", "red", attrs=["blink", "underline"]) 25 | else: 26 | return log 27 | return prefix + " " + log 28 | 29 | 30 | # so that calling setup_logger multiple times won't add many handlers 31 | @functools.lru_cache() 32 | def setup_logger(output=None, distributed_rank=0, *, color=True, name="imagenet", abbrev_name=None): 33 | """ 34 | Initialize the detectron2 logger and set its verbosity level to "INFO". 35 | 36 | Args: 37 | output (str): a file name or a directory to save log. If None, will not save log file. 38 | If ends with ".txt" or ".log", assumed to be a file name. 39 | Otherwise, logs will be saved to `output/log.txt`. 40 | name (str): the root module name of this logger 41 | 42 | Returns: 43 | logging.Logger: a logger 44 | """ 45 | logger = logging.getLogger(name) 46 | logger.setLevel(logging.DEBUG) 47 | logger.propagate = False 48 | 49 | if abbrev_name is None: 50 | abbrev_name = name 51 | 52 | plain_formatter = logging.Formatter( 53 | "[%(asctime)s.%(msecs)03d]: %(message)s", datefmt="%m/%d %H:%M:%S" 54 | ) 55 | # stdout logging: master only 56 | if distributed_rank == 0: 57 | ch = logging.StreamHandler(stream=sys.stdout) 58 | ch.setLevel(logging.DEBUG) 59 | if color: 60 | formatter = _ColorfulFormatter( 61 | colored("[%(asctime)s.%(msecs)03d]: ", "green") + "%(message)s", 62 | datefmt="%m/%d %H:%M:%S", 63 | root_name=name, 64 | abbrev_name=str(abbrev_name), 65 | ) 66 | else: 67 | formatter = plain_formatter 68 | ch.setFormatter(formatter) 69 | logger.addHandler(ch) 70 | 71 | # file logging: all workers 72 | if output is not None: 73 | if output.endswith(".txt") or output.endswith(".log"): 74 | filename = output 75 | else: 76 | filename = os.path.join(output, "log.txt") 77 | if distributed_rank > 0: 78 | filename = filename + f".rank{distributed_rank}" 79 | os.makedirs(os.path.dirname(filename), exist_ok=True) 80 | 81 | fh = logging.StreamHandler(_cached_log_stream(filename)) 82 | fh.setLevel(logging.DEBUG) 83 | fh.setFormatter(plain_formatter) 84 | logger.addHandler(fh) 85 | 86 | return logger 87 | 88 | 89 | # cache the opened file object, so that different calls to `setup_logger` 90 | # with the same file name can safely write to the same file. 91 | @functools.lru_cache(maxsize=None) 92 | def _cached_log_stream(filename): 93 | return open(filename, "a") 94 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/util/time_counter.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | 4 | 5 | class TimeCounter: 6 | def __init__(self) -> None: 7 | pass 8 | 9 | def clear(self): 10 | self.timedict = {} 11 | self.basetime = time.perf_counter() 12 | 13 | def timeit(self, name): 14 | nowtime = time.perf_counter() - self.basetime 15 | self.timedict[name] = nowtime 16 | self.basetime = time.perf_counter() 17 | 18 | 19 | class TimeHolder: 20 | def __init__(self) -> None: 21 | self.timedict = {} 22 | 23 | def update(self, _timedict: dict): 24 | for k, v in _timedict.items(): 25 | if k not in self.timedict: 26 | self.timedict[k] = AverageMeter(name=k, val_only=True) 27 | self.timedict[k].update(val=v) 28 | 29 | def final_res(self): 30 | return {k: v.avg for k, v in self.timedict.items()} 31 | 32 | def __str__(self): 33 | return json.dumps(self.final_res(), indent=2) 34 | 35 | 36 | class AverageMeter(object): 37 | """Computes and stores the average and current value""" 38 | 39 | def __init__(self, name, fmt=":f", val_only=False): 40 | self.name = name 41 | self.fmt = fmt 42 | self.val_only = val_only 43 | self.reset() 44 | 45 | def reset(self): 46 | self.val = 0 47 | self.avg = 0 48 | self.sum = 0 49 | self.count = 0 50 | 51 | def update(self, val, n=1): 52 | self.val = val 53 | self.sum += val * n 54 | self.count += n 55 | self.avg = self.sum / self.count 56 | 57 | def __str__(self): 58 | if self.val_only: 59 | fmtstr = "{name} {val" + self.fmt + "}" 60 | else: 61 | fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})" 62 | return fmtstr.format(**self.__dict__) 63 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/util/vl_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | from typing import List 4 | 5 | import torch 6 | 7 | 8 | def create_positive_map_from_span(tokenized, token_span, max_text_len=256): 9 | """construct a map such that positive_map[i,j] = True iff box i is associated to token j 10 | Input: 11 | - tokenized: 12 | - input_ids: Tensor[1, ntokens] 13 | - attention_mask: Tensor[1, ntokens] 14 | - token_span: list with length num_boxes. 15 | - each item: [start_idx, end_idx] 16 | """ 17 | positive_map = torch.zeros((len(token_span), max_text_len), dtype=torch.float) 18 | for j, tok_list in enumerate(token_span): 19 | for (beg, end) in tok_list: 20 | beg_pos = tokenized.char_to_token(beg) 21 | end_pos = tokenized.char_to_token(end - 1) 22 | if beg_pos is None: 23 | try: 24 | beg_pos = tokenized.char_to_token(beg + 1) 25 | if beg_pos is None: 26 | beg_pos = tokenized.char_to_token(beg + 2) 27 | except: 28 | beg_pos = None 29 | if end_pos is None: 30 | try: 31 | end_pos = tokenized.char_to_token(end - 2) 32 | if end_pos is None: 33 | end_pos = tokenized.char_to_token(end - 3) 34 | except: 35 | end_pos = None 36 | if beg_pos is None or end_pos is None: 37 | continue 38 | 39 | assert beg_pos is not None and end_pos is not None 40 | if os.environ.get("SHILONG_DEBUG_ONLY_ONE_POS", None) == "TRUE": 41 | positive_map[j, beg_pos] = 1 42 | break 43 | else: 44 | positive_map[j, beg_pos : end_pos + 1].fill_(1) 45 | 46 | return positive_map / (positive_map.sum(-1)[:, None] + 1e-6) 47 | 48 | 49 | def build_captions_and_token_span(cat_list, force_lowercase): 50 | """ 51 | Return: 52 | captions: str 53 | cat2tokenspan: dict 54 | { 55 | 'dog': [[0, 2]], 56 | ... 57 | } 58 | """ 59 | 60 | cat2tokenspan = {} 61 | captions = "" 62 | for catname in cat_list: 63 | class_name = catname 64 | if force_lowercase: 65 | class_name = class_name.lower() 66 | if "/" in class_name: 67 | class_name_list: List = class_name.strip().split("/") 68 | class_name_list.append(class_name) 69 | class_name: str = random.choice(class_name_list) 70 | 71 | tokens_positive_i = [] 72 | subnamelist = [i.strip() for i in class_name.strip().split(" ")] 73 | for subname in subnamelist: 74 | if len(subname) == 0: 75 | continue 76 | if len(captions) > 0: 77 | captions = captions + " " 78 | strat_idx = len(captions) 79 | end_idx = strat_idx + len(subname) 80 | tokens_positive_i.append([strat_idx, end_idx]) 81 | captions = captions + subname 82 | 83 | if len(tokens_positive_i) > 0: 84 | captions = captions + " ." 85 | cat2tokenspan[class_name] = tokens_positive_i 86 | 87 | return captions, cat2tokenspan 88 | 89 | 90 | def build_id2posspan_and_caption(category_dict: dict): 91 | """Build id2pos_span and caption from category_dict 92 | 93 | Args: 94 | category_dict (dict): category_dict 95 | """ 96 | cat_list = [item["name"].lower() for item in category_dict] 97 | id2catname = {item["id"]: item["name"].lower() for item in category_dict} 98 | caption, cat2posspan = build_captions_and_token_span(cat_list, force_lowercase=True) 99 | id2posspan = {catid: cat2posspan[catname] for catid, catname in id2catname.items()} 100 | return id2posspan, caption 101 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/version.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.1.0' 2 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/GroundingDINO/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools", 4 | "torch", 5 | "wheel", 6 | "torch" 7 | ] 8 | build-backend = "setuptools.build_meta" 9 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/GroundingDINO/requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | torchvision 3 | transformers 4 | addict 5 | yapf 6 | timm 7 | numpy 8 | opencv-python 9 | supervision 10 | pycocotools -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/Makefile: -------------------------------------------------------------------------------- 1 | # Get version of CUDA and enable it for compilation if CUDA > 11.0 2 | # This solves https://github.com/IDEA-Research/Grounded-Segment-Anything/issues/53 3 | # and https://github.com/IDEA-Research/Grounded-Segment-Anything/issues/84 4 | # when running in Docker 5 | # Check if nvcc is installed 6 | NVCC := $(shell which nvcc) 7 | ifeq ($(NVCC),) 8 | # NVCC not found 9 | USE_CUDA := 0 10 | NVCC_VERSION := "not installed" 11 | else 12 | NVCC_VERSION := $(shell nvcc --version | grep -oP 'release \K[0-9.]+') 13 | USE_CUDA := $(shell echo "$(NVCC_VERSION) > 11" | bc -l) 14 | endif 15 | 16 | # Add the list of supported ARCHs 17 | ifeq ($(USE_CUDA), 1) 18 | TORCH_CUDA_ARCH_LIST := "3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6+PTX" 19 | BUILD_MESSAGE := "I will try to build the image with CUDA support" 20 | else 21 | TORCH_CUDA_ARCH_LIST := 22 | BUILD_MESSAGE := "CUDA $(NVCC_VERSION) is not supported" 23 | endif 24 | 25 | 26 | build-image: 27 | @echo $(BUILD_MESSAGE) 28 | docker build --build-arg USE_CUDA=$(USE_CUDA) \ 29 | --build-arg TORCH_ARCH=$(TORCH_CUDA_ARCH_LIST) \ 30 | -t gsa:v0 . 31 | run: 32 | ifeq (,$(wildcard ./sam_vit_h_4b8939.pth)) 33 | wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth 34 | endif 35 | ifeq (,$(wildcard ./groundingdino_swint_ogc.pth)) 36 | wget https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth 37 | endif 38 | docker run --gpus all -it --rm --net=host --privileged \ 39 | -v /tmp/.X11-unix:/tmp/.X11-unix \ 40 | -v "${PWD}":/home/appuser/Grounded-Segment-Anything \ 41 | -e DISPLAY=$DISPLAY \ 42 | --name=gsa \ 43 | --ipc=host -it gsa:v0 44 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/cog.yaml: -------------------------------------------------------------------------------- 1 | # Configuration for Cog ⚙️ 2 | # Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md 3 | 4 | build: 5 | gpu: true 6 | cuda: "11.7" 7 | system_packages: 8 | - "libgl1-mesa-glx" 9 | - "libglib2.0-0" 10 | python_version: "3.10" 11 | python_packages: 12 | - "timm==0.9.2" 13 | - "transformers==4.30.2" 14 | - "fairscale==0.4.13" 15 | - "pycocoevalcap==1.2" 16 | - "torch==1.13.0" 17 | - "torchvision==0.14.0" 18 | - "Pillow==9.5.0" 19 | - "scipy==1.10.1" 20 | - "opencv-python==4.7.0.72" 21 | - "addict==2.4.0" 22 | - "yapf==0.40.0" 23 | - "supervision==0.10.0" 24 | - git+https://github.com/openai/CLIP.git 25 | - ipython 26 | 27 | predict: "predict.py:Predictor" 28 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/playground/DeepFloyd/dream.py: -------------------------------------------------------------------------------- 1 | from deepfloyd_if.modules import IFStageI, IFStageII, StableStageIII 2 | from deepfloyd_if.modules.t5 import T5Embedder 3 | from deepfloyd_if.pipelines import dream 4 | 5 | # Run locally 6 | device = 'cuda' 7 | cache_dir = "/path/to/storage/IF" 8 | if_I = IFStageI('IF-I-L-v1.0', device=device, cache_dir=cache_dir) 9 | if_II = IFStageII('IF-II-L-v1.0', device=device, cache_dir=cache_dir) 10 | if_III = StableStageIII('stable-diffusion-x4-upscaler', device=device, cache_dir=cache_dir) 11 | t5 = T5Embedder(device=device, cache_dir=cache_dir) 12 | 13 | prompt = "In the heart of the wilderness, an enchanting forest reveals itself. \ 14 | Towering trees, their trunks sturdy and thick, reach skyward, their leafy canopies \ 15 | forming a natural cathedral. Verdant moss clings to bark, and tendrils of ivy climb ambitiously towards the sun-dappled treetops. \ 16 | The forest floor is a tapestry of fallen leaves, sprinkled with delicate wildflowers. The soft chatter of wildlife resonates, while a nearby brook babbles, its clear waters winking in the dappled light. \ 17 | Sunrays filter through the foliage, casting an emerald glow that dances on the woodland floor. Amidst the tranquility, the forest teems with life, whispering ancient secrets on the breeze." 18 | count = 1 19 | 20 | result = dream( 21 | t5=t5, if_I=if_I, if_II=if_II, if_III=if_III, 22 | prompt=[prompt]*count, 23 | seed=42, 24 | if_I_kwargs={ 25 | "guidance_scale": 7.0, 26 | "sample_timestep_respacing": "smart100", 27 | }, 28 | if_II_kwargs={ 29 | "guidance_scale": 4.0, 30 | "sample_timestep_respacing": "smart50", 31 | }, 32 | if_III_kwargs={ 33 | "guidance_scale": 9.0, 34 | "noise_level": 20, 35 | "sample_timestep_respacing": "75", 36 | }, 37 | ) 38 | result['III'][0].save("./dream_figure.jpg") 39 | 40 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/playground/DeepFloyd/inpaint.py: -------------------------------------------------------------------------------- 1 | import PIL 2 | import requests 3 | from io import BytesIO 4 | from torchvision.transforms import ToTensor 5 | 6 | from deepfloyd_if.modules import IFStageI, IFStageII, StableStageIII 7 | from deepfloyd_if.modules.t5 import T5Embedder 8 | from deepfloyd_if.pipelines import inpainting 9 | 10 | def download_image(url): 11 | response = requests.get(url) 12 | return PIL.Image.open(BytesIO(response.content)).convert("RGB") 13 | 14 | img_url = "https://raw.githubusercontent.com/Fantasy-Studio/Paint-by-Example/main/examples/image/example_1.png" 15 | mask_url = "https://raw.githubusercontent.com/Fantasy-Studio/Paint-by-Example/main/examples/mask/example_1.png" 16 | 17 | init_image = download_image(img_url).resize((512, 512)) 18 | mask_image = download_image(mask_url).resize((512, 512)) 19 | 20 | # convert mask_image to torch.Tensor to avoid bug 21 | mask_image = ToTensor()(mask_image).unsqueeze(0) # (1, 3, 512, 512) 22 | 23 | # Run locally 24 | device = 'cuda:5' 25 | cache_dir = "/comp_robot/rentianhe/weights/IF/" 26 | if_I = IFStageI('IF-I-L-v1.0', device=device, cache_dir=cache_dir) 27 | if_II = IFStageII('IF-II-L-v1.0', device=device, cache_dir=cache_dir) 28 | if_III = StableStageIII('stable-diffusion-x4-upscaler', device=device, cache_dir=cache_dir) 29 | t5 = T5Embedder(device=device, cache_dir=cache_dir) 30 | result = inpainting( 31 | t5=t5, if_I=if_I, 32 | if_II=if_II, 33 | if_III=if_III, 34 | support_pil_img=init_image, 35 | inpainting_mask=mask_image, 36 | prompt=[ 37 | 'A Panda' 38 | ], 39 | seed=42, 40 | if_I_kwargs={ 41 | "guidance_scale": 7.0, 42 | "sample_timestep_respacing": "10,10,10,10,10,0,0,0,0,0", 43 | 'support_noise_less_qsample_steps': 0, 44 | }, 45 | if_II_kwargs={ 46 | "guidance_scale": 4.0, 47 | 'aug_level': 0.0, 48 | "sample_timestep_respacing": '100', 49 | }, 50 | if_III_kwargs={ 51 | "guidance_scale": 9.0, 52 | "noise_level": 20, 53 | "sample_timestep_respacing": "75", 54 | }, 55 | ) 56 | if_I.show(result['I'], 2, 3) 57 | if_I.show(result['II'], 2, 6) 58 | if_I.show(result['III'], 2, 14) 59 | 60 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/playground/DeepFloyd/style_transfer.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | 3 | from deepfloyd_if.modules import IFStageI, IFStageII 4 | from deepfloyd_if.modules.t5 import T5Embedder 5 | from deepfloyd_if.pipelines import style_transfer 6 | 7 | # Run locally 8 | device = 'cuda' 9 | cache_dir = "/path/to/storage/IF" 10 | if_I = IFStageI('IF-I-XL-v1.0', device=device, cache_dir=cache_dir) 11 | if_II = IFStageII('IF-II-L-v1.0', device=device, cache_dir=cache_dir) 12 | t5 = T5Embedder(device=device, cache_dir=cache_dir) 13 | 14 | # Style generate from GPT-4 15 | style_prompt = [ 16 | "in style of colorful and cute kawaii art", 17 | "in style of boho-chic textile patterns", 18 | ] 19 | 20 | raw_pil_image = Image.open("/path/to/image") 21 | 22 | result = style_transfer( 23 | t5=t5, if_I=if_I, if_II=if_II, 24 | support_pil_img=raw_pil_image, 25 | style_prompt=style_prompt, 26 | seed=42, 27 | if_I_kwargs={ 28 | "guidance_scale": 10.0, 29 | "sample_timestep_respacing": "10,10,10,10,10,10,10,10,0,0", 30 | 'support_noise_less_qsample_steps': 5, 31 | }, 32 | if_II_kwargs={ 33 | "guidance_scale": 4.0, 34 | "sample_timestep_respacing": 'smart50', 35 | "support_noise_less_qsample_steps": 5, 36 | }, 37 | ) 38 | 39 | # save all the images generated in StageII 40 | for i, image in enumerate(result["II"]): 41 | image.save("./style_transfer_{}.jpg".format(i)) 42 | 43 | 44 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/playground/ImageBind_SAM/.assets/bird_audio.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/playground/ImageBind_SAM/.assets/bird_audio.wav -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/playground/ImageBind_SAM/.assets/bird_image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/playground/ImageBind_SAM/.assets/bird_image.jpg -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/playground/ImageBind_SAM/.assets/car_audio.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/playground/ImageBind_SAM/.assets/car_audio.wav -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/playground/ImageBind_SAM/.assets/car_image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/playground/ImageBind_SAM/.assets/car_image.jpg -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/playground/ImageBind_SAM/.assets/dog_audio.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/playground/ImageBind_SAM/.assets/dog_audio.wav -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/playground/ImageBind_SAM/.assets/dog_image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/playground/ImageBind_SAM/.assets/dog_image.jpg -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/playground/ImageBind_SAM/README.md: -------------------------------------------------------------------------------- 1 | ## ImageBind with SAM 2 | 3 | This is an experimental demo aims to combine [ImageBind](https://github.com/facebookresearch/ImageBind) and [SAM](https://github.com/facebookresearch/segment-anything) to generate mask **with different modalities**. 4 | 5 | This basic idea is followed with [IEA: Image Editing Anything](https://github.com/feizc/IEA) and [CLIP-SAM](https://github.com/maxi-w/CLIP-SAM) which generate the referring mask with the following steps: 6 | 7 | - Step 1: Generate auto masks with `SamAutomaticMaskGenerator` 8 | - Step 2: Crop all the box region from the masks 9 | - Step 3: Compute the similarity with cropped images and different modalities 10 | - Step 4: Merge the highest similarity mask region 11 | 12 | ## Table of contents 13 | - [Installation](#installation) 14 | - [ImageBind-SAM Demo](#run-the-demo) 15 | - [Audio Referring Segment](#run-audio-referring-segment-demo) 16 | - [Text Referring Segment](#run-text-referring-segment-demo) 17 | - [Image Referring Segment](#run-image-referring-segmentation-demo) 18 | 19 | 20 | 21 | ## Installation 22 | - Download the pretrained checkpoints 23 | 24 | ```bash 25 | cd playground/ImageBind_SAM 26 | 27 | mkdir .checkpoints 28 | cd .checkpoints 29 | 30 | # download imagebind weights 31 | wget https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth 32 | wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth 33 | ``` 34 | 35 | - Install ImageBind follow the [official installation guidance](https://github.com/facebookresearch/ImageBind#usage). 36 | - Install Grounded-SAM follow [install Grounded-SAM](https://github.com/IDEA-Research/Grounded-Segment-Anything#installation). 37 | 38 | 39 | ## Run the demo 40 | ```bash 41 | python demo.py 42 | ``` 43 | 44 | We implement `Text Seg` and `Audio Seg` in this demo, the generate masks will be saved as `text_sam_merged_mask.jpg` and `audio_sam_merged_mask.jpg`: 45 | 46 |
47 | 48 | | Input Model | Modality | Generate Mask | 49 | |:----:|:----:|:----:| 50 | | ![](./.assets/car_image.jpg) | [car audio](./.assets/car_audio.wav) | ![](https://github.com/IDEA-Research/detrex-storage/blob/main/assets/grounded_sam/imagebind_sam/audio_sam_merged_mask_new.jpg?raw=true) | 51 | | ![](./.assets/car_image.jpg) | "A car" | ![](https://github.com/IDEA-Research/detrex-storage/blob/main/assets/grounded_sam/imagebind_sam/text_sam_merged_mask.jpg?raw=true) | 52 | | ![](./.assets/car_image.jpg) |
| ![](https://github.com/IDEA-Research/detrex-storage/blob/main/assets/grounded_sam/imagebind_sam/image_referring_sam_merged_mask.jpg?raw=true) | 53 | 54 | 55 |
56 | 57 | By setting different threshold may influence a lot on the final results. 58 | 59 | ## Run image referring segmentation demo 60 | ```bash 61 | # download the referring image 62 | cd .assets 63 | wget https://github.com/IDEA-Research/detrex-storage/releases/download/grounded-sam-storage/referring_car_image.jpg 64 | cd .. 65 | 66 | python image_referring_seg_demo.py 67 | ``` 68 | 69 | ## Run audio referring segmentation demo 70 | ```bash 71 | python audio_referring_seg_demo.py 72 | ``` 73 | 74 | ## Run text referring segmentation demo 75 | ```bash 76 | python text_referring_seg_demo.py 77 | ``` -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/playground/ImageBind_SAM/audio_referring_seg_demo.py: -------------------------------------------------------------------------------- 1 | import data 2 | import cv2 3 | import torch 4 | from PIL import Image, ImageDraw 5 | from tqdm import tqdm 6 | from models import imagebind_model 7 | from models.imagebind_model import ModalityType 8 | 9 | from segment_anything import build_sam, SamAutomaticMaskGenerator 10 | 11 | from utils import ( 12 | segment_image, 13 | convert_box_xywh_to_xyxy, 14 | get_indices_of_values_above_threshold, 15 | ) 16 | 17 | 18 | device = "cuda" if torch.cuda.is_available() else "cpu" 19 | 20 | 21 | """ 22 | Step 1: Instantiate model 23 | """ 24 | # Segment Anything 25 | mask_generator = SamAutomaticMaskGenerator( 26 | build_sam(checkpoint=".checkpoints/sam_vit_h_4b8939.pth").to(device), 27 | points_per_side=16, 28 | ) 29 | 30 | # ImageBind 31 | bind_model = imagebind_model.imagebind_huge(pretrained=True) 32 | bind_model.eval() 33 | bind_model.to(device) 34 | 35 | 36 | """ 37 | Step 2: Generate auto masks with SAM 38 | """ 39 | image_path = ".assets/car_image.jpg" 40 | image = cv2.imread(image_path) 41 | image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 42 | masks = mask_generator.generate(image) 43 | 44 | 45 | """ 46 | Step 3: Get cropped images based on mask and box 47 | """ 48 | cropped_boxes = [] 49 | image = Image.open(image_path) 50 | for mask in tqdm(masks): 51 | cropped_boxes.append(segment_image(image, mask["segmentation"]).crop(convert_box_xywh_to_xyxy(mask["bbox"]))) 52 | 53 | 54 | """ 55 | Step 4: Run ImageBind model to get similarity between cropped image and different modalities 56 | """ 57 | def retriev_vision_and_audio(elements, audio_list): 58 | inputs = { 59 | ModalityType.VISION: data.load_and_transform_vision_data_from_pil_image(elements, device), 60 | ModalityType.AUDIO: data.load_and_transform_audio_data(audio_list, device), 61 | } 62 | with torch.no_grad(): 63 | embeddings = bind_model(inputs) 64 | vision_audio = torch.softmax(embeddings[ModalityType.VISION] @ embeddings[ModalityType.AUDIO].T, dim=0), 65 | return vision_audio 66 | 67 | vision_audio_result = retriev_vision_and_audio(cropped_boxes, [".assets/car_audio.wav"]) 68 | 69 | 70 | """ 71 | Step 5: Merge the top similarity masks to get the final mask and save the merged mask 72 | 73 | This is the audio retrival result 74 | """ 75 | 76 | # get highest similar mask with threshold 77 | # result[0] shape: [113, 1] 78 | threshold = 0.025 79 | index = get_indices_of_values_above_threshold(vision_audio_result[0], threshold) 80 | 81 | segmentation_masks = [] 82 | for seg_idx in index: 83 | segmentation_mask_image = Image.fromarray(masks[seg_idx]["segmentation"].astype('uint8') * 255) 84 | segmentation_masks.append(segmentation_mask_image) 85 | 86 | original_image = Image.open(image_path) 87 | overlay_image = Image.new('RGBA', image.size, (0, 0, 0, 255)) 88 | overlay_color = (255, 255, 255, 0) 89 | 90 | draw = ImageDraw.Draw(overlay_image) 91 | for segmentation_mask_image in segmentation_masks: 92 | draw.bitmap((0, 0), segmentation_mask_image, fill=overlay_color) 93 | 94 | # return Image.alpha_composite(original_image.convert('RGBA'), overlay_image) 95 | mask_image = overlay_image.convert("RGB") 96 | mask_image.save("./audio_sam_merged_mask.jpg") 97 | 98 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/playground/ImageBind_SAM/bpe/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/playground/ImageBind_SAM/bpe/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/playground/ImageBind_SAM/image_referring_seg_demo.py: -------------------------------------------------------------------------------- 1 | import data 2 | import cv2 3 | import torch 4 | from PIL import Image, ImageDraw 5 | from tqdm import tqdm 6 | from models import imagebind_model 7 | from models.imagebind_model import ModalityType 8 | 9 | from segment_anything import build_sam, SamAutomaticMaskGenerator 10 | 11 | from utils import ( 12 | segment_image, 13 | convert_box_xywh_to_xyxy, 14 | get_indices_of_values_above_threshold, 15 | ) 16 | 17 | 18 | device = "cuda" if torch.cuda.is_available() else "cpu" 19 | 20 | 21 | """ 22 | Step 1: Instantiate model 23 | """ 24 | # Segment Anything 25 | mask_generator = SamAutomaticMaskGenerator( 26 | build_sam(checkpoint=".checkpoints/sam_vit_h_4b8939.pth").to(device), 27 | points_per_side=16, 28 | ) 29 | 30 | # ImageBind 31 | bind_model = imagebind_model.imagebind_huge(pretrained=True) 32 | bind_model.eval() 33 | bind_model.to(device) 34 | 35 | 36 | """ 37 | Step 2: Generate auto masks with SAM 38 | """ 39 | image_path = ".assets/car_image.jpg" 40 | image = cv2.imread(image_path) 41 | image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 42 | masks = mask_generator.generate(image) 43 | 44 | 45 | """ 46 | Step 3: Get cropped images based on mask and box 47 | """ 48 | cropped_boxes = [] 49 | image = Image.open(image_path) 50 | for mask in tqdm(masks): 51 | cropped_boxes.append(segment_image(image, mask["segmentation"]).crop(convert_box_xywh_to_xyxy(mask["bbox"]))) 52 | 53 | 54 | """ 55 | Step 4: Run ImageBind model to get similarity between cropped image and different modalities 56 | """ 57 | # load referring image 58 | referring_image_path = ".assets/referring_car_image.jpg" 59 | referring_image = Image.open(referring_image_path) 60 | 61 | image_list = [] 62 | image_list += cropped_boxes 63 | image_list.append(referring_image) 64 | 65 | def retriev_vision_and_vision(elements): 66 | inputs = { 67 | ModalityType.VISION: data.load_and_transform_vision_data_from_pil_image(elements, device), 68 | } 69 | with torch.no_grad(): 70 | embeddings = bind_model(inputs) 71 | 72 | # cropped box region embeddings 73 | cropped_box_embeddings = embeddings[ModalityType.VISION][:-1, :] 74 | referring_image_embeddings = embeddings[ModalityType.VISION][-1, :] 75 | 76 | vision_referring_result = torch.softmax(cropped_box_embeddings @ referring_image_embeddings.T, dim=0), 77 | return vision_referring_result # [113, 1] 78 | 79 | 80 | vision_referring_result = retriev_vision_and_vision(image_list) 81 | 82 | 83 | """ 84 | Step 5: Merge the top similarity masks to get the final mask and save the merged mask 85 | 86 | Image / Text mask 87 | """ 88 | 89 | # get highest similar mask with threshold 90 | # result[0] shape: [113, 1] 91 | threshold = 0.017 92 | index = get_indices_of_values_above_threshold(vision_referring_result[0], threshold) 93 | 94 | 95 | segmentation_masks = [] 96 | for seg_idx in index: 97 | segmentation_mask_image = Image.fromarray(masks[seg_idx]["segmentation"].astype('uint8') * 255) 98 | segmentation_masks.append(segmentation_mask_image) 99 | 100 | original_image = Image.open(image_path) 101 | overlay_image = Image.new('RGBA', image.size, (0, 0, 0, 255)) 102 | overlay_color = (255, 255, 255, 0) 103 | 104 | draw = ImageDraw.Draw(overlay_image) 105 | for segmentation_mask_image in segmentation_masks: 106 | draw.bitmap((0, 0), segmentation_mask_image, fill=overlay_color) 107 | 108 | # return Image.alpha_composite(original_image.convert('RGBA'), overlay_image) 109 | mask_image = overlay_image.convert("RGB") 110 | mask_image.save("./image_referring_sam_merged_mask.jpg") 111 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/playground/ImageBind_SAM/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/playground/ImageBind_SAM/models/__init__.py -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/playground/ImageBind_SAM/text_referring_seg_demo.py: -------------------------------------------------------------------------------- 1 | import data 2 | import cv2 3 | import torch 4 | from PIL import Image, ImageDraw 5 | from tqdm import tqdm 6 | from models import imagebind_model 7 | from models.imagebind_model import ModalityType 8 | 9 | from segment_anything import build_sam, SamAutomaticMaskGenerator 10 | 11 | from utils import ( 12 | segment_image, 13 | convert_box_xywh_to_xyxy, 14 | get_indices_of_values_above_threshold, 15 | ) 16 | 17 | 18 | device = "cuda" if torch.cuda.is_available() else "cpu" 19 | 20 | 21 | """ 22 | Step 1: Instantiate model 23 | """ 24 | # Segment Anything 25 | mask_generator = SamAutomaticMaskGenerator( 26 | build_sam(checkpoint=".checkpoints/sam_vit_h_4b8939.pth").to(device), 27 | points_per_side=16, 28 | ) 29 | 30 | # ImageBind 31 | bind_model = imagebind_model.imagebind_huge(pretrained=True) 32 | bind_model.eval() 33 | bind_model.to(device) 34 | 35 | 36 | """ 37 | Step 2: Generate auto masks with SAM 38 | """ 39 | image_path = ".assets/car_image.jpg" 40 | image = cv2.imread(image_path) 41 | image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 42 | masks = mask_generator.generate(image) 43 | 44 | 45 | """ 46 | Step 3: Get cropped images based on mask and box 47 | """ 48 | cropped_boxes = [] 49 | image = Image.open(image_path) 50 | for mask in tqdm(masks): 51 | cropped_boxes.append(segment_image(image, mask["segmentation"]).crop(convert_box_xywh_to_xyxy(mask["bbox"]))) 52 | 53 | 54 | """ 55 | Step 4: Run ImageBind model to get similarity between cropped image and different modalities 56 | """ 57 | def retriev_vision_and_text(elements, text_list): 58 | inputs = { 59 | ModalityType.VISION: data.load_and_transform_vision_data_from_pil_image(elements, device), 60 | ModalityType.TEXT: data.load_and_transform_text(text_list, device), 61 | } 62 | with torch.no_grad(): 63 | embeddings = bind_model(inputs) 64 | vision_audio = torch.softmax(embeddings[ModalityType.VISION] @ embeddings[ModalityType.TEXT].T, dim=0), 65 | return vision_audio # [113, 1] 66 | 67 | 68 | vision_text_result = retriev_vision_and_text(cropped_boxes, ["A car"] ) 69 | 70 | 71 | """ 72 | Step 5: Merge the top similarity masks to get the final mask and save the merged mask 73 | 74 | Image / Text mask 75 | """ 76 | 77 | # get highest similar mask with threshold 78 | # result[0] shape: [113, 1] 79 | threshold = 0.05 80 | index = get_indices_of_values_above_threshold(vision_text_result[0], threshold) 81 | 82 | segmentation_masks = [] 83 | for seg_idx in index: 84 | segmentation_mask_image = Image.fromarray(masks[seg_idx]["segmentation"].astype('uint8') * 255) 85 | segmentation_masks.append(segmentation_mask_image) 86 | 87 | original_image = Image.open(image_path) 88 | overlay_image = Image.new('RGBA', image.size, (0, 0, 0, 255)) 89 | overlay_color = (255, 255, 255, 0) 90 | 91 | draw = ImageDraw.Draw(overlay_image) 92 | for segmentation_mask_image in segmentation_masks: 93 | draw.bitmap((0, 0), segmentation_mask_image, fill=overlay_color) 94 | 95 | # return Image.alpha_composite(original_image.convert('RGBA'), overlay_image) 96 | mask_image = overlay_image.convert("RGB") 97 | mask_image.save("./text_sam_merged_mask.jpg") 98 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/playground/ImageBind_SAM/utils.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | import numpy as np 3 | 4 | def segment_image(image, segmentation_mask): 5 | image_array = np.array(image) 6 | segmented_image_array = np.zeros_like(image_array) 7 | segmented_image_array[segmentation_mask] = image_array[segmentation_mask] 8 | segmented_image = Image.fromarray(segmented_image_array) 9 | black_image = Image.new("RGB", image.size, (0, 0, 0)) 10 | transparency_mask = np.zeros_like(segmentation_mask, dtype=np.uint8) 11 | transparency_mask[segmentation_mask] = 255 12 | transparency_mask_image = Image.fromarray(transparency_mask, mode='L') 13 | black_image.paste(segmented_image, mask=transparency_mask_image) 14 | return black_image 15 | 16 | 17 | def convert_box_xywh_to_xyxy(box): 18 | x1 = box[0] 19 | y1 = box[1] 20 | x2 = box[0] + box[2] 21 | y2 = box[1] + box[3] 22 | return [x1, y1, x2, y2] 23 | 24 | 25 | def get_indices_of_values_above_threshold(values, threshold): 26 | return [i for i, v in enumerate(values) if v > threshold] -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/playground/LaMa/lama_inpaint_demo.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import PIL 3 | import requests 4 | import numpy as np 5 | from lama_cleaner.model.lama import LaMa 6 | from lama_cleaner.schema import Config 7 | 8 | 9 | def download_image(url): 10 | image = PIL.Image.open(requests.get(url, stream=True).raw) 11 | image = PIL.ImageOps.exif_transpose(image) 12 | image = image.convert("RGB") 13 | return image 14 | 15 | 16 | img_url = "https://raw.githubusercontent.com/Sanster/lama-cleaner/main/assets/dog.jpg" 17 | mask_url = "https://user-images.githubusercontent.com/3998421/202105351-9fcc4bf8-129d-461a-8524-92e4caad431f.png" 18 | 19 | image = np.asarray(download_image(img_url)) 20 | mask = np.asarray(download_image(mask_url).convert("L")) 21 | 22 | # set to GPU for faster inference 23 | model = LaMa("cpu") 24 | result = model(image, mask, Config(hd_strategy="Original", ldm_steps=20, hd_strategy_crop_margin=128, hd_strategy_crop_trigger_size=800, hd_strategy_resize_limit=800)) 25 | cv2.imwrite("lama_inpaint_demo.jpg", result) -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/playground/LaMa/sam_lama.py: -------------------------------------------------------------------------------- 1 | # !pip install diffusers transformers 2 | 3 | import requests 4 | import cv2 5 | import numpy as np 6 | import PIL 7 | from PIL import Image 8 | from io import BytesIO 9 | 10 | from segment_anything import sam_model_registry, SamPredictor 11 | 12 | from lama_cleaner.model.lama import LaMa 13 | from lama_cleaner.schema import Config 14 | 15 | """ 16 | Step 1: Download and preprocess demo images 17 | """ 18 | def download_image(url): 19 | image = PIL.Image.open(requests.get(url, stream=True).raw) 20 | image = PIL.ImageOps.exif_transpose(image) 21 | image = image.convert("RGB") 22 | return image 23 | 24 | 25 | img_url = "https://github.com/IDEA-Research/detrex-storage/blob/main/assets/grounded_sam/paint_by_example/input_image.png?raw=true" 26 | 27 | 28 | init_image = download_image(img_url) 29 | init_image = np.asarray(init_image) 30 | 31 | 32 | """ 33 | Step 2: Initialize SAM and LaMa models 34 | """ 35 | 36 | DEVICE = "cuda:1" 37 | 38 | # SAM 39 | SAM_ENCODER_VERSION = "vit_h" 40 | SAM_CHECKPOINT_PATH = "/comp_robot/rentianhe/code/Grounded-Segment-Anything/sam_vit_h_4b8939.pth" 41 | sam = sam_model_registry[SAM_ENCODER_VERSION](checkpoint=SAM_CHECKPOINT_PATH).to(device=DEVICE) 42 | sam_predictor = SamPredictor(sam) 43 | sam_predictor.set_image(init_image) 44 | 45 | # LaMa 46 | model = LaMa(DEVICE) 47 | 48 | 49 | """ 50 | Step 3: Get masks with SAM by prompt (box or point) and inpaint the mask region by example image. 51 | """ 52 | 53 | input_point = np.array([[350, 256]]) 54 | input_label = np.array([1]) # positive label 55 | 56 | masks, _, _ = sam_predictor.predict( 57 | point_coords=input_point, 58 | point_labels=input_label, 59 | multimask_output=False 60 | ) 61 | masks = masks.astype(np.uint8) * 255 62 | # mask_pil = Image.fromarray(masks[0]) # simply save the first mask 63 | 64 | 65 | """ 66 | Step 4: Dilate Mask to make it more suitable for LaMa inpainting 67 | 68 | The idea behind dilate mask is to mask a larger region which will be better for inpainting. 69 | 70 | Borrowed from Inpaint-Anything: https://github.com/geekyutao/Inpaint-Anything/blob/main/utils/utils.py#L18 71 | """ 72 | 73 | def dilate_mask(mask, dilate_factor=15): 74 | mask = mask.astype(np.uint8) 75 | mask = cv2.dilate( 76 | mask, 77 | np.ones((dilate_factor, dilate_factor), np.uint8), 78 | iterations=1 79 | ) 80 | return mask 81 | 82 | def save_array_to_img(img_arr, img_p): 83 | Image.fromarray(img_arr.astype(np.uint8)).save(img_p) 84 | 85 | # [1, 512, 512] to [512, 512] and save mask 86 | save_array_to_img(masks[0], "./mask.png") 87 | 88 | mask = dilate_mask(masks[0], dilate_factor=15) 89 | 90 | save_array_to_img(mask, "./dilated_mask.png") 91 | 92 | """ 93 | Step 5: Run LaMa inpaint model 94 | """ 95 | result = model(init_image, mask, Config(hd_strategy="Original", ldm_steps=20, hd_strategy_crop_margin=128, hd_strategy_crop_trigger_size=800, hd_strategy_resize_limit=800)) 96 | cv2.imwrite("sam_lama_demo.jpg", result) 97 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/playground/PaintByExample/paint_by_example.py: -------------------------------------------------------------------------------- 1 | # !pip install diffusers transformers 2 | 3 | import PIL 4 | import requests 5 | import torch 6 | from io import BytesIO 7 | from diffusers import DiffusionPipeline 8 | 9 | 10 | """ 11 | Step 1: Download demo images 12 | """ 13 | def download_image(url): 14 | response = requests.get(url) 15 | return PIL.Image.open(BytesIO(response.content)).convert("RGB") 16 | 17 | 18 | img_url = "https://github.com/IDEA-Research/detrex-storage/blob/main/assets/grounded_sam/paint_by_example/input_image.png?raw=true" 19 | mask_url = "https://github.com/IDEA-Research/detrex-storage/blob/main/assets/grounded_sam/paint_by_example/mask.png?raw=true" 20 | example_url = "https://github.com/IDEA-Research/detrex-storage/blob/main/assets/grounded_sam/paint_by_example/pomeranian_example.jpg?raw=True" 21 | # example_url = "https://raw.githubusercontent.com/Fantasy-Studio/Paint-by-Example/main/examples/reference/example_1.jpg" 22 | 23 | init_image = download_image(img_url).resize((512, 512)) 24 | mask_image = download_image(mask_url).resize((512, 512)) 25 | example_image = download_image(example_url).resize((512, 512)) 26 | 27 | 28 | """ 29 | Step 2: Download pretrained weights and initialize model 30 | """ 31 | # set cache dir to store the weights 32 | cache_dir = "/comp_robot/rentianhe/weights/diffusers/" 33 | 34 | pipe = DiffusionPipeline.from_pretrained( 35 | "Fantasy-Studio/Paint-by-Example", 36 | torch_dtype=torch.float16, 37 | cache_dir=cache_dir, 38 | ) 39 | # set to device 40 | pipe = pipe.to("cuda:1") 41 | 42 | 43 | """ 44 | Step 3: Run PaintByExample pipeline and save image 45 | """ 46 | image = pipe( 47 | image=init_image, 48 | mask_image=mask_image, 49 | example_image=example_image, 50 | num_inference_steps=200, 51 | ).images[0] 52 | 53 | image.save("./paint_by_example_demo.jpg") 54 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/playground/PaintByExample/sam_paint_by_example.py: -------------------------------------------------------------------------------- 1 | # !pip install diffusers transformers 2 | 3 | import requests 4 | import torch 5 | import numpy as np 6 | from PIL import Image 7 | from io import BytesIO 8 | from diffusers import DiffusionPipeline 9 | 10 | from segment_anything import sam_model_registry, SamPredictor 11 | 12 | 13 | """ 14 | Step 1: Download and preprocess example demo images 15 | """ 16 | def download_image(url): 17 | response = requests.get(url) 18 | return Image.open(BytesIO(response.content)).convert("RGB") 19 | 20 | 21 | img_url = "https://github.com/IDEA-Research/detrex-storage/blob/main/assets/grounded_sam/paint_by_example/input_image.png?raw=true" 22 | # example_url = "https://github.com/IDEA-Research/detrex-storage/blob/main/assets/grounded_sam/paint_by_example/pomeranian_example.jpg?raw=True" 23 | # example_url = "https://github.com/IDEA-Research/detrex-storage/blob/main/assets/grounded_sam/paint_by_example/example_image.jpg?raw=true" 24 | example_url = "https://github.com/IDEA-Research/detrex-storage/blob/main/assets/grounded_sam/paint_by_example/labrador_example.jpg?raw=true" 25 | 26 | init_image = download_image(img_url).resize((512, 512)) 27 | example_image = download_image(example_url).resize((512, 512)) 28 | 29 | 30 | """ 31 | Step 2: Initialize SAM and PaintByExample models 32 | """ 33 | 34 | DEVICE = "cuda:1" 35 | 36 | # SAM 37 | SAM_ENCODER_VERSION = "vit_h" 38 | SAM_CHECKPOINT_PATH = "/comp_robot/rentianhe/code/Grounded-Segment-Anything/sam_vit_h_4b8939.pth" 39 | sam = sam_model_registry[SAM_ENCODER_VERSION](checkpoint=SAM_CHECKPOINT_PATH).to(device=DEVICE) 40 | sam_predictor = SamPredictor(sam) 41 | sam_predictor.set_image(np.array(init_image)) 42 | 43 | # PaintByExample Pipeline 44 | CACHE_DIR = "/comp_robot/rentianhe/weights/diffusers/" 45 | pipe = DiffusionPipeline.from_pretrained( 46 | "Fantasy-Studio/Paint-by-Example", 47 | torch_dtype=torch.float16, 48 | cache_dir=CACHE_DIR, 49 | ) 50 | pipe = pipe.to(DEVICE) 51 | 52 | 53 | """ 54 | Step 3: Get masks with SAM by prompt (box or point) and inpaint the mask region by example image. 55 | """ 56 | 57 | input_point = np.array([[350, 256]]) 58 | input_label = np.array([1]) # positive label 59 | 60 | masks, _, _ = sam_predictor.predict( 61 | point_coords=input_point, 62 | point_labels=input_label, 63 | multimask_output=False 64 | ) 65 | mask = masks[0] # [1, 512, 512] to [512, 512] np.ndarray 66 | mask_pil = Image.fromarray(mask) 67 | 68 | mask_pil.save("./mask.jpg") 69 | 70 | image = pipe( 71 | image=init_image, 72 | mask_image=mask_pil, 73 | example_image=example_image, 74 | num_inference_steps=500, 75 | guidance_scale=9.0 76 | ).images[0] 77 | 78 | image.save("./paint_by_example_demo.jpg") 79 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/playground/README.md: -------------------------------------------------------------------------------- 1 | ## Playground 2 | 3 | We will try more interesting **base models** and **build more fun demos** in the playground. In the playground, we will: 4 | 5 | - **Simplify the demo code** to make it easier for users to get started. 6 | - **Keep complete usage notes** and some pitfalls to reduce the burden on users. 7 | 8 | ## Table of Contents 9 | - [DeepFloyd: Text-to-Image Generation](./DeepFloyd/) 10 | - [Dream: Text-to-Image Generation](./DeepFloyd/dream.py) 11 | - [Style Transfer](./DeepFloyd/style_transfer.py) 12 | - [Paint by Example: Exemplar-based Image Editing with Diffusion Models](./PaintByExample/) 13 | - [Diffuser Demo](./PaintByExample/paint_by_example.py) 14 | - [PaintByExample with SAM](./PaintByExample/sam_paint_by_example.py) 15 | - [LaMa: Resolution-robust Large Mask Inpainting with Fourier Convolutions](./LaMa/) 16 | - [LaMa Demo](./LaMa/lama_inpaint_demo.py) 17 | - [LaMa with SAM](./LaMa/sam_lama.py) 18 | - [RePaint: Inpainting using Denoising Diffusion Probabilistic Models](./RePaint/) 19 | - [RePaint Demo](./RePaint/repaint.py) 20 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/playground/RePaint/README.md: -------------------------------------------------------------------------------- 1 | ## RePaint: Inpainting using Denoising Diffusion Probabilistic Models 2 | 3 | :grapes: [[Official Project Page](https://github.com/andreas128/RePaint)] 4 | 5 |
6 | 7 | ![](https://user-images.githubusercontent.com/11280511/150803812-a4729ef8-6ad4-46aa-ae99-8c27fbb2ea2e.png) 8 | 9 |
10 | 11 | ## Abstract 12 | 13 | > Free-form inpainting is the task of adding new content to an image in the regions specified by an arbitrary binary mask. Most existing approaches train for a certain distribution of masks, which limits their generalization capabilities to unseen mask types. Furthermore, training with pixel-wise and perceptual losses often leads to simple textural extensions towards the missing areas instead of semantically meaningful generation. In this work, we propose RePaint: A Denoising Diffusion Probabilistic Model (DDPM) based inpainting approach that is applicable to even extreme masks. We employ a pretrained unconditional DDPM as the generative prior. To condition the generation process, we only alter the reverse diffusion iterations by sampling the unmasked regions using the given image information. Since this technique does not modify or condition the original DDPM network itself, the model produces highquality and diverse output images for any inpainting form. We validate our method for both faces and general-purpose image inpainting using standard and extreme masks. RePaint outperforms state-of-the-art Autoregressive, and GAN approaches for at least five out of six mask distributions. 14 | 15 | 16 | ## Table of Contents 17 | - [Installation](#installation) 18 | - [Repaint Demos](#repaint-demos) 19 | - [Diffuser Demo](#repaint-diffuser-demos) 20 | 21 | 22 | ## TODO 23 | - [x] RePaint Diffuser Demo 24 | - [ ] RePaint with SAM 25 | - [ ] RePaint with GroundingDINO 26 | - [ ] RePaint with Grounded-SAM 27 | 28 | ## Installation 29 | We're using PaintByExample with diffusers, install diffusers as follows: 30 | ```bash 31 | pip install diffusers==0.16.1 32 | ``` 33 | Then install Grounded-SAM follows [Grounded-SAM Installation](https://github.com/IDEA-Research/Grounded-Segment-Anything#installation) for some extension demos. 34 | 35 | ## RePaint Demos 36 | Here we provide the demos for `RePaint` 37 | 38 | 39 | ### RePaint Diffuser Demos 40 | ```python 41 | cd playground/RePaint 42 | python repaint.py 43 | ``` 44 | **Notes:** set `cache_dir` to save the pretrained weights to specific folder. The paint result will be save as `repaint_demo.jpg`: 45 | 46 |
47 | 48 | | Input Image | Mask | Inpaint Result | 49 | |:----:|:----:|:----:| 50 | | ![](https://github.com/IDEA-Research/detrex-storage/blob/main/assets/grounded_sam/repaint/celeba_hq_256.png?raw=true) | ![](https://github.com/IDEA-Research/detrex-storage/blob/main/assets/grounded_sam/repaint/mask_256.png?raw=true) | ![](https://github.com/IDEA-Research/detrex-storage/blob/main/assets/grounded_sam/repaint/repaint_demo.jpg?raw=true) | 51 | 52 | 53 |
54 | 55 | 56 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/playground/RePaint/repaint.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | 3 | import torch 4 | 5 | import PIL 6 | import requests 7 | from diffusers import RePaintPipeline, RePaintScheduler 8 | 9 | 10 | def download_image(url): 11 | response = requests.get(url) 12 | return PIL.Image.open(BytesIO(response.content)).convert("RGB") 13 | 14 | 15 | img_url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/repaint/celeba_hq_256.png" 16 | mask_url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/repaint/mask_256.png" 17 | 18 | # Load the original image and the mask as PIL images 19 | original_image = download_image(img_url).resize((256, 256)) 20 | mask_image = download_image(mask_url).resize((256, 256)) 21 | 22 | # Load the RePaint scheduler and pipeline based on a pretrained DDPM model 23 | DEVICE = "cuda:1" 24 | CACHE_DIR = "/comp_robot/rentianhe/weights/diffusers/" 25 | scheduler = RePaintScheduler.from_pretrained("google/ddpm-ema-celebahq-256", cache_dir=CACHE_DIR) 26 | pipe = RePaintPipeline.from_pretrained("google/ddpm-ema-celebahq-256", scheduler=scheduler, cache_dir=CACHE_DIR) 27 | pipe = pipe.to(DEVICE) 28 | 29 | generator = torch.Generator(device=DEVICE).manual_seed(0) 30 | output = pipe( 31 | image=original_image, 32 | mask_image=mask_image, 33 | num_inference_steps=250, 34 | eta=0.0, 35 | jump_length=10, 36 | jump_n_sample=10, 37 | generator=generator, 38 | ) 39 | inpainted_image = output.images[0] 40 | inpainted_image.save("./repaint_demo.jpg") -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/requirements.txt: -------------------------------------------------------------------------------- 1 | addict 2 | diffusers 3 | gradio 4 | huggingface_hub 5 | matplotlib 6 | numpy 7 | onnxruntime 8 | opencv_python 9 | Pillow 10 | pycocotools 11 | PyYAML 12 | requests 13 | setuptools 14 | supervision 15 | termcolor 16 | timm 17 | torch 18 | torchvision 19 | transformers 20 | yapf 21 | nltk 22 | fairscale 23 | litellm 24 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/segment_anything/.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = W503, E203, E221, C901, C408, E741, C407, B017, F811, C101, EXE001, EXE002 3 | max-line-length = 100 4 | max-complexity = 18 5 | select = B,C,E,F,W,T4,B9 6 | per-file-ignores = 7 | **/__init__.py:F401,F403,E402 8 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/segment_anything/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to make participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies within all project spaces, and it also applies when 49 | an individual is representing the project or its community in public spaces. 50 | Examples of representing a project or community include using an official 51 | project e-mail address, posting via an official social media account, or acting 52 | as an appointed representative at an online or offline event. Representation of 53 | a project may be further defined and clarified by project maintainers. 54 | 55 | This Code of Conduct also applies outside the project spaces when there is a 56 | reasonable belief that an individual's behavior may have a negative impact on 57 | the project or its community. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported by contacting the project team at . All 63 | complaints will be reviewed and investigated and will result in a response that 64 | is deemed necessary and appropriate to the circumstances. The project team is 65 | obligated to maintain confidentiality with regard to the reporter of an incident. 66 | Further details of specific enforcement policies may be posted separately. 67 | 68 | Project maintainers who do not follow or enforce the Code of Conduct in good 69 | faith may face temporary or permanent repercussions as determined by other 70 | members of the project's leadership. 71 | 72 | ## Attribution 73 | 74 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 75 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 76 | 77 | [homepage]: https://www.contributor-covenant.org 78 | 79 | For answers to common questions about this code of conduct, see 80 | https://www.contributor-covenant.org/faq 81 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/segment_anything/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to segment-anything 2 | We want to make contributing to this project as easy and transparent as 3 | possible. 4 | 5 | ## Pull Requests 6 | We actively welcome your pull requests. 7 | 8 | 1. Fork the repo and create your branch from `main`. 9 | 2. If you've added code that should be tested, add tests. 10 | 3. If you've changed APIs, update the documentation. 11 | 4. Ensure the test suite passes. 12 | 5. Make sure your code lints, using the `linter.sh` script in the project's root directory. Linting requires `black==23.*`, `isort==5.12.0`, `flake8`, and `mypy`. 13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA"). 14 | 15 | ## Contributor License Agreement ("CLA") 16 | In order to accept your pull request, we need you to submit a CLA. You only need 17 | to do this once to work on any of Facebook's open source projects. 18 | 19 | Complete your CLA here: 20 | 21 | ## Issues 22 | We use GitHub issues to track public bugs. Please ensure your description is 23 | clear and has sufficient instructions to be able to reproduce the issue. 24 | 25 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe 26 | disclosure of security bugs. In those cases, please go through the process 27 | outlined on that page and do not file a public issue. 28 | 29 | ## License 30 | By contributing to segment-anything, you agree that your contributions will be licensed 31 | under the LICENSE file in the root directory of this source tree. 32 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/segment_anything/assets/masks1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/segment_anything/assets/masks1.png -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/segment_anything/assets/masks2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/segment_anything/assets/masks2.jpg -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/segment_anything/assets/model_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/segment_anything/assets/model_diagram.png -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/segment_anything/assets/notebook1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/segment_anything/assets/notebook1.png -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/segment_anything/assets/notebook2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/segment_anything/assets/notebook2.png -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/segment_anything/linter.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | { 5 | black --version | grep -E "23\." > /dev/null 6 | } || { 7 | echo "Linter requires 'black==23.*' !" 8 | exit 1 9 | } 10 | 11 | ISORT_VERSION=$(isort --version-number) 12 | if [[ "$ISORT_VERSION" != 5.12* ]]; then 13 | echo "Linter requires isort==5.12.0 !" 14 | exit 1 15 | fi 16 | 17 | echo "Running isort ..." 18 | isort . --atomic 19 | 20 | echo "Running black ..." 21 | black -l 100 . 22 | 23 | echo "Running flake8 ..." 24 | if [ -x "$(command -v flake8)" ]; then 25 | flake8 . 26 | else 27 | python3 -m flake8 . 28 | fi 29 | 30 | echo "Running mypy..." 31 | 32 | mypy --exclude 'setup.py|notebooks' . 33 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/segment_anything/notebooks/images/dog.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/segment_anything/notebooks/images/dog.jpg -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/segment_anything/notebooks/images/groceries.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/segment_anything/notebooks/images/groceries.jpg -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/segment_anything/notebooks/images/truck.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/segment_anything/notebooks/images/truck.jpg -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/segment_anything/segment_anything/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from .build_sam import ( 8 | build_sam, 9 | build_sam_vit_h, 10 | build_sam_vit_l, 11 | build_sam_vit_b, 12 | sam_model_registry, 13 | ) 14 | from .build_sam_hq import ( 15 | build_sam_hq, 16 | build_sam_hq_vit_h, 17 | build_sam_hq_vit_l, 18 | build_sam_hq_vit_b, 19 | sam_hq_model_registry, 20 | ) 21 | from .predictor import SamPredictor 22 | from .automatic_mask_generator import SamAutomaticMaskGenerator 23 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/segment_anything/segment_anything/build_sam.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import torch 8 | 9 | from functools import partial 10 | 11 | from .modeling import ImageEncoderViT, MaskDecoder, PromptEncoder, Sam, TwoWayTransformer 12 | 13 | 14 | def build_sam_vit_h(checkpoint=None): 15 | return _build_sam( 16 | encoder_embed_dim=1280, 17 | encoder_depth=32, 18 | encoder_num_heads=16, 19 | encoder_global_attn_indexes=[7, 15, 23, 31], 20 | checkpoint=checkpoint, 21 | ) 22 | 23 | 24 | build_sam = build_sam_vit_h 25 | 26 | 27 | def build_sam_vit_l(checkpoint=None): 28 | return _build_sam( 29 | encoder_embed_dim=1024, 30 | encoder_depth=24, 31 | encoder_num_heads=16, 32 | encoder_global_attn_indexes=[5, 11, 17, 23], 33 | checkpoint=checkpoint, 34 | ) 35 | 36 | 37 | def build_sam_vit_b(checkpoint=None): 38 | return _build_sam( 39 | encoder_embed_dim=768, 40 | encoder_depth=12, 41 | encoder_num_heads=12, 42 | encoder_global_attn_indexes=[2, 5, 8, 11], 43 | checkpoint=checkpoint, 44 | ) 45 | 46 | 47 | sam_model_registry = { 48 | "default": build_sam, 49 | "vit_h": build_sam, 50 | "vit_l": build_sam_vit_l, 51 | "vit_b": build_sam_vit_b, 52 | } 53 | 54 | 55 | def _build_sam( 56 | encoder_embed_dim, 57 | encoder_depth, 58 | encoder_num_heads, 59 | encoder_global_attn_indexes, 60 | checkpoint=None, 61 | ): 62 | prompt_embed_dim = 256 63 | image_size = 1024 64 | vit_patch_size = 16 65 | image_embedding_size = image_size // vit_patch_size 66 | sam = Sam( 67 | image_encoder=ImageEncoderViT( 68 | depth=encoder_depth, 69 | embed_dim=encoder_embed_dim, 70 | img_size=image_size, 71 | mlp_ratio=4, 72 | norm_layer=partial(torch.nn.LayerNorm, eps=1e-6), 73 | num_heads=encoder_num_heads, 74 | patch_size=vit_patch_size, 75 | qkv_bias=True, 76 | use_rel_pos=True, 77 | global_attn_indexes=encoder_global_attn_indexes, 78 | window_size=14, 79 | out_chans=prompt_embed_dim, 80 | ), 81 | prompt_encoder=PromptEncoder( 82 | embed_dim=prompt_embed_dim, 83 | image_embedding_size=(image_embedding_size, image_embedding_size), 84 | input_image_size=(image_size, image_size), 85 | mask_in_chans=16, 86 | ), 87 | mask_decoder=MaskDecoder( 88 | num_multimask_outputs=3, 89 | transformer=TwoWayTransformer( 90 | depth=2, 91 | embedding_dim=prompt_embed_dim, 92 | mlp_dim=2048, 93 | num_heads=8, 94 | ), 95 | transformer_dim=prompt_embed_dim, 96 | iou_head_depth=3, 97 | iou_head_hidden_dim=256, 98 | ), 99 | pixel_mean=[123.675, 116.28, 103.53], 100 | pixel_std=[58.395, 57.12, 57.375], 101 | ) 102 | sam.eval() 103 | if checkpoint is not None: 104 | with open(checkpoint, "rb") as f: 105 | state_dict = torch.load(f) 106 | sam.load_state_dict(state_dict) 107 | return sam 108 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/segment_anything/segment_anything/build_sam_hq.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import torch 8 | 9 | from functools import partial 10 | 11 | from .modeling import ImageEncoderViT, MaskDecoderHQ, PromptEncoder, Sam, TwoWayTransformer 12 | 13 | 14 | def build_sam_hq_vit_h(checkpoint=None): 15 | return _build_sam( 16 | encoder_embed_dim=1280, 17 | encoder_depth=32, 18 | encoder_num_heads=16, 19 | encoder_global_attn_indexes=[7, 15, 23, 31], 20 | checkpoint=checkpoint, 21 | ) 22 | 23 | 24 | build_sam_hq = build_sam_hq_vit_h 25 | 26 | 27 | def build_sam_hq_vit_l(checkpoint=None): 28 | return _build_sam( 29 | encoder_embed_dim=1024, 30 | encoder_depth=24, 31 | encoder_num_heads=16, 32 | encoder_global_attn_indexes=[5, 11, 17, 23], 33 | checkpoint=checkpoint, 34 | ) 35 | 36 | 37 | def build_sam_hq_vit_b(checkpoint=None): 38 | return _build_sam( 39 | encoder_embed_dim=768, 40 | encoder_depth=12, 41 | encoder_num_heads=12, 42 | encoder_global_attn_indexes=[2, 5, 8, 11], 43 | checkpoint=checkpoint, 44 | ) 45 | 46 | 47 | sam_hq_model_registry = { 48 | "default": build_sam_hq_vit_h, 49 | "vit_h": build_sam_hq_vit_h, 50 | "vit_l": build_sam_hq_vit_l, 51 | "vit_b": build_sam_hq_vit_b, 52 | } 53 | 54 | 55 | def _build_sam( 56 | encoder_embed_dim, 57 | encoder_depth, 58 | encoder_num_heads, 59 | encoder_global_attn_indexes, 60 | checkpoint=None, 61 | ): 62 | prompt_embed_dim = 256 63 | image_size = 1024 64 | vit_patch_size = 16 65 | image_embedding_size = image_size // vit_patch_size 66 | sam = Sam( 67 | image_encoder=ImageEncoderViT( 68 | depth=encoder_depth, 69 | embed_dim=encoder_embed_dim, 70 | img_size=image_size, 71 | mlp_ratio=4, 72 | norm_layer=partial(torch.nn.LayerNorm, eps=1e-6), 73 | num_heads=encoder_num_heads, 74 | patch_size=vit_patch_size, 75 | qkv_bias=True, 76 | use_rel_pos=True, 77 | global_attn_indexes=encoder_global_attn_indexes, 78 | window_size=14, 79 | out_chans=prompt_embed_dim, 80 | ), 81 | prompt_encoder=PromptEncoder( 82 | embed_dim=prompt_embed_dim, 83 | image_embedding_size=(image_embedding_size, image_embedding_size), 84 | input_image_size=(image_size, image_size), 85 | mask_in_chans=16, 86 | ), 87 | mask_decoder=MaskDecoderHQ( 88 | num_multimask_outputs=3, 89 | transformer=TwoWayTransformer( 90 | depth=2, 91 | embedding_dim=prompt_embed_dim, 92 | mlp_dim=2048, 93 | num_heads=8, 94 | ), 95 | transformer_dim=prompt_embed_dim, 96 | iou_head_depth=3, 97 | iou_head_hidden_dim=256, 98 | vit_dim=encoder_embed_dim, 99 | ), 100 | pixel_mean=[123.675, 116.28, 103.53], 101 | pixel_std=[58.395, 57.12, 57.375], 102 | ) 103 | # sam.eval() 104 | if checkpoint is not None: 105 | with open(checkpoint, "rb") as f: 106 | device = "cuda" if torch.cuda.is_available() else "cpu" 107 | state_dict = torch.load(f, map_location=device) 108 | info = sam.load_state_dict(state_dict, strict=False) 109 | print(info) 110 | for n, p in sam.named_parameters(): 111 | if 'hf_token' not in n and 'hf_mlp' not in n and 'compress_vit_feat' not in n and 'embedding_encoder' not in n and 'embedding_maskfeature' not in n: 112 | p.requires_grad = False 113 | 114 | return sam 115 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/segment_anything/segment_anything/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from .sam import Sam 8 | from .image_encoder import ImageEncoderViT 9 | from .mask_decoder_hq import MaskDecoderHQ 10 | from .mask_decoder import MaskDecoder 11 | from .prompt_encoder import PromptEncoder 12 | from .transformer import TwoWayTransformer 13 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/segment_anything/segment_anything/modeling/common.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import torch 8 | import torch.nn as nn 9 | 10 | from typing import Type 11 | 12 | 13 | class MLPBlock(nn.Module): 14 | def __init__( 15 | self, 16 | embedding_dim: int, 17 | mlp_dim: int, 18 | act: Type[nn.Module] = nn.GELU, 19 | ) -> None: 20 | super().__init__() 21 | self.lin1 = nn.Linear(embedding_dim, mlp_dim) 22 | self.lin2 = nn.Linear(mlp_dim, embedding_dim) 23 | self.act = act() 24 | 25 | def forward(self, x: torch.Tensor) -> torch.Tensor: 26 | return self.lin2(self.act(self.lin1(x))) 27 | 28 | 29 | # From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa 30 | # Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119 # noqa 31 | class LayerNorm2d(nn.Module): 32 | def __init__(self, num_channels: int, eps: float = 1e-6) -> None: 33 | super().__init__() 34 | self.weight = nn.Parameter(torch.ones(num_channels)) 35 | self.bias = nn.Parameter(torch.zeros(num_channels)) 36 | self.eps = eps 37 | 38 | def forward(self, x: torch.Tensor) -> torch.Tensor: 39 | u = x.mean(1, keepdim=True) 40 | s = (x - u).pow(2).mean(1, keepdim=True) 41 | x = (x - u) / torch.sqrt(s + self.eps) 42 | x = self.weight[:, None, None] * x + self.bias[:, None, None] 43 | return x 44 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/segment_anything/segment_anything/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/segment_anything/segment_anything/utils/transforms.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import numpy as np 8 | import torch 9 | from torch.nn import functional as F 10 | from torchvision.transforms.functional import resize, to_pil_image # type: ignore 11 | 12 | from copy import deepcopy 13 | from typing import Tuple 14 | 15 | 16 | class ResizeLongestSide: 17 | """ 18 | Resizes images to longest side 'target_length', as well as provides 19 | methods for resizing coordinates and boxes. Provides methods for 20 | transforming both numpy array and batched torch tensors. 21 | """ 22 | 23 | def __init__(self, target_length: int) -> None: 24 | self.target_length = target_length 25 | 26 | def apply_image(self, image: np.ndarray) -> np.ndarray: 27 | """ 28 | Expects a numpy array with shape HxWxC in uint8 format. 29 | """ 30 | target_size = self.get_preprocess_shape(image.shape[0], image.shape[1], self.target_length) 31 | return np.array(resize(to_pil_image(image), target_size)) 32 | 33 | def apply_coords(self, coords: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray: 34 | """ 35 | Expects a numpy array of length 2 in the final dimension. Requires the 36 | original image size in (H, W) format. 37 | """ 38 | old_h, old_w = original_size 39 | new_h, new_w = self.get_preprocess_shape( 40 | original_size[0], original_size[1], self.target_length 41 | ) 42 | coords = deepcopy(coords).astype(float) 43 | coords[..., 0] = coords[..., 0] * (new_w / old_w) 44 | coords[..., 1] = coords[..., 1] * (new_h / old_h) 45 | return coords 46 | 47 | def apply_boxes(self, boxes: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray: 48 | """ 49 | Expects a numpy array shape Bx4. Requires the original image size 50 | in (H, W) format. 51 | """ 52 | boxes = self.apply_coords(boxes.reshape(-1, 2, 2), original_size) 53 | return boxes.reshape(-1, 4) 54 | 55 | def apply_image_torch(self, image: torch.Tensor) -> torch.Tensor: 56 | """ 57 | Expects batched images with shape BxCxHxW and float format. This 58 | transformation may not exactly match apply_image. apply_image is 59 | the transformation expected by the model. 60 | """ 61 | # Expects an image in BCHW format. May not exactly match apply_image. 62 | target_size = self.get_preprocess_shape(image.shape[0], image.shape[1], self.target_length) 63 | return F.interpolate( 64 | image, target_size, mode="bilinear", align_corners=False, antialias=True 65 | ) 66 | 67 | def apply_coords_torch( 68 | self, coords: torch.Tensor, original_size: Tuple[int, ...] 69 | ) -> torch.Tensor: 70 | """ 71 | Expects a torch tensor with length 2 in the last dimension. Requires the 72 | original image size in (H, W) format. 73 | """ 74 | old_h, old_w = original_size 75 | new_h, new_w = self.get_preprocess_shape( 76 | original_size[0], original_size[1], self.target_length 77 | ) 78 | coords = deepcopy(coords).to(torch.float) 79 | coords[..., 0] = coords[..., 0] * (new_w / old_w) 80 | coords[..., 1] = coords[..., 1] * (new_h / old_h) 81 | return coords 82 | 83 | def apply_boxes_torch( 84 | self, boxes: torch.Tensor, original_size: Tuple[int, ...] 85 | ) -> torch.Tensor: 86 | """ 87 | Expects a torch tensor with shape Bx4. Requires the original image 88 | size in (H, W) format. 89 | """ 90 | boxes = self.apply_coords_torch(boxes.reshape(-1, 2, 2), original_size) 91 | return boxes.reshape(-1, 4) 92 | 93 | @staticmethod 94 | def get_preprocess_shape(oldh: int, oldw: int, long_side_length: int) -> Tuple[int, int]: 95 | """ 96 | Compute the output size given input size and target long side length. 97 | """ 98 | scale = long_side_length * 1.0 / max(oldh, oldw) 99 | newh, neww = oldh * scale, oldw * scale 100 | neww = int(neww + 0.5) 101 | newh = int(newh + 0.5) 102 | return (newh, neww) 103 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/segment_anything/setup.cfg: -------------------------------------------------------------------------------- 1 | [isort] 2 | line_length=100 3 | multi_line_output=3 4 | include_trailing_comma=True 5 | known_standard_library=numpy,setuptools 6 | skip_glob=*/__init__.py 7 | known_myself=segment_anything 8 | known_third_party=matplotlib,cv2,torch,torchvision,pycocotools,onnx,black,isort 9 | no_lines_before=STDLIB,THIRDPARTY 10 | sections=FUTURE,STDLIB,THIRDPARTY,MYSELF,FIRSTPARTY,LOCALFOLDER 11 | default_section=FIRSTPARTY 12 | -------------------------------------------------------------------------------- /third_party/Grounded-Segment-Anything/segment_anything/setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from setuptools import find_packages, setup 8 | 9 | setup( 10 | name="segment_anything", 11 | version="1.0", 12 | install_requires=[], 13 | packages=find_packages(exclude="notebooks"), 14 | extras_require={ 15 | "all": ["matplotlib", "pycocotools", "opencv-python", "onnx", "onnxruntime"], 16 | "dev": ["flake8", "isort", "black", "mypy"], 17 | }, 18 | ) 19 | -------------------------------------------------------------------------------- /third_party/UniDepth/.gitignore: -------------------------------------------------------------------------------- 1 | # don't upload macOS folder info 2 | *.DS_Store 3 | 4 | #python 5 | *.pyc 6 | __pycache__/ 7 | 8 | #scripts 9 | *.sh 10 | 11 | # package 12 | unidepth.egg-info 13 | -------------------------------------------------------------------------------- /third_party/UniDepth/assets/demo/depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/UniDepth/assets/demo/depth.png -------------------------------------------------------------------------------- /third_party/UniDepth/assets/demo/intrinsics.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/UniDepth/assets/demo/intrinsics.npy -------------------------------------------------------------------------------- /third_party/UniDepth/assets/demo/output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/UniDepth/assets/demo/output.png -------------------------------------------------------------------------------- /third_party/UniDepth/assets/demo/rgb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/UniDepth/assets/demo/rgb.png -------------------------------------------------------------------------------- /third_party/UniDepth/assets/docs/V2_README.md: -------------------------------------------------------------------------------- 1 | # Changes 2 | 3 | 4 | ### Input shape and ratio flexibility. 5 | 6 | 1. Input images will not be reshaped to a specific image size. Training image ratios are in tha range: `[2/3, 2/1]`, thus if your image ratio is outside of these boundaries, we suggest to crop or pad it to be within the image ratio bounds. 7 | 8 | 2. UnidepthV2 exposes the attribute `self.resolution_level` (with range `[0,10]`) that is used in the preprocess function and can be used to tradeoff resolution and speed, with **possible effect** on the output scale. In particular, the level describes the linear interpolation degree of the processed image area within the training bounds. The training image area (named "pixels") for ViT are in the range `[1400, 2400]` (see `pixels_bounds` in config). If no attribute is set, the max level, i.e. 10, will be used. We improperly use the concept of "pixels" which accounts for the image area after patchification, e.g. for ViT means that it is `1/14**2` the actual original image area. 9 | 10 | 3. Infer method will use interpolation mode defined by the attribute `self.interpolation_mode`, default is `bilinear`. 11 | 12 | 13 | ### Confidence output 14 | 15 | The model outputs confidence in the range `[0, 1]` and represent the ARel error after affine matching with GT. The confidence itself is shift invariance, namely the confidence is a ranking and relative within one input. In particular, it does not have an absolute meaning (e.g. no heteroschdastic noise modelling). 16 | 17 | 18 | ### Decoder design predicting separately scale-shift invariant depth and scale and shift to allow more diverse training. 19 | 20 | The decoder presents three heads: `Camera`, `Depth` and `Global`. `Depth` head predicts scale and shift invariant depth: exponential of normalized values. 21 | `Global` head predicts the scale and shift to match the `Depth` head output to metric. 22 | With such design we can mix seamlessly dataset with metric GT, scale-invariant (i.e., SfM) or scale-shift invariant by turning down the gradient to the `Global` head when GT is either scale or shift invariant. 23 | This allows to scale up the training variety. 24 | Version 1 and 2 present similar performance but output of version 2 may look more nervous because more diversity is linked to lower GT quality, thus introducing artifacts... 25 | 26 | 27 | ### Faster inference 28 | 29 | The model is >30% faster than V1, tested on RTX4090 with float16 data-type. 30 | 31 | 32 | ### ONNX support 33 | 34 | We added support to UniDepthV2 in __ONNX__ format. 35 | Both with and without gt intrinsics support. 36 | It does not allow for dynamic shapes at test time. 37 | For instance you can run from the root of the repo: 38 | ```bash 39 | 40 | python ./unidepth/models/unidepthv2/export.py --version v2 --backbone vitl14 --shape (462, 616) --output-path unidepthv2.onnx --with-camera 41 | ``` 42 | 43 | Shape will be changed to the closest shape which is multiple of 14, i.e. ViT patch size. 44 | Your input shape at inference time will have to match with the (resized) shape passed to the exporter! 45 | The corresponding __ONNX__ model does not do any pre- or post-processing. 46 | Therefore, you should input an ImageNet-statistic normalized rgb image rescaled to the given input shape and, if `--with-camera` the corresponding (properly rescaled) camera intrinsics, too. 47 | 48 | 49 | Disclaimer: Not fully tested -------------------------------------------------------------------------------- /third_party/UniDepth/assets/docs/nuscenes_surround.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/UniDepth/assets/docs/nuscenes_surround.gif -------------------------------------------------------------------------------- /third_party/UniDepth/assets/docs/theoffice.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/UniDepth/assets/docs/theoffice.gif -------------------------------------------------------------------------------- /third_party/UniDepth/assets/docs/unidepth-banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/UniDepth/assets/docs/unidepth-banner.png -------------------------------------------------------------------------------- /third_party/UniDepth/configs/config_v1_cnvnxtl.json: -------------------------------------------------------------------------------- 1 | { 2 | "generic": { 3 | "seed": 13 4 | }, 5 | "training": { 6 | }, 7 | "data": { 8 | "image_shape": [462, 616] 9 | }, 10 | "model": { 11 | "name": "UniDepthV1", 12 | "num_heads": 8, 13 | "expansion": 4, 14 | "pixel_decoder": { 15 | "hidden_dim": 512, 16 | "depths": [3, 2, 1], 17 | "dropout": 0.0 18 | }, 19 | "pixel_encoder": { 20 | "name": "convnext_large", 21 | "pretrained": null 22 | } 23 | } 24 | } -------------------------------------------------------------------------------- /third_party/UniDepth/configs/config_v1_vitl14.json: -------------------------------------------------------------------------------- 1 | { 2 | "generic": { 3 | "seed": 13 4 | }, 5 | "training": {}, 6 | "data": { 7 | "image_shape": [462, 616] 8 | }, 9 | "model": { 10 | "name": "UniDepthV1", 11 | "num_heads": 8, 12 | "expansion": 4, 13 | "pixel_decoder": { 14 | "hidden_dim": 512, 15 | "depths": [3, 2, 1], 16 | "dropout": 0.0 17 | }, 18 | "pixel_encoder": { 19 | "name": "dinov2_vitl14", 20 | "pretrained": null 21 | } 22 | } 23 | } -------------------------------------------------------------------------------- /third_party/UniDepth/configs/config_v2_vitl14.json: -------------------------------------------------------------------------------- 1 | { 2 | "generic": { 3 | "seed": 13, 4 | "deterministic": true 5 | }, 6 | "training": {}, 7 | "data": { 8 | "image_shape": [420, 560], 9 | "shape_constraints": { 10 | "ratio_bounds": [0.66, 2.0], 11 | "pixels_bounds": [1400, 2400], 12 | "patch_size": 14 13 | } 14 | }, 15 | "model": { 16 | "name": "UniDepthV2", 17 | "num_heads": 8, 18 | "expansion": 4, 19 | "pixel_decoder": { 20 | "hidden_dim": 512, 21 | "depths": [6, 0, 0], 22 | "dropout": 0.0 23 | }, 24 | "pixel_encoder": { 25 | "name": "dinov2_vitl14", 26 | "pretrained": null, 27 | "use_norm": true, 28 | "stacking_fn": "last", 29 | "output_idx": [21,22,23,24] 30 | } 31 | } 32 | } -------------------------------------------------------------------------------- /third_party/UniDepth/configs/config_v2_vits14.json: -------------------------------------------------------------------------------- 1 | { 2 | "generic": { 3 | "seed": 13, 4 | "deterministic": true 5 | }, 6 | "training": {}, 7 | "data": { 8 | "image_shape": [420, 560], 9 | "shape_constraints": { 10 | "ratio_bounds": [0.66, 2.0], 11 | "pixels_bounds": [1400, 2400], 12 | "patch_size": 14 13 | } 14 | }, 15 | "model": { 16 | "name": "UniDepthV2", 17 | "num_heads": 8, 18 | "expansion": 4, 19 | "pixel_decoder": { 20 | "hidden_dim": 512, 21 | "depths": [6, 0, 0], 22 | "dropout": 0.0 23 | }, 24 | "pixel_encoder": { 25 | "name": "dinov2_vits14", 26 | "pretrained": null, 27 | "use_norm": true, 28 | "stacking_fn": "last", 29 | "output_idx": [9,10,11,12] 30 | } 31 | } 32 | } -------------------------------------------------------------------------------- /third_party/UniDepth/hubconf.py: -------------------------------------------------------------------------------- 1 | dependencies = ["torch", "huggingface_hub"] 2 | 3 | import os 4 | import json 5 | 6 | import torch 7 | import huggingface_hub 8 | 9 | from unidepth.models import UniDepthV1, UniDepthV2 10 | 11 | 12 | MAP_VERSIONS = { 13 | "v1": UniDepthV1, 14 | "v2": UniDepthV2 15 | } 16 | 17 | BACKBONES = { 18 | "v1": ["vitl14", "cnvnxtl"], 19 | "v2": ["vitl14", "vits14"] 20 | } 21 | 22 | 23 | def UniDepth(version="v2", backbone="vitl14", pretrained=True): 24 | assert version in MAP_VERSIONS.keys(), f"version must be one of {list(MAP_VERSIONS.keys())}" 25 | assert backbone in BACKBONES[version], f"backbone for current version ({version}) must be one of {list(BACKBONES[version])}" 26 | repo_dir = os.path.dirname(os.path.realpath(__file__)) 27 | with open(os.path.join(repo_dir, "configs", f"config_{version}_{backbone}.json")) as f: 28 | config = json.load(f) 29 | 30 | model = MAP_VERSIONS[version](config) 31 | if pretrained: 32 | path = huggingface_hub.hf_hub_download(repo_id=f"lpiccinelli/unidepth-{version}-{backbone}", filename=f"pytorch_model.bin", repo_type="model") 33 | info = model.load_state_dict(torch.load(path), strict=False) 34 | print(f"UniDepth_{version}_{backbone} is loaded with:") 35 | print(f"\t missing keys: {info.missing_keys}") 36 | print(f"\t additional keys: {info.unexpected_keys}") 37 | 38 | return model 39 | 40 | -------------------------------------------------------------------------------- /third_party/UniDepth/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [tool.pyright] 6 | include = ["unidepth"] 7 | 8 | [project] 9 | name = "unidepth" 10 | version = "0.1" 11 | authors = [{name = "Luigi Piccinelli", email = "lpiccinelli@ethz.ch"}] 12 | description = "UniDepth: Universal Monocular Metric Depth Estimation" 13 | readme = "README.md" 14 | license = { text="Creatives Common BY-NC 4.0 license"} 15 | requires-python = ">=3.10.0" 16 | dynamic = ["dependencies"] 17 | 18 | [tool.setuptools.dynamic] 19 | dependencies = {file = ["requirements.txt"]} 20 | 21 | [tool.setuptools.package-data] 22 | "*" = ["py.typed"] 23 | 24 | [tool.setuptools.packages.find] 25 | include = ["unidepth*"] 26 | -------------------------------------------------------------------------------- /third_party/UniDepth/requirements.txt: -------------------------------------------------------------------------------- 1 | appdirs 2 | attrs 3 | black 4 | blosc2 5 | botocore==1.34.54 6 | certifi==2022.12.7 7 | charset-normalizer 8 | click 9 | contourpy 10 | cycler 11 | docker-pycreds 12 | einops==0.7.0 13 | filelock 14 | flake8==7.0.0 15 | flake8-bugbear==24.2.6 16 | flake8-comprehensions==3.14.0 17 | fonttools 18 | fsspec 19 | fvcore==0.1.5.post20221221 20 | gitdb 21 | GitPython 22 | h5py>=3.10.0 23 | huggingface-hub>=0.22.0 24 | idna 25 | imageio 26 | imath 27 | iopath 28 | isort 29 | Jinja2 30 | jmespath 31 | kiwisolver 32 | MarkupSafe 33 | matplotlib 34 | mccabe 35 | mpmath 36 | msgpack 37 | mypy-extensions 38 | ndindex 39 | networkx 40 | ninja 41 | numexpr 42 | numpy<2.0.0 43 | opencv-python 44 | OpenEXR 45 | packaging 46 | pandas 47 | pathspec 48 | pillow==10.2.0 49 | platformdirs 50 | portalocker 51 | protobuf==4.25.3 52 | psutil 53 | py-cpuinfo 54 | pycodestyle 55 | pyflakes 56 | pyparsing 57 | python-dateutil 58 | pytz 59 | PyYAML 60 | requests 61 | safetensors 62 | scipy 63 | sentry-sdk 64 | setproctitle 65 | six 66 | smmap 67 | sympy 68 | tables 69 | tabulate 70 | termcolor 71 | timm 72 | tqdm 73 | triton==2.2.0 74 | typing_extensions 75 | tzdata==2024.1 76 | urllib3==1.26.13 77 | wandb 78 | yacs 79 | torch==2.2.0 80 | torchvision==0.17.0 81 | torchaudio==2.2.0 82 | xformers==0.0.24 -------------------------------------------------------------------------------- /third_party/UniDepth/run_unidepth.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | import os 6 | torch.backends.cudnn.enabled = False 7 | 8 | from PIL import Image 9 | from tqdm import tqdm 10 | import json 11 | import argparse 12 | from unidepth.models import UniDepthV1 13 | 14 | 15 | def parse_args(): 16 | parser = argparse.ArgumentParser(description="Dataset Configuration") 17 | parser.add_argument('--dataset', type=str, default='SUNRGBD', help='Name of the dataset') 18 | return parser.parse_args() 19 | 20 | version="v1" 21 | backbone="ViTL14" 22 | 23 | model = UniDepthV1.from_pretrained("lpiccinelli/unidepth-v1-vitl14") 24 | 25 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 26 | model = model.to(device) 27 | 28 | 29 | def process(dataset): 30 | for mode in ['train', 'val']: 31 | with open(f'datasets/Omni3D/{dataset}_{mode}.json', 'r') as file: 32 | data = json.load(file) 33 | 34 | for i in tqdm(range(len(data['images']))): 35 | filename = data['images'][i]['file_path'] 36 | rgb = torch.from_numpy(np.array(Image.open(f'datasets/{filename}'))).permute(2, 0, 1) 37 | intrinsics = np.array(data['images'][i]['K']).reshape(3,3) 38 | intrinsics = torch.from_numpy(intrinsics).float() 39 | file_name = data['images'][i]['id'] 40 | 41 | predictions = model.infer(rgb, intrinsics) 42 | depth = predictions["depth"] 43 | intrinsics = predictions["intrinsics"] 44 | 45 | outdir = f'pseudo_label/{dataset}/{mode}/depth' 46 | os.makedirs(outdir, exist_ok=True) 47 | np.save(os.path.join(outdir, f"{file_name}"), depth.cpu().numpy().squeeze(0).squeeze(0)) 48 | 49 | 50 | if __name__ == "__main__": 51 | args = parse_args() 52 | print(f"Dataset name: {args.dataset}") 53 | process(args.dataset) 54 | -------------------------------------------------------------------------------- /third_party/UniDepth/scripts/demo.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from PIL import Image 4 | 5 | from unidepth.models import UniDepthV1, UniDepthV2 6 | from unidepth.utils import colorize, image_grid 7 | 8 | 9 | def demo(model): 10 | rgb = np.array(Image.open("assets/demo/rgb.png")) 11 | rgb_torch = torch.from_numpy(rgb).permute(2, 0, 1) 12 | intrinsics_torch = torch.from_numpy(np.load("assets/demo/intrinsics.npy")) 13 | 14 | # predict 15 | predictions = model.infer(rgb_torch, intrinsics_torch) 16 | 17 | # get GT and pred 18 | depth_pred = predictions["depth"].squeeze().cpu().numpy() 19 | depth_gt = np.array(Image.open("assets/demo/depth.png")).astype(float) / 1000.0 20 | 21 | # compute error, you have zero divison where depth_gt == 0.0 22 | depth_arel = np.abs(depth_gt - depth_pred) / depth_gt 23 | depth_arel[depth_gt == 0.0] = 0.0 24 | 25 | # colorize 26 | depth_pred_col = colorize(depth_pred, vmin=0.01, vmax=10.0, cmap="magma_r") 27 | depth_gt_col = colorize(depth_gt, vmin=0.01, vmax=10.0, cmap="magma_r") 28 | depth_error_col = colorize(depth_arel, vmin=0.0, vmax=0.2, cmap="coolwarm") 29 | 30 | # save image with pred and error 31 | artifact = image_grid([rgb, depth_gt_col, depth_pred_col, depth_error_col], 2, 2) 32 | Image.fromarray(artifact).save("assets/demo/output.png") 33 | 34 | print("Available predictions:", list(predictions.keys())) 35 | print(f"ARel: {depth_arel[depth_gt > 0].mean() * 100:.2f}%") 36 | 37 | 38 | if __name__ == "__main__": 39 | print("Torch version:", torch.__version__) 40 | name = "unidepth-v2-vitl14" 41 | # model = UniDepthV1.from_pretrained("lpiccinelli/unidepth-v1-vitl14") 42 | model = UniDepthV2.from_pretrained(f"lpiccinelli/{name}") 43 | 44 | # set resolution level (only V2) 45 | # model.resolution_level = 0 46 | 47 | # set interpolation mode (only V2) 48 | # model.interpolation_mode = "bilinear" 49 | 50 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 51 | model = model.to(device) 52 | demo(model) 53 | -------------------------------------------------------------------------------- /third_party/UniDepth/unidepth/layers/__init__.py: -------------------------------------------------------------------------------- 1 | from .activation import GEGLU, SwiGLU 2 | from .attention import AttentionBlock, AttentionDecoderBlock 3 | from .convnext import CvnxtBlock 4 | from .mlp import MLP 5 | from .nystrom_attention import NystromBlock 6 | from .positional_encoding import PositionEmbeddingSine 7 | from .upsample import (ConvUpsample, ConvUpsampleShuffle, 8 | ConvUpsampleShuffleResidual) 9 | 10 | __all__ = [ 11 | "SwiGLU", 12 | "GEGLU", 13 | "CvnxtBlock", 14 | "AttentionBlock", 15 | "NystromBlock", 16 | "PositionEmbeddingSine", 17 | "ConvUpsample", 18 | "MLP", 19 | "ConvUpsampleShuffle", 20 | "AttentionDecoderBlock", 21 | "ConvUpsampleShuffleResidual", 22 | ] 23 | -------------------------------------------------------------------------------- /third_party/UniDepth/unidepth/layers/activation.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class SwiGLU(nn.Module): 7 | def forward(self, x: torch.Tensor) -> torch.Tensor: 8 | x, gates = x.chunk(2, dim=-1) 9 | return x * F.silu(gates) 10 | 11 | 12 | class GEGLU(nn.Module): 13 | def forward(self, x: torch.Tensor) -> torch.Tensor: 14 | x, gates = x.chunk(2, dim=-1) 15 | return x * F.gelu(gates) 16 | -------------------------------------------------------------------------------- /third_party/UniDepth/unidepth/layers/convnext.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class CvnxtBlock(nn.Module): 6 | def __init__( 7 | self, 8 | dim, 9 | kernel_size=7, 10 | layer_scale=1.0, 11 | expansion=4, 12 | dilation=1, 13 | padding_mode: str = "zeros", 14 | ): 15 | super().__init__() 16 | self.dwconv = nn.Conv2d( 17 | dim, 18 | dim, 19 | kernel_size=kernel_size, 20 | padding=dilation * (kernel_size - 1) // 2, 21 | groups=dim, 22 | dilation=dilation, 23 | padding_mode=padding_mode, 24 | ) # depthwise conv 25 | self.norm = nn.LayerNorm(dim) 26 | self.pwconv1 = nn.Linear(dim, expansion * dim) 27 | self.act = nn.GELU() 28 | self.pwconv2 = nn.Linear(expansion * dim, dim) 29 | self.gamma = ( 30 | nn.Parameter(layer_scale * torch.ones((dim))) if layer_scale > 0.0 else 1.0 31 | ) 32 | 33 | def forward(self, x): 34 | input = x 35 | x = self.dwconv(x) 36 | x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C) 37 | x = self.norm(x) 38 | x = self.pwconv1(x) 39 | x = self.act(x) 40 | x = self.pwconv2(x) 41 | 42 | x = self.gamma * x 43 | x = input + x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W) 44 | return x 45 | -------------------------------------------------------------------------------- /third_party/UniDepth/unidepth/layers/drop_path.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | def drop_path(x: torch.Tensor, drop_prob: float = 0.0, training: bool = False): 6 | if drop_prob == 0.0 or not training: 7 | return x 8 | keep_prob = 1 - drop_prob 9 | shape = (x.shape[0],) + (1,) * ( 10 | x.ndim - 1 11 | ) # work with diff dim tensors, not just 2D ConvNets 12 | random_tensor = x.new_empty(shape).bernoulli_(keep_prob) 13 | if keep_prob > 0.0: 14 | random_tensor.div_(keep_prob) 15 | output = x * random_tensor 16 | return output 17 | 18 | 19 | class DropPath(nn.Module): 20 | def __init__(self, drop_prob=None): 21 | super(DropPath, self).__init__() 22 | self.drop_prob = drop_prob 23 | 24 | def forward(self, x): 25 | return drop_path(x, self.drop_prob, self.training) 26 | -------------------------------------------------------------------------------- /third_party/UniDepth/unidepth/layers/layer_scale.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class LayerScale(nn.Module): 6 | def __init__( 7 | self, 8 | dim: int, 9 | init_values: float | torch.Tensor = 1e-5, 10 | inplace: bool = False, 11 | ) -> None: 12 | super().__init__() 13 | self.inplace = inplace 14 | self.gamma = nn.Parameter(init_values * torch.ones(dim)) 15 | 16 | def forward(self, x: torch.Tensor) -> torch.Tensor: 17 | return x.mul_(self.gamma) if self.inplace else x * self.gamma 18 | -------------------------------------------------------------------------------- /third_party/UniDepth/unidepth/layers/mlp.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from unidepth.utils.misc import default 5 | 6 | from .activation import SwiGLU 7 | 8 | 9 | class MLP(nn.Module): 10 | def __init__( 11 | self, 12 | input_dim: int, 13 | expansion: int = 4, 14 | dropout: float = 0.0, 15 | gated: bool = False, 16 | output_dim: int | None = None, 17 | ): 18 | super().__init__() 19 | if gated: 20 | expansion = int(expansion * 2 / 3) 21 | hidden_dim = int(input_dim * expansion) 22 | output_dim = default(output_dim, input_dim) 23 | self.norm = nn.LayerNorm(input_dim) 24 | self.proj1 = nn.Linear(input_dim, hidden_dim) 25 | self.proj2 = nn.Linear(hidden_dim, output_dim) 26 | self.act = nn.GELU() if not gated else SwiGLU() 27 | self.dropout = nn.Dropout(dropout) if dropout > 0.0 else nn.Identity() 28 | 29 | def forward(self, x: torch.Tensor) -> torch.Tensor: 30 | x = self.norm(x) 31 | x = self.proj1(x) 32 | x = self.act(x) 33 | x = self.proj2(x) 34 | x = self.dropout(x) 35 | return x 36 | -------------------------------------------------------------------------------- /third_party/UniDepth/unidepth/layers/nystrom_attention.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from einops import rearrange 7 | from xformers.components.attention import NystromAttention 8 | 9 | from .attention import AttentionBlock 10 | 11 | 12 | class NystromBlock(AttentionBlock): 13 | def __init__( 14 | self, 15 | dim: int, 16 | num_heads: int = 4, 17 | expansion: int = 4, 18 | dropout: float = 0.0, 19 | cosine: bool = False, 20 | gated: bool = False, 21 | layer_scale: float = 1.0, 22 | context_dim: int | None = None, 23 | ): 24 | super().__init__( 25 | dim=dim, 26 | num_heads=num_heads, 27 | expansion=expansion, 28 | dropout=dropout, 29 | cosine=cosine, 30 | gated=gated, 31 | layer_scale=layer_scale, 32 | context_dim=context_dim, 33 | ) 34 | self.attention_fn = NystromAttention( 35 | num_landmarks=128, num_heads=num_heads, dropout=dropout 36 | ) 37 | 38 | def attn( 39 | self, 40 | x: torch.Tensor, 41 | attn_bias: torch.Tensor | None = None, 42 | context: torch.Tensor | None = None, 43 | pos_embed: torch.Tensor | None = None, 44 | pos_embed_context: torch.Tensor | None = None, 45 | rope: nn.Module | None = None, 46 | ) -> torch.Tensor: 47 | x = self.norm_attnx(x) 48 | context = self.norm_attnctx(context) 49 | k, v = rearrange( 50 | self.kv(context), "b n (kv h d) -> b n h d kv", h=self.num_heads, kv=2 51 | ).unbind(dim=-1) 52 | q = rearrange(self.q(x), "b n (h d) -> b n h d", h=self.num_heads) 53 | 54 | if rope is not None: 55 | q = rope(q) 56 | k = rope(k) 57 | else: 58 | if pos_embed is not None: 59 | pos_embed = rearrange( 60 | pos_embed, "b n (h d) -> b n h d", h=self.num_heads 61 | ) 62 | q = q + pos_embed 63 | if pos_embed_context is not None: 64 | pos_embed_context = rearrange( 65 | pos_embed_context, "b n (h d) -> b n h d", h=self.num_heads 66 | ) 67 | k = k + pos_embed_context 68 | 69 | if self.cosine: 70 | q, k = map(partial(F.normalize, p=2, dim=-1), (q, k)) # cosine sim 71 | x = self.attention_fn(q, k, v, key_padding_mask=attn_bias) 72 | x = rearrange(x, "b n h d -> b n (h d)") 73 | x = self.out(x) 74 | return x 75 | -------------------------------------------------------------------------------- /third_party/UniDepth/unidepth/layers/upsample.py: -------------------------------------------------------------------------------- 1 | """ 2 | Author: Luigi Piccinelli 3 | Licensed under the CC-BY NC 4.0 license (http://creativecommons.org/licenses/by-nc/4.0/) 4 | """ 5 | 6 | import torch 7 | import torch.nn as nn 8 | from einops import rearrange 9 | 10 | from .convnext import CvnxtBlock 11 | 12 | 13 | class ConvUpsample(nn.Module): 14 | def __init__( 15 | self, 16 | hidden_dim, 17 | num_layers: int = 2, 18 | expansion: int = 4, 19 | layer_scale: float = 1.0, 20 | kernel_size: int = 7, 21 | **kwargs, 22 | ): 23 | super().__init__() 24 | self.convs = nn.ModuleList([]) 25 | for _ in range(num_layers): 26 | self.convs.append( 27 | CvnxtBlock( 28 | hidden_dim, 29 | kernel_size=kernel_size, 30 | expansion=expansion, 31 | layer_scale=layer_scale, 32 | ) 33 | ) 34 | self.up = nn.Sequential( 35 | nn.Conv2d(hidden_dim, hidden_dim // 2, kernel_size=1, padding=0), 36 | nn.UpsamplingBilinear2d(scale_factor=2), 37 | nn.Conv2d(hidden_dim // 2, hidden_dim // 2, kernel_size=3, padding=1), 38 | ) 39 | 40 | def forward(self, x: torch.Tensor): 41 | for conv in self.convs: 42 | x = conv(x) 43 | x = self.up(x) 44 | x = rearrange(x, "b c h w -> b (h w) c") 45 | return x 46 | 47 | 48 | class ConvUpsampleShuffle(nn.Module): 49 | def __init__( 50 | self, 51 | hidden_dim, 52 | num_layers: int = 2, 53 | expansion: int = 4, 54 | layer_scale: float = 1.0, 55 | kernel_size: int = 7, 56 | **kwargs, 57 | ): 58 | super().__init__() 59 | self.convs = nn.ModuleList([]) 60 | for _ in range(num_layers): 61 | self.convs.append( 62 | CvnxtBlock( 63 | hidden_dim, 64 | kernel_size=kernel_size, 65 | expansion=expansion, 66 | layer_scale=layer_scale, 67 | ) 68 | ) 69 | self.up = nn.Sequential( 70 | nn.PixelShuffle(2), 71 | nn.Conv2d(hidden_dim // 4, hidden_dim // 2, kernel_size=3, padding=1), 72 | ) 73 | 74 | def forward(self, x: torch.Tensor): 75 | for conv in self.convs: 76 | x = conv(x) 77 | x = self.up(x) 78 | x = rearrange(x, "b c h w -> b (h w) c") 79 | return x 80 | 81 | 82 | class ConvUpsampleShuffleResidual(nn.Module): 83 | def __init__( 84 | self, 85 | hidden_dim, 86 | num_layers: int = 2, 87 | expansion: int = 4, 88 | layer_scale: float = 1.0, 89 | kernel_size: int = 7, 90 | padding_mode: str = "zeros", 91 | **kwargs, 92 | ): 93 | super().__init__() 94 | self.convs = nn.ModuleList([]) 95 | for _ in range(num_layers): 96 | self.convs.append( 97 | CvnxtBlock( 98 | hidden_dim, 99 | kernel_size=kernel_size, 100 | expansion=expansion, 101 | layer_scale=layer_scale, 102 | padding_mode=padding_mode, 103 | ) 104 | ) 105 | self.up = nn.Sequential( 106 | nn.PixelShuffle(2), 107 | nn.Conv2d( 108 | hidden_dim // 4, 109 | hidden_dim // 4, 110 | kernel_size=7, 111 | padding=3, 112 | padding_mode=padding_mode, 113 | groups=hidden_dim // 4, 114 | ), 115 | nn.ReLU(), 116 | nn.Conv2d( 117 | hidden_dim // 4, 118 | hidden_dim // 2, 119 | kernel_size=3, 120 | padding=1, 121 | padding_mode=padding_mode, 122 | ), 123 | ) 124 | self.residual = nn.Sequential( 125 | nn.Conv2d(hidden_dim, hidden_dim // 2, kernel_size=1, padding=0), 126 | nn.UpsamplingBilinear2d(scale_factor=2), 127 | ) 128 | 129 | def forward(self, x: torch.Tensor): 130 | for conv in self.convs: 131 | x = conv(x) 132 | x = self.up(x) + self.residual(x) 133 | x = rearrange(x, "b c h w -> b (h w) c") 134 | return x 135 | -------------------------------------------------------------------------------- /third_party/UniDepth/unidepth/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .unidepthv1 import UniDepthV1 2 | from .unidepthv2 import UniDepthV2 3 | 4 | __all__ = [ 5 | "UniDepthV1", 6 | "UniDepthV2", 7 | ] 8 | -------------------------------------------------------------------------------- /third_party/UniDepth/unidepth/models/backbones/__init__.py: -------------------------------------------------------------------------------- 1 | from .convnext import ConvNeXt 2 | from .convnext2 import ConvNeXtV2 3 | from .dinov2 import _make_dinov2_model 4 | 5 | __all__ = [ 6 | "ConvNeXt", 7 | "ConvNeXtV2", 8 | "_make_dinov2_model", 9 | ] 10 | -------------------------------------------------------------------------------- /third_party/UniDepth/unidepth/models/backbones/metadinov2/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from .attention import Attention, MemEffAttention 8 | from .block import NestedTensorBlock 9 | from .dino_head import DINOHead 10 | from .mlp import Mlp 11 | from .patch_embed import PatchEmbed 12 | from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused 13 | -------------------------------------------------------------------------------- /third_party/UniDepth/unidepth/models/backbones/metadinov2/attention.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # References: 8 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py 9 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py 10 | 11 | import logging 12 | 13 | import torch.nn as nn 14 | from torch import Tensor 15 | 16 | logger = logging.getLogger("dinov2") 17 | 18 | 19 | try: 20 | from xformers.ops import fmha, memory_efficient_attention, unbind 21 | 22 | XFORMERS_AVAILABLE = True 23 | except ImportError: 24 | logger.warning("xFormers not available") 25 | XFORMERS_AVAILABLE = False 26 | 27 | 28 | class Attention(nn.Module): 29 | def __init__( 30 | self, 31 | dim: int, 32 | num_heads: int = 8, 33 | qkv_bias: bool = False, 34 | proj_bias: bool = True, 35 | attn_drop: float = 0.0, 36 | proj_drop: float = 0.0, 37 | ) -> None: 38 | super().__init__() 39 | self.num_heads = num_heads 40 | head_dim = dim // num_heads 41 | self.scale = head_dim**-0.5 42 | 43 | self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) 44 | self.attn_drop = nn.Dropout(attn_drop) 45 | self.proj = nn.Linear(dim, dim, bias=proj_bias) 46 | self.proj_drop = nn.Dropout(proj_drop) 47 | 48 | def forward(self, x: Tensor) -> Tensor: 49 | B, N, C = x.shape 50 | qkv = ( 51 | self.qkv(x) 52 | .reshape(B, N, 3, self.num_heads, C // self.num_heads) 53 | .permute(2, 0, 3, 1, 4) 54 | ) 55 | 56 | q, k, v = qkv[0] * self.scale, qkv[1], qkv[2] 57 | attn = q @ k.transpose(-2, -1) 58 | 59 | attn = attn.softmax(dim=-1) 60 | attn = self.attn_drop(attn) 61 | 62 | x = (attn @ v).transpose(1, 2).reshape(B, N, C) 63 | x = self.proj(x) 64 | x = self.proj_drop(x) 65 | return x 66 | 67 | 68 | class MemEffAttention(Attention): 69 | def forward(self, x: Tensor, attn_bias=None) -> Tensor: 70 | if not XFORMERS_AVAILABLE: 71 | assert attn_bias is None, "xFormers is required for nested tensors usage" 72 | return super().forward(x) 73 | 74 | B, N, C = x.shape 75 | qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads) 76 | 77 | q, k, v = unbind(qkv, 2) 78 | 79 | x = memory_efficient_attention(q, k, v, attn_bias=attn_bias) 80 | x = x.reshape([B, N, C]) 81 | 82 | x = self.proj(x) 83 | x = self.proj_drop(x) 84 | return x 85 | -------------------------------------------------------------------------------- /third_party/UniDepth/unidepth/models/backbones/metadinov2/dino_head.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import torch 8 | import torch.nn as nn 9 | from torch.nn.init import trunc_normal_ 10 | from torch.nn.utils import weight_norm 11 | 12 | 13 | class DINOHead(nn.Module): 14 | def __init__( 15 | self, 16 | in_dim, 17 | out_dim, 18 | use_bn=False, 19 | nlayers=3, 20 | hidden_dim=2048, 21 | bottleneck_dim=256, 22 | mlp_bias=True, 23 | ): 24 | super().__init__() 25 | nlayers = max(nlayers, 1) 26 | self.mlp = _build_mlp( 27 | nlayers, 28 | in_dim, 29 | bottleneck_dim, 30 | hidden_dim=hidden_dim, 31 | use_bn=use_bn, 32 | bias=mlp_bias, 33 | ) 34 | self.apply(self._init_weights) 35 | self.last_layer = weight_norm(nn.Linear(bottleneck_dim, out_dim, bias=False)) 36 | self.last_layer.weight_g.data.fill_(1) 37 | 38 | def _init_weights(self, m): 39 | if isinstance(m, nn.Linear): 40 | trunc_normal_(m.weight, std=0.02) 41 | if isinstance(m, nn.Linear) and m.bias is not None: 42 | nn.init.constant_(m.bias, 0) 43 | 44 | def forward(self, x): 45 | x = self.mlp(x) 46 | eps = 1e-6 if x.dtype == torch.float16 else 1e-12 47 | x = nn.functional.normalize(x, dim=-1, p=2, eps=eps) 48 | x = self.last_layer(x) 49 | return x 50 | 51 | 52 | def _build_mlp( 53 | nlayers, in_dim, bottleneck_dim, hidden_dim=None, use_bn=False, bias=True 54 | ): 55 | if nlayers == 1: 56 | return nn.Linear(in_dim, bottleneck_dim, bias=bias) 57 | else: 58 | layers = [nn.Linear(in_dim, hidden_dim, bias=bias)] 59 | if use_bn: 60 | layers.append(nn.BatchNorm1d(hidden_dim)) 61 | layers.append(nn.GELU()) 62 | for _ in range(nlayers - 2): 63 | layers.append(nn.Linear(hidden_dim, hidden_dim, bias=bias)) 64 | if use_bn: 65 | layers.append(nn.BatchNorm1d(hidden_dim)) 66 | layers.append(nn.GELU()) 67 | layers.append(nn.Linear(hidden_dim, bottleneck_dim, bias=bias)) 68 | return nn.Sequential(*layers) 69 | -------------------------------------------------------------------------------- /third_party/UniDepth/unidepth/models/backbones/metadinov2/drop_path.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # References: 8 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py 9 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py 10 | 11 | 12 | import torch.nn as nn 13 | 14 | 15 | def drop_path(x, drop_prob: float = 0.0, training: bool = False): 16 | if drop_prob == 0.0 or not training: 17 | return x 18 | keep_prob = 1 - drop_prob 19 | shape = (x.shape[0],) + (1,) * ( 20 | x.ndim - 1 21 | ) # work with diff dim tensors, not just 2D ConvNets 22 | random_tensor = x.new_empty(shape).bernoulli_(keep_prob) 23 | if keep_prob > 0.0: 24 | random_tensor.div_(keep_prob) 25 | output = x * random_tensor 26 | return output 27 | 28 | 29 | class DropPath(nn.Module): 30 | """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" 31 | 32 | def __init__(self, drop_prob=None): 33 | super(DropPath, self).__init__() 34 | self.drop_prob = drop_prob 35 | 36 | def forward(self, x): 37 | return drop_path(x, self.drop_prob, self.training) 38 | -------------------------------------------------------------------------------- /third_party/UniDepth/unidepth/models/backbones/metadinov2/layer_scale.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110 8 | 9 | from typing import Union 10 | 11 | import torch 12 | import torch.nn as nn 13 | from torch import Tensor 14 | 15 | 16 | class LayerScale(nn.Module): 17 | def __init__( 18 | self, 19 | dim: int, 20 | init_values: Union[float, Tensor] = 1e-5, 21 | inplace: bool = False, 22 | ) -> None: 23 | super().__init__() 24 | self.inplace = inplace 25 | self.gamma = nn.Parameter(init_values * torch.ones(dim)) 26 | 27 | def forward(self, x: Tensor) -> Tensor: 28 | return x.mul_(self.gamma) if self.inplace else x * self.gamma 29 | -------------------------------------------------------------------------------- /third_party/UniDepth/unidepth/models/backbones/metadinov2/mlp.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # References: 8 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py 9 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py 10 | 11 | 12 | from typing import Callable, Optional 13 | 14 | from torch import Tensor, nn 15 | 16 | 17 | class Mlp(nn.Module): 18 | def __init__( 19 | self, 20 | in_features: int, 21 | hidden_features: Optional[int] = None, 22 | out_features: Optional[int] = None, 23 | act_layer: Callable[..., nn.Module] = nn.GELU, 24 | drop: float = 0.0, 25 | bias: bool = True, 26 | ) -> None: 27 | super().__init__() 28 | out_features = out_features or in_features 29 | hidden_features = hidden_features or in_features 30 | self.fc1 = nn.Linear(in_features, hidden_features, bias=bias) 31 | self.act = act_layer() 32 | self.fc2 = nn.Linear(hidden_features, out_features, bias=bias) 33 | self.drop = nn.Dropout(drop) 34 | 35 | def forward(self, x: Tensor) -> Tensor: 36 | x = self.fc1(x) 37 | x = self.act(x) 38 | x = self.drop(x) 39 | x = self.fc2(x) 40 | x = self.drop(x) 41 | return x 42 | -------------------------------------------------------------------------------- /third_party/UniDepth/unidepth/models/backbones/metadinov2/patch_embed.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # References: 8 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py 9 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py 10 | 11 | from typing import Callable, Optional, Tuple, Union 12 | 13 | import torch.nn as nn 14 | from torch import Tensor 15 | 16 | 17 | def make_2tuple(x): 18 | if isinstance(x, tuple): 19 | assert len(x) == 2 20 | return x 21 | 22 | assert isinstance(x, int) 23 | return (x, x) 24 | 25 | 26 | class PatchEmbed(nn.Module): 27 | """ 28 | 2D image to patch embedding: (B,C,H,W) -> (B,N,D) 29 | 30 | Args: 31 | img_size: Image size. 32 | patch_size: Patch token size. 33 | in_chans: Number of input image channels. 34 | embed_dim: Number of linear projection output channels. 35 | norm_layer: Normalization layer. 36 | """ 37 | 38 | def __init__( 39 | self, 40 | img_size: Union[int, Tuple[int, int]] = 224, 41 | patch_size: Union[int, Tuple[int, int]] = 16, 42 | in_chans: int = 3, 43 | embed_dim: int = 768, 44 | norm_layer: Optional[Callable] = None, 45 | flatten_embedding: bool = True, 46 | ) -> None: 47 | super().__init__() 48 | 49 | image_HW = make_2tuple(img_size) 50 | patch_HW = make_2tuple(patch_size) 51 | patch_grid_size = ( 52 | image_HW[0] // patch_HW[0], 53 | image_HW[1] // patch_HW[1], 54 | ) 55 | 56 | self.img_size = image_HW 57 | self.patch_size = patch_HW 58 | self.patches_resolution = patch_grid_size 59 | self.num_patches = patch_grid_size[0] * patch_grid_size[1] 60 | 61 | self.in_chans = in_chans 62 | self.embed_dim = embed_dim 63 | 64 | self.flatten_embedding = flatten_embedding 65 | 66 | self.proj = nn.Conv2d( 67 | in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW 68 | ) 69 | self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() 70 | 71 | def forward(self, x: Tensor) -> Tensor: 72 | _, _, H, W = x.shape 73 | patch_H, patch_W = self.patch_size 74 | 75 | assert ( 76 | H % patch_H == 0 77 | ), f"Input image height {H} is not a multiple of patch height {patch_H}" 78 | assert ( 79 | W % patch_W == 0 80 | ), f"Input image width {W} is not a multiple of patch width: {patch_W}" 81 | 82 | x = self.proj(x) # B C H W 83 | H, W = x.size(2), x.size(3) 84 | x = x.flatten(2).transpose(1, 2) # B HW C 85 | x = self.norm(x) 86 | if not self.flatten_embedding: 87 | x = x.reshape(-1, H, W, self.embed_dim) # B H W C 88 | return x 89 | 90 | def flops(self) -> float: 91 | Ho, Wo = self.patches_resolution 92 | flops = ( 93 | Ho 94 | * Wo 95 | * self.embed_dim 96 | * self.in_chans 97 | * (self.patch_size[0] * self.patch_size[1]) 98 | ) 99 | if self.norm is not None: 100 | flops += Ho * Wo * self.embed_dim 101 | return flops 102 | -------------------------------------------------------------------------------- /third_party/UniDepth/unidepth/models/backbones/metadinov2/swiglu_ffn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from typing import Callable, Optional 8 | 9 | import torch.nn.functional as F 10 | from torch import Tensor, nn 11 | 12 | 13 | class SwiGLUFFN(nn.Module): 14 | def __init__( 15 | self, 16 | in_features: int, 17 | hidden_features: Optional[int] = None, 18 | out_features: Optional[int] = None, 19 | act_layer: Callable[..., nn.Module] = None, 20 | drop: float = 0.0, 21 | bias: bool = True, 22 | ) -> None: 23 | super().__init__() 24 | out_features = out_features or in_features 25 | hidden_features = hidden_features or in_features 26 | self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias) 27 | self.w3 = nn.Linear(hidden_features, out_features, bias=bias) 28 | 29 | def forward(self, x: Tensor) -> Tensor: 30 | x12 = self.w12(x) 31 | x1, x2 = x12.chunk(2, dim=-1) 32 | hidden = F.silu(x1) * x2 33 | return self.w3(hidden) 34 | 35 | 36 | try: 37 | from xformers.ops import SwiGLU 38 | 39 | XFORMERS_AVAILABLE = True 40 | except ImportError: 41 | SwiGLU = SwiGLUFFN 42 | XFORMERS_AVAILABLE = False 43 | 44 | 45 | class SwiGLUFFNFused(SwiGLU): 46 | def __init__( 47 | self, 48 | in_features: int, 49 | hidden_features: Optional[int] = None, 50 | out_features: Optional[int] = None, 51 | act_layer: Callable[..., nn.Module] = None, 52 | drop: float = 0.0, 53 | bias: bool = True, 54 | ) -> None: 55 | out_features = out_features or in_features 56 | hidden_features = hidden_features or in_features 57 | hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8 58 | super().__init__( 59 | in_features=in_features, 60 | hidden_features=hidden_features, 61 | out_features=out_features, 62 | bias=bias, 63 | ) 64 | -------------------------------------------------------------------------------- /third_party/UniDepth/unidepth/models/unidepthv1/__init__.py: -------------------------------------------------------------------------------- 1 | from .unidepthv1 import UniDepthV1 2 | 3 | __all__ = [ 4 | "UniDepthV1", 5 | ] 6 | -------------------------------------------------------------------------------- /third_party/UniDepth/unidepth/models/unidepthv2/__init__.py: -------------------------------------------------------------------------------- 1 | from .unidepthv2 import UniDepthV2 2 | 3 | __all__ = [ 4 | "UniDepthV2", 5 | ] 6 | -------------------------------------------------------------------------------- /third_party/UniDepth/unidepth/ops/__init__.py: -------------------------------------------------------------------------------- 1 | from .losses import MSE, SelfCons, SILog 2 | from .scheduler import CosineScheduler 3 | 4 | __all__ = [ 5 | "SILog", 6 | "MSE", 7 | "SelfCons", 8 | "CosineScheduler", 9 | ] 10 | -------------------------------------------------------------------------------- /third_party/UniDepth/unidepth/ops/scheduler.py: -------------------------------------------------------------------------------- 1 | """ 2 | Author: Luigi Piccinelli 3 | Licensed under the CC-BY NC 4.0 license (http://creativecommons.org/licenses/by-nc/4.0/) 4 | """ 5 | 6 | import numpy as np 7 | 8 | 9 | class CosineScheduler(object): 10 | def __init__( 11 | self, 12 | optimizer, 13 | warmup_iters, 14 | total_iters, 15 | key, 16 | overwrite=False, 17 | init_value=None, 18 | base_value=None, 19 | final_value=None, 20 | step_init=-1, 21 | ): 22 | super().__init__() 23 | self.iter = step_init 24 | self.overwrite = overwrite 25 | self.optimizer = optimizer 26 | self.base_value = base_value 27 | self.init_value = init_value 28 | self.final_value = final_value 29 | self.total_iters = total_iters 30 | self.warmup_iters = warmup_iters 31 | self.key = key 32 | self.schedulers = [ 33 | self.get_schedulers(group) for group in optimizer.param_groups 34 | ] 35 | 36 | def get_schedulers(self, group): 37 | init_value = group.get(self.key + "_init", self.init_value) 38 | base_value = group.get(self.key + "_base", self.base_value) 39 | final_value = group.get(self.key + "_final", self.final_value) 40 | warmup_iters = self.warmup_iters 41 | total_iters = self.total_iters 42 | if self.overwrite: 43 | final_value = self.final_value 44 | 45 | # normalize in 0,1, then apply function (power) and denormalize 46 | normalized_schedule = np.linspace(0, 1, warmup_iters, endpoint=True) 47 | normalized_schedule = np.power(normalized_schedule, 2) 48 | warmup_schedule = (base_value - init_value) * normalized_schedule + init_value 49 | 50 | # main scheduling 51 | iters = np.arange(total_iters - warmup_iters) 52 | schedule = final_value + 0.5 * (base_value - final_value) * ( 53 | 1 + np.cos(np.pi * iters / len(iters)) 54 | ) 55 | return np.concatenate((warmup_schedule, schedule)) 56 | 57 | def step(self): 58 | self.iter = self.iter + 1 59 | vals = self[self.iter] 60 | for group, val in zip(self.optimizer.param_groups, vals): 61 | if isinstance(group[self.key], (tuple, list)): 62 | val = (val, *group[self.key][1:]) 63 | group[self.key] = val 64 | 65 | def __getitem__(self, it): 66 | it = min(it, self.total_iters - 1) 67 | return [scheduler[it] for scheduler in self.schedulers] 68 | 69 | def get(self): 70 | return [group[self.key] for group in self.optimizer.param_groups] 71 | -------------------------------------------------------------------------------- /third_party/UniDepth/unidepth/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .distributed import (barrier, get_dist_info, get_rank, is_main_process, 2 | setup_multi_processes, setup_slurm, 3 | sync_tensor_across_gpus) 4 | from .evaluation_depth import DICT_METRICS, eval_depth 5 | from .geometric import spherical_zbuffer_to_euclidean, unproject_points 6 | from .misc import format_seconds, get_params, identity, remove_padding 7 | from .visualization import colorize, image_grid, log_train_artifacts 8 | 9 | __all__ = [ 10 | "eval_depth", 11 | "DICT_METRICS", 12 | "colorize", 13 | "image_grid", 14 | "log_train_artifacts", 15 | "format_seconds", 16 | "remove_padding", 17 | "get_params", 18 | "identity", 19 | "is_main_process", 20 | "setup_multi_processes", 21 | "setup_slurm", 22 | "sync_tensor_across_gpus", 23 | "barrier", 24 | "get_rank", 25 | "unproject_points", 26 | "spherical_zbuffer_to_euclidean", 27 | "validate", 28 | "get_dist_info", 29 | ] 30 | -------------------------------------------------------------------------------- /third_party/UniDepth/unidepth/utils/constants.py: -------------------------------------------------------------------------------- 1 | """ 2 | Author: Luigi Piccinelli 3 | Licensed under the CC-BY NC 4.0 license (http://creativecommons.org/licenses/by-nc/4.0/) 4 | """ 5 | 6 | import math 7 | 8 | import torch 9 | 10 | OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073) 11 | OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711) 12 | IMAGENET_DATASET_MEAN = (0.485, 0.456, 0.406) 13 | IMAGENET_DATASET_STD = (0.229, 0.224, 0.225) 14 | DEPTH_BINS = torch.cat( 15 | ( 16 | torch.logspace(math.log10(0.1), math.log10(180.0), steps=512), 17 | torch.tensor([260.0]), 18 | ), 19 | dim=0, 20 | ) 21 | LOGERR_BINS = torch.linspace(-2, 2, steps=128 + 1) 22 | LINERR_BINS = torch.linspace(-50, 50, steps=256 + 1) 23 | -------------------------------------------------------------------------------- /tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/tools/__init__.py --------------------------------------------------------------------------------