├── .gitignore
├── LICENSE
├── README.md
├── configs
    ├── Base.yaml
    ├── Base_Omni3D.yaml
    ├── Base_Omni3D_ARKitScenes.yaml
    ├── Base_Omni3D_KITTI.yaml
    ├── Base_Omni3D_SUNRGBD.yaml
    ├── Base_Omni3D_nuScenes.yaml
    ├── cubercnn_DLA34_FPN.yaml
    ├── cubercnn_ResNet34_FPN.yaml
    ├── cubercnn_densenet_FPN.yaml
    ├── cubercnn_mnasnet_FPN.yaml
    └── cubercnn_shufflenet_FPN.yaml
├── cubercnn
    ├── config
    │   ├── __init__.py
    │   └── config.py
    ├── data
    │   ├── __init__.py
    │   ├── build.py
    │   ├── builtin.py
    │   ├── dataset_mapper.py
    │   └── datasets.py
    ├── evaluation
    │   ├── __init__.py
    │   └── omni3d_evaluation.py
    ├── generate_label
    │   ├── __init__.py
    │   ├── priors.py
    │   ├── process_indoor.py
    │   ├── process_outdoor.py
    │   ├── raytrace.py
    │   └── util.py
    ├── modeling
    │   ├── backbone
    │   │   ├── __init__.py
    │   │   ├── densenet.py
    │   │   ├── dla.py
    │   │   ├── mnasnet.py
    │   │   ├── resnet.py
    │   │   └── shufflenet.py
    │   ├── meta_arch
    │   │   ├── __init__.py
    │   │   ├── rcnn3d.py
    │   │   └── rcnn3d_text.py
    │   ├── proposal_generator
    │   │   ├── __init__.py
    │   │   └── rpn.py
    │   └── roi_heads
    │   │   ├── __init__.py
    │   │   ├── cube_head.py
    │   │   ├── fast_rcnn.py
    │   │   ├── fast_rcnn_text.py
    │   │   ├── roi_heads.py
    │   │   └── roi_heads_text.py
    ├── solver
    │   ├── __init__.py
    │   ├── build.py
    │   └── checkpoint.py
    ├── util
    │   ├── __init__.py
    │   ├── math_util.py
    │   ├── model_zoo.py
    │   └── util.py
    └── vis
    │   ├── __init__.py
    │   ├── logperf.py
    │   └── vis.py
├── datasets
    └── Omni3D
    │   ├── download_omni3d_json.sh
    │   └── stats.json
├── docs
    └── teaser.png
├── scripts
    ├── generate_pseudo_label.sh
    ├── test.sh
    ├── train.sh
    └── train_KITTI.sh
├── third_party
    ├── Grounded-Segment-Anything
    │   ├── .gitignore
    │   ├── .gitmodules
    │   ├── CITATION.cff
    │   ├── Dockerfile
    │   ├── EfficientSAM
    │   │   ├── EdgeSAM
    │   │   │   ├── common.py
    │   │   │   ├── rep_vit.py
    │   │   │   └── setup_edge_sam.py
    │   │   ├── FastSAM
    │   │   │   └── tools.py
    │   │   ├── LightHQSAM
    │   │   │   ├── example_light_hqsam.png
    │   │   │   ├── grounded_light_hqsam_annotated_image.jpg
    │   │   │   ├── setup_light_hqsam.py
    │   │   │   └── tiny_vit_sam.py
    │   │   ├── MobileSAM
    │   │   │   ├── setup_mobile_sam.py
    │   │   │   └── tiny_vit_sam.py
    │   │   ├── README.md
    │   │   ├── RepViTSAM
    │   │   │   ├── repvit.py
    │   │   │   └── setup_repvit_sam.py
    │   │   ├── grounded_edge_sam.py
    │   │   ├── grounded_efficient_sam.py
    │   │   ├── grounded_fast_sam.py
    │   │   ├── grounded_light_hqsam.py
    │   │   ├── grounded_mobile_sam.py
    │   │   └── grounded_repvit_sam.py
    │   ├── GroundingDINO
    │   │   ├── .asset
    │   │   │   ├── COCO.png
    │   │   │   ├── GD_GLIGEN.png
    │   │   │   ├── GD_SD.png
    │   │   │   ├── ODinW.png
    │   │   │   ├── arch.png
    │   │   │   ├── cats.png
    │   │   │   └── hero_figure.png
    │   │   ├── LICENSE
    │   │   ├── README.md
    │   │   ├── demo
    │   │   │   ├── gradio_app.py
    │   │   │   └── inference_on_a_image.py
    │   │   ├── groundingdino
    │   │   │   ├── __init__.py
    │   │   │   ├── config
    │   │   │   │   ├── GroundingDINO_SwinB.py
    │   │   │   │   └── GroundingDINO_SwinT_OGC.py
    │   │   │   ├── datasets
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── transforms.py
    │   │   │   ├── models
    │   │   │   │   ├── GroundingDINO
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── backbone
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   ├── backbone.py
    │   │   │   │   │   │   ├── position_encoding.py
    │   │   │   │   │   │   └── swin_transformer.py
    │   │   │   │   │   ├── bertwarper.py
    │   │   │   │   │   ├── csrc
    │   │   │   │   │   │   ├── MsDeformAttn
    │   │   │   │   │   │   │   ├── ms_deform_attn.h
    │   │   │   │   │   │   │   ├── ms_deform_attn_cpu.cpp
    │   │   │   │   │   │   │   ├── ms_deform_attn_cpu.h
    │   │   │   │   │   │   │   ├── ms_deform_attn_cuda.cu
    │   │   │   │   │   │   │   ├── ms_deform_attn_cuda.h
    │   │   │   │   │   │   │   └── ms_deform_im2col_cuda.cuh
    │   │   │   │   │   │   ├── cuda_version.cu
    │   │   │   │   │   │   └── vision.cpp
    │   │   │   │   │   ├── fuse_modules.py
    │   │   │   │   │   ├── groundingdino.py
    │   │   │   │   │   ├── ms_deform_attn.py
    │   │   │   │   │   ├── transformer.py
    │   │   │   │   │   ├── transformer_vanilla.py
    │   │   │   │   │   └── utils.py
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── registry.py
    │   │   │   ├── util
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── box_ops.py
    │   │   │   │   ├── get_tokenlizer.py
    │   │   │   │   ├── inference.py
    │   │   │   │   ├── logger.py
    │   │   │   │   ├── misc.py
    │   │   │   │   ├── slconfig.py
    │   │   │   │   ├── slio.py
    │   │   │   │   ├── time_counter.py
    │   │   │   │   ├── utils.py
    │   │   │   │   ├── visualizer.py
    │   │   │   │   └── vl_utils.py
    │   │   │   └── version.py
    │   │   ├── pyproject.toml
    │   │   ├── requirements.txt
    │   │   └── setup.py
    │   ├── LICENSE
    │   ├── Makefile
    │   ├── README.md
    │   ├── automatic_label_demo.py
    │   ├── automatic_label_ram_demo.py
    │   ├── automatic_label_simple_demo.py
    │   ├── automatic_label_tag2text_demo.py
    │   ├── chatbot.py
    │   ├── cog.yaml
    │   ├── grounded_sam_detect.py
    │   ├── grounded_sam_detect_ground.py
    │   ├── playground
    │   │   ├── DeepFloyd
    │   │   │   ├── README.md
    │   │   │   ├── dream.py
    │   │   │   ├── inpaint.py
    │   │   │   └── style_transfer.py
    │   │   ├── ImageBind_SAM
    │   │   │   ├── .assets
    │   │   │   │   ├── bird_audio.wav
    │   │   │   │   ├── bird_image.jpg
    │   │   │   │   ├── car_audio.wav
    │   │   │   │   ├── car_image.jpg
    │   │   │   │   ├── dog_audio.wav
    │   │   │   │   └── dog_image.jpg
    │   │   │   ├── README.md
    │   │   │   ├── audio_referring_seg_demo.py
    │   │   │   ├── bpe
    │   │   │   │   └── bpe_simple_vocab_16e6.txt.gz
    │   │   │   ├── data.py
    │   │   │   ├── demo.py
    │   │   │   ├── image_referring_seg_demo.py
    │   │   │   ├── models
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── helpers.py
    │   │   │   │   ├── imagebind_model.py
    │   │   │   │   ├── multimodal_preprocessors.py
    │   │   │   │   └── transformer.py
    │   │   │   ├── text_referring_seg_demo.py
    │   │   │   └── utils.py
    │   │   ├── LaMa
    │   │   │   ├── README.md
    │   │   │   ├── lama_inpaint_demo.py
    │   │   │   └── sam_lama.py
    │   │   ├── PaintByExample
    │   │   │   ├── README.md
    │   │   │   ├── paint_by_example.py
    │   │   │   └── sam_paint_by_example.py
    │   │   ├── README.md
    │   │   └── RePaint
    │   │   │   ├── README.md
    │   │   │   └── repaint.py
    │   ├── predict.py
    │   ├── requirements.txt
    │   └── segment_anything
    │   │   ├── .flake8
    │   │   ├── CODE_OF_CONDUCT.md
    │   │   ├── CONTRIBUTING.md
    │   │   ├── LICENSE
    │   │   ├── README.md
    │   │   ├── assets
    │   │       ├── masks1.png
    │   │       ├── masks2.jpg
    │   │       ├── model_diagram.png
    │   │       ├── notebook1.png
    │   │       └── notebook2.png
    │   │   ├── linter.sh
    │   │   ├── notebooks
    │   │       └── images
    │   │       │   ├── dog.jpg
    │   │       │   ├── groceries.jpg
    │   │       │   └── truck.jpg
    │   │   ├── scripts
    │   │       ├── amg.py
    │   │       └── export_onnx_model.py
    │   │   ├── segment_anything
    │   │       ├── __init__.py
    │   │       ├── automatic_mask_generator.py
    │   │       ├── build_sam.py
    │   │       ├── build_sam_hq.py
    │   │       ├── modeling
    │   │       │   ├── __init__.py
    │   │       │   ├── common.py
    │   │       │   ├── image_encoder.py
    │   │       │   ├── mask_decoder.py
    │   │       │   ├── mask_decoder_hq.py
    │   │       │   ├── prompt_encoder.py
    │   │       │   ├── sam.py
    │   │       │   └── transformer.py
    │   │       ├── predictor.py
    │   │       └── utils
    │   │       │   ├── __init__.py
    │   │       │   ├── amg.py
    │   │       │   ├── onnx.py
    │   │       │   └── transforms.py
    │   │   ├── setup.cfg
    │   │   └── setup.py
    └── UniDepth
    │   ├── .gitignore
    │   ├── LICENSE
    │   ├── README.md
    │   ├── assets
    │       ├── demo
    │       │   ├── depth.png
    │       │   ├── intrinsics.npy
    │       │   ├── output.png
    │       │   └── rgb.png
    │       └── docs
    │       │   ├── V2_README.md
    │       │   ├── nuscenes_surround.gif
    │       │   ├── theoffice.gif
    │       │   └── unidepth-banner.png
    │   ├── configs
    │       ├── config_v1_cnvnxtl.json
    │       ├── config_v1_vitl14.json
    │       ├── config_v2_vitl14.json
    │       └── config_v2_vits14.json
    │   ├── hubconf.py
    │   ├── pyproject.toml
    │   ├── requirements.txt
    │   ├── run_unidepth.py
    │   ├── scripts
    │       └── demo.py
    │   └── unidepth
    │       ├── layers
    │           ├── __init__.py
    │           ├── activation.py
    │           ├── attention.py
    │           ├── convnext.py
    │           ├── drop_path.py
    │           ├── layer_scale.py
    │           ├── mlp.py
    │           ├── nystrom_attention.py
    │           ├── positional_encoding.py
    │           └── upsample.py
    │       ├── models
    │           ├── __init__.py
    │           ├── backbones
    │           │   ├── __init__.py
    │           │   ├── convnext.py
    │           │   ├── convnext2.py
    │           │   ├── dinov2.py
    │           │   └── metadinov2
    │           │   │   ├── __init__.py
    │           │   │   ├── attention.py
    │           │   │   ├── block.py
    │           │   │   ├── dino_head.py
    │           │   │   ├── drop_path.py
    │           │   │   ├── layer_scale.py
    │           │   │   ├── mlp.py
    │           │   │   ├── patch_embed.py
    │           │   │   └── swiglu_ffn.py
    │           ├── encoder.py
    │           ├── unidepthv1
    │           │   ├── __init__.py
    │           │   ├── decoder.py
    │           │   └── unidepthv1.py
    │           └── unidepthv2
    │           │   ├── __init__.py
    │           │   ├── decoder.py
    │           │   ├── decoder_old.py
    │           │   ├── export.py
    │           │   └── unidepthv2.py
    │       ├── ops
    │           ├── __init__.py
    │           ├── losses.py
    │           └── scheduler.py
    │       └── utils
    │           ├── __init__.py
    │           ├── constants.py
    │           ├── distributed.py
    │           ├── ema_torch.py
    │           ├── evaluation_depth.py
    │           ├── geometric.py
    │           ├── misc.py
    │           ├── positional_embedding.py
    │           ├── sht.py
    │           └── visualization.py
└── tools
    ├── __init__.py
    ├── generate_pseudo_bbox.py
    ├── train_net.py
    └── transform_to_coco.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # folders or files
 2 | datasets/*
 3 | !datasets/Omni3D
 4 | .vscode/
 5 | .ipynb_checkpoints/
 6 | .idea/
 7 | output/*
 8 | checkpoints/*
 9 | pseudo_label/*
10 | third_party/detectron2
11 | 
12 | cubercnn/external/
13 | 
14 | # filetypes
15 | *.pyc
16 | *.mexa64
17 | */output/*
18 | */output*/*
19 | *~
20 | *.so
21 | *.ipynb
22 | 


--------------------------------------------------------------------------------
/configs/Base.yaml:
--------------------------------------------------------------------------------
 1 | SOLVER:
 2 |   TYPE: "sgd"
 3 |   IMS_PER_BATCH: 32
 4 |   BASE_LR: 0.02
 5 |   STEPS: (19200, 25600)
 6 |   MAX_ITER: 32000
 7 |   WEIGHT_DECAY: 0.0001
 8 |   LR_SCHEDULER_NAME: "WarmupMultiStepLR"
 9 | INPUT:
10 |   MIN_SIZE_TRAIN: (256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640,)
11 |   MIN_SIZE_TEST: 512
12 |   MAX_SIZE_TRAIN: 4096
13 |   MAX_SIZE_TEST: 4096
14 | TEST:
15 |   VISIBILITY_THRES: 0.33333333
16 |   TRUNCATION_THRES: 0.33333333
17 |   EVAL_PERIOD: 16000
18 | DATASETS:
19 |   TRAIN: ('KITTI_train', 'KITTI_val')
20 |   TEST: ('KITTI_test',) 
21 |   CATEGORY_NAMES: ('pedestrian', 'car', 'cyclist', 'van', 'truck', 'tram', 'person')
22 |   IGNORE_NAMES: "['dontcare', 'ignore', 'void']"
23 |   MIN_HEIGHT_THRES: 0.05
24 |   TRUNCATION_THRES: 0.75
25 |   VISIBILITY_THRES: 0.25
26 |   TRUNC_2D_BOXES: True
27 | VIS_PERIOD: 640
28 | DATALOADER:
29 |   SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
30 |   REPEAT_THRESHOLD: 0.1
31 | MODEL:
32 |   PIXEL_MEAN: [103.530, 116.280, 123.675]
33 |   PIXEL_STD: [57.375, 57.120, 58.395]
34 |   META_ARCHITECTURE: "RCNN3D"
35 |   MASK_ON: False
36 |   STABILIZE: 0.02
37 |   USE_BN: True
38 |   BACKBONE:
39 |     FREEZE_AT: 0
40 |     NAME: 'build_dla_from_vision_fpn_backbone'
41 |   DLA:
42 |     TYPE: 'dla34'
43 |   FPN:
44 |     IN_FEATURES: ['p2', 'p3', 'p4', 'p5', 'p6']
45 |   ANCHOR_GENERATOR:
46 |     SIZES: [[32], [64], [128], [256], [512]]  # One size for each in feature map
47 |     ASPECT_RATIOS: [[0.5, 1.0, 2.0]]  # Three aspect ratios (same for all in feature maps)
48 |   RPN:
49 |     HEAD_NAME: "StandardRPNHead"
50 |     IN_FEATURES: ['p2', 'p3', 'p4', 'p5', 'p6']
51 |     PRE_NMS_TOPK_TRAIN: 2000  # Per FPN level
52 |     PRE_NMS_TOPK_TEST: 1000  # Per FPN level
53 |     POST_NMS_TOPK_TRAIN: 1000
54 |     POST_NMS_TOPK_TEST: 1000
55 |     BOUNDARY_THRESH: -1
56 |     OBJECTNESS_UNCERTAINTY: "IoUness"
57 |     IOU_THRESHOLDS: [0.05, 0.05]
58 |     POSITIVE_FRACTION: 1.0
59 |   PROPOSAL_GENERATOR:
60 |     NAME: "RPNWithIgnore"
61 |   ROI_HEADS:
62 |     NAME: "ROIHeads3D"
63 |     IN_FEATURES: ["p2", "p3", "p4", "p5", 'p6']
64 |     BATCH_SIZE_PER_IMAGE: 512
65 |     SCORE_THRESH_TEST: 0.01
66 |     NUM_CLASSES: 43
67 |   ROI_BOX_HEAD:
68 |     NAME: "FastRCNNConvFCHead"
69 |     NUM_FC: 2
70 |     POOLER_RESOLUTION: 7
71 |   ROI_CUBE_HEAD:
72 |     NAME: 'CubeHead'
73 |     Z_TYPE: 'direct'
74 |     POSE_TYPE: '6d'
75 |     NUM_FC: 2
76 |     SHARED_FC: True
77 |     USE_CONFIDENCE: 1.0
78 |     LOSS_W_3D: 1.0
79 |     POOLER_TYPE: 'ROIAlignV2'
80 |     POOLER_RESOLUTION: 7
81 |     DIMS_PRIORS_ENABLED: True
82 |     DISENTANGLED_LOSS: True
83 |     ALLOCENTRIC_POSE: True
84 |     VIRTUAL_FOCAL: 512.0
85 |     VIRTUAL_DEPTH: True
86 |     CHAMFER_POSE: True
87 | VERSION: 2


--------------------------------------------------------------------------------
/configs/Base_Omni3D.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base.yaml"
 2 | SOLVER:
 3 |   TYPE: "sgd"
 4 |   IMS_PER_BATCH: 192
 5 |   BASE_LR: 0.12
 6 |   STEPS: (69600, 92800)
 7 |   MAX_ITER: 116000
 8 |   WARMUP_ITERS: 3625
 9 | TEST:
10 |   EVAL_PERIOD: 29000
11 | VIS_PERIOD: 2320
12 | DATASETS:
13 |   TRAIN: ('SUNRGBD_train', 'SUNRGBD_val', 'Hypersim_train', 'Hypersim_val', 'ARKitScenes_train', 'ARKitScenes_val', 'Objectron_train', 'Objectron_val', 'nuScenes_train', 'nuScenes_val', 'KITTI_train', 'KITTI_val')
14 |   TEST: ('SUNRGBD_test', 'Hypersim_test', 'ARKitScenes_test', 'Objectron_test', 'KITTI_test', 'nuScenes_test') 
15 |   CATEGORY_NAMES: ('chair', 'table', 'cabinet', 'car', 'lamp', 'books', 'sofa', 'pedestrian', 'picture', 'window', 'pillow', 'truck', 'door', 'blinds', 'sink', 'shelves', 'television', 'shoes', 'cup', 'bottle', 'bookcase', 'laptop', 'desk', 'cereal box', 'floor mat', 'traffic cone', 'mirror', 'barrier', 'counter', 'camera', 'bicycle', 'toilet', 'bus', 'bed', 'refrigerator', 'trailer', 'box', 'oven', 'clothes', 'van', 'towel', 'motorcycle', 'night stand', 'stove', 'machine', 'stationery', 'bathtub', 'cyclist', 'curtain', 'bin')
16 | MODEL:
17 |   ROI_HEADS:
18 |     NUM_CLASSES: 50


--------------------------------------------------------------------------------
/configs/Base_Omni3D_ARKitScenes.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base.yaml"
 2 | SOLVER:
 3 |   TYPE: "sgd"
 4 |   IMS_PER_BATCH: 32
 5 |   BASE_LR: 0.02
 6 |   STEPS: (17400, 23200)
 7 |   MAX_ITER: 29000
 8 |   WARMUP_ITERS: 906
 9 | TEST:
10 |   EVAL_PERIOD: 7250
11 | VIS_PERIOD: 580
12 | DATASETS:
13 |   TRAIN: ('ARKitScenes_train', 'ARKitScenes_val')
14 |   TEST: ('ARKitScenes_test',) 
15 |   CATEGORY_NAMES: ('bed', 'table', 'chair', 'fireplace', 'machine', 'cabinet', 'oven', 'shelves', 'sink', 'stove', 'bathtub', 'toilet', 'sofa', 'television', 'refrigerator')
16 |   FOLDER_NAME: 'Omni3D_pl'
17 | MODEL:
18 |   META_ARCHITECTURE: RCNN3D_text
19 |   ROI_HEADS:
20 |     NAME : ROIHeads3D_Text
21 |     NUM_CLASSES: 15
22 |   STABILIZE: 0.5
23 | 
24 | 


--------------------------------------------------------------------------------
/configs/Base_Omni3D_KITTI.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base.yaml"
 2 | SOLVER:
 3 |   TYPE: "sgd"
 4 |   IMS_PER_BATCH: 16
 5 |   BASE_LR: 0.01
 6 |   STEPS: (17400, 23200)
 7 |   MAX_ITER: 29000
 8 |   WARMUP_ITERS: 906
 9 | TEST:
10 |   EVAL_PERIOD: 7250
11 | VIS_PERIOD: 580
12 | DATASETS:
13 |   TRAIN: ('KITTI_train', 'KITTI_val')
14 |   TEST: ('KITTI_test',)
15 |   CATEGORY_NAMES: ('pedestrian', 'car', 'cyclist', 'van', 'truck')
16 |   FOLDER_NAME: 'Omni3D_pl'
17 | MODEL:
18 |   META_ARCHITECTURE: RCNN3D_text
19 |   ROI_HEADS:
20 |     NAME : ROIHeads3D_Text
21 |     NUM_CLASSES: 5
22 |   STABILIZE: 0.5


--------------------------------------------------------------------------------
/configs/Base_Omni3D_SUNRGBD.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base.yaml"
 2 | SOLVER:
 3 |   TYPE: "sgd"
 4 |   IMS_PER_BATCH: 32
 5 |   BASE_LR: 0.02
 6 |   STEPS: (17400, 23200)
 7 |   MAX_ITER: 29000
 8 |   WARMUP_ITERS: 906
 9 | TEST:
10 |   EVAL_PERIOD: 7250
11 | VIS_PERIOD: 580
12 | DATASETS:
13 |   TRAIN: ('SUNRGBD_train', 'SUNRGBD_val')
14 |   TEST: ('SUNRGBD_test',) 
15 |   CATEGORY_NAMES: ('bicycle', 'books', 'bottle', 'chair', 'cup', 'laptop', 'shoes', 'towel', 'blinds', 'window', 'lamp', 'shelves', 'mirror', 'sink', 'cabinet', 'bathtub', 'door', 'toilet', 'desk', 'box', 'bookcase', 'picture', 'table', 'counter', 'bed', 'night stand', 'pillow', 'sofa', 'television', 'floor mat', 'curtain', 'clothes', 'stationery', 'refrigerator', 'bin', 'stove', 'oven', 'machine')
16 |   FOLDER_NAME: 'Omni3D_pl'
17 | MODEL:
18 |   META_ARCHITECTURE: RCNN3D_text
19 |   ROI_HEADS:
20 |     NAME : ROIHeads3D_Text
21 |     NUM_CLASSES: 38
22 |   STABILIZE: 0.5


--------------------------------------------------------------------------------
/configs/Base_Omni3D_nuScenes.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base.yaml"
 2 | SOLVER:
 3 |   TYPE: "sgd"
 4 |   IMS_PER_BATCH: 32
 5 |   BASE_LR: 0.01
 6 |   STEPS: (17400, 23200)
 7 |   MAX_ITER: 29000
 8 |   WARMUP_ITERS: 906
 9 | TEST:
10 |   EVAL_PERIOD: 7250
11 | VIS_PERIOD: 580
12 | DATASETS:
13 |   TRAIN: ('nuScenes_train', 'nuScenes_val')
14 |   TEST: ('nuScenes_test',) 
15 |   CATEGORY_NAMES: ('pedestrian', 'car', 'truck', 'traffic cone', 'barrier', 'motorcycle', 'bicycle', 'bus', 'trailer')
16 |   FOLDER_NAME: 'Omni3D_pl'
17 | MODEL:
18 |   META_ARCHITECTURE: RCNN3D_text
19 |   ROI_HEADS:
20 |     NAME : ROIHeads3D_Text
21 |     NUM_CLASSES: 9
22 |   STABILIZE: 0.5


--------------------------------------------------------------------------------
/configs/cubercnn_DLA34_FPN.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "Base_Omni3D.yaml"
2 | MODEL:
3 |   BACKBONE:
4 |     NAME: 'build_dla_from_vision_fpn_backbone'
5 |   DLA:
6 |     TYPE: 'dla34'


--------------------------------------------------------------------------------
/configs/cubercnn_ResNet34_FPN.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "Base_Omni3D.yaml"
2 | MODEL:
3 |   BACKBONE:
4 |     NAME: 'build_resnet_from_vision_fpn_backbone'
5 |   RESNETS:
6 |     DEPTH: 34
7 |     TORCHVISION: True


--------------------------------------------------------------------------------
/configs/cubercnn_densenet_FPN.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "Base_Omni3D.yaml"
2 | MODEL:
3 |   BACKBONE:
4 |     NAME: 'build_densenet_fpn_backbone'


--------------------------------------------------------------------------------
/configs/cubercnn_mnasnet_FPN.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "Base_Omni3D.yaml"
2 | MODEL:
3 |   BACKBONE:
4 |     NAME: 'build_mnasnet_fpn_backbone'


--------------------------------------------------------------------------------
/configs/cubercnn_shufflenet_FPN.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "Base_Omni3D.yaml"
2 | MODEL:
3 |   BACKBONE:
4 |     NAME: 'build_shufflenet_fpn_backbone'


--------------------------------------------------------------------------------
/cubercnn/config/__init__.py:
--------------------------------------------------------------------------------
1 | from .config import *


--------------------------------------------------------------------------------
/cubercnn/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .datasets import *
2 | from .dataset_mapper import *
3 | from .build import *
4 | from .builtin import *


--------------------------------------------------------------------------------
/cubercnn/data/builtin.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates
 2 | 
 3 | def get_omni3d_categories(dataset="omni3d"):
 4 |     """
 5 |     Returns the Omni3D categories for dataset
 6 |     Args:
 7 |         dataset: str
 8 |     Returns:
 9 |         cats: set of strings with category names
10 |     """
11 | 
12 |     if dataset == "omni3d":
13 |         cats = set({'chair', 'table', 'cabinet', 'car', 'lamp', 'books', 'sofa', 'pedestrian', 'picture', 'window', 'pillow', 'truck', 'door', 'blinds', 'sink', 'shelves', 'television', 'shoes', 'cup', 'bottle', 'bookcase', 'laptop', 'desk', 'cereal box', 'floor mat', 'traffic cone', 'mirror', 'barrier', 'counter', 'camera', 'bicycle', 'toilet', 'bus', 'bed', 'refrigerator', 'trailer', 'box', 'oven', 'clothes', 'van', 'towel', 'motorcycle', 'night stand', 'stove', 'machine', 'stationery', 'bathtub', 'cyclist', 'curtain', 'bin'})
14 |         assert len(cats) == 50
15 |     elif dataset == "omni3d_in":
16 |         cats = set({'stationery', 'sink', 'table', 'floor mat', 'bottle', 'bookcase', 'bin', 'blinds', 'pillow', 'bicycle', 'refrigerator', 'night stand', 'chair', 'sofa', 'books', 'oven', 'towel', 'cabinet', 'window', 'curtain', 'bathtub', 'laptop', 'desk', 'television', 'clothes', 'stove', 'cup', 'shelves', 'box', 'shoes', 'mirror', 'door', 'picture', 'lamp', 'machine', 'counter', 'bed', 'toilet'})
17 |         assert len(cats) == 38
18 |     elif dataset == "omni3d_out":
19 |         cats = set({'cyclist', 'pedestrian', 'trailer', 'bus', 'motorcycle', 'car', 'barrier', 'truck', 'van', 'traffic cone', 'bicycle'})
20 |         assert len(cats) == 11
21 |     elif dataset in ["SUNRGBD_train", "SUNRGBD_val", "SUNRGBD_test"]:
22 |         cats = set({'bicycle', 'books', 'bottle', 'chair', 'cup', 'laptop', 'shoes', 'towel', 'blinds', 'window', 'lamp', 'shelves', 'mirror', 'sink', 'cabinet', 'bathtub', 'door', 'toilet', 'desk', 'box', 'bookcase', 'picture', 'table', 'counter', 'bed', 'night stand', 'pillow', 'sofa', 'television', 'floor mat', 'curtain', 'clothes', 'stationery', 'refrigerator', 'bin', 'stove', 'oven', 'machine'})
23 |         assert len(cats) == 38
24 |     elif dataset in ["Hypersim_train", "Hypersim_val"]:
25 |         cats = set({'books', 'chair', 'towel', 'blinds', 'window', 'lamp', 'shelves', 'mirror', 'sink', 'cabinet', 'bathtub', 'door', 'toilet', 'desk', 'box', 'bookcase', 'picture', 'table', 'counter', 'bed', 'night stand', 'pillow', 'sofa', 'television', 'floor mat', 'curtain', 'clothes', 'stationery', 'refrigerator'})
26 |         assert len(cats) == 29
27 |     elif dataset == "Hypersim_test":
28 |         # Hypersim test annotation does not contain toilet
29 |         cats = set({'books', 'chair', 'towel', 'blinds', 'window', 'lamp', 'shelves', 'mirror', 'sink', 'cabinet', 'bathtub', 'door', 'desk', 'box', 'bookcase', 'picture', 'table', 'counter', 'bed', 'night stand', 'pillow', 'sofa', 'television', 'floor mat', 'curtain', 'clothes', 'stationery', 'refrigerator'})
30 |         assert len(cats) == 28
31 |     elif dataset in ["ARKitScenes_train", "ARKitScenes_val", "ARKitScenes_test"]:
32 |         cats = set({'table', 'bed', 'sofa', 'television', 'refrigerator', 'chair', 'oven', 'machine', 'stove', 'shelves', 'sink', 'cabinet', 'bathtub', 'toilet'})
33 |         assert len(cats) == 14
34 |     elif dataset in ["Objectron_train", "Objectron_val", "Objectron_test"]:
35 |         cats = set({'bicycle', 'books', 'bottle', 'camera', 'cereal box', 'chair', 'cup', 'laptop', 'shoes'})
36 |         assert len(cats) == 9
37 |     elif dataset in ["KITTI_train", "KITTI_val", "KITTI_test"]:
38 |         cats = set({'pedestrian', 'car', 'cyclist', 'van', 'truck'})
39 |         assert len(cats) == 5
40 |     elif dataset in ["nuScenes_train", "nuScenes_val", "nuScenes_test"]:
41 |         cats = set({'pedestrian', 'car', 'truck', 'traffic cone', 'barrier', 'motorcycle', 'bicycle', 'bus', 'trailer'})
42 |         assert len(cats) == 9
43 |     else:
44 |         raise ValueError("%s dataset is not registered." % (dataset))
45 | 
46 |     return cats


--------------------------------------------------------------------------------
/cubercnn/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | from .omni3d_evaluation import *


--------------------------------------------------------------------------------
/cubercnn/generate_label/__init__.py:
--------------------------------------------------------------------------------
1 | from .process_indoor import * 
2 | from .process_outdoor import * 
3 | from .priors import *


--------------------------------------------------------------------------------
/cubercnn/generate_label/priors.py:
--------------------------------------------------------------------------------
1 | llm_generated_prior = {
2 |     'SUNRGBD': {'bicycle': [0.5, 1, 1.5], 'books': [0.2, 0.1, 0.3], 'bottle': [0.1, 0.3, 0.1], 'chair': [0.5, 1, 0.5], 'cup': [0.1, 0.1, 0.1], 'laptop': [0.3, 0.1, 0.4], 'shoes': [0.2, 0.1, 0.3], 'towel': [0.2, 0.1, 0.3], 'blinds': [0.1, 1, 1.5], 'window': [0.1, 1, 1.5], 'lamp': [0.3, 0.6, 0.3], 'shelves': [0.3, 1.5, 1.5], 'mirror': [0.1, 1, 0.5], 'sink': [0.5, 0.2, 0.8], 'cabinet': [0.5, 1.5, 1], 'bathtub': [0.8, 0.5, 1.5], 'door': [0.1, 2, 1], 'toilet': [0.4, 0.8, 0.5], 'desk': [0.6, 0.8, 1.2], 'box': [0.5, 0.5, 0.5], 'bookcase': [0.3, 2, 1], 'picture': [0.1, 0.5, 0.5], 'table': [0.8, 0.8, 1.5], 'counter': [0.6, 1, 1.5], 'bed': [1.5, 0.5, 2], 'night stand': [0.4, 0.5, 0.5], 'pillow': [0.3, 0.3, 0.5], 'sofa': [1, 1, 2], 'television': [1, 0.5, 0.1], 'floor mat': [1, 0.1, 1.5], 'curtain': [0.1, 1.5, 1], 'clothes': [0.5, 1, 0.5], 'stationery': [0.3, 0.3, 0.3], 'refrigerator': [0.8, 1.5, 0.8], 'bin': [0.5, 0.5, 0.5], 'stove': [0.6, 0.8, 0.8], 'oven': [0.6, 0.8, 0.8], 'machine': [0.8, 1, 1]},
3 |     'KITTI': { "car": [1.8, 1.5, 4.5], "van": [2.0, 2.0, 5.0], "truck": [2.5, 3.5, 10.0], "pedestrian": [0.5, 1.7, 0.8], "cyclist": [0.6, 1.7, 1.5]},
4 |     'ARKitScenes': {'refrigerator': [0.8, 1.5, 0.8], 'chair': [0.5, 1, 0.5], 'oven': [0.6, 0.8, 0.8], 'machine': [0.8, 1, 1], 'stove': [0.6, 0.8, 0.8], 'shelves': [0.3, 1.5, 1.5], 'sink': [0.5, 0.2, 0.8], 'cabinet': [0.5, 1.5, 1], 'bathtub': [0.8, 0.5, 1.5], 'toilet': [0.4, 0.8, 0.5], 'table': [0.8, 0.8, 1.5], 'bed': [1.5, 0.5, 2], 'sofa': [1, 1, 2], 'television': [1, 0.5, 0.1]},
5 |     'nuScenes': { "pedestrian": [0.5, 1.7, 0.8], "car": [1.8, 1.5, 4.5], "truck": [2.5, 3.5, 8.0], "traffic cone": [0.3, 0.7, 0.3], "barrier": [0.5, 2.0, 2.0], "motorcycle": [0.8, 1.2, 2.0], "bicycle": [0.6, 1.2, 1.8], "bus": [2.75, 3.5, 11.0], "trailer": [2.75, 3.25, 11.0]},
6 | }


--------------------------------------------------------------------------------
/cubercnn/modeling/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | from .densenet import * 
2 | from .mnasnet import * 
3 | from .resnet import * 
4 | from .shufflenet import * 
5 | from .dla import * 


--------------------------------------------------------------------------------
/cubercnn/modeling/backbone/densenet.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates
 2 | from torchvision import models
 3 | from detectron2.layers import ShapeSpec
 4 | from detectron2.modeling.backbone import Backbone
 5 | from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
 6 | import torch.nn.functional as F
 7 | 
 8 | from detectron2.modeling.backbone.fpn import FPN
 9 | 
10 | class DenseNetBackbone(Backbone):
11 |     def __init__(self, cfg, input_shape, pretrained=True):
12 |         super().__init__()
13 | 
14 |         base  = models.densenet121(pretrained)
15 |         base  = base.features
16 | 
17 |         self.base = base
18 |         
19 |         self._out_feature_channels = {'p2': 256, 'p3': 512, 'p4': 1024, 'p5': 1024, 'p6': 1024}
20 |         self._out_feature_strides ={'p2': 4, 'p3': 8, 'p4': 16, 'p5': 32, 'p6': 64}
21 |         self._out_features = ['p2', 'p3', 'p4', 'p5', 'p6']
22 |     
23 |     def forward(self, x):
24 | 
25 |         outputs = {}
26 |         
27 |         db1 = self.base[0:5](x)
28 |         db2 = self.base[5:7](db1)
29 |         db3 = self.base[7:9](db2)
30 |         p5 = self.base[9:](db3)
31 |         p6 = F.max_pool2d(p5, kernel_size=1, stride=2, padding=0)
32 |         outputs['p2'] = db1
33 |         outputs['p3'] = db2
34 |         outputs['p4'] = db3
35 |         outputs['p5'] = p5
36 |         outputs['p6'] = p6
37 | 
38 |         return outputs
39 | 
40 | 
41 | @BACKBONE_REGISTRY.register()
42 | def build_densenet_fpn_backbone(cfg, input_shape: ShapeSpec, priors=None):
43 |     """
44 |     Args:
45 |         cfg: a detectron2 CfgNode
46 | 
47 |     Returns:
48 |         backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
49 |     """
50 | 
51 |     imagenet_pretrain = cfg.MODEL.WEIGHTS_PRETRAIN + cfg.MODEL.WEIGHTS == ''
52 | 
53 |     bottom_up = DenseNetBackbone(cfg, input_shape, pretrained=imagenet_pretrain)
54 |     in_features = cfg.MODEL.FPN.IN_FEATURES
55 |     out_channels = cfg.MODEL.FPN.OUT_CHANNELS
56 | 
57 |     backbone = FPN(
58 |         bottom_up=bottom_up,
59 |         in_features=in_features,
60 |         out_channels=out_channels,
61 |         norm=cfg.MODEL.FPN.NORM,
62 |         fuse_type=cfg.MODEL.FPN.FUSE_TYPE
63 |     )
64 |     return backbone


--------------------------------------------------------------------------------
/cubercnn/modeling/backbone/mnasnet.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates
 2 | from torchvision import models
 3 | from detectron2.layers import ShapeSpec
 4 | from detectron2.modeling.backbone import Backbone
 5 | from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
 6 | import torch.nn.functional as F
 7 | 
 8 | from detectron2.modeling.backbone.fpn import FPN
 9 | 
10 | class MNASNetBackbone(Backbone):
11 |     def __init__(self, cfg, input_shape, pretrained=True):
12 |         super().__init__()
13 | 
14 |         base  = models.mnasnet1_0(pretrained)
15 |         base  = base.layers 
16 |         
17 |         self.base = base
18 | 
19 |         self._out_feature_channels = {'p2': 24, 'p3': 40, 'p4': 96, 'p5': 320, 'p6': 320}
20 |         self._out_feature_strides ={'p2': 4, 'p3': 8, 'p4': 16, 'p5': 32, 'p6': 64}
21 |         self._out_features = ['p2', 'p3', 'p4', 'p5', 'p6']
22 |     
23 |     def forward(self, x):
24 | 
25 |         outputs = {}
26 |         
27 |         p2 = self.base[0:9](x)
28 |         p3 = self.base[9](p2)
29 |         p4 = self.base[10:12](p3)
30 |         p5 = self.base[12:14](p4)
31 |         p6 = F.max_pool2d(p5, kernel_size=1, stride=2, padding=0)
32 |         outputs['p2'] = p2
33 |         outputs['p3'] = p3
34 |         outputs['p4'] = p4
35 |         outputs['p5'] = p5
36 |         outputs['p6'] = p6
37 | 
38 |         return outputs
39 | 
40 | @BACKBONE_REGISTRY.register()
41 | def build_mnasnet_fpn_backbone(cfg, input_shape: ShapeSpec, priors=None):
42 |     """
43 |     Args:
44 |         cfg: a detectron2 CfgNode
45 | 
46 |     Returns:
47 |         backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
48 |     """
49 | 
50 |     imagenet_pretrain = cfg.MODEL.WEIGHTS_PRETRAIN + cfg.MODEL.WEIGHTS == ''
51 | 
52 |     bottom_up = MNASNetBackbone(cfg, input_shape, pretrained=imagenet_pretrain)
53 |     in_features = cfg.MODEL.FPN.IN_FEATURES
54 |     out_channels = cfg.MODEL.FPN.OUT_CHANNELS
55 | 
56 |     backbone = FPN(
57 |         bottom_up=bottom_up,
58 |         in_features=in_features,
59 |         out_channels=out_channels,
60 |         norm=cfg.MODEL.FPN.NORM,
61 |         fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
62 |     )
63 |     return backbone
64 | 


--------------------------------------------------------------------------------
/cubercnn/modeling/backbone/resnet.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates
 2 | from torchvision import models
 3 | from detectron2.layers import ShapeSpec
 4 | from detectron2.modeling.backbone import Backbone
 5 | from detectron2.modeling.backbone.fpn import LastLevelMaxPool
 6 | from detectron2.modeling.backbone.resnet import build_resnet_backbone
 7 | from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
 8 | import torch.nn.functional as F
 9 | 
10 | from detectron2.modeling.backbone.fpn import FPN
11 | 
12 | class ResNet(Backbone):
13 |     def __init__(self, cfg, input_shape, pretrained=True):
14 |         super().__init__()
15 | 
16 |         if cfg.MODEL.RESNETS.DEPTH == 18:
17 |             base  = models.resnet18(pretrained)
18 |             self._out_feature_channels = {'p2': 64, 'p3': 128, 'p4': 256, 'p5': 512, 'p6': 512}
19 |         elif cfg.MODEL.RESNETS.DEPTH == 34:
20 |             base  = models.resnet34(pretrained)
21 |             self._out_feature_channels = {'p2': 64, 'p3': 128, 'p4': 256, 'p5': 512, 'p6': 512}
22 |         elif cfg.MODEL.RESNETS.DEPTH == 50:
23 |             base  = models.resnet50(pretrained)
24 |             self._out_feature_channels = {'p2': 256, 'p3': 512, 'p4': 1024, 'p5': 2048, 'p6': 2048}
25 |         elif cfg.MODEL.RESNETS.DEPTH == 101:
26 |             base  = models.resnet101(pretrained)
27 |             self._out_feature_channels = {'p2': 256, 'p3': 512, 'p4': 1024, 'p5': 2048, 'p6': 2048}
28 |         else:
29 |             raise ValueError('No configuration currently supporting depth of {}'.format(cfg.MODEL.RESNETS.DEPTH))
30 |         
31 |         self.conv1 = base.conv1
32 |         self.bn1 = base.bn1
33 |         self.relu = base.relu
34 |         self.maxpool = base.maxpool
35 |         self.layer1 = base.layer1
36 |         self.layer2 = base.layer2
37 |         self.layer3 = base.layer3
38 |         self.layer4 = base.layer4
39 |         
40 |         self._out_feature_strides ={'p2': 4, 'p3': 8, 'p4': 16, 'p5': 32, 'p6': 64}
41 |         self._out_features = ['p2', 'p3', 'p4', 'p5', 'p6']
42 |     
43 |     def forward(self, x):
44 | 
45 |         outputs = {}
46 |         
47 |         x = self.conv1(x)
48 |         x = self.bn1(x)
49 |         x = self.relu(x)
50 |         x = self.maxpool(x)
51 |         p2 = self.layer1(x)
52 |         p3 = self.layer2(p2)
53 |         p4 = self.layer3(p3)
54 |         p5 = self.layer4(p4)
55 |         p6 = F.max_pool2d(p5, kernel_size=1, stride=2, padding=0)
56 | 
57 |         outputs['p2'] = p2
58 |         outputs['p3'] = p3
59 |         outputs['p4'] = p4
60 |         outputs['p5'] = p5
61 |         outputs['p6'] = p6
62 | 
63 |         return outputs
64 | 
65 | 
66 | @BACKBONE_REGISTRY.register()
67 | def build_resnet_from_vision_fpn_backbone(cfg, input_shape: ShapeSpec, priors=None):
68 |     """
69 |     Args:
70 |         cfg: a detectron2 CfgNode
71 | 
72 |     Returns:
73 |         backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
74 |     """
75 | 
76 |     imagenet_pretrain = cfg.MODEL.WEIGHTS_PRETRAIN + cfg.MODEL.WEIGHTS == ''
77 | 
78 |     if cfg.MODEL.RESNETS.TORCHVISION:
79 |         bottom_up = ResNet(cfg, input_shape, pretrained=imagenet_pretrain)
80 | 
81 |     else:
82 |         # use the MSRA modeling logic to build the backbone.
83 |         bottom_up = build_resnet_backbone(cfg, input_shape)
84 | 
85 |     in_features = cfg.MODEL.FPN.IN_FEATURES
86 |     out_channels = cfg.MODEL.FPN.OUT_CHANNELS
87 | 
88 |     backbone = FPN(
89 |         bottom_up=bottom_up,
90 |         in_features=in_features,
91 |         out_channels=out_channels,
92 |         norm=cfg.MODEL.FPN.NORM,
93 |         top_block=LastLevelMaxPool(),
94 |         fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
95 |     )
96 |     return backbone
97 | 


--------------------------------------------------------------------------------
/cubercnn/modeling/backbone/shufflenet.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates
 2 | from torchvision import models
 3 | from detectron2.layers import ShapeSpec
 4 | from detectron2.modeling.backbone import Backbone
 5 | from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
 6 | import torch.nn.functional as F
 7 | 
 8 | from detectron2.modeling.backbone.fpn import FPN
 9 | 
10 | class ShufflenetBackbone(Backbone):
11 |     def __init__(self, cfg, input_shape, pretrained=True):
12 |         super().__init__()
13 | 
14 |         base  = models.shufflenet_v2_x1_0(pretrained)
15 |         self.conv1 = base.conv1
16 |         self.maxpool = base.maxpool
17 |         self.stage2 = base.stage2
18 |         self.stage3 = base.stage3
19 |         self.stage4 = base.stage4
20 |         self.conv5 = base.conv5
21 | 
22 |         self._out_feature_channels = {'p2': 24, 'p3': 116, 'p4': 232, 'p5': 464, 'p6': 464}
23 |         self._out_feature_strides ={'p2': 4, 'p3': 8, 'p4': 16, 'p5': 32, 'p6': 64}
24 |         self._out_features = ['p2', 'p3', 'p4', 'p5', 'p6']
25 |     
26 |     def forward(self, x):
27 | 
28 |         outputs = {}
29 |         
30 |         x = self.conv1(x)
31 |         p2 = self.maxpool(x)
32 |         p3 = self.stage2(p2)
33 |         p4 = self.stage3(p3)
34 |         p5 = self.stage4(p4)
35 |         p6 = F.max_pool2d(p5, kernel_size=1, stride=2, padding=0)
36 | 
37 |         outputs['p2'] = p2
38 |         outputs['p3'] = p3
39 |         outputs['p4'] = p4
40 |         outputs['p5'] = p5
41 |         outputs['p6'] = p6
42 | 
43 |         return outputs
44 | 
45 | 
46 | @BACKBONE_REGISTRY.register()
47 | def build_shufflenet_fpn_backbone(cfg, input_shape: ShapeSpec, priors=None):
48 |     """
49 |     Args:
50 |         cfg: a detectron2 CfgNode
51 | 
52 |     Returns:
53 |         backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
54 |     """
55 | 
56 |     imagenet_pretrain = cfg.MODEL.WEIGHTS_PRETRAIN + cfg.MODEL.WEIGHTS == ''
57 |     
58 |     bottom_up = ShufflenetBackbone(cfg, input_shape, pretrained=imagenet_pretrain)
59 |     in_features = cfg.MODEL.FPN.IN_FEATURES
60 |     out_channels = cfg.MODEL.FPN.OUT_CHANNELS
61 | 
62 |     backbone = FPN(
63 |         bottom_up=bottom_up,
64 |         in_features=in_features,
65 |         out_channels=out_channels,
66 |         norm=cfg.MODEL.FPN.NORM,
67 |         fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
68 |     )
69 |     return backbone
70 | 


--------------------------------------------------------------------------------
/cubercnn/modeling/meta_arch/__init__.py:
--------------------------------------------------------------------------------
1 | # from .rcnn3d import *
2 | from .rcnn3d_text import *


--------------------------------------------------------------------------------
/cubercnn/modeling/proposal_generator/__init__.py:
--------------------------------------------------------------------------------
1 | from .rpn import *
2 | 


--------------------------------------------------------------------------------
/cubercnn/modeling/roi_heads/__init__.py:
--------------------------------------------------------------------------------
1 | # from .roi_heads import *
2 | from .roi_heads_text import *


--------------------------------------------------------------------------------
/cubercnn/solver/__init__.py:
--------------------------------------------------------------------------------
1 | from .build import *
2 | from .checkpoint import *


--------------------------------------------------------------------------------
/cubercnn/solver/build.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates
 2 | import torch
 3 | from typing import Any, Dict, List, Set
 4 | from detectron2.solver.build import maybe_add_gradient_clipping
 5 | 
 6 | def build_optimizer(cfg, model):
 7 |     norm_module_types = (
 8 |         torch.nn.BatchNorm1d,
 9 |         torch.nn.BatchNorm2d,
10 |         torch.nn.BatchNorm3d,
11 |         torch.nn.SyncBatchNorm,
12 |         torch.nn.GroupNorm,
13 |         torch.nn.InstanceNorm1d,
14 |         torch.nn.InstanceNorm2d,
15 |         torch.nn.InstanceNorm3d,
16 |         torch.nn.LayerNorm,
17 |         torch.nn.LocalResponseNorm,
18 |     )
19 |     params: List[Dict[str, Any]] = []
20 |     memo: Set[torch.nn.parameter.Parameter] = set()
21 |     for module in model.modules():
22 |         for key, value in module.named_parameters(recurse=False):
23 |             if not value.requires_grad:
24 |                 continue
25 |             # Avoid duplicating parameters
26 |             if value in memo:
27 |                 continue
28 |             memo.add(value)
29 |             
30 |             lr = cfg.SOLVER.BASE_LR
31 |             weight_decay = cfg.SOLVER.WEIGHT_DECAY
32 | 
33 |             if isinstance(module, norm_module_types) and (cfg.SOLVER.WEIGHT_DECAY_NORM is not None):
34 |                 weight_decay = cfg.SOLVER.WEIGHT_DECAY_NORM
35 |             
36 |             elif key == "bias":
37 |                 if (cfg.SOLVER.BIAS_LR_FACTOR is not None):
38 |                     lr = cfg.SOLVER.BASE_LR * cfg.SOLVER.BIAS_LR_FACTOR
39 |                 if (cfg.SOLVER.WEIGHT_DECAY_BIAS is not None):
40 |                     weight_decay = cfg.SOLVER.WEIGHT_DECAY_BIAS
41 | 
42 |             # these params do not need weight decay at all
43 |             # TODO parameterize these in configs instead.
44 |             if key in ['priors_dims_per_cat', 'priors_z_scales', 'priors_z_stats']:
45 |                 weight_decay = 0.0
46 | 
47 |             params += [{"params": [value], "lr": lr, "weight_decay": weight_decay}]
48 | 
49 |     if cfg.SOLVER.TYPE == 'sgd':
50 |         optimizer = torch.optim.SGD(
51 |             params, 
52 |             cfg.SOLVER.BASE_LR, 
53 |             momentum=cfg.SOLVER.MOMENTUM, 
54 |             nesterov=cfg.SOLVER.NESTEROV, 
55 |             weight_decay=cfg.SOLVER.WEIGHT_DECAY
56 |         )
57 |     elif cfg.SOLVER.TYPE == 'adam':
58 |         optimizer = torch.optim.Adam(params, cfg.SOLVER.BASE_LR, eps=1e-02)
59 |     elif cfg.SOLVER.TYPE == 'adam+amsgrad':
60 |         optimizer = torch.optim.Adam(params, cfg.SOLVER.BASE_LR, amsgrad=True, eps=1e-02)
61 |     elif cfg.SOLVER.TYPE == 'adamw':
62 |         optimizer = torch.optim.AdamW(params, cfg.SOLVER.BASE_LR, eps=1e-02)
63 |     elif cfg.SOLVER.TYPE == 'adamw+amsgrad':
64 |         optimizer = torch.optim.AdamW(params, cfg.SOLVER.BASE_LR, amsgrad=True, eps=1e-02)
65 |     else:
66 |         raise ValueError('{} is not supported as an optimizer.'.format(cfg.SOLVER.TYPE))
67 | 
68 |     optimizer = maybe_add_gradient_clipping(cfg, optimizer)
69 |     return optimizer
70 | 
71 | def freeze_bn(network):
72 | 
73 |     for _, module in network.named_modules():
74 |         if isinstance(module, torch.nn.BatchNorm2d):
75 |             module.eval()
76 |             module.track_running_stats = False
77 | 


--------------------------------------------------------------------------------
/cubercnn/solver/checkpoint.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates
 2 | from detectron2.checkpoint import PeriodicCheckpointer
 3 | from typing import Any
 4 | 
 5 | class PeriodicCheckpointerOnlyOne(PeriodicCheckpointer):
 6 |     def step(self, iteration: int, **kwargs: Any) -> None:
 7 |         """
 8 |         Perform the appropriate action at the given iteration.
 9 | 
10 |         Args:
11 |             iteration (int): the current iteration, ranged in [0, max_iter-1].
12 |             kwargs (Any): extra data to save, same as in
13 |                 :meth:`Checkpointer.save`.
14 |         """
15 |         iteration = int(iteration)
16 |         additional_state = {"iteration": iteration}
17 |         additional_state.update(kwargs)
18 | 
19 |         if (iteration + 1) % self.period == 0:
20 |             
21 |             # simply save a single recent model
22 |             self.checkpointer.save(
23 |                 "{}_recent".format(self.file_prefix), **additional_state
24 |             )
25 | 
26 |         if self.max_iter is not None:
27 |             if iteration >= self.max_iter - 1:
28 |                 self.checkpointer.save(f"{self.file_prefix}_final", **additional_state)


--------------------------------------------------------------------------------
/cubercnn/util/__init__.py:
--------------------------------------------------------------------------------
1 | from .util import *
2 | from .model_zoo import *
3 | from .math_util import *


--------------------------------------------------------------------------------
/cubercnn/util/model_zoo.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates
 2 | from detectron2.utils.file_io import PathHandler, PathManager
 3 | 
 4 | __all__ = ["CubeRCNNHandler"]
 5 | 
 6 | class CubeRCNNHandler(PathHandler):
 7 |     """
 8 |     Resolves CubeRCNN's model zoo files. 
 9 |     """
10 | 
11 |     PREFIX = "cubercnn://"
12 |     CUBERCNN_PREFIX = "https://dl.fbaipublicfiles.com/cubercnn/"
13 | 
14 |     def _get_supported_prefixes(self):
15 |         return [self.PREFIX]
16 | 
17 |     def _get_local_path(self, path):
18 |         name = path[len(self.PREFIX) :]
19 |         return PathManager.get_local_path(self.CUBERCNN_PREFIX + name)
20 | 
21 |     def _open(self, path, mode="r", **kwargs):
22 |         return PathManager.open(self._get_local_path(path), mode, **kwargs)
23 | 
24 | 
25 | PathManager.register_handler(CubeRCNNHandler())


--------------------------------------------------------------------------------
/cubercnn/vis/__init__.py:
--------------------------------------------------------------------------------
1 | from .vis import * 


--------------------------------------------------------------------------------
/cubercnn/vis/logperf.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates
  2 | from termcolor import colored
  3 | import itertools
  4 | from tabulate import tabulate
  5 | import logging
  6 | 
  7 | logger = logging.getLogger(__name__)
  8 | 
  9 | def print_ap_category_histogram(dataset, results):
 10 |     """
 11 |     Prints AP performance for each category.
 12 |     Args:
 13 |         results: dictionary; each entry contains information for a dataset
 14 |     """
 15 |     num_classes = len(results)
 16 |     N_COLS = 9
 17 |     data = list(
 18 |         itertools.chain(
 19 |             *[
 20 |                 [
 21 |                     cat,
 22 |                     out["AP2D"],
 23 |                     out["AP3D"],
 24 |                 ]
 25 |                 for cat, out in results.items()
 26 |             ]
 27 |         )
 28 |     )
 29 |     data.extend([None] * (N_COLS - (len(data) % N_COLS)))
 30 |     data = itertools.zip_longest(*[data[i::N_COLS] for i in range(N_COLS)])
 31 |     table = tabulate(
 32 |         data,
 33 |         headers=["category", "AP2D", "AP3D"] * (N_COLS // 2),
 34 |         tablefmt="pipe",
 35 |         numalign="left",
 36 |         stralign="center",
 37 |     )
 38 |     logger.info(
 39 |         "Performance for each of {} categories on {}:\n".format(num_classes, dataset)
 40 |         + colored(table, "cyan")
 41 |     )
 42 | 
 43 | 
 44 | def print_ap_analysis_histogram(results):
 45 |     """
 46 |     Prints AP performance for various IoU thresholds and (near, medium, far) objects.
 47 |     Args:
 48 |         results: dictionary. Each entry in results contains outputs for a dataset
 49 |     """
 50 |     metric_names = ["AP2D", "AP3D", "AP3D@15", "AP3D@25", "AP3D@50", "AP3D-N", "AP3D-M", "AP3D-F"]
 51 |     N_COLS = 10
 52 |     data = []
 53 |     for name, metrics in results.items():
 54 |         data_item = [name, metrics["iters"], metrics["AP2D"], metrics["AP3D"], metrics["AP3D@15"], metrics["AP3D@25"], metrics["AP3D@50"], metrics["AP3D-N"], metrics["AP3D-M"], metrics["AP3D-F"]]
 55 |         data.append(data_item)
 56 |     table = tabulate(
 57 |         data,
 58 |         headers=["Dataset", "#iters", "AP2D", "AP3D", "AP3D@15", "AP3D@25", "AP3D@50", "AP3D-N", "AP3D-M", "AP3D-F"],
 59 |         tablefmt="grid",
 60 |         numalign="left",
 61 |         stralign="center",
 62 |     )
 63 |     logger.info(
 64 |         "Per-dataset performance analysis on test set:\n"
 65 |         + colored(table, "cyan")
 66 |     )
 67 | 
 68 | 
 69 | def print_ap_dataset_histogram(results):
 70 |     """
 71 |     Prints AP performance for each dataset.
 72 |     Args:
 73 |         results: list of dicts. Each entry in results contains outputs for a dataset
 74 |     """
 75 |     metric_names = ["AP2D", "AP3D"]
 76 |     N_COLS = 4
 77 |     data = []
 78 |     for name, metrics in results.items():
 79 |         data_item = [name, metrics["iters"], metrics["AP2D"], metrics["AP3D"]]
 80 |         data.append(data_item)
 81 |     table = tabulate(
 82 |         data,
 83 |         headers=["Dataset", "#iters", "AP2D", "AP3D"],
 84 |         tablefmt="grid",
 85 |         numalign="left",
 86 |         stralign="center",
 87 |     )
 88 |     logger.info(
 89 |         "Per-dataset performance on test set:\n"
 90 |         + colored(table, "cyan")
 91 |     )
 92 | 
 93 | 
 94 | def print_ap_omni_histogram(results):
 95 |     """
 96 |     Prints AP performance for Omni3D dataset.
 97 |     Args:
 98 |         results: list of dicts. Each entry in results contains outputs for a dataset
 99 |     """
100 |     metric_names = ["AP2D", "AP3D"]
101 |     N_COLS = 4
102 |     data = []
103 |     for name, metrics in results.items():
104 |         data_item = [name, metrics["iters"], metrics["AP2D"], metrics["AP3D"]]
105 |         data.append(data_item)
106 |     table = tabulate(
107 |         data,
108 |         headers=["Dataset", "#iters", "AP2D", "AP3D"],
109 |         tablefmt="grid",
110 |         numalign="left",
111 |         stralign="center",
112 |     )
113 |     logger.info("Omni3D performance on test set. The numbers below should be used to compare to others approaches on Omni3D, such as Cube R-CNN")
114 |     logger.info(
115 |         "Performance on Omni3D:\n"
116 |         + colored(table, "magenta")
117 |     )
118 | 


--------------------------------------------------------------------------------
/datasets/Omni3D/download_omni3d_json.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash -e
2 | # -*- coding: utf-8 -*-
3 | # Copyright (c) Meta, Inc. and its affiliates. All Rights Reserved
4 | 
5 | wget https://dl.fbaipublicfiles.com/omni3d_data/Omni3D_json.zip
6 | unzip Omni3D_json.zip


--------------------------------------------------------------------------------
/docs/teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/docs/teaser.png


--------------------------------------------------------------------------------
/scripts/generate_pseudo_label.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DATASET=$1
 4 | 
 5 | # Step 1: Predict depth using UniDepth
 6 | CUDA_VISIBLE_DEVICES=0 python third_party/UniDepth/run_unidepth.py --dataset $DATASET
 7 | 
 8 | # Step 2: Segment novel objects using Grounded-SAM
 9 | CUDA_VISIBLE_DEVICES=0 python third_party/Grounded-Segment-Anything/grounded_sam_detect.py --dataset $DATASET
10 | CUDA_VISIBLE_DEVICES=0 python third_party/Grounded-Segment-Anything/grounded_sam_detect_ground.py --dataset $DATASET
11 | 
12 | # Step 3: Generate pseudo 3D bounding boxes
13 | python tools/generate_pseudo_bbox.py \
14 |   --config-file configs/Base_Omni3D_${DATASET}.yaml \
15 |   OUTPUT_DIR output/generate_pseudo_label/$DATASET \
16 | 
17 | # Step 4: Convert to COCO dataset format
18 | python tools/transform_to_coco.py --dataset_name $DATASET


--------------------------------------------------------------------------------
/scripts/test.sh:
--------------------------------------------------------------------------------
1 | DATASET=$1
2 | 
3 | CUDA_VISIBLE_DEVICES=0 python tools/train_net.py \
4 |     --eval-only --config-file checkpoints/$DATASET/config.yaml --dist-url tcp://0.0.0.0:12345 --num-gpus 1 \
5 |     MODEL.WEIGHTS checkpoints/$DATASET/model_recent.pth \
6 |     OUTPUT_DIR output/test/$DATASET
7 | 


--------------------------------------------------------------------------------
/scripts/train.sh:
--------------------------------------------------------------------------------
1 | DATASET=$1
2 | 
3 | CUDA_VISIBLE_DEVICES=0,1 python tools/train_net.py \
4 |   --config-file configs/Base_Omni3D_$DATASET.yaml --dist-url tcp://0.0.0.0:12345 --num-gpus 2 \
5 |     DATASETS.FOLDER_NAME "Omni3D_pl" \
6 |     OUTPUT_DIR output/training/$DATASET
7 | 
8 | 


--------------------------------------------------------------------------------
/scripts/train_KITTI.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0 python tools/train_net.py \
2 |   --config-file configs/Base_Omni3D_KITTI.yaml --dist-url tcp://0.0.0.0:12345 --num-gpus 1 \
3 |     DATASETS.FOLDER_NAME "Omni3D_pl" \
4 |     OUTPUT_DIR output/training/KITTI
5 | 
6 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # checkpoint
132 | *.pth
133 | outputs/
134 | 
135 | .idea/
136 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/.gitmodules:
--------------------------------------------------------------------------------
1 | 
2 | [submodule "grounded-sam-osx"]
3 | 	path = grounded-sam-osx
4 | 	url = https://github.com/linjing7/grounded-sam-osx.git
5 | [submodule "VISAM"]
6 | 	path = VISAM
7 | 	url = https://github.com/BingfengYan/VISAM
8 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/CITATION.cff:
--------------------------------------------------------------------------------
1 | cff-version: 1.2.0
2 | message: "If you use this software, please cite it as below."
3 | authors:
4 |   - name: "Grounded-SAM Contributors"
5 | title: "Grounded-Segment-Anything"
6 | date-released: 2023-04-06
7 | url: "https://github.com/IDEA-Research/Grounded-Segment-Anything"
8 | license: Apache-2.0
9 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM pytorch/pytorch:1.13.1-cuda11.6-cudnn8-devel
 2 | 
 3 | # Arguments to build Docker Image using CUDA
 4 | ARG USE_CUDA=0
 5 | ARG TORCH_ARCH=
 6 | 
 7 | ENV AM_I_DOCKER True
 8 | ENV BUILD_WITH_CUDA "${USE_CUDA}"
 9 | ENV TORCH_CUDA_ARCH_LIST "${TORCH_ARCH}"
10 | ENV CUDA_HOME /usr/local/cuda-11.6/
11 | 
12 | RUN mkdir -p /home/appuser/Grounded-Segment-Anything
13 | COPY . /home/appuser/Grounded-Segment-Anything/
14 | 
15 | RUN apt-get update && apt-get install --no-install-recommends wget ffmpeg=7:* \
16 |     libsm6=2:* libxext6=2:* git=1:* nano=2.* \
17 |     vim=2:* -y \
18 |     && apt-get clean && apt-get autoremove && rm -rf /var/lib/apt/lists/*
19 | 
20 | WORKDIR /home/appuser/Grounded-Segment-Anything
21 | RUN python -m pip install --no-cache-dir -e segment_anything
22 | 
23 | # When using build isolation, PyTorch with newer CUDA is installed and can't compile GroundingDINO
24 | RUN python -m pip install --no-cache-dir wheel
25 | RUN python -m pip install --no-cache-dir --no-build-isolation -e GroundingDINO
26 | 
27 | WORKDIR /home/appuser
28 | RUN pip install --no-cache-dir diffusers[torch]==0.15.1 opencv-python==4.7.0.72 \
29 |     pycocotools==2.0.6 matplotlib==3.5.3 \
30 |     onnxruntime==1.14.1 onnx==1.13.1 ipykernel==6.16.2 scipy gradio openai
31 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/EfficientSAM/EdgeSAM/common.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.nn.functional as F
 10 | 
 11 | from typing import Type
 12 | 
 13 | 
 14 | class MLPBlock(nn.Module):
 15 |     def __init__(
 16 |         self,
 17 |         embedding_dim: int,
 18 |         mlp_dim: int,
 19 |         act: Type[nn.Module] = nn.GELU,
 20 |     ) -> None:
 21 |         super().__init__()
 22 |         self.lin1 = nn.Linear(embedding_dim, mlp_dim)
 23 |         self.lin2 = nn.Linear(mlp_dim, embedding_dim)
 24 |         self.act = act()
 25 | 
 26 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
 27 |         return self.lin2(self.act(self.lin1(x)))
 28 | 
 29 | 
 30 | # From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa
 31 | # Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119  # noqa
 32 | class LayerNorm2d(nn.Module):
 33 |     def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
 34 |         super().__init__()
 35 |         self.weight = nn.Parameter(torch.ones(num_channels))
 36 |         self.bias = nn.Parameter(torch.zeros(num_channels))
 37 |         self.eps = eps
 38 | 
 39 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
 40 |         u = x.mean(1, keepdim=True)
 41 |         s = (x - u).pow(2).mean(1, keepdim=True)
 42 |         x = (x - u) / torch.sqrt(s + self.eps)
 43 |         x = self.weight[:, None, None] * x + self.bias[:, None, None]
 44 |         return x
 45 | 
 46 | 
 47 | def val2list(x: list or tuple or any, repeat_time=1) -> list:
 48 |     if isinstance(x, (list, tuple)):
 49 |         return list(x)
 50 |     return [x for _ in range(repeat_time)]
 51 | 
 52 | 
 53 | def val2tuple(x: list or tuple or any, min_len: int = 1, idx_repeat: int = -1) -> tuple:
 54 |     x = val2list(x)
 55 | 
 56 |     # repeat elements if necessary
 57 |     if len(x) > 0:
 58 |         x[idx_repeat:idx_repeat] = [x[idx_repeat] for _ in range(min_len - len(x))]
 59 | 
 60 |     return tuple(x)
 61 | 
 62 | 
 63 | def list_sum(x: list) -> any:
 64 |     return x[0] if len(x) == 1 else x[0] + list_sum(x[1:])
 65 | 
 66 | 
 67 | def resize(
 68 |         x: torch.Tensor,
 69 |         size: any or None = None,
 70 |         scale_factor=None,
 71 |         mode: str = "bicubic",
 72 |         align_corners: bool or None = False,
 73 | ) -> torch.Tensor:
 74 |     if mode in ["bilinear", "bicubic"]:
 75 |         return F.interpolate(
 76 |             x,
 77 |             size=size,
 78 |             scale_factor=scale_factor,
 79 |             mode=mode,
 80 |             align_corners=align_corners,
 81 |         )
 82 |     elif mode in ["nearest", "area"]:
 83 |         return F.interpolate(x, size=size, scale_factor=scale_factor, mode=mode)
 84 |     else:
 85 |         raise NotImplementedError(f"resize(mode={mode}) not implemented.")
 86 | 
 87 | 
 88 | class UpSampleLayer(nn.Module):
 89 |     def __init__(
 90 |             self,
 91 |             mode="bicubic",
 92 |             size=None,
 93 |             factor=2,
 94 |             align_corners=False,
 95 |     ):
 96 |         super(UpSampleLayer, self).__init__()
 97 |         self.mode = mode
 98 |         self.size = val2list(size, 2) if size is not None else None
 99 |         self.factor = None if self.size is not None else factor
100 |         self.align_corners = align_corners
101 | 
102 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
103 |         return resize(x, self.size, self.factor, self.mode, self.align_corners)
104 | 
105 | 
106 | class OpSequential(nn.Module):
107 |     def __init__(self, op_list):
108 |         super(OpSequential, self).__init__()
109 |         valid_op_list = []
110 |         for op in op_list:
111 |             if op is not None:
112 |                 valid_op_list.append(op)
113 |         self.op_list = nn.ModuleList(valid_op_list)
114 | 
115 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
116 |         for op in self.op_list:
117 |             x = op(x)
118 |         return x


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/EfficientSAM/EdgeSAM/setup_edge_sam.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import torch
 8 | 
 9 | from functools import partial
10 | 
11 | from segment_anything.modeling import ImageEncoderViT, MaskDecoder, PromptEncoder, Sam, TwoWayTransformer
12 | from EdgeSAM.rep_vit import RepViT
13 | 
14 | 
15 | prompt_embed_dim = 256
16 | image_size = 1024
17 | vit_patch_size = 16
18 | image_embedding_size = image_size // vit_patch_size
19 | 
20 | 
21 | def build_edge_sam(checkpoint=None, upsample_mode="bicubic"):
22 |     image_encoder = RepViT(
23 |         arch="m1",
24 |         img_size=image_size,
25 |         upsample_mode=upsample_mode
26 |     )
27 |     return _build_sam(image_encoder, checkpoint)
28 | 
29 | 
30 | sam_model_registry = {
31 |     "default": build_edge_sam,
32 |     "edge_sam": build_edge_sam,
33 | }
34 | 
35 | def _build_sam_encoder(
36 |     encoder_embed_dim,
37 |     encoder_depth,
38 |     encoder_num_heads,
39 |     encoder_global_attn_indexes,
40 | ):
41 |     image_encoder = ImageEncoderViT(
42 |         depth=encoder_depth,
43 |         embed_dim=encoder_embed_dim,
44 |         img_size=image_size,
45 |         mlp_ratio=4,
46 |         norm_layer=partial(torch.nn.LayerNorm, eps=1e-6),
47 |         num_heads=encoder_num_heads,
48 |         patch_size=vit_patch_size,
49 |         qkv_bias=True,
50 |         use_rel_pos=True,
51 |         global_attn_indexes=encoder_global_attn_indexes,
52 |         window_size=14,
53 |         out_chans=prompt_embed_dim,
54 |     )
55 |     return image_encoder
56 | 
57 | 
58 | def _build_sam(
59 |     image_encoder,
60 |     checkpoint=None,
61 | ):
62 |     sam = Sam(
63 |         image_encoder=image_encoder,
64 |         prompt_encoder=PromptEncoder(
65 |             embed_dim=prompt_embed_dim,
66 |             image_embedding_size=(image_embedding_size, image_embedding_size),
67 |             input_image_size=(image_size, image_size),
68 |             mask_in_chans=16,
69 |         ),
70 |         mask_decoder=MaskDecoder(
71 |             num_multimask_outputs=3,
72 |             transformer=TwoWayTransformer(
73 |                 depth=2,
74 |                 embedding_dim=prompt_embed_dim,
75 |                 mlp_dim=2048,
76 |                 num_heads=8,
77 |             ),
78 |             transformer_dim=prompt_embed_dim,
79 |             iou_head_depth=3,
80 |             iou_head_hidden_dim=256,
81 |         ),
82 |         pixel_mean=[123.675, 116.28, 103.53],
83 |         pixel_std=[58.395, 57.12, 57.375],
84 |     )
85 |     sam.eval()
86 |     if checkpoint is not None:
87 |         with open(checkpoint, "rb") as f:
88 |             state_dict = torch.load(f, map_location="cpu")
89 |         sam.load_state_dict(state_dict)
90 |     return sam


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/EfficientSAM/LightHQSAM/example_light_hqsam.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/EfficientSAM/LightHQSAM/example_light_hqsam.png


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/EfficientSAM/LightHQSAM/grounded_light_hqsam_annotated_image.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/EfficientSAM/LightHQSAM/grounded_light_hqsam_annotated_image.jpg


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/EfficientSAM/LightHQSAM/setup_light_hqsam.py:
--------------------------------------------------------------------------------
 1 | from LightHQSAM.tiny_vit_sam import TinyViT
 2 | from segment_anything.modeling import MaskDecoderHQ, PromptEncoder, Sam, TwoWayTransformer
 3 | 
 4 | def setup_model():
 5 |     prompt_embed_dim = 256
 6 |     image_size = 1024
 7 |     vit_patch_size = 16
 8 |     image_embedding_size = image_size // vit_patch_size
 9 |     mobile_sam = Sam(
10 |             image_encoder=TinyViT(img_size=1024, in_chans=3, num_classes=1000,
11 |                 embed_dims=[64, 128, 160, 320],
12 |                 depths=[2, 2, 6, 2],
13 |                 num_heads=[2, 4, 5, 10],
14 |                 window_sizes=[7, 7, 14, 7],
15 |                 mlp_ratio=4.,
16 |                 drop_rate=0.,
17 |                 drop_path_rate=0.0,
18 |                 use_checkpoint=False,
19 |                 mbconv_expand_ratio=4.0,
20 |                 local_conv_size=3,
21 |                 layer_lr_decay=0.8
22 |             ),
23 |             prompt_encoder=PromptEncoder(
24 |             embed_dim=prompt_embed_dim,
25 |             image_embedding_size=(image_embedding_size, image_embedding_size),
26 |             input_image_size=(image_size, image_size),
27 |             mask_in_chans=16,
28 |             ),
29 |             mask_decoder=MaskDecoderHQ(
30 |                     num_multimask_outputs=3,
31 |                     transformer=TwoWayTransformer(
32 |                     depth=2,
33 |                     embedding_dim=prompt_embed_dim,
34 |                     mlp_dim=2048,
35 |                     num_heads=8,
36 |                 ),
37 |                 transformer_dim=prompt_embed_dim,
38 |                 iou_head_depth=3,
39 |                 iou_head_hidden_dim=256,
40 |                 vit_dim=160,
41 |             ),
42 |             pixel_mean=[123.675, 116.28, 103.53],
43 |             pixel_std=[58.395, 57.12, 57.375],
44 |         )
45 |     return mobile_sam


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/EfficientSAM/MobileSAM/setup_mobile_sam.py:
--------------------------------------------------------------------------------
 1 | from MobileSAM.tiny_vit_sam import TinyViT
 2 | from segment_anything.modeling import MaskDecoder, PromptEncoder, Sam, TwoWayTransformer
 3 | 
 4 | def setup_model():
 5 |     prompt_embed_dim = 256
 6 |     image_size = 1024
 7 |     vit_patch_size = 16
 8 |     image_embedding_size = image_size // vit_patch_size
 9 |     mobile_sam = Sam(
10 |             image_encoder=TinyViT(img_size=1024, in_chans=3, num_classes=1000,
11 |                 embed_dims=[64, 128, 160, 320],
12 |                 depths=[2, 2, 6, 2],
13 |                 num_heads=[2, 4, 5, 10],
14 |                 window_sizes=[7, 7, 14, 7],
15 |                 mlp_ratio=4.,
16 |                 drop_rate=0.,
17 |                 drop_path_rate=0.0,
18 |                 use_checkpoint=False,
19 |                 mbconv_expand_ratio=4.0,
20 |                 local_conv_size=3,
21 |                 layer_lr_decay=0.8
22 |             ),
23 |             prompt_encoder=PromptEncoder(
24 |             embed_dim=prompt_embed_dim,
25 |             image_embedding_size=(image_embedding_size, image_embedding_size),
26 |             input_image_size=(image_size, image_size),
27 |             mask_in_chans=16,
28 |             ),
29 |             mask_decoder=MaskDecoder(
30 |                     num_multimask_outputs=3,
31 |                     transformer=TwoWayTransformer(
32 |                     depth=2,
33 |                     embedding_dim=prompt_embed_dim,
34 |                     mlp_dim=2048,
35 |                     num_heads=8,
36 |                 ),
37 |                 transformer_dim=prompt_embed_dim,
38 |                 iou_head_depth=3,
39 |                 iou_head_hidden_dim=256,
40 |             ),
41 |             pixel_mean=[123.675, 116.28, 103.53],
42 |             pixel_std=[58.395, 57.12, 57.375],
43 |         )
44 |     return mobile_sam


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/EfficientSAM/RepViTSAM/setup_repvit_sam.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import torch
 8 | from functools import partial
 9 | from segment_anything.modeling import ImageEncoderViT, MaskDecoder, PromptEncoder, Sam, TwoWayTransformer
10 | from RepViTSAM import repvit
11 | from timm.models import create_model
12 | 
13 | def build_sam_repvit(checkpoint=None):
14 |     prompt_embed_dim = 256
15 |     image_size = 1024
16 |     vit_patch_size = 16
17 |     image_embedding_size = image_size // vit_patch_size
18 |     repvit_sam = Sam(
19 |             image_encoder=create_model('repvit'),
20 |             prompt_encoder=PromptEncoder(
21 |             embed_dim=prompt_embed_dim,
22 |             image_embedding_size=(image_embedding_size, image_embedding_size),
23 |             input_image_size=(image_size, image_size),
24 |             mask_in_chans=16,
25 |             ),
26 |             mask_decoder=MaskDecoder(
27 |                     num_multimask_outputs=3,
28 |                     transformer=TwoWayTransformer(
29 |                     depth=2,
30 |                     embedding_dim=prompt_embed_dim,
31 |                     mlp_dim=2048,
32 |                     num_heads=8,
33 |                 ),
34 |                 transformer_dim=prompt_embed_dim,
35 |                 iou_head_depth=3,
36 |                 iou_head_hidden_dim=256,
37 |             ),
38 |             pixel_mean=[123.675, 116.28, 103.53],
39 |             pixel_std=[58.395, 57.12, 57.375],
40 |         )
41 | 
42 |     repvit_sam.eval()
43 |     if checkpoint is not None:
44 |         with open(checkpoint, "rb") as f:
45 |             state_dict = torch.load(f)
46 |         repvit_sam.load_state_dict(state_dict)
47 |     return repvit_sam
48 | 
49 | from functools import partial
50 | 
51 | sam_model_registry = {
52 |     "repvit": partial(build_sam_repvit),
53 | }
54 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/EfficientSAM/grounded_edge_sam.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import numpy as np
  3 | import supervision as sv
  4 | 
  5 | import torch
  6 | import torchvision
  7 | 
  8 | from groundingdino.util.inference import Model
  9 | from segment_anything import SamPredictor
 10 | from EdgeSAM.setup_edge_sam import build_edge_sam
 11 | 
 12 | DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 13 | 
 14 | # GroundingDINO config and checkpoint
 15 | GROUNDING_DINO_CONFIG_PATH = "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py"
 16 | GROUNDING_DINO_CHECKPOINT_PATH = "./groundingdino_swint_ogc.pth"
 17 | 
 18 | # Building GroundingDINO inference model
 19 | grounding_dino_model = Model(model_config_path=GROUNDING_DINO_CONFIG_PATH, model_checkpoint_path=GROUNDING_DINO_CHECKPOINT_PATH)
 20 | 
 21 | # Building MobileSAM predictor
 22 | EdgeSAM_CHECKPOINT_PATH = "./EfficientSAM/edge_sam_3x.pth"
 23 | edge_sam = build_edge_sam(checkpoint=EdgeSAM_CHECKPOINT_PATH)
 24 | edge_sam.to(device=DEVICE)
 25 | 
 26 | sam_predictor = SamPredictor(edge_sam)
 27 | 
 28 | 
 29 | # Predict classes and hyper-param for GroundingDINO
 30 | SOURCE_IMAGE_PATH = "./EfficientSAM/LightHQSAM/example_light_hqsam.png"
 31 | CLASSES = ["bench"]
 32 | BOX_THRESHOLD = 0.25
 33 | TEXT_THRESHOLD = 0.25
 34 | NMS_THRESHOLD = 0.8
 35 | 
 36 | 
 37 | # load image
 38 | image = cv2.imread(SOURCE_IMAGE_PATH)
 39 | 
 40 | # detect objects
 41 | detections = grounding_dino_model.predict_with_classes(
 42 |     image=image,
 43 |     classes=CLASSES,
 44 |     box_threshold=BOX_THRESHOLD,
 45 |     text_threshold=TEXT_THRESHOLD
 46 | )
 47 | 
 48 | # annotate image with detections
 49 | box_annotator = sv.BoxAnnotator()
 50 | labels = [
 51 |     f"{CLASSES[class_id]} {confidence:0.2f}" 
 52 |     for _, _, confidence, class_id, _, _ 
 53 |     in detections]
 54 | annotated_frame = box_annotator.annotate(scene=image.copy(), detections=detections, labels=labels)
 55 | 
 56 | # save the annotated grounding dino image
 57 | cv2.imwrite("EfficientSAM/LightHQSAM/groundingdino_annotated_image.jpg", annotated_frame)
 58 | 
 59 | 
 60 | # NMS post process
 61 | print(f"Before NMS: {len(detections.xyxy)} boxes")
 62 | nms_idx = torchvision.ops.nms(
 63 |     torch.from_numpy(detections.xyxy), 
 64 |     torch.from_numpy(detections.confidence), 
 65 |     NMS_THRESHOLD
 66 | ).numpy().tolist()
 67 | 
 68 | detections.xyxy = detections.xyxy[nms_idx]
 69 | detections.confidence = detections.confidence[nms_idx]
 70 | detections.class_id = detections.class_id[nms_idx]
 71 | 
 72 | print(f"After NMS: {len(detections.xyxy)} boxes")
 73 | 
 74 | # Prompting SAM with detected boxes
 75 | def segment(sam_predictor: SamPredictor, image: np.ndarray, xyxy: np.ndarray) -> np.ndarray:
 76 |     sam_predictor.set_image(image)
 77 |     result_masks = []
 78 |     for box in xyxy:
 79 |         masks, scores, logits = sam_predictor.predict(
 80 |             box=box,
 81 |             multimask_output=False,
 82 |             hq_token_only=True,
 83 |         )
 84 |         index = np.argmax(scores)
 85 |         result_masks.append(masks[index])
 86 |     return np.array(result_masks)
 87 | 
 88 | 
 89 | # convert detections to masks
 90 | detections.mask = segment(
 91 |     sam_predictor=sam_predictor,
 92 |     image=cv2.cvtColor(image, cv2.COLOR_BGR2RGB),
 93 |     xyxy=detections.xyxy
 94 | )
 95 | 
 96 | # annotate image with detections
 97 | box_annotator = sv.BoxAnnotator()
 98 | mask_annotator = sv.MaskAnnotator()
 99 | labels = [
100 |     f"{CLASSES[class_id]} {confidence:0.2f}" 
101 |     for _, _, confidence, class_id, _, _ 
102 |     in detections]
103 | annotated_image = mask_annotator.annotate(scene=image.copy(), detections=detections)
104 | annotated_image = box_annotator.annotate(scene=annotated_image, detections=detections, labels=labels)
105 | 
106 | # save the annotated grounded-sam image
107 | cv2.imwrite("EfficientSAM/grounded_edge_sam_annotated_image.jpg", annotated_image)
108 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/EfficientSAM/grounded_light_hqsam.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import numpy as np
  3 | import supervision as sv
  4 | 
  5 | import torch
  6 | import torchvision
  7 | 
  8 | from groundingdino.util.inference import Model
  9 | from segment_anything import SamPredictor
 10 | from LightHQSAM.setup_light_hqsam import setup_model
 11 | 
 12 | DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 13 | 
 14 | # GroundingDINO config and checkpoint
 15 | GROUNDING_DINO_CONFIG_PATH = "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py"
 16 | GROUNDING_DINO_CHECKPOINT_PATH = "./groundingdino_swint_ogc.pth"
 17 | 
 18 | # Building GroundingDINO inference model
 19 | grounding_dino_model = Model(model_config_path=GROUNDING_DINO_CONFIG_PATH, model_checkpoint_path=GROUNDING_DINO_CHECKPOINT_PATH)
 20 | 
 21 | # Building MobileSAM predictor
 22 | HQSAM_CHECKPOINT_PATH = "./EfficientSAM/sam_hq_vit_tiny.pth"
 23 | checkpoint = torch.load(HQSAM_CHECKPOINT_PATH)
 24 | light_hqsam = setup_model()
 25 | light_hqsam.load_state_dict(checkpoint, strict=True)
 26 | light_hqsam.to(device=DEVICE)
 27 | 
 28 | sam_predictor = SamPredictor(light_hqsam)
 29 | 
 30 | 
 31 | # Predict classes and hyper-param for GroundingDINO
 32 | SOURCE_IMAGE_PATH = "./EfficientSAM/LightHQSAM/example_light_hqsam.png"
 33 | CLASSES = ["bench"]
 34 | BOX_THRESHOLD = 0.25
 35 | TEXT_THRESHOLD = 0.25
 36 | NMS_THRESHOLD = 0.8
 37 | 
 38 | 
 39 | # load image
 40 | image = cv2.imread(SOURCE_IMAGE_PATH)
 41 | 
 42 | # detect objects
 43 | detections = grounding_dino_model.predict_with_classes(
 44 |     image=image,
 45 |     classes=CLASSES,
 46 |     box_threshold=BOX_THRESHOLD,
 47 |     text_threshold=TEXT_THRESHOLD
 48 | )
 49 | 
 50 | # annotate image with detections
 51 | box_annotator = sv.BoxAnnotator()
 52 | labels = [
 53 |     f"{CLASSES[class_id]} {confidence:0.2f}" 
 54 |     for _, _, confidence, class_id, _, _
 55 |     in detections]
 56 | annotated_frame = box_annotator.annotate(scene=image.copy(), detections=detections, labels=labels)
 57 | 
 58 | # save the annotated grounding dino image
 59 | cv2.imwrite("EfficientSAM/LightHQSAM/groundingdino_annotated_image.jpg", annotated_frame)
 60 | 
 61 | 
 62 | # NMS post process
 63 | print(f"Before NMS: {len(detections.xyxy)} boxes")
 64 | nms_idx = torchvision.ops.nms(
 65 |     torch.from_numpy(detections.xyxy), 
 66 |     torch.from_numpy(detections.confidence), 
 67 |     NMS_THRESHOLD
 68 | ).numpy().tolist()
 69 | 
 70 | detections.xyxy = detections.xyxy[nms_idx]
 71 | detections.confidence = detections.confidence[nms_idx]
 72 | detections.class_id = detections.class_id[nms_idx]
 73 | 
 74 | print(f"After NMS: {len(detections.xyxy)} boxes")
 75 | 
 76 | # Prompting SAM with detected boxes
 77 | def segment(sam_predictor: SamPredictor, image: np.ndarray, xyxy: np.ndarray) -> np.ndarray:
 78 |     sam_predictor.set_image(image)
 79 |     result_masks = []
 80 |     for box in xyxy:
 81 |         masks, scores, logits = sam_predictor.predict(
 82 |             box=box,
 83 |             multimask_output=False,
 84 |             hq_token_only=True,
 85 |         )
 86 |         index = np.argmax(scores)
 87 |         result_masks.append(masks[index])
 88 |     return np.array(result_masks)
 89 | 
 90 | 
 91 | # convert detections to masks
 92 | detections.mask = segment(
 93 |     sam_predictor=sam_predictor,
 94 |     image=cv2.cvtColor(image, cv2.COLOR_BGR2RGB),
 95 |     xyxy=detections.xyxy
 96 | )
 97 | 
 98 | # annotate image with detections
 99 | box_annotator = sv.BoxAnnotator()
100 | mask_annotator = sv.MaskAnnotator()
101 | labels = [
102 |     f"{CLASSES[class_id]} {confidence:0.2f}" 
103 |     for _, _, confidence, class_id, _, _
104 |     in detections]
105 | annotated_image = mask_annotator.annotate(scene=image.copy(), detections=detections)
106 | annotated_image = box_annotator.annotate(scene=annotated_image, detections=detections, labels=labels)
107 | 
108 | # save the annotated grounded-sam image
109 | cv2.imwrite("EfficientSAM/LightHQSAM/grounded_light_hqsam_annotated_image.jpg", annotated_image)
110 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/EfficientSAM/grounded_repvit_sam.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import numpy as np
  3 | import supervision as sv
  4 | 
  5 | import torch
  6 | import torchvision
  7 | 
  8 | from groundingdino.util.inference import Model
  9 | from segment_anything import SamPredictor
 10 | from RepViTSAM.setup_repvit_sam import build_sam_repvit
 11 | 
 12 | DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 13 | 
 14 | # GroundingDINO config and checkpoint
 15 | GROUNDING_DINO_CONFIG_PATH = "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py"
 16 | GROUNDING_DINO_CHECKPOINT_PATH = "./groundingdino_swint_ogc.pth"
 17 | 
 18 | # Building GroundingDINO inference model
 19 | grounding_dino_model = Model(model_config_path=GROUNDING_DINO_CONFIG_PATH, model_checkpoint_path=GROUNDING_DINO_CHECKPOINT_PATH)
 20 | 
 21 | # Building MobileSAM predictor
 22 | RepViTSAM_CHECKPOINT_PATH = "./EfficientSAM/repvit_sam.pt"
 23 | repvit_sam = build_sam_repvit(checkpoint=RepViTSAM_CHECKPOINT_PATH)
 24 | repvit_sam.to(device=DEVICE)
 25 | 
 26 | sam_predictor = SamPredictor(repvit_sam)
 27 | 
 28 | 
 29 | # Predict classes and hyper-param for GroundingDINO
 30 | SOURCE_IMAGE_PATH = "./EfficientSAM/LightHQSAM/example_light_hqsam.png"
 31 | CLASSES = ["bench"]
 32 | BOX_THRESHOLD = 0.25
 33 | TEXT_THRESHOLD = 0.25
 34 | NMS_THRESHOLD = 0.8
 35 | 
 36 | 
 37 | # load image
 38 | image = cv2.imread(SOURCE_IMAGE_PATH)
 39 | 
 40 | # detect objects
 41 | detections = grounding_dino_model.predict_with_classes(
 42 |     image=image,
 43 |     classes=CLASSES,
 44 |     box_threshold=BOX_THRESHOLD,
 45 |     text_threshold=TEXT_THRESHOLD
 46 | )
 47 | 
 48 | # annotate image with detections
 49 | box_annotator = sv.BoxAnnotator()
 50 | labels = [
 51 |     f"{CLASSES[class_id]} {confidence:0.2f}" 
 52 |     for _, _, confidence, class_id, _, _ 
 53 |     in detections]
 54 | annotated_frame = box_annotator.annotate(scene=image.copy(), detections=detections, labels=labels)
 55 | 
 56 | # save the annotated grounding dino image
 57 | cv2.imwrite("EfficientSAM/LightHQSAM/groundingdino_annotated_image.jpg", annotated_frame)
 58 | 
 59 | 
 60 | # NMS post process
 61 | print(f"Before NMS: {len(detections.xyxy)} boxes")
 62 | nms_idx = torchvision.ops.nms(
 63 |     torch.from_numpy(detections.xyxy), 
 64 |     torch.from_numpy(detections.confidence), 
 65 |     NMS_THRESHOLD
 66 | ).numpy().tolist()
 67 | 
 68 | detections.xyxy = detections.xyxy[nms_idx]
 69 | detections.confidence = detections.confidence[nms_idx]
 70 | detections.class_id = detections.class_id[nms_idx]
 71 | 
 72 | print(f"After NMS: {len(detections.xyxy)} boxes")
 73 | 
 74 | # Prompting SAM with detected boxes
 75 | def segment(sam_predictor: SamPredictor, image: np.ndarray, xyxy: np.ndarray) -> np.ndarray:
 76 |     sam_predictor.set_image(image)
 77 |     result_masks = []
 78 |     for box in xyxy:
 79 |         masks, scores, logits = sam_predictor.predict(
 80 |             box=box,
 81 |             multimask_output=False,
 82 |             hq_token_only=True,
 83 |         )
 84 |         index = np.argmax(scores)
 85 |         result_masks.append(masks[index])
 86 |     return np.array(result_masks)
 87 | 
 88 | 
 89 | # convert detections to masks
 90 | detections.mask = segment(
 91 |     sam_predictor=sam_predictor,
 92 |     image=cv2.cvtColor(image, cv2.COLOR_BGR2RGB),
 93 |     xyxy=detections.xyxy
 94 | )
 95 | 
 96 | # annotate image with detections
 97 | box_annotator = sv.BoxAnnotator()
 98 | mask_annotator = sv.MaskAnnotator()
 99 | labels = [
100 |     f"{CLASSES[class_id]} {confidence:0.2f}" 
101 |     for _, _, confidence, class_id, _, _ 
102 |     in detections]
103 | annotated_image = mask_annotator.annotate(scene=image.copy(), detections=detections)
104 | annotated_image = box_annotator.annotate(scene=annotated_image, detections=detections, labels=labels)
105 | 
106 | # save the annotated grounded-sam image
107 | cv2.imwrite("EfficientSAM/grounded_repvit_sam_annotated_image.jpg", annotated_image)
108 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/GroundingDINO/.asset/COCO.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/GroundingDINO/.asset/COCO.png


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/GroundingDINO/.asset/GD_GLIGEN.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/GroundingDINO/.asset/GD_GLIGEN.png


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/GroundingDINO/.asset/GD_SD.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/GroundingDINO/.asset/GD_SD.png


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/GroundingDINO/.asset/ODinW.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/GroundingDINO/.asset/ODinW.png


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/GroundingDINO/.asset/arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/GroundingDINO/.asset/arch.png


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/GroundingDINO/.asset/cats.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/GroundingDINO/.asset/cats.png


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/GroundingDINO/.asset/hero_figure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/GroundingDINO/.asset/hero_figure.png


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/__init__.py


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/config/GroundingDINO_SwinB.py:
--------------------------------------------------------------------------------
 1 | batch_size = 1
 2 | modelname = "groundingdino"
 3 | backbone = "swin_B_384_22k"
 4 | position_embedding = "sine"
 5 | pe_temperatureH = 20
 6 | pe_temperatureW = 20
 7 | return_interm_indices = [1, 2, 3]
 8 | backbone_freeze_keywords = None
 9 | enc_layers = 6
10 | dec_layers = 6
11 | pre_norm = False
12 | dim_feedforward = 2048
13 | hidden_dim = 256
14 | dropout = 0.0
15 | nheads = 8
16 | num_queries = 900
17 | query_dim = 4
18 | num_patterns = 0
19 | num_feature_levels = 4
20 | enc_n_points = 4
21 | dec_n_points = 4
22 | two_stage_type = "standard"
23 | two_stage_bbox_embed_share = False
24 | two_stage_class_embed_share = False
25 | transformer_activation = "relu"
26 | dec_pred_bbox_embed_share = True
27 | dn_box_noise_scale = 1.0
28 | dn_label_noise_ratio = 0.5
29 | dn_label_coef = 1.0
30 | dn_bbox_coef = 1.0
31 | embed_init_tgt = True
32 | dn_labelbook_size = 2000
33 | max_text_len = 256
34 | text_encoder_type = "bert-base-uncased"
35 | use_text_enhancer = True
36 | use_fusion_layer = True
37 | use_checkpoint = True
38 | use_transformer_ckpt = True
39 | use_text_cross_attention = True
40 | text_dropout = 0.0
41 | fusion_dropout = 0.0
42 | fusion_droppath = 0.1
43 | sub_sentence_present = True
44 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py:
--------------------------------------------------------------------------------
 1 | batch_size = 1
 2 | modelname = "groundingdino"
 3 | backbone = "swin_T_224_1k"
 4 | position_embedding = "sine"
 5 | pe_temperatureH = 20
 6 | pe_temperatureW = 20
 7 | return_interm_indices = [1, 2, 3]
 8 | backbone_freeze_keywords = None
 9 | enc_layers = 6
10 | dec_layers = 6
11 | pre_norm = False
12 | dim_feedforward = 2048
13 | hidden_dim = 256
14 | dropout = 0.0
15 | nheads = 8
16 | num_queries = 900
17 | query_dim = 4
18 | num_patterns = 0
19 | num_feature_levels = 4
20 | enc_n_points = 4
21 | dec_n_points = 4
22 | two_stage_type = "standard"
23 | two_stage_bbox_embed_share = False
24 | two_stage_class_embed_share = False
25 | transformer_activation = "relu"
26 | dec_pred_bbox_embed_share = True
27 | dn_box_noise_scale = 1.0
28 | dn_label_noise_ratio = 0.5
29 | dn_label_coef = 1.0
30 | dn_bbox_coef = 1.0
31 | embed_init_tgt = True
32 | dn_labelbook_size = 2000
33 | max_text_len = 256
34 | text_encoder_type = "bert-base-uncased"
35 | use_text_enhancer = True
36 | use_fusion_layer = True
37 | use_checkpoint = True
38 | use_transformer_ckpt = True
39 | use_text_cross_attention = True
40 | text_dropout = 0.0
41 | fusion_dropout = 0.0
42 | fusion_droppath = 0.1
43 | sub_sentence_present = True
44 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/datasets/__init__.py


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/models/GroundingDINO/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Grounding DINO
 3 | # url: https://github.com/IDEA-Research/GroundingDINO
 4 | # Copyright (c) 2023 IDEA. All Rights Reserved.
 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | # ------------------------------------------------------------------------
 7 | # Conditional DETR
 8 | # Copyright (c) 2021 Microsoft. All Rights Reserved.
 9 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
10 | # ------------------------------------------------------------------------
11 | # Copied from DETR (https://github.com/facebookresearch/detr)
12 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
13 | # ------------------------------------------------------------------------
14 | 
15 | from .groundingdino import build_groundingdino
16 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/models/GroundingDINO/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | from .backbone import build_backbone
2 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/models/GroundingDINO/csrc/MsDeformAttn/ms_deform_attn.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #pragma once
12 | 
13 | #include "ms_deform_attn_cpu.h"
14 | 
15 | #ifdef WITH_CUDA
16 | #include "ms_deform_attn_cuda.h"
17 | #endif
18 | 
19 | namespace groundingdino {
20 | 
21 | at::Tensor
22 | ms_deform_attn_forward(
23 |     const at::Tensor &value, 
24 |     const at::Tensor &spatial_shapes,
25 |     const at::Tensor &level_start_index,
26 |     const at::Tensor &sampling_loc,
27 |     const at::Tensor &attn_weight,
28 |     const int im2col_step)
29 | {
30 |     if (value.type().is_cuda())
31 |     {
32 | #ifdef WITH_CUDA
33 |         return ms_deform_attn_cuda_forward(
34 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
35 | #else
36 |         AT_ERROR("Not compiled with GPU support");
37 | #endif
38 |     }
39 |     AT_ERROR("Not implemented on the CPU");
40 | }
41 | 
42 | std::vector<at::Tensor>
43 | ms_deform_attn_backward(
44 |     const at::Tensor &value, 
45 |     const at::Tensor &spatial_shapes,
46 |     const at::Tensor &level_start_index,
47 |     const at::Tensor &sampling_loc,
48 |     const at::Tensor &attn_weight,
49 |     const at::Tensor &grad_output,
50 |     const int im2col_step)
51 | {
52 |     if (value.type().is_cuda())
53 |     {
54 | #ifdef WITH_CUDA
55 |         return ms_deform_attn_cuda_backward(
56 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
57 | #else
58 |         AT_ERROR("Not compiled with GPU support");
59 | #endif
60 |     }
61 |     AT_ERROR("Not implemented on the CPU");
62 | }
63 | 
64 | } // namespace groundingdino


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/models/GroundingDINO/csrc/MsDeformAttn/ms_deform_attn_cpu.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #include <vector>
12 | 
13 | #include <ATen/ATen.h>
14 | #include <ATen/cuda/CUDAContext.h>
15 | 
16 | namespace groundingdino {
17 | 
18 | at::Tensor
19 | ms_deform_attn_cpu_forward(
20 |     const at::Tensor &value, 
21 |     const at::Tensor &spatial_shapes,
22 |     const at::Tensor &level_start_index,
23 |     const at::Tensor &sampling_loc,
24 |     const at::Tensor &attn_weight,
25 |     const int im2col_step)
26 | {
27 |     AT_ERROR("Not implement on cpu");
28 | }
29 | 
30 | std::vector<at::Tensor>
31 | ms_deform_attn_cpu_backward(
32 |     const at::Tensor &value, 
33 |     const at::Tensor &spatial_shapes,
34 |     const at::Tensor &level_start_index,
35 |     const at::Tensor &sampling_loc,
36 |     const at::Tensor &attn_weight,
37 |     const at::Tensor &grad_output,
38 |     const int im2col_step)
39 | {
40 |     AT_ERROR("Not implement on cpu");
41 | }
42 | 
43 | } // namespace groundingdino
44 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/models/GroundingDINO/csrc/MsDeformAttn/ms_deform_attn_cpu.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #pragma once
12 | #include <torch/extension.h>
13 | 
14 | namespace groundingdino {
15 | 
16 | at::Tensor
17 | ms_deform_attn_cpu_forward(
18 |     const at::Tensor &value, 
19 |     const at::Tensor &spatial_shapes,
20 |     const at::Tensor &level_start_index,
21 |     const at::Tensor &sampling_loc,
22 |     const at::Tensor &attn_weight,
23 |     const int im2col_step);
24 | 
25 | std::vector<at::Tensor>
26 | ms_deform_attn_cpu_backward(
27 |     const at::Tensor &value, 
28 |     const at::Tensor &spatial_shapes,
29 |     const at::Tensor &level_start_index,
30 |     const at::Tensor &sampling_loc,
31 |     const at::Tensor &attn_weight,
32 |     const at::Tensor &grad_output,
33 |     const int im2col_step);
34 | 
35 | } // namespace groundingdino
36 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/models/GroundingDINO/csrc/MsDeformAttn/ms_deform_attn_cuda.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #pragma once
12 | #include <torch/extension.h>
13 | 
14 | namespace groundingdino {
15 | 
16 | at::Tensor ms_deform_attn_cuda_forward(
17 |     const at::Tensor &value, 
18 |     const at::Tensor &spatial_shapes,
19 |     const at::Tensor &level_start_index,
20 |     const at::Tensor &sampling_loc,
21 |     const at::Tensor &attn_weight,
22 |     const int im2col_step);
23 | 
24 | std::vector<at::Tensor> ms_deform_attn_cuda_backward(
25 |     const at::Tensor &value, 
26 |     const at::Tensor &spatial_shapes,
27 |     const at::Tensor &level_start_index,
28 |     const at::Tensor &sampling_loc,
29 |     const at::Tensor &attn_weight,
30 |     const at::Tensor &grad_output,
31 |     const int im2col_step);
32 | 
33 | } // namespace groundingdino


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/models/GroundingDINO/csrc/cuda_version.cu:
--------------------------------------------------------------------------------
1 | #include <cuda_runtime_api.h>
2 | 
3 | namespace groundingdino {
4 | int get_cudart_version() {
5 |   return CUDART_VERSION;
6 | }
7 | } // namespace groundingdino
8 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/models/GroundingDINO/csrc/vision.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 2 | 
 3 | #include "MsDeformAttn/ms_deform_attn.h"
 4 | 
 5 | namespace groundingdino {
 6 | 
 7 | #ifdef WITH_CUDA
 8 | extern int get_cudart_version();
 9 | #endif
10 | 
11 | std::string get_cuda_version() {
12 | #ifdef WITH_CUDA
13 |   std::ostringstream oss;
14 | 
15 |   // copied from
16 |   // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231
17 |   auto printCudaStyleVersion = [&](int v) {
18 |     oss << (v / 1000) << "." << (v / 10 % 100);
19 |     if (v % 10 != 0) {
20 |       oss << "." << (v % 10);
21 |     }
22 |   };
23 |   printCudaStyleVersion(get_cudart_version());
24 |   return oss.str();
25 | #else
26 |   return std::string("not available");
27 | #endif
28 | }
29 | 
30 | // similar to
31 | // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp
32 | std::string get_compiler_version() {
33 |   std::ostringstream ss;
34 | #if defined(__GNUC__)
35 | #ifndef __clang__
36 |   { ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; }
37 | #endif
38 | #endif
39 | 
40 | #if defined(__clang_major__)
41 |   {
42 |     ss << "clang " << __clang_major__ << "." << __clang_minor__ << "."
43 |        << __clang_patchlevel__;
44 |   }
45 | #endif
46 | 
47 | #if defined(_MSC_VER)
48 |   { ss << "MSVC " << _MSC_FULL_VER; }
49 | #endif
50 |   return ss.str();
51 | }
52 | 
53 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
54 |   m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
55 |   m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
56 | }
57 | 
58 | } // namespace groundingdino


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Grounding DINO
 3 | # url: https://github.com/IDEA-Research/GroundingDINO
 4 | # Copyright (c) 2023 IDEA. All Rights Reserved.
 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | # ------------------------------------------------------------------------
 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 8 | from .GroundingDINO import build_groundingdino
 9 | 
10 | 
11 | def build_model(args):
12 |     # we use register to maintain models from catdet6 on.
13 |     from .registry import MODULE_BUILD_FUNCS
14 | 
15 |     assert args.modelname in MODULE_BUILD_FUNCS._module_dict
16 |     build_func = MODULE_BUILD_FUNCS.get(args.modelname)
17 |     model = build_func(args)
18 |     return model
19 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/models/registry.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Grounding DINO
 3 | # url: https://github.com/IDEA-Research/GroundingDINO
 4 | # Copyright (c) 2023 IDEA. All Rights Reserved.
 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | # ------------------------------------------------------------------------
 7 | # -*- coding: utf-8 -*-
 8 | # @Author: Yihao Chen
 9 | # @Date:   2021-08-16 16:03:17
10 | # @Last Modified by:   Shilong Liu
11 | # @Last Modified time: 2022-01-23 15:26
12 | # modified from mmcv
13 | 
14 | import inspect
15 | from functools import partial
16 | 
17 | 
18 | class Registry(object):
19 |     def __init__(self, name):
20 |         self._name = name
21 |         self._module_dict = dict()
22 | 
23 |     def __repr__(self):
24 |         format_str = self.__class__.__name__ + "(name={}, items={})".format(
25 |             self._name, list(self._module_dict.keys())
26 |         )
27 |         return format_str
28 | 
29 |     def __len__(self):
30 |         return len(self._module_dict)
31 | 
32 |     @property
33 |     def name(self):
34 |         return self._name
35 | 
36 |     @property
37 |     def module_dict(self):
38 |         return self._module_dict
39 | 
40 |     def get(self, key):
41 |         return self._module_dict.get(key, None)
42 | 
43 |     def registe_with_name(self, module_name=None, force=False):
44 |         return partial(self.register, module_name=module_name, force=force)
45 | 
46 |     def register(self, module_build_function, module_name=None, force=False):
47 |         """Register a module build function.
48 |         Args:
49 |             module (:obj:`nn.Module`): Module to be registered.
50 |         """
51 |         if not inspect.isfunction(module_build_function):
52 |             raise TypeError(
53 |                 "module_build_function must be a function, but got {}".format(
54 |                     type(module_build_function)
55 |                 )
56 |             )
57 |         if module_name is None:
58 |             module_name = module_build_function.__name__
59 |         if not force and module_name in self._module_dict:
60 |             raise KeyError("{} is already registered in {}".format(module_name, self.name))
61 |         self._module_dict[module_name] = module_build_function
62 | 
63 |         return module_build_function
64 | 
65 | 
66 | MODULE_BUILD_FUNCS = Registry("model build functions")
67 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/util/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/util/get_tokenlizer.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoTokenizer, BertModel, BertTokenizer, RobertaModel, RobertaTokenizerFast
 2 | 
 3 | 
 4 | def get_tokenlizer(text_encoder_type, bert_base_uncased_path):
 5 |     if not isinstance(text_encoder_type, str):
 6 |         # print("text_encoder_type is not a str")
 7 |         if hasattr(text_encoder_type, "text_encoder_type"):
 8 |             text_encoder_type = text_encoder_type.text_encoder_type
 9 |         elif text_encoder_type.get("text_encoder_type", False):
10 |             text_encoder_type = text_encoder_type.get("text_encoder_type")
11 |         else:
12 |             raise ValueError(
13 |                 "Unknown type of text_encoder_type: {}".format(type(text_encoder_type))
14 |             )
15 |     
16 |     # solve huggingface connect issue
17 |     if is_bert_model_use_local_path(bert_base_uncased_path) and text_encoder_type == "bert-base-uncased":
18 |         print("use local bert model path: {}".format(bert_base_uncased_path))
19 |         return AutoTokenizer.from_pretrained(bert_base_uncased_path)
20 | 
21 |     print("final text_encoder_type: {}".format(text_encoder_type))
22 | 
23 |     tokenizer = AutoTokenizer.from_pretrained(text_encoder_type)
24 |     return tokenizer
25 | 
26 | 
27 | def get_pretrained_language_model(text_encoder_type, bert_base_uncased_path):
28 |     if text_encoder_type == "bert-base-uncased":
29 |         if is_bert_model_use_local_path(bert_base_uncased_path):
30 |             return BertModel.from_pretrained(bert_base_uncased_path)
31 |         return BertModel.from_pretrained(text_encoder_type)
32 |     if text_encoder_type == "roberta-base":
33 |         return RobertaModel.from_pretrained(text_encoder_type)
34 |     raise ValueError("Unknown text_encoder_type {}".format(text_encoder_type))
35 | 
36 | def is_bert_model_use_local_path(bert_base_uncased_path):
37 |     return bert_base_uncased_path is not None and len(bert_base_uncased_path) > 0
38 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/util/logger.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 2 | import functools
 3 | import logging
 4 | import os
 5 | import sys
 6 | 
 7 | from termcolor import colored
 8 | 
 9 | 
10 | class _ColorfulFormatter(logging.Formatter):
11 |     def __init__(self, *args, **kwargs):
12 |         self._root_name = kwargs.pop("root_name") + "."
13 |         self._abbrev_name = kwargs.pop("abbrev_name", "")
14 |         if len(self._abbrev_name):
15 |             self._abbrev_name = self._abbrev_name + "."
16 |         super(_ColorfulFormatter, self).__init__(*args, **kwargs)
17 | 
18 |     def formatMessage(self, record):
19 |         record.name = record.name.replace(self._root_name, self._abbrev_name)
20 |         log = super(_ColorfulFormatter, self).formatMessage(record)
21 |         if record.levelno == logging.WARNING:
22 |             prefix = colored("WARNING", "red", attrs=["blink"])
23 |         elif record.levelno == logging.ERROR or record.levelno == logging.CRITICAL:
24 |             prefix = colored("ERROR", "red", attrs=["blink", "underline"])
25 |         else:
26 |             return log
27 |         return prefix + " " + log
28 | 
29 | 
30 | # so that calling setup_logger multiple times won't add many handlers
31 | @functools.lru_cache()
32 | def setup_logger(output=None, distributed_rank=0, *, color=True, name="imagenet", abbrev_name=None):
33 |     """
34 |     Initialize the detectron2 logger and set its verbosity level to "INFO".
35 | 
36 |     Args:
37 |         output (str): a file name or a directory to save log. If None, will not save log file.
38 |             If ends with ".txt" or ".log", assumed to be a file name.
39 |             Otherwise, logs will be saved to `output/log.txt`.
40 |         name (str): the root module name of this logger
41 | 
42 |     Returns:
43 |         logging.Logger: a logger
44 |     """
45 |     logger = logging.getLogger(name)
46 |     logger.setLevel(logging.DEBUG)
47 |     logger.propagate = False
48 | 
49 |     if abbrev_name is None:
50 |         abbrev_name = name
51 | 
52 |     plain_formatter = logging.Formatter(
53 |         "[%(asctime)s.%(msecs)03d]: %(message)s", datefmt="%m/%d %H:%M:%S"
54 |     )
55 |     # stdout logging: master only
56 |     if distributed_rank == 0:
57 |         ch = logging.StreamHandler(stream=sys.stdout)
58 |         ch.setLevel(logging.DEBUG)
59 |         if color:
60 |             formatter = _ColorfulFormatter(
61 |                 colored("[%(asctime)s.%(msecs)03d]: ", "green") + "%(message)s",
62 |                 datefmt="%m/%d %H:%M:%S",
63 |                 root_name=name,
64 |                 abbrev_name=str(abbrev_name),
65 |             )
66 |         else:
67 |             formatter = plain_formatter
68 |         ch.setFormatter(formatter)
69 |         logger.addHandler(ch)
70 | 
71 |     # file logging: all workers
72 |     if output is not None:
73 |         if output.endswith(".txt") or output.endswith(".log"):
74 |             filename = output
75 |         else:
76 |             filename = os.path.join(output, "log.txt")
77 |         if distributed_rank > 0:
78 |             filename = filename + f".rank{distributed_rank}"
79 |         os.makedirs(os.path.dirname(filename), exist_ok=True)
80 | 
81 |         fh = logging.StreamHandler(_cached_log_stream(filename))
82 |         fh.setLevel(logging.DEBUG)
83 |         fh.setFormatter(plain_formatter)
84 |         logger.addHandler(fh)
85 | 
86 |     return logger
87 | 
88 | 
89 | # cache the opened file object, so that different calls to `setup_logger`
90 | # with the same file name can safely write to the same file.
91 | @functools.lru_cache(maxsize=None)
92 | def _cached_log_stream(filename):
93 |     return open(filename, "a")
94 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/util/time_counter.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import time
 3 | 
 4 | 
 5 | class TimeCounter:
 6 |     def __init__(self) -> None:
 7 |         pass
 8 | 
 9 |     def clear(self):
10 |         self.timedict = {}
11 |         self.basetime = time.perf_counter()
12 | 
13 |     def timeit(self, name):
14 |         nowtime = time.perf_counter() - self.basetime
15 |         self.timedict[name] = nowtime
16 |         self.basetime = time.perf_counter()
17 | 
18 | 
19 | class TimeHolder:
20 |     def __init__(self) -> None:
21 |         self.timedict = {}
22 | 
23 |     def update(self, _timedict: dict):
24 |         for k, v in _timedict.items():
25 |             if k not in self.timedict:
26 |                 self.timedict[k] = AverageMeter(name=k, val_only=True)
27 |             self.timedict[k].update(val=v)
28 | 
29 |     def final_res(self):
30 |         return {k: v.avg for k, v in self.timedict.items()}
31 | 
32 |     def __str__(self):
33 |         return json.dumps(self.final_res(), indent=2)
34 | 
35 | 
36 | class AverageMeter(object):
37 |     """Computes and stores the average and current value"""
38 | 
39 |     def __init__(self, name, fmt=":f", val_only=False):
40 |         self.name = name
41 |         self.fmt = fmt
42 |         self.val_only = val_only
43 |         self.reset()
44 | 
45 |     def reset(self):
46 |         self.val = 0
47 |         self.avg = 0
48 |         self.sum = 0
49 |         self.count = 0
50 | 
51 |     def update(self, val, n=1):
52 |         self.val = val
53 |         self.sum += val * n
54 |         self.count += n
55 |         self.avg = self.sum / self.count
56 | 
57 |     def __str__(self):
58 |         if self.val_only:
59 |             fmtstr = "{name} {val" + self.fmt + "}"
60 |         else:
61 |             fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})"
62 |         return fmtstr.format(**self.__dict__)
63 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/util/vl_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | from typing import List
  4 | 
  5 | import torch
  6 | 
  7 | 
  8 | def create_positive_map_from_span(tokenized, token_span, max_text_len=256):
  9 |     """construct a map such that positive_map[i,j] = True iff box i is associated to token j
 10 |     Input:
 11 |         - tokenized:
 12 |             - input_ids: Tensor[1, ntokens]
 13 |             - attention_mask: Tensor[1, ntokens]
 14 |         - token_span: list with length num_boxes.
 15 |             - each item: [start_idx, end_idx]
 16 |     """
 17 |     positive_map = torch.zeros((len(token_span), max_text_len), dtype=torch.float)
 18 |     for j, tok_list in enumerate(token_span):
 19 |         for (beg, end) in tok_list:
 20 |             beg_pos = tokenized.char_to_token(beg)
 21 |             end_pos = tokenized.char_to_token(end - 1)
 22 |             if beg_pos is None:
 23 |                 try:
 24 |                     beg_pos = tokenized.char_to_token(beg + 1)
 25 |                     if beg_pos is None:
 26 |                         beg_pos = tokenized.char_to_token(beg + 2)
 27 |                 except:
 28 |                     beg_pos = None
 29 |             if end_pos is None:
 30 |                 try:
 31 |                     end_pos = tokenized.char_to_token(end - 2)
 32 |                     if end_pos is None:
 33 |                         end_pos = tokenized.char_to_token(end - 3)
 34 |                 except:
 35 |                     end_pos = None
 36 |             if beg_pos is None or end_pos is None:
 37 |                 continue
 38 | 
 39 |             assert beg_pos is not None and end_pos is not None
 40 |             if os.environ.get("SHILONG_DEBUG_ONLY_ONE_POS", None) == "TRUE":
 41 |                 positive_map[j, beg_pos] = 1
 42 |                 break
 43 |             else:
 44 |                 positive_map[j, beg_pos : end_pos + 1].fill_(1)
 45 | 
 46 |     return positive_map / (positive_map.sum(-1)[:, None] + 1e-6)
 47 | 
 48 | 
 49 | def build_captions_and_token_span(cat_list, force_lowercase):
 50 |     """
 51 |     Return:
 52 |         captions: str
 53 |         cat2tokenspan: dict
 54 |             {
 55 |                 'dog': [[0, 2]],
 56 |                 ...
 57 |             }
 58 |     """
 59 | 
 60 |     cat2tokenspan = {}
 61 |     captions = ""
 62 |     for catname in cat_list:
 63 |         class_name = catname
 64 |         if force_lowercase:
 65 |             class_name = class_name.lower()
 66 |         if "/" in class_name:
 67 |             class_name_list: List = class_name.strip().split("/")
 68 |             class_name_list.append(class_name)
 69 |             class_name: str = random.choice(class_name_list)
 70 | 
 71 |         tokens_positive_i = []
 72 |         subnamelist = [i.strip() for i in class_name.strip().split(" ")]
 73 |         for subname in subnamelist:
 74 |             if len(subname) == 0:
 75 |                 continue
 76 |             if len(captions) > 0:
 77 |                 captions = captions + " "
 78 |             strat_idx = len(captions)
 79 |             end_idx = strat_idx + len(subname)
 80 |             tokens_positive_i.append([strat_idx, end_idx])
 81 |             captions = captions + subname
 82 | 
 83 |         if len(tokens_positive_i) > 0:
 84 |             captions = captions + " ."
 85 |             cat2tokenspan[class_name] = tokens_positive_i
 86 | 
 87 |     return captions, cat2tokenspan
 88 | 
 89 | 
 90 | def build_id2posspan_and_caption(category_dict: dict):
 91 |     """Build id2pos_span and caption from category_dict
 92 | 
 93 |     Args:
 94 |         category_dict (dict): category_dict
 95 |     """
 96 |     cat_list = [item["name"].lower() for item in category_dict]
 97 |     id2catname = {item["id"]: item["name"].lower() for item in category_dict}
 98 |     caption, cat2posspan = build_captions_and_token_span(cat_list, force_lowercase=True)
 99 |     id2posspan = {catid: cat2posspan[catname] for catid, catname in id2catname.items()}
100 |     return id2posspan, caption
101 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/version.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.1.0'
2 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/GroundingDINO/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 |   "setuptools",
4 |   "torch",
5 |   "wheel",
6 |   "torch"
7 | ]
8 | build-backend = "setuptools.build_meta"
9 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/GroundingDINO/requirements.txt:
--------------------------------------------------------------------------------
 1 | torch
 2 | torchvision
 3 | transformers
 4 | addict
 5 | yapf
 6 | timm
 7 | numpy
 8 | opencv-python
 9 | supervision
10 | pycocotools


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/Makefile:
--------------------------------------------------------------------------------
 1 | # Get version of CUDA and enable it for compilation if CUDA > 11.0
 2 | # This solves https://github.com/IDEA-Research/Grounded-Segment-Anything/issues/53
 3 | # and https://github.com/IDEA-Research/Grounded-Segment-Anything/issues/84
 4 | # when running in Docker
 5 | # Check if nvcc is installed
 6 | NVCC := $(shell which nvcc)
 7 | ifeq ($(NVCC),)
 8 | 	# NVCC not found
 9 | 	USE_CUDA := 0
10 | 	NVCC_VERSION := "not installed"
11 | else
12 | 	NVCC_VERSION := $(shell nvcc --version | grep -oP 'release \K[0-9.]+')
13 | 	USE_CUDA := $(shell echo "$(NVCC_VERSION) > 11" | bc -l)
14 | endif
15 | 
16 | # Add the list of supported ARCHs
17 | ifeq ($(USE_CUDA), 1)
18 | 	TORCH_CUDA_ARCH_LIST := "3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6+PTX"
19 | 	BUILD_MESSAGE := "I will try to build the image with CUDA support"
20 | else
21 | 	TORCH_CUDA_ARCH_LIST :=
22 | 	BUILD_MESSAGE := "CUDA $(NVCC_VERSION) is not supported"
23 | endif
24 | 
25 | 
26 | build-image:
27 | 	@echo $(BUILD_MESSAGE)
28 | 	docker build --build-arg USE_CUDA=$(USE_CUDA) \
29 | 	--build-arg TORCH_ARCH=$(TORCH_CUDA_ARCH_LIST) \
30 | 	-t gsa:v0 .
31 | run:
32 | ifeq (,$(wildcard ./sam_vit_h_4b8939.pth))
33 | 	wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth
34 | endif
35 | ifeq (,$(wildcard ./groundingdino_swint_ogc.pth))
36 | 	wget https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth
37 | endif
38 | 	docker run --gpus all -it --rm --net=host --privileged \
39 | 	-v /tmp/.X11-unix:/tmp/.X11-unix \
40 | 	-v "${PWD}":/home/appuser/Grounded-Segment-Anything \
41 | 	-e DISPLAY=$DISPLAY \
42 | 	--name=gsa \
43 | 	--ipc=host -it gsa:v0
44 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/cog.yaml:
--------------------------------------------------------------------------------
 1 | # Configuration for Cog ⚙️
 2 | # Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md
 3 | 
 4 | build:
 5 |   gpu: true
 6 |   cuda: "11.7"
 7 |   system_packages:
 8 |     - "libgl1-mesa-glx"
 9 |     - "libglib2.0-0"
10 |   python_version: "3.10"
11 |   python_packages:
12 |     - "timm==0.9.2"
13 |     - "transformers==4.30.2"
14 |     - "fairscale==0.4.13"
15 |     - "pycocoevalcap==1.2"
16 |     - "torch==1.13.0"
17 |     - "torchvision==0.14.0"
18 |     - "Pillow==9.5.0"
19 |     - "scipy==1.10.1"
20 |     - "opencv-python==4.7.0.72"
21 |     - "addict==2.4.0"
22 |     - "yapf==0.40.0"
23 |     - "supervision==0.10.0"
24 |     - git+https://github.com/openai/CLIP.git
25 |     - ipython
26 | 
27 | predict: "predict.py:Predictor"
28 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/playground/DeepFloyd/dream.py:
--------------------------------------------------------------------------------
 1 | from deepfloyd_if.modules import IFStageI, IFStageII, StableStageIII
 2 | from deepfloyd_if.modules.t5 import T5Embedder
 3 | from deepfloyd_if.pipelines import dream
 4 | 
 5 | # Run locally
 6 | device = 'cuda'
 7 | cache_dir = "/path/to/storage/IF"
 8 | if_I = IFStageI('IF-I-L-v1.0', device=device, cache_dir=cache_dir)
 9 | if_II = IFStageII('IF-II-L-v1.0', device=device, cache_dir=cache_dir)
10 | if_III = StableStageIII('stable-diffusion-x4-upscaler', device=device, cache_dir=cache_dir)
11 | t5 = T5Embedder(device=device, cache_dir=cache_dir)
12 | 
13 | prompt = "In the heart of the wilderness, an enchanting forest reveals itself. \
14 |     Towering trees, their trunks sturdy and thick, reach skyward, their leafy canopies \
15 |     forming a natural cathedral. Verdant moss clings to bark, and tendrils of ivy climb ambitiously towards the sun-dappled treetops. \
16 |     The forest floor is a tapestry of fallen leaves, sprinkled with delicate wildflowers. The soft chatter of wildlife resonates, while a nearby brook babbles, its clear waters winking in the dappled light. \
17 |     Sunrays filter through the foliage, casting an emerald glow that dances on the woodland floor. Amidst the tranquility, the forest teems with life, whispering ancient secrets on the breeze."
18 | count = 1
19 | 
20 | result = dream(
21 |     t5=t5, if_I=if_I, if_II=if_II, if_III=if_III,
22 |     prompt=[prompt]*count,
23 |     seed=42,
24 |     if_I_kwargs={
25 |         "guidance_scale": 7.0,
26 |         "sample_timestep_respacing": "smart100",
27 |     },
28 |     if_II_kwargs={
29 |         "guidance_scale": 4.0,
30 |         "sample_timestep_respacing": "smart50",
31 |     },
32 |     if_III_kwargs={
33 |         "guidance_scale": 9.0,
34 |         "noise_level": 20,
35 |         "sample_timestep_respacing": "75",
36 |     },
37 | )
38 | result['III'][0].save("./dream_figure.jpg")
39 | 
40 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/playground/DeepFloyd/inpaint.py:
--------------------------------------------------------------------------------
 1 | import PIL
 2 | import requests
 3 | from io import BytesIO
 4 | from torchvision.transforms import ToTensor
 5 | 
 6 | from deepfloyd_if.modules import IFStageI, IFStageII, StableStageIII
 7 | from deepfloyd_if.modules.t5 import T5Embedder
 8 | from deepfloyd_if.pipelines import inpainting
 9 | 
10 | def download_image(url):
11 |     response = requests.get(url)
12 |     return PIL.Image.open(BytesIO(response.content)).convert("RGB")
13 | 
14 | img_url = "https://raw.githubusercontent.com/Fantasy-Studio/Paint-by-Example/main/examples/image/example_1.png"
15 | mask_url = "https://raw.githubusercontent.com/Fantasy-Studio/Paint-by-Example/main/examples/mask/example_1.png"
16 | 
17 | init_image = download_image(img_url).resize((512, 512))
18 | mask_image = download_image(mask_url).resize((512, 512))
19 | 
20 | # convert mask_image to torch.Tensor to avoid bug
21 | mask_image = ToTensor()(mask_image).unsqueeze(0)  # (1, 3, 512, 512)
22 | 
23 | # Run locally
24 | device = 'cuda:5'
25 | cache_dir = "/comp_robot/rentianhe/weights/IF/"
26 | if_I = IFStageI('IF-I-L-v1.0', device=device, cache_dir=cache_dir)
27 | if_II = IFStageII('IF-II-L-v1.0', device=device, cache_dir=cache_dir)
28 | if_III = StableStageIII('stable-diffusion-x4-upscaler', device=device, cache_dir=cache_dir)
29 | t5 = T5Embedder(device=device, cache_dir=cache_dir)
30 | result = inpainting(
31 |     t5=t5, if_I=if_I,
32 |     if_II=if_II,
33 |     if_III=if_III,
34 |     support_pil_img=init_image,
35 |     inpainting_mask=mask_image,
36 |     prompt=[
37 |         'A Panda'
38 |     ],
39 |     seed=42,
40 |     if_I_kwargs={
41 |         "guidance_scale": 7.0,
42 |         "sample_timestep_respacing": "10,10,10,10,10,0,0,0,0,0",
43 |         'support_noise_less_qsample_steps': 0,
44 |     },
45 |     if_II_kwargs={
46 |         "guidance_scale": 4.0,
47 |         'aug_level': 0.0,
48 |         "sample_timestep_respacing": '100',
49 |     },
50 |     if_III_kwargs={
51 |         "guidance_scale": 9.0,
52 |         "noise_level": 20,
53 |         "sample_timestep_respacing": "75",
54 |     },
55 | )
56 | if_I.show(result['I'], 2, 3)
57 | if_I.show(result['II'], 2, 6)
58 | if_I.show(result['III'], 2, 14)
59 | 
60 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/playground/DeepFloyd/style_transfer.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | 
 3 | from deepfloyd_if.modules import IFStageI, IFStageII
 4 | from deepfloyd_if.modules.t5 import T5Embedder
 5 | from deepfloyd_if.pipelines import style_transfer
 6 | 
 7 | # Run locally
 8 | device = 'cuda'
 9 | cache_dir = "/path/to/storage/IF"
10 | if_I = IFStageI('IF-I-XL-v1.0', device=device, cache_dir=cache_dir)
11 | if_II = IFStageII('IF-II-L-v1.0', device=device, cache_dir=cache_dir)
12 | t5 = T5Embedder(device=device, cache_dir=cache_dir)
13 | 
14 | # Style generate from GPT-4
15 | style_prompt = [
16 |     "in style of colorful and cute kawaii art",
17 |     "in style of boho-chic textile patterns",
18 | ]
19 | 
20 | raw_pil_image = Image.open("/path/to/image")
21 | 
22 | result = style_transfer(
23 |     t5=t5, if_I=if_I, if_II=if_II,
24 |     support_pil_img=raw_pil_image,
25 |     style_prompt=style_prompt,
26 |     seed=42,
27 |     if_I_kwargs={
28 |         "guidance_scale": 10.0,
29 |         "sample_timestep_respacing": "10,10,10,10,10,10,10,10,0,0",
30 |         'support_noise_less_qsample_steps': 5,
31 |     },
32 |     if_II_kwargs={
33 |         "guidance_scale": 4.0,
34 |         "sample_timestep_respacing": 'smart50',
35 |         "support_noise_less_qsample_steps": 5,
36 |     },
37 | )
38 | 
39 | # save all the images generated in StageII
40 | for i, image in enumerate(result["II"]):
41 |     image.save("./style_transfer_{}.jpg".format(i))
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/playground/ImageBind_SAM/.assets/bird_audio.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/playground/ImageBind_SAM/.assets/bird_audio.wav


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/playground/ImageBind_SAM/.assets/bird_image.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/playground/ImageBind_SAM/.assets/bird_image.jpg


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/playground/ImageBind_SAM/.assets/car_audio.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/playground/ImageBind_SAM/.assets/car_audio.wav


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/playground/ImageBind_SAM/.assets/car_image.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/playground/ImageBind_SAM/.assets/car_image.jpg


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/playground/ImageBind_SAM/.assets/dog_audio.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/playground/ImageBind_SAM/.assets/dog_audio.wav


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/playground/ImageBind_SAM/.assets/dog_image.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/playground/ImageBind_SAM/.assets/dog_image.jpg


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/playground/ImageBind_SAM/README.md:
--------------------------------------------------------------------------------
 1 | ## ImageBind with SAM
 2 | 
 3 | This is an experimental demo aims to combine [ImageBind](https://github.com/facebookresearch/ImageBind) and [SAM](https://github.com/facebookresearch/segment-anything) to generate mask **with different modalities**.
 4 | 
 5 | This basic idea is followed with [IEA: Image Editing Anything](https://github.com/feizc/IEA) and [CLIP-SAM](https://github.com/maxi-w/CLIP-SAM) which generate the referring mask with the following steps:
 6 | 
 7 | - Step 1: Generate auto masks with `SamAutomaticMaskGenerator`
 8 | - Step 2: Crop all the box region from the masks
 9 | - Step 3: Compute the similarity with cropped images and different modalities
10 | - Step 4: Merge the highest similarity mask region
11 | 
12 | ## Table of contents
13 | - [Installation](#installation)
14 | - [ImageBind-SAM Demo](#run-the-demo)
15 | - [Audio Referring Segment](#run-audio-referring-segment-demo)
16 | - [Text Referring Segment](#run-text-referring-segment-demo)
17 | - [Image Referring Segment](#run-image-referring-segmentation-demo)
18 | 
19 | 
20 | 
21 | ## Installation
22 | - Download the pretrained checkpoints
23 | 
24 | ```bash
25 | cd playground/ImageBind_SAM
26 | 
27 | mkdir .checkpoints
28 | cd .checkpoints
29 | 
30 | # download imagebind weights
31 | wget https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth
32 | wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth
33 | ```
34 | 
35 | - Install ImageBind follow the [official installation guidance](https://github.com/facebookresearch/ImageBind#usage).
36 | - Install Grounded-SAM follow [install Grounded-SAM](https://github.com/IDEA-Research/Grounded-Segment-Anything#installation).
37 | 
38 | 
39 | ## Run the demo
40 | ```bash
41 | python demo.py
42 | ```
43 | 
44 | We implement `Text Seg` and `Audio Seg` in this demo, the generate masks will be saved as `text_sam_merged_mask.jpg` and `audio_sam_merged_mask.jpg`:
45 | 
46 | <div align="center">
47 | 
48 | | Input Model | Modality | Generate Mask |
49 | |:----:|:----:|:----:|
50 | | ![](./.assets/car_image.jpg) | [car audio](./.assets/car_audio.wav) | ![](https://github.com/IDEA-Research/detrex-storage/blob/main/assets/grounded_sam/imagebind_sam/audio_sam_merged_mask_new.jpg?raw=true) |
51 | | ![](./.assets/car_image.jpg) | "A car" | ![](https://github.com/IDEA-Research/detrex-storage/blob/main/assets/grounded_sam/imagebind_sam/text_sam_merged_mask.jpg?raw=true) |
52 | | ![](./.assets/car_image.jpg) | <div style="text-align: center"> <img src="https://github.com/IDEA-Research/detrex-storage/blob/main/assets/grounded_sam/imagebind_sam/referring_car_image.jpg?raw=true" width=55%></div> | ![](https://github.com/IDEA-Research/detrex-storage/blob/main/assets/grounded_sam/imagebind_sam/image_referring_sam_merged_mask.jpg?raw=true) |
53 | 
54 | 
55 | </div>
56 | 
57 | By setting different threshold may influence a lot on the final results.
58 | 
59 | ## Run image referring segmentation demo
60 | ```bash
61 | # download the referring image
62 | cd .assets
63 | wget https://github.com/IDEA-Research/detrex-storage/releases/download/grounded-sam-storage/referring_car_image.jpg
64 | cd ..
65 | 
66 | python image_referring_seg_demo.py
67 | ```
68 | 
69 | ## Run audio referring segmentation demo
70 | ```bash
71 | python audio_referring_seg_demo.py
72 | ```
73 | 
74 | ## Run text referring segmentation demo
75 | ```bash
76 | python text_referring_seg_demo.py
77 | ```


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/playground/ImageBind_SAM/audio_referring_seg_demo.py:
--------------------------------------------------------------------------------
 1 | import data
 2 | import cv2
 3 | import torch
 4 | from PIL import Image, ImageDraw
 5 | from tqdm import tqdm
 6 | from models import imagebind_model
 7 | from models.imagebind_model import ModalityType
 8 | 
 9 | from segment_anything import build_sam, SamAutomaticMaskGenerator
10 | 
11 | from utils import (
12 |     segment_image, 
13 |     convert_box_xywh_to_xyxy,
14 |     get_indices_of_values_above_threshold,
15 | )
16 | 
17 | 
18 | device = "cuda" if torch.cuda.is_available() else "cpu"
19 | 
20 | 
21 | """
22 | Step 1: Instantiate model
23 | """
24 | # Segment Anything
25 | mask_generator = SamAutomaticMaskGenerator(
26 |     build_sam(checkpoint=".checkpoints/sam_vit_h_4b8939.pth").to(device),
27 |     points_per_side=16,
28 | )
29 | 
30 | # ImageBind
31 | bind_model = imagebind_model.imagebind_huge(pretrained=True)
32 | bind_model.eval()
33 | bind_model.to(device)
34 | 
35 | 
36 | """
37 | Step 2: Generate auto masks with SAM
38 | """
39 | image_path = ".assets/car_image.jpg"
40 | image = cv2.imread(image_path)
41 | image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
42 | masks = mask_generator.generate(image)
43 | 
44 | 
45 | """
46 | Step 3: Get cropped images based on mask and box
47 | """
48 | cropped_boxes = []
49 | image = Image.open(image_path)
50 | for mask in tqdm(masks):
51 |     cropped_boxes.append(segment_image(image, mask["segmentation"]).crop(convert_box_xywh_to_xyxy(mask["bbox"])))
52 | 
53 | 
54 | """
55 | Step 4: Run ImageBind model to get similarity between cropped image and different modalities
56 | """
57 | def retriev_vision_and_audio(elements, audio_list):
58 |     inputs = {
59 |         ModalityType.VISION: data.load_and_transform_vision_data_from_pil_image(elements, device),
60 |         ModalityType.AUDIO: data.load_and_transform_audio_data(audio_list, device),
61 |     }
62 |     with torch.no_grad():
63 |         embeddings = bind_model(inputs)
64 |     vision_audio = torch.softmax(embeddings[ModalityType.VISION] @ embeddings[ModalityType.AUDIO].T, dim=0),
65 |     return vision_audio 
66 | 
67 | vision_audio_result = retriev_vision_and_audio(cropped_boxes, [".assets/car_audio.wav"])
68 | 
69 | 
70 | """
71 | Step 5: Merge the top similarity masks to get the final mask and save the merged mask
72 | 
73 | This is the audio retrival result
74 | """
75 | 
76 | # get highest similar mask with threshold
77 | # result[0] shape: [113, 1]
78 | threshold = 0.025
79 | index = get_indices_of_values_above_threshold(vision_audio_result[0], threshold)
80 | 
81 | segmentation_masks = []
82 | for seg_idx in index:
83 |     segmentation_mask_image = Image.fromarray(masks[seg_idx]["segmentation"].astype('uint8') * 255)
84 |     segmentation_masks.append(segmentation_mask_image)
85 | 
86 | original_image = Image.open(image_path)
87 | overlay_image = Image.new('RGBA', image.size, (0, 0, 0, 255))
88 | overlay_color = (255, 255, 255, 0)
89 | 
90 | draw = ImageDraw.Draw(overlay_image)
91 | for segmentation_mask_image in segmentation_masks:
92 |     draw.bitmap((0, 0), segmentation_mask_image, fill=overlay_color)
93 | 
94 | # return Image.alpha_composite(original_image.convert('RGBA'), overlay_image) 
95 | mask_image = overlay_image.convert("RGB")
96 | mask_image.save("./audio_sam_merged_mask.jpg")
97 | 
98 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/playground/ImageBind_SAM/bpe/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/playground/ImageBind_SAM/bpe/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/playground/ImageBind_SAM/image_referring_seg_demo.py:
--------------------------------------------------------------------------------
  1 | import data
  2 | import cv2
  3 | import torch
  4 | from PIL import Image, ImageDraw
  5 | from tqdm import tqdm
  6 | from models import imagebind_model
  7 | from models.imagebind_model import ModalityType
  8 | 
  9 | from segment_anything import build_sam, SamAutomaticMaskGenerator
 10 | 
 11 | from utils import (
 12 |     segment_image, 
 13 |     convert_box_xywh_to_xyxy,
 14 |     get_indices_of_values_above_threshold,
 15 | )
 16 | 
 17 | 
 18 | device = "cuda" if torch.cuda.is_available() else "cpu"
 19 | 
 20 | 
 21 | """
 22 | Step 1: Instantiate model
 23 | """
 24 | # Segment Anything
 25 | mask_generator = SamAutomaticMaskGenerator(
 26 |     build_sam(checkpoint=".checkpoints/sam_vit_h_4b8939.pth").to(device),
 27 |     points_per_side=16,
 28 | )
 29 | 
 30 | # ImageBind
 31 | bind_model = imagebind_model.imagebind_huge(pretrained=True)
 32 | bind_model.eval()
 33 | bind_model.to(device)
 34 | 
 35 | 
 36 | """
 37 | Step 2: Generate auto masks with SAM
 38 | """
 39 | image_path = ".assets/car_image.jpg"
 40 | image = cv2.imread(image_path)
 41 | image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
 42 | masks = mask_generator.generate(image)
 43 | 
 44 | 
 45 | """
 46 | Step 3: Get cropped images based on mask and box
 47 | """
 48 | cropped_boxes = []
 49 | image = Image.open(image_path)
 50 | for mask in tqdm(masks):
 51 |     cropped_boxes.append(segment_image(image, mask["segmentation"]).crop(convert_box_xywh_to_xyxy(mask["bbox"])))
 52 | 
 53 | 
 54 | """
 55 | Step 4: Run ImageBind model to get similarity between cropped image and different modalities
 56 | """
 57 | # load referring image
 58 | referring_image_path = ".assets/referring_car_image.jpg"
 59 | referring_image = Image.open(referring_image_path)
 60 | 
 61 | image_list = []
 62 | image_list += cropped_boxes
 63 | image_list.append(referring_image)
 64 | 
 65 | def retriev_vision_and_vision(elements):
 66 |     inputs = {
 67 |         ModalityType.VISION: data.load_and_transform_vision_data_from_pil_image(elements, device),
 68 |     }
 69 |     with torch.no_grad():
 70 |         embeddings = bind_model(inputs)
 71 | 
 72 |     # cropped box region embeddings
 73 |     cropped_box_embeddings = embeddings[ModalityType.VISION][:-1, :]
 74 |     referring_image_embeddings = embeddings[ModalityType.VISION][-1, :]
 75 | 
 76 |     vision_referring_result = torch.softmax(cropped_box_embeddings @ referring_image_embeddings.T, dim=0),
 77 |     return vision_referring_result  # [113, 1]
 78 | 
 79 | 
 80 | vision_referring_result = retriev_vision_and_vision(image_list)
 81 | 
 82 | 
 83 | """
 84 | Step 5: Merge the top similarity masks to get the final mask and save the merged mask
 85 | 
 86 | Image / Text mask
 87 | """
 88 | 
 89 | # get highest similar mask with threshold
 90 | # result[0] shape: [113, 1]
 91 | threshold = 0.017
 92 | index = get_indices_of_values_above_threshold(vision_referring_result[0], threshold)
 93 | 
 94 | 
 95 | segmentation_masks = []
 96 | for seg_idx in index:
 97 |     segmentation_mask_image = Image.fromarray(masks[seg_idx]["segmentation"].astype('uint8') * 255)
 98 |     segmentation_masks.append(segmentation_mask_image)
 99 | 
100 | original_image = Image.open(image_path)
101 | overlay_image = Image.new('RGBA', image.size, (0, 0, 0, 255))
102 | overlay_color = (255, 255, 255, 0)
103 | 
104 | draw = ImageDraw.Draw(overlay_image)
105 | for segmentation_mask_image in segmentation_masks:
106 |     draw.bitmap((0, 0), segmentation_mask_image, fill=overlay_color)
107 | 
108 | # return Image.alpha_composite(original_image.convert('RGBA'), overlay_image) 
109 | mask_image = overlay_image.convert("RGB")
110 | mask_image.save("./image_referring_sam_merged_mask.jpg")
111 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/playground/ImageBind_SAM/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/playground/ImageBind_SAM/models/__init__.py


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/playground/ImageBind_SAM/text_referring_seg_demo.py:
--------------------------------------------------------------------------------
 1 | import data
 2 | import cv2
 3 | import torch
 4 | from PIL import Image, ImageDraw
 5 | from tqdm import tqdm
 6 | from models import imagebind_model
 7 | from models.imagebind_model import ModalityType
 8 | 
 9 | from segment_anything import build_sam, SamAutomaticMaskGenerator
10 | 
11 | from utils import (
12 |     segment_image, 
13 |     convert_box_xywh_to_xyxy,
14 |     get_indices_of_values_above_threshold,
15 | )
16 | 
17 | 
18 | device = "cuda" if torch.cuda.is_available() else "cpu"
19 | 
20 | 
21 | """
22 | Step 1: Instantiate model
23 | """
24 | # Segment Anything
25 | mask_generator = SamAutomaticMaskGenerator(
26 |     build_sam(checkpoint=".checkpoints/sam_vit_h_4b8939.pth").to(device),
27 |     points_per_side=16,
28 | )
29 | 
30 | # ImageBind
31 | bind_model = imagebind_model.imagebind_huge(pretrained=True)
32 | bind_model.eval()
33 | bind_model.to(device)
34 | 
35 | 
36 | """
37 | Step 2: Generate auto masks with SAM
38 | """
39 | image_path = ".assets/car_image.jpg"
40 | image = cv2.imread(image_path)
41 | image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
42 | masks = mask_generator.generate(image)
43 | 
44 | 
45 | """
46 | Step 3: Get cropped images based on mask and box
47 | """
48 | cropped_boxes = []
49 | image = Image.open(image_path)
50 | for mask in tqdm(masks):
51 |     cropped_boxes.append(segment_image(image, mask["segmentation"]).crop(convert_box_xywh_to_xyxy(mask["bbox"])))
52 | 
53 | 
54 | """
55 | Step 4: Run ImageBind model to get similarity between cropped image and different modalities
56 | """
57 | def retriev_vision_and_text(elements, text_list):
58 |     inputs = {
59 |         ModalityType.VISION: data.load_and_transform_vision_data_from_pil_image(elements, device),
60 |         ModalityType.TEXT: data.load_and_transform_text(text_list, device),
61 |     }
62 |     with torch.no_grad():
63 |         embeddings = bind_model(inputs)
64 |     vision_audio = torch.softmax(embeddings[ModalityType.VISION] @ embeddings[ModalityType.TEXT].T, dim=0),
65 |     return vision_audio  # [113, 1]
66 | 
67 | 
68 | vision_text_result = retriev_vision_and_text(cropped_boxes, ["A car"] )
69 | 
70 | 
71 | """
72 | Step 5: Merge the top similarity masks to get the final mask and save the merged mask
73 | 
74 | Image / Text mask
75 | """
76 | 
77 | # get highest similar mask with threshold
78 | # result[0] shape: [113, 1]
79 | threshold = 0.05
80 | index = get_indices_of_values_above_threshold(vision_text_result[0], threshold)
81 | 
82 | segmentation_masks = []
83 | for seg_idx in index:
84 |     segmentation_mask_image = Image.fromarray(masks[seg_idx]["segmentation"].astype('uint8') * 255)
85 |     segmentation_masks.append(segmentation_mask_image)
86 | 
87 | original_image = Image.open(image_path)
88 | overlay_image = Image.new('RGBA', image.size, (0, 0, 0, 255))
89 | overlay_color = (255, 255, 255, 0)
90 | 
91 | draw = ImageDraw.Draw(overlay_image)
92 | for segmentation_mask_image in segmentation_masks:
93 |     draw.bitmap((0, 0), segmentation_mask_image, fill=overlay_color)
94 | 
95 | # return Image.alpha_composite(original_image.convert('RGBA'), overlay_image) 
96 | mask_image = overlay_image.convert("RGB")
97 | mask_image.save("./text_sam_merged_mask.jpg")
98 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/playground/ImageBind_SAM/utils.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | import numpy as np
 3 | 
 4 | def segment_image(image, segmentation_mask):
 5 |     image_array = np.array(image)
 6 |     segmented_image_array = np.zeros_like(image_array)
 7 |     segmented_image_array[segmentation_mask] = image_array[segmentation_mask]
 8 |     segmented_image = Image.fromarray(segmented_image_array)
 9 |     black_image = Image.new("RGB", image.size, (0, 0, 0))
10 |     transparency_mask = np.zeros_like(segmentation_mask, dtype=np.uint8)
11 |     transparency_mask[segmentation_mask] = 255
12 |     transparency_mask_image = Image.fromarray(transparency_mask, mode='L')
13 |     black_image.paste(segmented_image, mask=transparency_mask_image)
14 |     return black_image
15 | 
16 | 
17 | def convert_box_xywh_to_xyxy(box):
18 |     x1 = box[0]
19 |     y1 = box[1]
20 |     x2 = box[0] + box[2]
21 |     y2 = box[1] + box[3]
22 |     return [x1, y1, x2, y2]
23 | 
24 | 
25 | def get_indices_of_values_above_threshold(values, threshold):
26 |     return [i for i, v in enumerate(values) if v > threshold]


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/playground/LaMa/lama_inpaint_demo.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import PIL
 3 | import requests
 4 | import numpy as np
 5 | from lama_cleaner.model.lama import LaMa
 6 | from lama_cleaner.schema import Config
 7 | 
 8 | 
 9 | def download_image(url):
10 |     image = PIL.Image.open(requests.get(url, stream=True).raw)
11 |     image = PIL.ImageOps.exif_transpose(image)
12 |     image = image.convert("RGB")
13 |     return image
14 | 
15 | 
16 | img_url = "https://raw.githubusercontent.com/Sanster/lama-cleaner/main/assets/dog.jpg"
17 | mask_url = "https://user-images.githubusercontent.com/3998421/202105351-9fcc4bf8-129d-461a-8524-92e4caad431f.png"
18 | 
19 | image = np.asarray(download_image(img_url))
20 | mask = np.asarray(download_image(mask_url).convert("L"))
21 | 
22 | # set to GPU for faster inference
23 | model = LaMa("cpu")
24 | result = model(image, mask, Config(hd_strategy="Original", ldm_steps=20, hd_strategy_crop_margin=128, hd_strategy_crop_trigger_size=800, hd_strategy_resize_limit=800))
25 | cv2.imwrite("lama_inpaint_demo.jpg", result)


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/playground/LaMa/sam_lama.py:
--------------------------------------------------------------------------------
 1 | # !pip install diffusers transformers
 2 | 
 3 | import requests
 4 | import cv2
 5 | import numpy as np
 6 | import PIL
 7 | from PIL import Image
 8 | from io import BytesIO
 9 | 
10 | from segment_anything import sam_model_registry, SamPredictor
11 | 
12 | from lama_cleaner.model.lama import LaMa
13 | from lama_cleaner.schema import Config
14 | 
15 | """
16 | Step 1: Download and preprocess demo images
17 | """
18 | def download_image(url):
19 |     image = PIL.Image.open(requests.get(url, stream=True).raw)
20 |     image = PIL.ImageOps.exif_transpose(image)
21 |     image = image.convert("RGB")
22 |     return image
23 | 
24 | 
25 | img_url = "https://github.com/IDEA-Research/detrex-storage/blob/main/assets/grounded_sam/paint_by_example/input_image.png?raw=true"
26 | 
27 | 
28 | init_image = download_image(img_url)
29 | init_image = np.asarray(init_image)
30 | 
31 | 
32 | """
33 | Step 2: Initialize SAM and LaMa models
34 | """
35 | 
36 | DEVICE = "cuda:1"
37 | 
38 | # SAM
39 | SAM_ENCODER_VERSION = "vit_h"
40 | SAM_CHECKPOINT_PATH = "/comp_robot/rentianhe/code/Grounded-Segment-Anything/sam_vit_h_4b8939.pth"
41 | sam = sam_model_registry[SAM_ENCODER_VERSION](checkpoint=SAM_CHECKPOINT_PATH).to(device=DEVICE)
42 | sam_predictor = SamPredictor(sam)
43 | sam_predictor.set_image(init_image)
44 | 
45 | # LaMa
46 | model = LaMa(DEVICE)
47 | 
48 | 
49 | """
50 | Step 3: Get masks with SAM by prompt (box or point) and inpaint the mask region by example image.
51 | """
52 | 
53 | input_point = np.array([[350, 256]])
54 | input_label = np.array([1])  # positive label
55 | 
56 | masks, _, _ = sam_predictor.predict(
57 |     point_coords=input_point,
58 |     point_labels=input_label,
59 |     multimask_output=False
60 | )
61 | masks = masks.astype(np.uint8) * 255
62 | # mask_pil = Image.fromarray(masks[0])  # simply save the first mask
63 | 
64 | 
65 | """
66 | Step 4: Dilate Mask to make it more suitable for LaMa inpainting
67 | 
68 | The idea behind dilate mask is to mask a larger region which will be better for inpainting.
69 | 
70 | Borrowed from Inpaint-Anything: https://github.com/geekyutao/Inpaint-Anything/blob/main/utils/utils.py#L18
71 | """
72 | 
73 | def dilate_mask(mask, dilate_factor=15):
74 |     mask = mask.astype(np.uint8)
75 |     mask = cv2.dilate(
76 |         mask,
77 |         np.ones((dilate_factor, dilate_factor), np.uint8),
78 |         iterations=1
79 |     )
80 |     return mask
81 | 
82 | def save_array_to_img(img_arr, img_p):
83 |     Image.fromarray(img_arr.astype(np.uint8)).save(img_p)
84 | 
85 | # [1, 512, 512] to [512, 512] and save mask
86 | save_array_to_img(masks[0], "./mask.png")
87 | 
88 | mask = dilate_mask(masks[0], dilate_factor=15)
89 | 
90 | save_array_to_img(mask, "./dilated_mask.png")
91 | 
92 | """
93 | Step 5: Run LaMa inpaint model
94 | """
95 | result = model(init_image, mask, Config(hd_strategy="Original", ldm_steps=20, hd_strategy_crop_margin=128, hd_strategy_crop_trigger_size=800, hd_strategy_resize_limit=800))
96 | cv2.imwrite("sam_lama_demo.jpg", result)
97 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/playground/PaintByExample/paint_by_example.py:
--------------------------------------------------------------------------------
 1 | # !pip install diffusers transformers
 2 | 
 3 | import PIL
 4 | import requests
 5 | import torch
 6 | from io import BytesIO
 7 | from diffusers import DiffusionPipeline
 8 | 
 9 | 
10 | """
11 | Step 1: Download demo images
12 | """
13 | def download_image(url):
14 |     response = requests.get(url)
15 |     return PIL.Image.open(BytesIO(response.content)).convert("RGB")
16 | 
17 | 
18 | img_url = "https://github.com/IDEA-Research/detrex-storage/blob/main/assets/grounded_sam/paint_by_example/input_image.png?raw=true"
19 | mask_url = "https://github.com/IDEA-Research/detrex-storage/blob/main/assets/grounded_sam/paint_by_example/mask.png?raw=true"
20 | example_url = "https://github.com/IDEA-Research/detrex-storage/blob/main/assets/grounded_sam/paint_by_example/pomeranian_example.jpg?raw=True"
21 | # example_url = "https://raw.githubusercontent.com/Fantasy-Studio/Paint-by-Example/main/examples/reference/example_1.jpg"
22 | 
23 | init_image = download_image(img_url).resize((512, 512))
24 | mask_image = download_image(mask_url).resize((512, 512))
25 | example_image = download_image(example_url).resize((512, 512))
26 | 
27 | 
28 | """
29 | Step 2: Download pretrained weights and initialize model
30 | """
31 | # set cache dir to store the weights
32 | cache_dir = "/comp_robot/rentianhe/weights/diffusers/"
33 | 
34 | pipe = DiffusionPipeline.from_pretrained(
35 |     "Fantasy-Studio/Paint-by-Example",
36 |     torch_dtype=torch.float16,
37 |     cache_dir=cache_dir,
38 | )
39 | # set to device
40 | pipe = pipe.to("cuda:1")
41 | 
42 | 
43 | """
44 | Step 3: Run PaintByExample pipeline and save image
45 | """
46 | image = pipe(
47 |     image=init_image, 
48 |     mask_image=mask_image, 
49 |     example_image=example_image,
50 |     num_inference_steps=200,
51 | ).images[0]
52 | 
53 | image.save("./paint_by_example_demo.jpg")
54 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/playground/PaintByExample/sam_paint_by_example.py:
--------------------------------------------------------------------------------
 1 | # !pip install diffusers transformers
 2 | 
 3 | import requests
 4 | import torch
 5 | import numpy as np
 6 | from PIL import Image
 7 | from io import BytesIO
 8 | from diffusers import DiffusionPipeline
 9 | 
10 | from segment_anything import sam_model_registry, SamPredictor
11 | 
12 | 
13 | """
14 | Step 1: Download and preprocess example demo images
15 | """
16 | def download_image(url):
17 |     response = requests.get(url)
18 |     return Image.open(BytesIO(response.content)).convert("RGB")
19 | 
20 | 
21 | img_url = "https://github.com/IDEA-Research/detrex-storage/blob/main/assets/grounded_sam/paint_by_example/input_image.png?raw=true"
22 | # example_url = "https://github.com/IDEA-Research/detrex-storage/blob/main/assets/grounded_sam/paint_by_example/pomeranian_example.jpg?raw=True"
23 | # example_url = "https://github.com/IDEA-Research/detrex-storage/blob/main/assets/grounded_sam/paint_by_example/example_image.jpg?raw=true"
24 | example_url = "https://github.com/IDEA-Research/detrex-storage/blob/main/assets/grounded_sam/paint_by_example/labrador_example.jpg?raw=true"
25 | 
26 | init_image = download_image(img_url).resize((512, 512))
27 | example_image = download_image(example_url).resize((512, 512))
28 | 
29 | 
30 | """
31 | Step 2: Initialize SAM and PaintByExample models
32 | """
33 | 
34 | DEVICE = "cuda:1"
35 | 
36 | # SAM
37 | SAM_ENCODER_VERSION = "vit_h"
38 | SAM_CHECKPOINT_PATH = "/comp_robot/rentianhe/code/Grounded-Segment-Anything/sam_vit_h_4b8939.pth"
39 | sam = sam_model_registry[SAM_ENCODER_VERSION](checkpoint=SAM_CHECKPOINT_PATH).to(device=DEVICE)
40 | sam_predictor = SamPredictor(sam)
41 | sam_predictor.set_image(np.array(init_image))
42 | 
43 | # PaintByExample Pipeline
44 | CACHE_DIR = "/comp_robot/rentianhe/weights/diffusers/"
45 | pipe = DiffusionPipeline.from_pretrained(
46 |     "Fantasy-Studio/Paint-by-Example",
47 |     torch_dtype=torch.float16,
48 |     cache_dir=CACHE_DIR,
49 | )
50 | pipe = pipe.to(DEVICE)
51 | 
52 | 
53 | """
54 | Step 3: Get masks with SAM by prompt (box or point) and inpaint the mask region by example image.
55 | """
56 | 
57 | input_point = np.array([[350, 256]])
58 | input_label = np.array([1])  # positive label
59 | 
60 | masks, _, _ = sam_predictor.predict(
61 |     point_coords=input_point,
62 |     point_labels=input_label,
63 |     multimask_output=False
64 | )
65 | mask = masks[0]  # [1, 512, 512] to [512, 512] np.ndarray
66 | mask_pil = Image.fromarray(mask)
67 | 
68 | mask_pil.save("./mask.jpg")
69 | 
70 | image = pipe(
71 |     image=init_image, 
72 |     mask_image=mask_pil, 
73 |     example_image=example_image, 
74 |     num_inference_steps=500, 
75 |     guidance_scale=9.0
76 | ).images[0]
77 | 
78 | image.save("./paint_by_example_demo.jpg")
79 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/playground/README.md:
--------------------------------------------------------------------------------
 1 | ## Playground
 2 | 
 3 | We will try more interesting **base models** and **build more fun demos** in the playground. In the playground, we will:
 4 | 
 5 | - **Simplify the demo code** to make it easier for users to get started.
 6 | - **Keep complete usage notes** and some pitfalls to reduce the burden on users.
 7 | 
 8 | ## Table of Contents
 9 | - [DeepFloyd: Text-to-Image Generation](./DeepFloyd/)
10 |   - [Dream: Text-to-Image Generation](./DeepFloyd/dream.py)
11 |   - [Style Transfer](./DeepFloyd/style_transfer.py)
12 | - [Paint by Example: Exemplar-based Image Editing with Diffusion Models](./PaintByExample/)
13 |   - [Diffuser Demo](./PaintByExample/paint_by_example.py)
14 |   - [PaintByExample with SAM](./PaintByExample/sam_paint_by_example.py)
15 | - [LaMa: Resolution-robust Large Mask Inpainting with Fourier Convolutions](./LaMa/)
16 |   - [LaMa Demo](./LaMa/lama_inpaint_demo.py)
17 |   - [LaMa with SAM](./LaMa/sam_lama.py)
18 | - [RePaint: Inpainting using Denoising Diffusion Probabilistic Models](./RePaint/)
19 |   - [RePaint Demo](./RePaint/repaint.py)
20 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/playground/RePaint/README.md:
--------------------------------------------------------------------------------
 1 | ## RePaint: Inpainting using Denoising Diffusion Probabilistic Models
 2 | 
 3 | :grapes: [[Official Project Page](https://github.com/andreas128/RePaint)]
 4 | 
 5 | <div align="center">
 6 | 
 7 | ![](https://user-images.githubusercontent.com/11280511/150803812-a4729ef8-6ad4-46aa-ae99-8c27fbb2ea2e.png)
 8 | 
 9 | </div>
10 | 
11 | ## Abstract
12 | 
13 | > Free-form inpainting is the task of adding new content to an image in the regions specified by an arbitrary binary mask. Most existing approaches train for a certain distribution of masks, which limits their generalization capabilities to unseen mask types. Furthermore, training with pixel-wise and perceptual losses often leads to simple textural extensions towards the missing areas instead of semantically meaningful generation. In this work, we propose RePaint: A Denoising Diffusion Probabilistic Model (DDPM) based inpainting approach that is applicable to even extreme masks. We employ a pretrained unconditional DDPM as the generative prior. To condition the generation process, we only alter the reverse diffusion iterations by sampling the unmasked regions using the given image information. Since this technique does not modify or condition the original DDPM network itself, the model produces highquality and diverse output images for any inpainting form. We validate our method for both faces and general-purpose image inpainting using standard and extreme masks. RePaint outperforms state-of-the-art Autoregressive, and GAN approaches for at least five out of six mask distributions.
14 | 
15 | 
16 | ## Table of Contents
17 | - [Installation](#installation)
18 | - [Repaint Demos](#repaint-demos)
19 |   - [Diffuser Demo](#repaint-diffuser-demos)
20 | 
21 | 
22 | ## TODO
23 | - [x] RePaint Diffuser Demo
24 | - [ ] RePaint with SAM
25 | - [ ] RePaint with GroundingDINO
26 | - [ ] RePaint with Grounded-SAM
27 | 
28 | ## Installation
29 | We're using PaintByExample with diffusers, install diffusers as follows:
30 | ```bash
31 | pip install diffusers==0.16.1
32 | ```
33 | Then install Grounded-SAM follows [Grounded-SAM Installation](https://github.com/IDEA-Research/Grounded-Segment-Anything#installation) for some extension demos.
34 | 
35 | ## RePaint Demos
36 | Here we provide the demos for `RePaint`
37 | 
38 | 
39 | ### RePaint Diffuser Demos
40 | ```python
41 | cd playground/RePaint
42 | python repaint.py
43 | ```
44 | **Notes:** set `cache_dir` to save the pretrained weights to specific folder. The paint result will be save as `repaint_demo.jpg`:
45 | 
46 | <div align="center">
47 | 
48 | | Input Image | Mask | Inpaint Result |
49 | |:----:|:----:|:----:|
50 | | ![](https://github.com/IDEA-Research/detrex-storage/blob/main/assets/grounded_sam/repaint/celeba_hq_256.png?raw=true) | ![](https://github.com/IDEA-Research/detrex-storage/blob/main/assets/grounded_sam/repaint/mask_256.png?raw=true) | ![](https://github.com/IDEA-Research/detrex-storage/blob/main/assets/grounded_sam/repaint/repaint_demo.jpg?raw=true) |
51 | 
52 | 
53 | </div>
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/playground/RePaint/repaint.py:
--------------------------------------------------------------------------------
 1 | from io import BytesIO
 2 | 
 3 | import torch
 4 | 
 5 | import PIL
 6 | import requests
 7 | from diffusers import RePaintPipeline, RePaintScheduler
 8 | 
 9 | 
10 | def download_image(url):
11 |     response = requests.get(url)
12 |     return PIL.Image.open(BytesIO(response.content)).convert("RGB")
13 | 
14 | 
15 | img_url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/repaint/celeba_hq_256.png"
16 | mask_url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/repaint/mask_256.png"
17 | 
18 | # Load the original image and the mask as PIL images
19 | original_image = download_image(img_url).resize((256, 256))
20 | mask_image = download_image(mask_url).resize((256, 256))
21 | 
22 | # Load the RePaint scheduler and pipeline based on a pretrained DDPM model
23 | DEVICE = "cuda:1"
24 | CACHE_DIR = "/comp_robot/rentianhe/weights/diffusers/"
25 | scheduler = RePaintScheduler.from_pretrained("google/ddpm-ema-celebahq-256", cache_dir=CACHE_DIR)
26 | pipe = RePaintPipeline.from_pretrained("google/ddpm-ema-celebahq-256", scheduler=scheduler, cache_dir=CACHE_DIR)
27 | pipe = pipe.to(DEVICE)
28 | 
29 | generator = torch.Generator(device=DEVICE).manual_seed(0)
30 | output = pipe(
31 |     image=original_image,
32 |     mask_image=mask_image,
33 |     num_inference_steps=250,
34 |     eta=0.0,
35 |     jump_length=10,
36 |     jump_n_sample=10,
37 |     generator=generator,
38 | )
39 | inpainted_image = output.images[0]
40 | inpainted_image.save("./repaint_demo.jpg")


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/requirements.txt:
--------------------------------------------------------------------------------
 1 | addict
 2 | diffusers
 3 | gradio
 4 | huggingface_hub
 5 | matplotlib
 6 | numpy
 7 | onnxruntime
 8 | opencv_python
 9 | Pillow
10 | pycocotools
11 | PyYAML
12 | requests
13 | setuptools
14 | supervision
15 | termcolor
16 | timm
17 | torch
18 | torchvision
19 | transformers
20 | yapf
21 | nltk
22 | fairscale
23 | litellm
24 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/segment_anything/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = W503, E203, E221, C901, C408, E741, C407, B017, F811, C101, EXE001, EXE002
3 | max-line-length = 100
4 | max-complexity = 18
5 | select = B,C,E,F,W,T4,B9
6 | per-file-ignores =
7 |   **/__init__.py:F401,F403,E402
8 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/segment_anything/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to make participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |   advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |   address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |   professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies within all project spaces, and it also applies when
49 | an individual is representing the project or its community in public spaces.
50 | Examples of representing a project or community include using an official
51 | project e-mail address, posting via an official social media account, or acting
52 | as an appointed representative at an online or offline event. Representation of
53 | a project may be further defined and clarified by project maintainers.
54 | 
55 | This Code of Conduct also applies outside the project spaces when there is a
56 | reasonable belief that an individual's behavior may have a negative impact on
57 | the project or its community.
58 | 
59 | ## Enforcement
60 | 
61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
62 | reported by contacting the project team at <opensource-conduct@fb.com>. All
63 | complaints will be reviewed and investigated and will result in a response that
64 | is deemed necessary and appropriate to the circumstances. The project team is
65 | obligated to maintain confidentiality with regard to the reporter of an incident.
66 | Further details of specific enforcement policies may be posted separately.
67 | 
68 | Project maintainers who do not follow or enforce the Code of Conduct in good
69 | faith may face temporary or permanent repercussions as determined by other
70 | members of the project's leadership.
71 | 
72 | ## Attribution
73 | 
74 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
75 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
76 | 
77 | [homepage]: https://www.contributor-covenant.org
78 | 
79 | For answers to common questions about this code of conduct, see
80 | https://www.contributor-covenant.org/faq
81 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/segment_anything/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to segment-anything
 2 | We want to make contributing to this project as easy and transparent as
 3 | possible.
 4 | 
 5 | ## Pull Requests
 6 | We actively welcome your pull requests.
 7 | 
 8 | 1. Fork the repo and create your branch from `main`.
 9 | 2. If you've added code that should be tested, add tests.
10 | 3. If you've changed APIs, update the documentation.
11 | 4. Ensure the test suite passes.
12 | 5. Make sure your code lints, using the `linter.sh` script in the project's root directory. Linting requires `black==23.*`, `isort==5.12.0`, `flake8`, and `mypy`.
13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA").
14 | 
15 | ## Contributor License Agreement ("CLA")
16 | In order to accept your pull request, we need you to submit a CLA. You only need
17 | to do this once to work on any of Facebook's open source projects.
18 | 
19 | Complete your CLA here: <https://code.facebook.com/cla>
20 | 
21 | ## Issues
22 | We use GitHub issues to track public bugs. Please ensure your description is
23 | clear and has sufficient instructions to be able to reproduce the issue.
24 | 
25 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
26 | disclosure of security bugs. In those cases, please go through the process
27 | outlined on that page and do not file a public issue.
28 | 
29 | ## License
30 | By contributing to segment-anything, you agree that your contributions will be licensed
31 | under the LICENSE file in the root directory of this source tree.
32 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/segment_anything/assets/masks1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/segment_anything/assets/masks1.png


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/segment_anything/assets/masks2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/segment_anything/assets/masks2.jpg


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/segment_anything/assets/model_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/segment_anything/assets/model_diagram.png


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/segment_anything/assets/notebook1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/segment_anything/assets/notebook1.png


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/segment_anything/assets/notebook2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/segment_anything/assets/notebook2.png


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/segment_anything/linter.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -e
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | 
 4 | {
 5 |   black --version | grep -E "23\." > /dev/null
 6 | } || {
 7 |   echo "Linter requires 'black==23.*' !"
 8 |   exit 1
 9 | }
10 | 
11 | ISORT_VERSION=$(isort --version-number)
12 | if [[ "$ISORT_VERSION" != 5.12* ]]; then
13 |   echo "Linter requires isort==5.12.0 !"
14 |   exit 1
15 | fi
16 | 
17 | echo "Running isort ..."
18 | isort . --atomic
19 | 
20 | echo "Running black ..."
21 | black -l 100 .
22 | 
23 | echo "Running flake8 ..."
24 | if [ -x "$(command -v flake8)" ]; then
25 |   flake8 .
26 | else
27 |   python3 -m flake8 .
28 | fi
29 | 
30 | echo "Running mypy..."
31 | 
32 | mypy --exclude 'setup.py|notebooks' .
33 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/segment_anything/notebooks/images/dog.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/segment_anything/notebooks/images/dog.jpg


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/segment_anything/notebooks/images/groceries.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/segment_anything/notebooks/images/groceries.jpg


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/segment_anything/notebooks/images/truck.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/Grounded-Segment-Anything/segment_anything/notebooks/images/truck.jpg


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/segment_anything/segment_anything/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from .build_sam import (
 8 |     build_sam,
 9 |     build_sam_vit_h,
10 |     build_sam_vit_l,
11 |     build_sam_vit_b,
12 |     sam_model_registry,
13 | )
14 | from .build_sam_hq import (
15 |     build_sam_hq,
16 |     build_sam_hq_vit_h,
17 |     build_sam_hq_vit_l,
18 |     build_sam_hq_vit_b,
19 |     sam_hq_model_registry,
20 | )
21 | from .predictor import SamPredictor
22 | from .automatic_mask_generator import SamAutomaticMaskGenerator
23 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/segment_anything/segment_anything/build_sam.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import torch
  8 | 
  9 | from functools import partial
 10 | 
 11 | from .modeling import ImageEncoderViT, MaskDecoder, PromptEncoder, Sam, TwoWayTransformer
 12 | 
 13 | 
 14 | def build_sam_vit_h(checkpoint=None):
 15 |     return _build_sam(
 16 |         encoder_embed_dim=1280,
 17 |         encoder_depth=32,
 18 |         encoder_num_heads=16,
 19 |         encoder_global_attn_indexes=[7, 15, 23, 31],
 20 |         checkpoint=checkpoint,
 21 |     )
 22 | 
 23 | 
 24 | build_sam = build_sam_vit_h
 25 | 
 26 | 
 27 | def build_sam_vit_l(checkpoint=None):
 28 |     return _build_sam(
 29 |         encoder_embed_dim=1024,
 30 |         encoder_depth=24,
 31 |         encoder_num_heads=16,
 32 |         encoder_global_attn_indexes=[5, 11, 17, 23],
 33 |         checkpoint=checkpoint,
 34 |     )
 35 | 
 36 | 
 37 | def build_sam_vit_b(checkpoint=None):
 38 |     return _build_sam(
 39 |         encoder_embed_dim=768,
 40 |         encoder_depth=12,
 41 |         encoder_num_heads=12,
 42 |         encoder_global_attn_indexes=[2, 5, 8, 11],
 43 |         checkpoint=checkpoint,
 44 |     )
 45 | 
 46 | 
 47 | sam_model_registry = {
 48 |     "default": build_sam,
 49 |     "vit_h": build_sam,
 50 |     "vit_l": build_sam_vit_l,
 51 |     "vit_b": build_sam_vit_b,
 52 | }
 53 | 
 54 | 
 55 | def _build_sam(
 56 |     encoder_embed_dim,
 57 |     encoder_depth,
 58 |     encoder_num_heads,
 59 |     encoder_global_attn_indexes,
 60 |     checkpoint=None,
 61 | ):
 62 |     prompt_embed_dim = 256
 63 |     image_size = 1024
 64 |     vit_patch_size = 16
 65 |     image_embedding_size = image_size // vit_patch_size
 66 |     sam = Sam(
 67 |         image_encoder=ImageEncoderViT(
 68 |             depth=encoder_depth,
 69 |             embed_dim=encoder_embed_dim,
 70 |             img_size=image_size,
 71 |             mlp_ratio=4,
 72 |             norm_layer=partial(torch.nn.LayerNorm, eps=1e-6),
 73 |             num_heads=encoder_num_heads,
 74 |             patch_size=vit_patch_size,
 75 |             qkv_bias=True,
 76 |             use_rel_pos=True,
 77 |             global_attn_indexes=encoder_global_attn_indexes,
 78 |             window_size=14,
 79 |             out_chans=prompt_embed_dim,
 80 |         ),
 81 |         prompt_encoder=PromptEncoder(
 82 |             embed_dim=prompt_embed_dim,
 83 |             image_embedding_size=(image_embedding_size, image_embedding_size),
 84 |             input_image_size=(image_size, image_size),
 85 |             mask_in_chans=16,
 86 |         ),
 87 |         mask_decoder=MaskDecoder(
 88 |             num_multimask_outputs=3,
 89 |             transformer=TwoWayTransformer(
 90 |                 depth=2,
 91 |                 embedding_dim=prompt_embed_dim,
 92 |                 mlp_dim=2048,
 93 |                 num_heads=8,
 94 |             ),
 95 |             transformer_dim=prompt_embed_dim,
 96 |             iou_head_depth=3,
 97 |             iou_head_hidden_dim=256,
 98 |         ),
 99 |         pixel_mean=[123.675, 116.28, 103.53],
100 |         pixel_std=[58.395, 57.12, 57.375],
101 |     )
102 |     sam.eval()
103 |     if checkpoint is not None:
104 |         with open(checkpoint, "rb") as f:
105 |             state_dict = torch.load(f)
106 |         sam.load_state_dict(state_dict)
107 |     return sam
108 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/segment_anything/segment_anything/build_sam_hq.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import torch
  8 | 
  9 | from functools import partial
 10 | 
 11 | from .modeling import ImageEncoderViT, MaskDecoderHQ, PromptEncoder, Sam, TwoWayTransformer
 12 | 
 13 | 
 14 | def build_sam_hq_vit_h(checkpoint=None):
 15 |     return _build_sam(
 16 |         encoder_embed_dim=1280,
 17 |         encoder_depth=32,
 18 |         encoder_num_heads=16,
 19 |         encoder_global_attn_indexes=[7, 15, 23, 31],
 20 |         checkpoint=checkpoint,
 21 |     )
 22 | 
 23 | 
 24 | build_sam_hq = build_sam_hq_vit_h
 25 | 
 26 | 
 27 | def build_sam_hq_vit_l(checkpoint=None):
 28 |     return _build_sam(
 29 |         encoder_embed_dim=1024,
 30 |         encoder_depth=24,
 31 |         encoder_num_heads=16,
 32 |         encoder_global_attn_indexes=[5, 11, 17, 23],
 33 |         checkpoint=checkpoint,
 34 |     )
 35 | 
 36 | 
 37 | def build_sam_hq_vit_b(checkpoint=None):
 38 |     return _build_sam(
 39 |         encoder_embed_dim=768,
 40 |         encoder_depth=12,
 41 |         encoder_num_heads=12,
 42 |         encoder_global_attn_indexes=[2, 5, 8, 11],
 43 |         checkpoint=checkpoint,
 44 |     )
 45 | 
 46 | 
 47 | sam_hq_model_registry = {
 48 |     "default": build_sam_hq_vit_h,
 49 |     "vit_h": build_sam_hq_vit_h,
 50 |     "vit_l": build_sam_hq_vit_l,
 51 |     "vit_b": build_sam_hq_vit_b,
 52 | }
 53 | 
 54 | 
 55 | def _build_sam(
 56 |     encoder_embed_dim,
 57 |     encoder_depth,
 58 |     encoder_num_heads,
 59 |     encoder_global_attn_indexes,
 60 |     checkpoint=None,
 61 | ):
 62 |     prompt_embed_dim = 256
 63 |     image_size = 1024
 64 |     vit_patch_size = 16
 65 |     image_embedding_size = image_size // vit_patch_size
 66 |     sam = Sam(
 67 |         image_encoder=ImageEncoderViT(
 68 |             depth=encoder_depth,
 69 |             embed_dim=encoder_embed_dim,
 70 |             img_size=image_size,
 71 |             mlp_ratio=4,
 72 |             norm_layer=partial(torch.nn.LayerNorm, eps=1e-6),
 73 |             num_heads=encoder_num_heads,
 74 |             patch_size=vit_patch_size,
 75 |             qkv_bias=True,
 76 |             use_rel_pos=True,
 77 |             global_attn_indexes=encoder_global_attn_indexes,
 78 |             window_size=14,
 79 |             out_chans=prompt_embed_dim,
 80 |         ),
 81 |         prompt_encoder=PromptEncoder(
 82 |             embed_dim=prompt_embed_dim,
 83 |             image_embedding_size=(image_embedding_size, image_embedding_size),
 84 |             input_image_size=(image_size, image_size),
 85 |             mask_in_chans=16,
 86 |         ),
 87 |         mask_decoder=MaskDecoderHQ(
 88 |             num_multimask_outputs=3,
 89 |             transformer=TwoWayTransformer(
 90 |                 depth=2,
 91 |                 embedding_dim=prompt_embed_dim,
 92 |                 mlp_dim=2048,
 93 |                 num_heads=8,
 94 |             ),
 95 |             transformer_dim=prompt_embed_dim,
 96 |             iou_head_depth=3,
 97 |             iou_head_hidden_dim=256,
 98 |             vit_dim=encoder_embed_dim,
 99 |         ),
100 |         pixel_mean=[123.675, 116.28, 103.53],
101 |         pixel_std=[58.395, 57.12, 57.375],
102 |     )
103 |     # sam.eval()
104 |     if checkpoint is not None:
105 |         with open(checkpoint, "rb") as f:
106 |             device = "cuda" if torch.cuda.is_available() else "cpu"
107 |             state_dict = torch.load(f, map_location=device)
108 |         info = sam.load_state_dict(state_dict, strict=False)
109 |         print(info)
110 |     for n, p in sam.named_parameters():
111 |         if 'hf_token' not in n and 'hf_mlp' not in n and 'compress_vit_feat' not in n and 'embedding_encoder' not in n and 'embedding_maskfeature' not in n:
112 |             p.requires_grad = False
113 | 
114 |     return sam
115 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/segment_anything/segment_anything/modeling/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from .sam import Sam
 8 | from .image_encoder import ImageEncoderViT
 9 | from .mask_decoder_hq import MaskDecoderHQ
10 | from .mask_decoder import MaskDecoder
11 | from .prompt_encoder import PromptEncoder
12 | from .transformer import TwoWayTransformer
13 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/segment_anything/segment_anything/modeling/common.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import torch
 8 | import torch.nn as nn
 9 | 
10 | from typing import Type
11 | 
12 | 
13 | class MLPBlock(nn.Module):
14 |     def __init__(
15 |         self,
16 |         embedding_dim: int,
17 |         mlp_dim: int,
18 |         act: Type[nn.Module] = nn.GELU,
19 |     ) -> None:
20 |         super().__init__()
21 |         self.lin1 = nn.Linear(embedding_dim, mlp_dim)
22 |         self.lin2 = nn.Linear(mlp_dim, embedding_dim)
23 |         self.act = act()
24 | 
25 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
26 |         return self.lin2(self.act(self.lin1(x)))
27 | 
28 | 
29 | # From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa
30 | # Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119  # noqa
31 | class LayerNorm2d(nn.Module):
32 |     def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
33 |         super().__init__()
34 |         self.weight = nn.Parameter(torch.ones(num_channels))
35 |         self.bias = nn.Parameter(torch.zeros(num_channels))
36 |         self.eps = eps
37 | 
38 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
39 |         u = x.mean(1, keepdim=True)
40 |         s = (x - u).pow(2).mean(1, keepdim=True)
41 |         x = (x - u) / torch.sqrt(s + self.eps)
42 |         x = self.weight[:, None, None] * x + self.bias[:, None, None]
43 |         return x
44 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/segment_anything/segment_anything/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | 
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/segment_anything/segment_anything/utils/transforms.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import numpy as np
  8 | import torch
  9 | from torch.nn import functional as F
 10 | from torchvision.transforms.functional import resize, to_pil_image  # type: ignore
 11 | 
 12 | from copy import deepcopy
 13 | from typing import Tuple
 14 | 
 15 | 
 16 | class ResizeLongestSide:
 17 |     """
 18 |     Resizes images to longest side 'target_length', as well as provides
 19 |     methods for resizing coordinates and boxes. Provides methods for
 20 |     transforming both numpy array and batched torch tensors.
 21 |     """
 22 | 
 23 |     def __init__(self, target_length: int) -> None:
 24 |         self.target_length = target_length
 25 | 
 26 |     def apply_image(self, image: np.ndarray) -> np.ndarray:
 27 |         """
 28 |         Expects a numpy array with shape HxWxC in uint8 format.
 29 |         """
 30 |         target_size = self.get_preprocess_shape(image.shape[0], image.shape[1], self.target_length)
 31 |         return np.array(resize(to_pil_image(image), target_size))
 32 | 
 33 |     def apply_coords(self, coords: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray:
 34 |         """
 35 |         Expects a numpy array of length 2 in the final dimension. Requires the
 36 |         original image size in (H, W) format.
 37 |         """
 38 |         old_h, old_w = original_size
 39 |         new_h, new_w = self.get_preprocess_shape(
 40 |             original_size[0], original_size[1], self.target_length
 41 |         )
 42 |         coords = deepcopy(coords).astype(float)
 43 |         coords[..., 0] = coords[..., 0] * (new_w / old_w)
 44 |         coords[..., 1] = coords[..., 1] * (new_h / old_h)
 45 |         return coords
 46 | 
 47 |     def apply_boxes(self, boxes: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray:
 48 |         """
 49 |         Expects a numpy array shape Bx4. Requires the original image size
 50 |         in (H, W) format.
 51 |         """
 52 |         boxes = self.apply_coords(boxes.reshape(-1, 2, 2), original_size)
 53 |         return boxes.reshape(-1, 4)
 54 | 
 55 |     def apply_image_torch(self, image: torch.Tensor) -> torch.Tensor:
 56 |         """
 57 |         Expects batched images with shape BxCxHxW and float format. This
 58 |         transformation may not exactly match apply_image. apply_image is
 59 |         the transformation expected by the model.
 60 |         """
 61 |         # Expects an image in BCHW format. May not exactly match apply_image.
 62 |         target_size = self.get_preprocess_shape(image.shape[0], image.shape[1], self.target_length)
 63 |         return F.interpolate(
 64 |             image, target_size, mode="bilinear", align_corners=False, antialias=True
 65 |         )
 66 | 
 67 |     def apply_coords_torch(
 68 |         self, coords: torch.Tensor, original_size: Tuple[int, ...]
 69 |     ) -> torch.Tensor:
 70 |         """
 71 |         Expects a torch tensor with length 2 in the last dimension. Requires the
 72 |         original image size in (H, W) format.
 73 |         """
 74 |         old_h, old_w = original_size
 75 |         new_h, new_w = self.get_preprocess_shape(
 76 |             original_size[0], original_size[1], self.target_length
 77 |         )
 78 |         coords = deepcopy(coords).to(torch.float)
 79 |         coords[..., 0] = coords[..., 0] * (new_w / old_w)
 80 |         coords[..., 1] = coords[..., 1] * (new_h / old_h)
 81 |         return coords
 82 | 
 83 |     def apply_boxes_torch(
 84 |         self, boxes: torch.Tensor, original_size: Tuple[int, ...]
 85 |     ) -> torch.Tensor:
 86 |         """
 87 |         Expects a torch tensor with shape Bx4. Requires the original image
 88 |         size in (H, W) format.
 89 |         """
 90 |         boxes = self.apply_coords_torch(boxes.reshape(-1, 2, 2), original_size)
 91 |         return boxes.reshape(-1, 4)
 92 | 
 93 |     @staticmethod
 94 |     def get_preprocess_shape(oldh: int, oldw: int, long_side_length: int) -> Tuple[int, int]:
 95 |         """
 96 |         Compute the output size given input size and target long side length.
 97 |         """
 98 |         scale = long_side_length * 1.0 / max(oldh, oldw)
 99 |         newh, neww = oldh * scale, oldw * scale
100 |         neww = int(neww + 0.5)
101 |         newh = int(newh + 0.5)
102 |         return (newh, neww)
103 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/segment_anything/setup.cfg:
--------------------------------------------------------------------------------
 1 | [isort]
 2 | line_length=100
 3 | multi_line_output=3
 4 | include_trailing_comma=True
 5 | known_standard_library=numpy,setuptools
 6 | skip_glob=*/__init__.py
 7 | known_myself=segment_anything
 8 | known_third_party=matplotlib,cv2,torch,torchvision,pycocotools,onnx,black,isort
 9 | no_lines_before=STDLIB,THIRDPARTY
10 | sections=FUTURE,STDLIB,THIRDPARTY,MYSELF,FIRSTPARTY,LOCALFOLDER
11 | default_section=FIRSTPARTY
12 | 


--------------------------------------------------------------------------------
/third_party/Grounded-Segment-Anything/segment_anything/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from setuptools import find_packages, setup
 8 | 
 9 | setup(
10 |     name="segment_anything",
11 |     version="1.0",
12 |     install_requires=[],
13 |     packages=find_packages(exclude="notebooks"),
14 |     extras_require={
15 |         "all": ["matplotlib", "pycocotools", "opencv-python", "onnx", "onnxruntime"],
16 |         "dev": ["flake8", "isort", "black", "mypy"],
17 |     },
18 | )
19 | 


--------------------------------------------------------------------------------
/third_party/UniDepth/.gitignore:
--------------------------------------------------------------------------------
 1 | # don't upload macOS folder info
 2 | *.DS_Store
 3 | 
 4 | #python
 5 | *.pyc
 6 | __pycache__/
 7 | 
 8 | #scripts
 9 | *.sh
10 | 
11 | # package
12 | unidepth.egg-info
13 | 


--------------------------------------------------------------------------------
/third_party/UniDepth/assets/demo/depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/UniDepth/assets/demo/depth.png


--------------------------------------------------------------------------------
/third_party/UniDepth/assets/demo/intrinsics.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/UniDepth/assets/demo/intrinsics.npy


--------------------------------------------------------------------------------
/third_party/UniDepth/assets/demo/output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/UniDepth/assets/demo/output.png


--------------------------------------------------------------------------------
/third_party/UniDepth/assets/demo/rgb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/UniDepth/assets/demo/rgb.png


--------------------------------------------------------------------------------
/third_party/UniDepth/assets/docs/V2_README.md:
--------------------------------------------------------------------------------
 1 | # Changes
 2 | 
 3 | 
 4 | ### Input shape and ratio flexibility.
 5 | 
 6 | 1. Input images will not be reshaped to a specific image size. Training image ratios are in tha range: `[2/3, 2/1]`, thus if your image ratio is outside of these boundaries, we suggest to crop or pad it to be within the image ratio bounds.
 7 | 
 8 | 2. UnidepthV2 exposes the attribute `self.resolution_level` (with range `[0,10]`) that is used in the preprocess function and can be used to tradeoff resolution and speed, with **possible effect** on the output scale. In particular, the level describes the linear interpolation degree of the processed image area within the training bounds. The training image area (named "pixels") for ViT are in the range `[1400, 2400]` (see `pixels_bounds` in config). If no attribute is set, the max level, i.e. 10, will be used. We improperly use the concept of "pixels" which accounts for the image area after patchification, e.g. for ViT means that it is `1/14**2` the actual original image area.
 9 | 
10 | 3. Infer method will use interpolation mode defined by the attribute `self.interpolation_mode`, default is `bilinear`.
11 | 
12 | 
13 | ### Confidence output  
14 | 
15 | The model outputs confidence in the range `[0, 1]` and represent the ARel error after affine matching with GT. The confidence itself is shift invariance, namely the confidence is a ranking and relative within one input. In particular, it does not have an absolute meaning (e.g. no heteroschdastic noise modelling).
16 | 
17 | 
18 | ### Decoder design predicting separately scale-shift invariant depth and scale and shift to allow more diverse training. 
19 | 
20 | The decoder presents three heads: `Camera`, `Depth` and `Global`. `Depth` head predicts scale and shift invariant depth: exponential of normalized values.
21 | `Global` head predicts the scale and shift to match the `Depth` head output to metric.
22 | With such design we can mix seamlessly dataset with metric GT, scale-invariant (i.e., SfM) or scale-shift invariant by turning down the gradient to the `Global` head when GT is either scale or shift invariant.
23 | This allows to scale up the training variety.
24 | Version 1 and 2 present similar performance but output of version 2 may look more nervous because more diversity is linked to lower GT quality, thus introducing artifacts... 
25 | 
26 | 
27 | ### Faster inference  
28 | 
29 | The model is >30% faster than V1, tested on RTX4090 with float16 data-type.
30 | 
31 | 
32 | ### ONNX support
33 | 
34 | We added support to UniDepthV2 in __ONNX__ format.
35 | Both with and without gt intrinsics support.
36 | It does not allow for dynamic shapes at test time.
37 | For instance you can run from the root of the repo:
38 | ```bash
39 | 
40 | python ./unidepth/models/unidepthv2/export.py --version v2 --backbone vitl14 --shape (462, 616) --output-path unidepthv2.onnx --with-camera
41 | ```
42 | 
43 | Shape will be changed to the closest shape which is multiple of 14, i.e. ViT patch size.
44 | Your input shape at inference time will have to match with the (resized) shape passed to the exporter!
45 | The corresponding __ONNX__ model does not do any pre- or post-processing.
46 | Therefore, you should input an ImageNet-statistic normalized rgb image rescaled to the given input shape and, if `--with-camera` the corresponding (properly rescaled) camera intrinsics, too.
47 | 
48 | 
49 | Disclaimer: Not fully tested


--------------------------------------------------------------------------------
/third_party/UniDepth/assets/docs/nuscenes_surround.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/UniDepth/assets/docs/nuscenes_surround.gif


--------------------------------------------------------------------------------
/third_party/UniDepth/assets/docs/theoffice.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/UniDepth/assets/docs/theoffice.gif


--------------------------------------------------------------------------------
/third_party/UniDepth/assets/docs/unidepth-banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/third_party/UniDepth/assets/docs/unidepth-banner.png


--------------------------------------------------------------------------------
/third_party/UniDepth/configs/config_v1_cnvnxtl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "generic": {
 3 |         "seed": 13
 4 |     },
 5 |     "training": {
 6 |     },
 7 |     "data": {
 8 |         "image_shape": [462, 616]
 9 |     },
10 |     "model": {
11 |         "name": "UniDepthV1",
12 |         "num_heads": 8,
13 |         "expansion": 4,
14 |         "pixel_decoder": {
15 |             "hidden_dim": 512,
16 |             "depths": [3, 2, 1],
17 |             "dropout": 0.0
18 |         },
19 |         "pixel_encoder": {
20 |             "name": "convnext_large",
21 |             "pretrained": null
22 |         }
23 |     }
24 | }


--------------------------------------------------------------------------------
/third_party/UniDepth/configs/config_v1_vitl14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "generic": {
 3 |         "seed": 13
 4 |     },
 5 |     "training": {},
 6 |     "data": {
 7 |         "image_shape": [462, 616]
 8 |     },
 9 |     "model": {
10 |         "name": "UniDepthV1",
11 |         "num_heads": 8,
12 |         "expansion": 4,
13 |         "pixel_decoder": {
14 |             "hidden_dim": 512,
15 |             "depths": [3, 2, 1],
16 |             "dropout": 0.0
17 |         },
18 |         "pixel_encoder": {
19 |             "name": "dinov2_vitl14",
20 |             "pretrained": null
21 |         }
22 |     }
23 | }


--------------------------------------------------------------------------------
/third_party/UniDepth/configs/config_v2_vitl14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "generic": {
 3 |         "seed": 13,
 4 |         "deterministic": true
 5 |     },
 6 |     "training": {},
 7 |     "data": {
 8 |         "image_shape": [420, 560],
 9 |         "shape_constraints": {
10 |             "ratio_bounds": [0.66, 2.0],
11 |             "pixels_bounds": [1400, 2400],
12 |             "patch_size": 14
13 |         }
14 |     },
15 |     "model": {
16 |         "name": "UniDepthV2",
17 |         "num_heads": 8,
18 |         "expansion": 4,
19 |         "pixel_decoder": {
20 |             "hidden_dim": 512,
21 |             "depths": [6, 0, 0],
22 |             "dropout": 0.0
23 |         },
24 |         "pixel_encoder": {
25 |             "name": "dinov2_vitl14",
26 |             "pretrained": null,
27 |             "use_norm": true,
28 |             "stacking_fn": "last",
29 |             "output_idx": [21,22,23,24]
30 |         }
31 |     }
32 | }


--------------------------------------------------------------------------------
/third_party/UniDepth/configs/config_v2_vits14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "generic": {
 3 |         "seed": 13,
 4 |         "deterministic": true
 5 |     },
 6 |     "training": {},
 7 |     "data": {
 8 |         "image_shape": [420, 560],
 9 |         "shape_constraints": {
10 |             "ratio_bounds": [0.66, 2.0],
11 |             "pixels_bounds": [1400, 2400],
12 |             "patch_size": 14
13 |         }
14 |     },
15 |     "model": {
16 |         "name": "UniDepthV2",
17 |         "num_heads": 8,
18 |         "expansion": 4,
19 |         "pixel_decoder": {
20 |             "hidden_dim": 512,
21 |             "depths": [6, 0, 0],
22 |             "dropout": 0.0
23 |         },
24 |         "pixel_encoder": {
25 |             "name": "dinov2_vits14",
26 |             "pretrained": null,
27 |             "use_norm": true,
28 |             "stacking_fn": "last",
29 |             "output_idx": [9,10,11,12]
30 |         }
31 |     }
32 | }


--------------------------------------------------------------------------------
/third_party/UniDepth/hubconf.py:
--------------------------------------------------------------------------------
 1 | dependencies = ["torch", "huggingface_hub"]
 2 | 
 3 | import os
 4 | import json
 5 | 
 6 | import torch
 7 | import huggingface_hub
 8 | 
 9 | from unidepth.models import UniDepthV1, UniDepthV2
10 | 
11 | 
12 | MAP_VERSIONS = {
13 |     "v1": UniDepthV1,
14 |     "v2": UniDepthV2
15 | }
16 | 
17 | BACKBONES = {
18 |     "v1": ["vitl14", "cnvnxtl"],
19 |     "v2": ["vitl14", "vits14"]
20 | }
21 | 
22 | 
23 | def UniDepth(version="v2", backbone="vitl14", pretrained=True):
24 |     assert version in MAP_VERSIONS.keys(), f"version must be one of {list(MAP_VERSIONS.keys())}"
25 |     assert backbone in BACKBONES[version], f"backbone for current version ({version}) must be one of {list(BACKBONES[version])}"
26 |     repo_dir = os.path.dirname(os.path.realpath(__file__))
27 |     with open(os.path.join(repo_dir, "configs", f"config_{version}_{backbone}.json")) as f:
28 |         config = json.load(f)
29 |     
30 |     model = MAP_VERSIONS[version](config)
31 |     if pretrained:
32 |         path = huggingface_hub.hf_hub_download(repo_id=f"lpiccinelli/unidepth-{version}-{backbone}", filename=f"pytorch_model.bin", repo_type="model")
33 |         info = model.load_state_dict(torch.load(path), strict=False)
34 |         print(f"UniDepth_{version}_{backbone} is loaded with:")
35 |         print(f"\t missing keys: {info.missing_keys}")
36 |         print(f"\t additional keys: {info.unexpected_keys}")
37 | 
38 |     return model
39 | 
40 | 


--------------------------------------------------------------------------------
/third_party/UniDepth/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [tool.pyright]
 6 | include = ["unidepth"]
 7 | 
 8 | [project]
 9 | name = "unidepth"
10 | version = "0.1"
11 | authors = [{name = "Luigi Piccinelli", email = "lpiccinelli@ethz.ch"}]
12 | description = "UniDepth: Universal Monocular Metric Depth Estimation"
13 | readme = "README.md"
14 | license = { text="Creatives Common BY-NC 4.0 license"}
15 | requires-python = ">=3.10.0"
16 | dynamic = ["dependencies"]
17 | 
18 | [tool.setuptools.dynamic]
19 | dependencies = {file = ["requirements.txt"]}
20 | 
21 | [tool.setuptools.package-data]
22 | "*" = ["py.typed"]
23 | 
24 | [tool.setuptools.packages.find]
25 | include = ["unidepth*"]
26 | 


--------------------------------------------------------------------------------
/third_party/UniDepth/requirements.txt:
--------------------------------------------------------------------------------
 1 | appdirs
 2 | attrs
 3 | black
 4 | blosc2
 5 | botocore==1.34.54
 6 | certifi==2022.12.7
 7 | charset-normalizer
 8 | click
 9 | contourpy
10 | cycler
11 | docker-pycreds
12 | einops==0.7.0
13 | filelock
14 | flake8==7.0.0
15 | flake8-bugbear==24.2.6
16 | flake8-comprehensions==3.14.0
17 | fonttools
18 | fsspec
19 | fvcore==0.1.5.post20221221
20 | gitdb
21 | GitPython
22 | h5py>=3.10.0
23 | huggingface-hub>=0.22.0
24 | idna
25 | imageio
26 | imath
27 | iopath
28 | isort
29 | Jinja2
30 | jmespath
31 | kiwisolver
32 | MarkupSafe
33 | matplotlib
34 | mccabe
35 | mpmath
36 | msgpack
37 | mypy-extensions
38 | ndindex
39 | networkx
40 | ninja
41 | numexpr
42 | numpy<2.0.0
43 | opencv-python
44 | OpenEXR
45 | packaging
46 | pandas
47 | pathspec
48 | pillow==10.2.0
49 | platformdirs
50 | portalocker
51 | protobuf==4.25.3
52 | psutil
53 | py-cpuinfo
54 | pycodestyle
55 | pyflakes
56 | pyparsing
57 | python-dateutil
58 | pytz
59 | PyYAML
60 | requests
61 | safetensors
62 | scipy
63 | sentry-sdk
64 | setproctitle
65 | six
66 | smmap
67 | sympy
68 | tables
69 | tabulate
70 | termcolor
71 | timm
72 | tqdm
73 | triton==2.2.0
74 | typing_extensions
75 | tzdata==2024.1
76 | urllib3==1.26.13
77 | wandb
78 | yacs
79 | torch==2.2.0
80 | torchvision==0.17.0
81 | torchaudio==2.2.0
82 | xformers==0.0.24


--------------------------------------------------------------------------------
/third_party/UniDepth/run_unidepth.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | import numpy as np
 5 | import os
 6 | torch.backends.cudnn.enabled = False
 7 | 
 8 | from PIL import Image
 9 | from tqdm import tqdm
10 | import json
11 | import argparse
12 | from unidepth.models import UniDepthV1
13 | 
14 | 
15 | def parse_args():
16 |     parser = argparse.ArgumentParser(description="Dataset Configuration")
17 |     parser.add_argument('--dataset', type=str, default='SUNRGBD', help='Name of the dataset')
18 |     return parser.parse_args()
19 | 
20 | version="v1"
21 | backbone="ViTL14"
22 | 
23 | model = UniDepthV1.from_pretrained("lpiccinelli/unidepth-v1-vitl14")
24 | 
25 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
26 | model = model.to(device)
27 | 
28 | 
29 | def process(dataset):
30 |     for mode in ['train', 'val']:
31 |         with open(f'datasets/Omni3D/{dataset}_{mode}.json', 'r') as file:
32 |             data = json.load(file)
33 | 
34 |         for i in tqdm(range(len(data['images']))):
35 |             filename = data['images'][i]['file_path']
36 |             rgb = torch.from_numpy(np.array(Image.open(f'datasets/{filename}'))).permute(2, 0, 1)
37 |             intrinsics = np.array(data['images'][i]['K']).reshape(3,3)
38 |             intrinsics = torch.from_numpy(intrinsics).float()
39 |             file_name = data['images'][i]['id']
40 | 
41 |             predictions = model.infer(rgb, intrinsics)
42 |             depth = predictions["depth"]
43 |             intrinsics = predictions["intrinsics"]
44 | 
45 |             outdir = f'pseudo_label/{dataset}/{mode}/depth'
46 |             os.makedirs(outdir, exist_ok=True)
47 |             np.save(os.path.join(outdir, f"{file_name}"), depth.cpu().numpy().squeeze(0).squeeze(0))
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     args = parse_args()
52 |     print(f"Dataset name: {args.dataset}")
53 |     process(args.dataset)
54 | 


--------------------------------------------------------------------------------
/third_party/UniDepth/scripts/demo.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | from PIL import Image
 4 | 
 5 | from unidepth.models import UniDepthV1, UniDepthV2
 6 | from unidepth.utils import colorize, image_grid
 7 | 
 8 | 
 9 | def demo(model):
10 |     rgb = np.array(Image.open("assets/demo/rgb.png"))
11 |     rgb_torch = torch.from_numpy(rgb).permute(2, 0, 1)
12 |     intrinsics_torch = torch.from_numpy(np.load("assets/demo/intrinsics.npy"))
13 | 
14 |     # predict
15 |     predictions = model.infer(rgb_torch, intrinsics_torch)
16 | 
17 |     # get GT and pred
18 |     depth_pred = predictions["depth"].squeeze().cpu().numpy()
19 |     depth_gt = np.array(Image.open("assets/demo/depth.png")).astype(float) / 1000.0
20 | 
21 |     # compute error, you have zero divison where depth_gt == 0.0
22 |     depth_arel = np.abs(depth_gt - depth_pred) / depth_gt
23 |     depth_arel[depth_gt == 0.0] = 0.0
24 | 
25 |     # colorize
26 |     depth_pred_col = colorize(depth_pred, vmin=0.01, vmax=10.0, cmap="magma_r")
27 |     depth_gt_col = colorize(depth_gt, vmin=0.01, vmax=10.0, cmap="magma_r")
28 |     depth_error_col = colorize(depth_arel, vmin=0.0, vmax=0.2, cmap="coolwarm")
29 | 
30 |     # save image with pred and error
31 |     artifact = image_grid([rgb, depth_gt_col, depth_pred_col, depth_error_col], 2, 2)
32 |     Image.fromarray(artifact).save("assets/demo/output.png")
33 | 
34 |     print("Available predictions:", list(predictions.keys()))
35 |     print(f"ARel: {depth_arel[depth_gt > 0].mean() * 100:.2f}%")
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     print("Torch version:", torch.__version__)
40 |     name = "unidepth-v2-vitl14"
41 |     # model = UniDepthV1.from_pretrained("lpiccinelli/unidepth-v1-vitl14")
42 |     model = UniDepthV2.from_pretrained(f"lpiccinelli/{name}")
43 | 
44 |     # set resolution level (only V2)
45 |     # model.resolution_level = 0
46 | 
47 |     # set interpolation mode (only V2)
48 |     # model.interpolation_mode = "bilinear"
49 | 
50 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
51 |     model = model.to(device)
52 |     demo(model)
53 | 


--------------------------------------------------------------------------------
/third_party/UniDepth/unidepth/layers/__init__.py:
--------------------------------------------------------------------------------
 1 | from .activation import GEGLU, SwiGLU
 2 | from .attention import AttentionBlock, AttentionDecoderBlock
 3 | from .convnext import CvnxtBlock
 4 | from .mlp import MLP
 5 | from .nystrom_attention import NystromBlock
 6 | from .positional_encoding import PositionEmbeddingSine
 7 | from .upsample import (ConvUpsample, ConvUpsampleShuffle,
 8 |                        ConvUpsampleShuffleResidual)
 9 | 
10 | __all__ = [
11 |     "SwiGLU",
12 |     "GEGLU",
13 |     "CvnxtBlock",
14 |     "AttentionBlock",
15 |     "NystromBlock",
16 |     "PositionEmbeddingSine",
17 |     "ConvUpsample",
18 |     "MLP",
19 |     "ConvUpsampleShuffle",
20 |     "AttentionDecoderBlock",
21 |     "ConvUpsampleShuffleResidual",
22 | ]
23 | 


--------------------------------------------------------------------------------
/third_party/UniDepth/unidepth/layers/activation.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | class SwiGLU(nn.Module):
 7 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
 8 |         x, gates = x.chunk(2, dim=-1)
 9 |         return x * F.silu(gates)
10 | 
11 | 
12 | class GEGLU(nn.Module):
13 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
14 |         x, gates = x.chunk(2, dim=-1)
15 |         return x * F.gelu(gates)
16 | 


--------------------------------------------------------------------------------
/third_party/UniDepth/unidepth/layers/convnext.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class CvnxtBlock(nn.Module):
 6 |     def __init__(
 7 |         self,
 8 |         dim,
 9 |         kernel_size=7,
10 |         layer_scale=1.0,
11 |         expansion=4,
12 |         dilation=1,
13 |         padding_mode: str = "zeros",
14 |     ):
15 |         super().__init__()
16 |         self.dwconv = nn.Conv2d(
17 |             dim,
18 |             dim,
19 |             kernel_size=kernel_size,
20 |             padding=dilation * (kernel_size - 1) // 2,
21 |             groups=dim,
22 |             dilation=dilation,
23 |             padding_mode=padding_mode,
24 |         )  # depthwise conv
25 |         self.norm = nn.LayerNorm(dim)
26 |         self.pwconv1 = nn.Linear(dim, expansion * dim)
27 |         self.act = nn.GELU()
28 |         self.pwconv2 = nn.Linear(expansion * dim, dim)
29 |         self.gamma = (
30 |             nn.Parameter(layer_scale * torch.ones((dim))) if layer_scale > 0.0 else 1.0
31 |         )
32 | 
33 |     def forward(self, x):
34 |         input = x
35 |         x = self.dwconv(x)
36 |         x = x.permute(0, 2, 3, 1)  # (N, C, H, W) -> (N, H, W, C)
37 |         x = self.norm(x)
38 |         x = self.pwconv1(x)
39 |         x = self.act(x)
40 |         x = self.pwconv2(x)
41 | 
42 |         x = self.gamma * x
43 |         x = input + x.permute(0, 3, 1, 2)  # (N, H, W, C) -> (N, C, H, W)
44 |         return x
45 | 


--------------------------------------------------------------------------------
/third_party/UniDepth/unidepth/layers/drop_path.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | def drop_path(x: torch.Tensor, drop_prob: float = 0.0, training: bool = False):
 6 |     if drop_prob == 0.0 or not training:
 7 |         return x
 8 |     keep_prob = 1 - drop_prob
 9 |     shape = (x.shape[0],) + (1,) * (
10 |         x.ndim - 1
11 |     )  # work with diff dim tensors, not just 2D ConvNets
12 |     random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
13 |     if keep_prob > 0.0:
14 |         random_tensor.div_(keep_prob)
15 |     output = x * random_tensor
16 |     return output
17 | 
18 | 
19 | class DropPath(nn.Module):
20 |     def __init__(self, drop_prob=None):
21 |         super(DropPath, self).__init__()
22 |         self.drop_prob = drop_prob
23 | 
24 |     def forward(self, x):
25 |         return drop_path(x, self.drop_prob, self.training)
26 | 


--------------------------------------------------------------------------------
/third_party/UniDepth/unidepth/layers/layer_scale.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class LayerScale(nn.Module):
 6 |     def __init__(
 7 |         self,
 8 |         dim: int,
 9 |         init_values: float | torch.Tensor = 1e-5,
10 |         inplace: bool = False,
11 |     ) -> None:
12 |         super().__init__()
13 |         self.inplace = inplace
14 |         self.gamma = nn.Parameter(init_values * torch.ones(dim))
15 | 
16 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
17 |         return x.mul_(self.gamma) if self.inplace else x * self.gamma
18 | 


--------------------------------------------------------------------------------
/third_party/UniDepth/unidepth/layers/mlp.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from unidepth.utils.misc import default
 5 | 
 6 | from .activation import SwiGLU
 7 | 
 8 | 
 9 | class MLP(nn.Module):
10 |     def __init__(
11 |         self,
12 |         input_dim: int,
13 |         expansion: int = 4,
14 |         dropout: float = 0.0,
15 |         gated: bool = False,
16 |         output_dim: int | None = None,
17 |     ):
18 |         super().__init__()
19 |         if gated:
20 |             expansion = int(expansion * 2 / 3)
21 |         hidden_dim = int(input_dim * expansion)
22 |         output_dim = default(output_dim, input_dim)
23 |         self.norm = nn.LayerNorm(input_dim)
24 |         self.proj1 = nn.Linear(input_dim, hidden_dim)
25 |         self.proj2 = nn.Linear(hidden_dim, output_dim)
26 |         self.act = nn.GELU() if not gated else SwiGLU()
27 |         self.dropout = nn.Dropout(dropout) if dropout > 0.0 else nn.Identity()
28 | 
29 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
30 |         x = self.norm(x)
31 |         x = self.proj1(x)
32 |         x = self.act(x)
33 |         x = self.proj2(x)
34 |         x = self.dropout(x)
35 |         return x
36 | 


--------------------------------------------------------------------------------
/third_party/UniDepth/unidepth/layers/nystrom_attention.py:
--------------------------------------------------------------------------------
 1 | from functools import partial
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | from einops import rearrange
 7 | from xformers.components.attention import NystromAttention
 8 | 
 9 | from .attention import AttentionBlock
10 | 
11 | 
12 | class NystromBlock(AttentionBlock):
13 |     def __init__(
14 |         self,
15 |         dim: int,
16 |         num_heads: int = 4,
17 |         expansion: int = 4,
18 |         dropout: float = 0.0,
19 |         cosine: bool = False,
20 |         gated: bool = False,
21 |         layer_scale: float = 1.0,
22 |         context_dim: int | None = None,
23 |     ):
24 |         super().__init__(
25 |             dim=dim,
26 |             num_heads=num_heads,
27 |             expansion=expansion,
28 |             dropout=dropout,
29 |             cosine=cosine,
30 |             gated=gated,
31 |             layer_scale=layer_scale,
32 |             context_dim=context_dim,
33 |         )
34 |         self.attention_fn = NystromAttention(
35 |             num_landmarks=128, num_heads=num_heads, dropout=dropout
36 |         )
37 | 
38 |     def attn(
39 |         self,
40 |         x: torch.Tensor,
41 |         attn_bias: torch.Tensor | None = None,
42 |         context: torch.Tensor | None = None,
43 |         pos_embed: torch.Tensor | None = None,
44 |         pos_embed_context: torch.Tensor | None = None,
45 |         rope: nn.Module | None = None,
46 |     ) -> torch.Tensor:
47 |         x = self.norm_attnx(x)
48 |         context = self.norm_attnctx(context)
49 |         k, v = rearrange(
50 |             self.kv(context), "b n (kv h d) -> b n h d kv", h=self.num_heads, kv=2
51 |         ).unbind(dim=-1)
52 |         q = rearrange(self.q(x), "b n (h d) -> b n h d", h=self.num_heads)
53 | 
54 |         if rope is not None:
55 |             q = rope(q)
56 |             k = rope(k)
57 |         else:
58 |             if pos_embed is not None:
59 |                 pos_embed = rearrange(
60 |                     pos_embed, "b n (h d) -> b n h d", h=self.num_heads
61 |                 )
62 |                 q = q + pos_embed
63 |             if pos_embed_context is not None:
64 |                 pos_embed_context = rearrange(
65 |                     pos_embed_context, "b n (h d) -> b n h d", h=self.num_heads
66 |                 )
67 |                 k = k + pos_embed_context
68 | 
69 |         if self.cosine:
70 |             q, k = map(partial(F.normalize, p=2, dim=-1), (q, k))  # cosine sim
71 |         x = self.attention_fn(q, k, v, key_padding_mask=attn_bias)
72 |         x = rearrange(x, "b n h d -> b n (h d)")
73 |         x = self.out(x)
74 |         return x
75 | 


--------------------------------------------------------------------------------
/third_party/UniDepth/unidepth/layers/upsample.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Author: Luigi Piccinelli
  3 | Licensed under the CC-BY NC 4.0 license (http://creativecommons.org/licenses/by-nc/4.0/)
  4 | """
  5 | 
  6 | import torch
  7 | import torch.nn as nn
  8 | from einops import rearrange
  9 | 
 10 | from .convnext import CvnxtBlock
 11 | 
 12 | 
 13 | class ConvUpsample(nn.Module):
 14 |     def __init__(
 15 |         self,
 16 |         hidden_dim,
 17 |         num_layers: int = 2,
 18 |         expansion: int = 4,
 19 |         layer_scale: float = 1.0,
 20 |         kernel_size: int = 7,
 21 |         **kwargs,
 22 |     ):
 23 |         super().__init__()
 24 |         self.convs = nn.ModuleList([])
 25 |         for _ in range(num_layers):
 26 |             self.convs.append(
 27 |                 CvnxtBlock(
 28 |                     hidden_dim,
 29 |                     kernel_size=kernel_size,
 30 |                     expansion=expansion,
 31 |                     layer_scale=layer_scale,
 32 |                 )
 33 |             )
 34 |         self.up = nn.Sequential(
 35 |             nn.Conv2d(hidden_dim, hidden_dim // 2, kernel_size=1, padding=0),
 36 |             nn.UpsamplingBilinear2d(scale_factor=2),
 37 |             nn.Conv2d(hidden_dim // 2, hidden_dim // 2, kernel_size=3, padding=1),
 38 |         )
 39 | 
 40 |     def forward(self, x: torch.Tensor):
 41 |         for conv in self.convs:
 42 |             x = conv(x)
 43 |         x = self.up(x)
 44 |         x = rearrange(x, "b c h w -> b (h w) c")
 45 |         return x
 46 | 
 47 | 
 48 | class ConvUpsampleShuffle(nn.Module):
 49 |     def __init__(
 50 |         self,
 51 |         hidden_dim,
 52 |         num_layers: int = 2,
 53 |         expansion: int = 4,
 54 |         layer_scale: float = 1.0,
 55 |         kernel_size: int = 7,
 56 |         **kwargs,
 57 |     ):
 58 |         super().__init__()
 59 |         self.convs = nn.ModuleList([])
 60 |         for _ in range(num_layers):
 61 |             self.convs.append(
 62 |                 CvnxtBlock(
 63 |                     hidden_dim,
 64 |                     kernel_size=kernel_size,
 65 |                     expansion=expansion,
 66 |                     layer_scale=layer_scale,
 67 |                 )
 68 |             )
 69 |         self.up = nn.Sequential(
 70 |             nn.PixelShuffle(2),
 71 |             nn.Conv2d(hidden_dim // 4, hidden_dim // 2, kernel_size=3, padding=1),
 72 |         )
 73 | 
 74 |     def forward(self, x: torch.Tensor):
 75 |         for conv in self.convs:
 76 |             x = conv(x)
 77 |         x = self.up(x)
 78 |         x = rearrange(x, "b c h w -> b (h w) c")
 79 |         return x
 80 | 
 81 | 
 82 | class ConvUpsampleShuffleResidual(nn.Module):
 83 |     def __init__(
 84 |         self,
 85 |         hidden_dim,
 86 |         num_layers: int = 2,
 87 |         expansion: int = 4,
 88 |         layer_scale: float = 1.0,
 89 |         kernel_size: int = 7,
 90 |         padding_mode: str = "zeros",
 91 |         **kwargs,
 92 |     ):
 93 |         super().__init__()
 94 |         self.convs = nn.ModuleList([])
 95 |         for _ in range(num_layers):
 96 |             self.convs.append(
 97 |                 CvnxtBlock(
 98 |                     hidden_dim,
 99 |                     kernel_size=kernel_size,
100 |                     expansion=expansion,
101 |                     layer_scale=layer_scale,
102 |                     padding_mode=padding_mode,
103 |                 )
104 |             )
105 |         self.up = nn.Sequential(
106 |             nn.PixelShuffle(2),
107 |             nn.Conv2d(
108 |                 hidden_dim // 4,
109 |                 hidden_dim // 4,
110 |                 kernel_size=7,
111 |                 padding=3,
112 |                 padding_mode=padding_mode,
113 |                 groups=hidden_dim // 4,
114 |             ),
115 |             nn.ReLU(),
116 |             nn.Conv2d(
117 |                 hidden_dim // 4,
118 |                 hidden_dim // 2,
119 |                 kernel_size=3,
120 |                 padding=1,
121 |                 padding_mode=padding_mode,
122 |             ),
123 |         )
124 |         self.residual = nn.Sequential(
125 |             nn.Conv2d(hidden_dim, hidden_dim // 2, kernel_size=1, padding=0),
126 |             nn.UpsamplingBilinear2d(scale_factor=2),
127 |         )
128 | 
129 |     def forward(self, x: torch.Tensor):
130 |         for conv in self.convs:
131 |             x = conv(x)
132 |         x = self.up(x) + self.residual(x)
133 |         x = rearrange(x, "b c h w -> b (h w) c")
134 |         return x
135 | 


--------------------------------------------------------------------------------
/third_party/UniDepth/unidepth/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .unidepthv1 import UniDepthV1
2 | from .unidepthv2 import UniDepthV2
3 | 
4 | __all__ = [
5 |     "UniDepthV1",
6 |     "UniDepthV2",
7 | ]
8 | 


--------------------------------------------------------------------------------
/third_party/UniDepth/unidepth/models/backbones/__init__.py:
--------------------------------------------------------------------------------
 1 | from .convnext import ConvNeXt
 2 | from .convnext2 import ConvNeXtV2
 3 | from .dinov2 import _make_dinov2_model
 4 | 
 5 | __all__ = [
 6 |     "ConvNeXt",
 7 |     "ConvNeXtV2",
 8 |     "_make_dinov2_model",
 9 | ]
10 | 


--------------------------------------------------------------------------------
/third_party/UniDepth/unidepth/models/backbones/metadinov2/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from .attention import Attention, MemEffAttention
 8 | from .block import NestedTensorBlock
 9 | from .dino_head import DINOHead
10 | from .mlp import Mlp
11 | from .patch_embed import PatchEmbed
12 | from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
13 | 


--------------------------------------------------------------------------------
/third_party/UniDepth/unidepth/models/backbones/metadinov2/attention.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # References:
 8 | #   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
 9 | #   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
10 | 
11 | import logging
12 | 
13 | import torch.nn as nn
14 | from torch import Tensor
15 | 
16 | logger = logging.getLogger("dinov2")
17 | 
18 | 
19 | try:
20 |     from xformers.ops import fmha, memory_efficient_attention, unbind
21 | 
22 |     XFORMERS_AVAILABLE = True
23 | except ImportError:
24 |     logger.warning("xFormers not available")
25 |     XFORMERS_AVAILABLE = False
26 | 
27 | 
28 | class Attention(nn.Module):
29 |     def __init__(
30 |         self,
31 |         dim: int,
32 |         num_heads: int = 8,
33 |         qkv_bias: bool = False,
34 |         proj_bias: bool = True,
35 |         attn_drop: float = 0.0,
36 |         proj_drop: float = 0.0,
37 |     ) -> None:
38 |         super().__init__()
39 |         self.num_heads = num_heads
40 |         head_dim = dim // num_heads
41 |         self.scale = head_dim**-0.5
42 | 
43 |         self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
44 |         self.attn_drop = nn.Dropout(attn_drop)
45 |         self.proj = nn.Linear(dim, dim, bias=proj_bias)
46 |         self.proj_drop = nn.Dropout(proj_drop)
47 | 
48 |     def forward(self, x: Tensor) -> Tensor:
49 |         B, N, C = x.shape
50 |         qkv = (
51 |             self.qkv(x)
52 |             .reshape(B, N, 3, self.num_heads, C // self.num_heads)
53 |             .permute(2, 0, 3, 1, 4)
54 |         )
55 | 
56 |         q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
57 |         attn = q @ k.transpose(-2, -1)
58 | 
59 |         attn = attn.softmax(dim=-1)
60 |         attn = self.attn_drop(attn)
61 | 
62 |         x = (attn @ v).transpose(1, 2).reshape(B, N, C)
63 |         x = self.proj(x)
64 |         x = self.proj_drop(x)
65 |         return x
66 | 
67 | 
68 | class MemEffAttention(Attention):
69 |     def forward(self, x: Tensor, attn_bias=None) -> Tensor:
70 |         if not XFORMERS_AVAILABLE:
71 |             assert attn_bias is None, "xFormers is required for nested tensors usage"
72 |             return super().forward(x)
73 | 
74 |         B, N, C = x.shape
75 |         qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
76 | 
77 |         q, k, v = unbind(qkv, 2)
78 | 
79 |         x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
80 |         x = x.reshape([B, N, C])
81 | 
82 |         x = self.proj(x)
83 |         x = self.proj_drop(x)
84 |         return x
85 | 


--------------------------------------------------------------------------------
/third_party/UniDepth/unidepth/models/backbones/metadinov2/dino_head.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import torch
 8 | import torch.nn as nn
 9 | from torch.nn.init import trunc_normal_
10 | from torch.nn.utils import weight_norm
11 | 
12 | 
13 | class DINOHead(nn.Module):
14 |     def __init__(
15 |         self,
16 |         in_dim,
17 |         out_dim,
18 |         use_bn=False,
19 |         nlayers=3,
20 |         hidden_dim=2048,
21 |         bottleneck_dim=256,
22 |         mlp_bias=True,
23 |     ):
24 |         super().__init__()
25 |         nlayers = max(nlayers, 1)
26 |         self.mlp = _build_mlp(
27 |             nlayers,
28 |             in_dim,
29 |             bottleneck_dim,
30 |             hidden_dim=hidden_dim,
31 |             use_bn=use_bn,
32 |             bias=mlp_bias,
33 |         )
34 |         self.apply(self._init_weights)
35 |         self.last_layer = weight_norm(nn.Linear(bottleneck_dim, out_dim, bias=False))
36 |         self.last_layer.weight_g.data.fill_(1)
37 | 
38 |     def _init_weights(self, m):
39 |         if isinstance(m, nn.Linear):
40 |             trunc_normal_(m.weight, std=0.02)
41 |             if isinstance(m, nn.Linear) and m.bias is not None:
42 |                 nn.init.constant_(m.bias, 0)
43 | 
44 |     def forward(self, x):
45 |         x = self.mlp(x)
46 |         eps = 1e-6 if x.dtype == torch.float16 else 1e-12
47 |         x = nn.functional.normalize(x, dim=-1, p=2, eps=eps)
48 |         x = self.last_layer(x)
49 |         return x
50 | 
51 | 
52 | def _build_mlp(
53 |     nlayers, in_dim, bottleneck_dim, hidden_dim=None, use_bn=False, bias=True
54 | ):
55 |     if nlayers == 1:
56 |         return nn.Linear(in_dim, bottleneck_dim, bias=bias)
57 |     else:
58 |         layers = [nn.Linear(in_dim, hidden_dim, bias=bias)]
59 |         if use_bn:
60 |             layers.append(nn.BatchNorm1d(hidden_dim))
61 |         layers.append(nn.GELU())
62 |         for _ in range(nlayers - 2):
63 |             layers.append(nn.Linear(hidden_dim, hidden_dim, bias=bias))
64 |             if use_bn:
65 |                 layers.append(nn.BatchNorm1d(hidden_dim))
66 |             layers.append(nn.GELU())
67 |         layers.append(nn.Linear(hidden_dim, bottleneck_dim, bias=bias))
68 |         return nn.Sequential(*layers)
69 | 


--------------------------------------------------------------------------------
/third_party/UniDepth/unidepth/models/backbones/metadinov2/drop_path.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # References:
 8 | #   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
 9 | #   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
10 | 
11 | 
12 | import torch.nn as nn
13 | 
14 | 
15 | def drop_path(x, drop_prob: float = 0.0, training: bool = False):
16 |     if drop_prob == 0.0 or not training:
17 |         return x
18 |     keep_prob = 1 - drop_prob
19 |     shape = (x.shape[0],) + (1,) * (
20 |         x.ndim - 1
21 |     )  # work with diff dim tensors, not just 2D ConvNets
22 |     random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
23 |     if keep_prob > 0.0:
24 |         random_tensor.div_(keep_prob)
25 |     output = x * random_tensor
26 |     return output
27 | 
28 | 
29 | class DropPath(nn.Module):
30 |     """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
31 | 
32 |     def __init__(self, drop_prob=None):
33 |         super(DropPath, self).__init__()
34 |         self.drop_prob = drop_prob
35 | 
36 |     def forward(self, x):
37 |         return drop_path(x, self.drop_prob, self.training)
38 | 


--------------------------------------------------------------------------------
/third_party/UniDepth/unidepth/models/backbones/metadinov2/layer_scale.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
 8 | 
 9 | from typing import Union
10 | 
11 | import torch
12 | import torch.nn as nn
13 | from torch import Tensor
14 | 
15 | 
16 | class LayerScale(nn.Module):
17 |     def __init__(
18 |         self,
19 |         dim: int,
20 |         init_values: Union[float, Tensor] = 1e-5,
21 |         inplace: bool = False,
22 |     ) -> None:
23 |         super().__init__()
24 |         self.inplace = inplace
25 |         self.gamma = nn.Parameter(init_values * torch.ones(dim))
26 | 
27 |     def forward(self, x: Tensor) -> Tensor:
28 |         return x.mul_(self.gamma) if self.inplace else x * self.gamma
29 | 


--------------------------------------------------------------------------------
/third_party/UniDepth/unidepth/models/backbones/metadinov2/mlp.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # References:
 8 | #   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
 9 | #   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
10 | 
11 | 
12 | from typing import Callable, Optional
13 | 
14 | from torch import Tensor, nn
15 | 
16 | 
17 | class Mlp(nn.Module):
18 |     def __init__(
19 |         self,
20 |         in_features: int,
21 |         hidden_features: Optional[int] = None,
22 |         out_features: Optional[int] = None,
23 |         act_layer: Callable[..., nn.Module] = nn.GELU,
24 |         drop: float = 0.0,
25 |         bias: bool = True,
26 |     ) -> None:
27 |         super().__init__()
28 |         out_features = out_features or in_features
29 |         hidden_features = hidden_features or in_features
30 |         self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
31 |         self.act = act_layer()
32 |         self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
33 |         self.drop = nn.Dropout(drop)
34 | 
35 |     def forward(self, x: Tensor) -> Tensor:
36 |         x = self.fc1(x)
37 |         x = self.act(x)
38 |         x = self.drop(x)
39 |         x = self.fc2(x)
40 |         x = self.drop(x)
41 |         return x
42 | 


--------------------------------------------------------------------------------
/third_party/UniDepth/unidepth/models/backbones/metadinov2/patch_embed.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | # References:
  8 | #   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
  9 | #   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
 10 | 
 11 | from typing import Callable, Optional, Tuple, Union
 12 | 
 13 | import torch.nn as nn
 14 | from torch import Tensor
 15 | 
 16 | 
 17 | def make_2tuple(x):
 18 |     if isinstance(x, tuple):
 19 |         assert len(x) == 2
 20 |         return x
 21 | 
 22 |     assert isinstance(x, int)
 23 |     return (x, x)
 24 | 
 25 | 
 26 | class PatchEmbed(nn.Module):
 27 |     """
 28 |     2D image to patch embedding: (B,C,H,W) -> (B,N,D)
 29 | 
 30 |     Args:
 31 |         img_size: Image size.
 32 |         patch_size: Patch token size.
 33 |         in_chans: Number of input image channels.
 34 |         embed_dim: Number of linear projection output channels.
 35 |         norm_layer: Normalization layer.
 36 |     """
 37 | 
 38 |     def __init__(
 39 |         self,
 40 |         img_size: Union[int, Tuple[int, int]] = 224,
 41 |         patch_size: Union[int, Tuple[int, int]] = 16,
 42 |         in_chans: int = 3,
 43 |         embed_dim: int = 768,
 44 |         norm_layer: Optional[Callable] = None,
 45 |         flatten_embedding: bool = True,
 46 |     ) -> None:
 47 |         super().__init__()
 48 | 
 49 |         image_HW = make_2tuple(img_size)
 50 |         patch_HW = make_2tuple(patch_size)
 51 |         patch_grid_size = (
 52 |             image_HW[0] // patch_HW[0],
 53 |             image_HW[1] // patch_HW[1],
 54 |         )
 55 | 
 56 |         self.img_size = image_HW
 57 |         self.patch_size = patch_HW
 58 |         self.patches_resolution = patch_grid_size
 59 |         self.num_patches = patch_grid_size[0] * patch_grid_size[1]
 60 | 
 61 |         self.in_chans = in_chans
 62 |         self.embed_dim = embed_dim
 63 | 
 64 |         self.flatten_embedding = flatten_embedding
 65 | 
 66 |         self.proj = nn.Conv2d(
 67 |             in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW
 68 |         )
 69 |         self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
 70 | 
 71 |     def forward(self, x: Tensor) -> Tensor:
 72 |         _, _, H, W = x.shape
 73 |         patch_H, patch_W = self.patch_size
 74 | 
 75 |         assert (
 76 |             H % patch_H == 0
 77 |         ), f"Input image height {H} is not a multiple of patch height {patch_H}"
 78 |         assert (
 79 |             W % patch_W == 0
 80 |         ), f"Input image width {W} is not a multiple of patch width: {patch_W}"
 81 | 
 82 |         x = self.proj(x)  # B C H W
 83 |         H, W = x.size(2), x.size(3)
 84 |         x = x.flatten(2).transpose(1, 2)  # B HW C
 85 |         x = self.norm(x)
 86 |         if not self.flatten_embedding:
 87 |             x = x.reshape(-1, H, W, self.embed_dim)  # B H W C
 88 |         return x
 89 | 
 90 |     def flops(self) -> float:
 91 |         Ho, Wo = self.patches_resolution
 92 |         flops = (
 93 |             Ho
 94 |             * Wo
 95 |             * self.embed_dim
 96 |             * self.in_chans
 97 |             * (self.patch_size[0] * self.patch_size[1])
 98 |         )
 99 |         if self.norm is not None:
100 |             flops += Ho * Wo * self.embed_dim
101 |         return flops
102 | 


--------------------------------------------------------------------------------
/third_party/UniDepth/unidepth/models/backbones/metadinov2/swiglu_ffn.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from typing import Callable, Optional
 8 | 
 9 | import torch.nn.functional as F
10 | from torch import Tensor, nn
11 | 
12 | 
13 | class SwiGLUFFN(nn.Module):
14 |     def __init__(
15 |         self,
16 |         in_features: int,
17 |         hidden_features: Optional[int] = None,
18 |         out_features: Optional[int] = None,
19 |         act_layer: Callable[..., nn.Module] = None,
20 |         drop: float = 0.0,
21 |         bias: bool = True,
22 |     ) -> None:
23 |         super().__init__()
24 |         out_features = out_features or in_features
25 |         hidden_features = hidden_features or in_features
26 |         self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
27 |         self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
28 | 
29 |     def forward(self, x: Tensor) -> Tensor:
30 |         x12 = self.w12(x)
31 |         x1, x2 = x12.chunk(2, dim=-1)
32 |         hidden = F.silu(x1) * x2
33 |         return self.w3(hidden)
34 | 
35 | 
36 | try:
37 |     from xformers.ops import SwiGLU
38 | 
39 |     XFORMERS_AVAILABLE = True
40 | except ImportError:
41 |     SwiGLU = SwiGLUFFN
42 |     XFORMERS_AVAILABLE = False
43 | 
44 | 
45 | class SwiGLUFFNFused(SwiGLU):
46 |     def __init__(
47 |         self,
48 |         in_features: int,
49 |         hidden_features: Optional[int] = None,
50 |         out_features: Optional[int] = None,
51 |         act_layer: Callable[..., nn.Module] = None,
52 |         drop: float = 0.0,
53 |         bias: bool = True,
54 |     ) -> None:
55 |         out_features = out_features or in_features
56 |         hidden_features = hidden_features or in_features
57 |         hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
58 |         super().__init__(
59 |             in_features=in_features,
60 |             hidden_features=hidden_features,
61 |             out_features=out_features,
62 |             bias=bias,
63 |         )
64 | 


--------------------------------------------------------------------------------
/third_party/UniDepth/unidepth/models/unidepthv1/__init__.py:
--------------------------------------------------------------------------------
1 | from .unidepthv1 import UniDepthV1
2 | 
3 | __all__ = [
4 |     "UniDepthV1",
5 | ]
6 | 


--------------------------------------------------------------------------------
/third_party/UniDepth/unidepth/models/unidepthv2/__init__.py:
--------------------------------------------------------------------------------
1 | from .unidepthv2 import UniDepthV2
2 | 
3 | __all__ = [
4 |     "UniDepthV2",
5 | ]
6 | 


--------------------------------------------------------------------------------
/third_party/UniDepth/unidepth/ops/__init__.py:
--------------------------------------------------------------------------------
 1 | from .losses import MSE, SelfCons, SILog
 2 | from .scheduler import CosineScheduler
 3 | 
 4 | __all__ = [
 5 |     "SILog",
 6 |     "MSE",
 7 |     "SelfCons",
 8 |     "CosineScheduler",
 9 | ]
10 | 


--------------------------------------------------------------------------------
/third_party/UniDepth/unidepth/ops/scheduler.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Author: Luigi Piccinelli
 3 | Licensed under the CC-BY NC 4.0 license (http://creativecommons.org/licenses/by-nc/4.0/)
 4 | """
 5 | 
 6 | import numpy as np
 7 | 
 8 | 
 9 | class CosineScheduler(object):
10 |     def __init__(
11 |         self,
12 |         optimizer,
13 |         warmup_iters,
14 |         total_iters,
15 |         key,
16 |         overwrite=False,
17 |         init_value=None,
18 |         base_value=None,
19 |         final_value=None,
20 |         step_init=-1,
21 |     ):
22 |         super().__init__()
23 |         self.iter = step_init
24 |         self.overwrite = overwrite
25 |         self.optimizer = optimizer
26 |         self.base_value = base_value
27 |         self.init_value = init_value
28 |         self.final_value = final_value
29 |         self.total_iters = total_iters
30 |         self.warmup_iters = warmup_iters
31 |         self.key = key
32 |         self.schedulers = [
33 |             self.get_schedulers(group) for group in optimizer.param_groups
34 |         ]
35 | 
36 |     def get_schedulers(self, group):
37 |         init_value = group.get(self.key + "_init", self.init_value)
38 |         base_value = group.get(self.key + "_base", self.base_value)
39 |         final_value = group.get(self.key + "_final", self.final_value)
40 |         warmup_iters = self.warmup_iters
41 |         total_iters = self.total_iters
42 |         if self.overwrite:
43 |             final_value = self.final_value
44 | 
45 |         # normalize in 0,1, then apply function (power) and denormalize
46 |         normalized_schedule = np.linspace(0, 1, warmup_iters, endpoint=True)
47 |         normalized_schedule = np.power(normalized_schedule, 2)
48 |         warmup_schedule = (base_value - init_value) * normalized_schedule + init_value
49 | 
50 |         # main scheduling
51 |         iters = np.arange(total_iters - warmup_iters)
52 |         schedule = final_value + 0.5 * (base_value - final_value) * (
53 |             1 + np.cos(np.pi * iters / len(iters))
54 |         )
55 |         return np.concatenate((warmup_schedule, schedule))
56 | 
57 |     def step(self):
58 |         self.iter = self.iter + 1
59 |         vals = self[self.iter]
60 |         for group, val in zip(self.optimizer.param_groups, vals):
61 |             if isinstance(group[self.key], (tuple, list)):
62 |                 val = (val, *group[self.key][1:])
63 |             group[self.key] = val
64 | 
65 |     def __getitem__(self, it):
66 |         it = min(it, self.total_iters - 1)
67 |         return [scheduler[it] for scheduler in self.schedulers]
68 | 
69 |     def get(self):
70 |         return [group[self.key] for group in self.optimizer.param_groups]
71 | 


--------------------------------------------------------------------------------
/third_party/UniDepth/unidepth/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from .distributed import (barrier, get_dist_info, get_rank, is_main_process,
 2 |                           setup_multi_processes, setup_slurm,
 3 |                           sync_tensor_across_gpus)
 4 | from .evaluation_depth import DICT_METRICS, eval_depth
 5 | from .geometric import spherical_zbuffer_to_euclidean, unproject_points
 6 | from .misc import format_seconds, get_params, identity, remove_padding
 7 | from .visualization import colorize, image_grid, log_train_artifacts
 8 | 
 9 | __all__ = [
10 |     "eval_depth",
11 |     "DICT_METRICS",
12 |     "colorize",
13 |     "image_grid",
14 |     "log_train_artifacts",
15 |     "format_seconds",
16 |     "remove_padding",
17 |     "get_params",
18 |     "identity",
19 |     "is_main_process",
20 |     "setup_multi_processes",
21 |     "setup_slurm",
22 |     "sync_tensor_across_gpus",
23 |     "barrier",
24 |     "get_rank",
25 |     "unproject_points",
26 |     "spherical_zbuffer_to_euclidean",
27 |     "validate",
28 |     "get_dist_info",
29 | ]
30 | 


--------------------------------------------------------------------------------
/third_party/UniDepth/unidepth/utils/constants.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Author: Luigi Piccinelli
 3 | Licensed under the CC-BY NC 4.0 license (http://creativecommons.org/licenses/by-nc/4.0/)
 4 | """
 5 | 
 6 | import math
 7 | 
 8 | import torch
 9 | 
10 | OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
11 | OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
12 | IMAGENET_DATASET_MEAN = (0.485, 0.456, 0.406)
13 | IMAGENET_DATASET_STD = (0.229, 0.224, 0.225)
14 | DEPTH_BINS = torch.cat(
15 |     (
16 |         torch.logspace(math.log10(0.1), math.log10(180.0), steps=512),
17 |         torch.tensor([260.0]),
18 |     ),
19 |     dim=0,
20 | )
21 | LOGERR_BINS = torch.linspace(-2, 2, steps=128 + 1)
22 | LINERR_BINS = torch.linspace(-50, 50, steps=256 + 1)
23 | 


--------------------------------------------------------------------------------
/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LeapLabTHU/OVM3D-Det/e599833bc7899c313254ba6e3cd6b61d4d20c993/tools/__init__.py


--------------------------------------------------------------------------------