├── .gitignore
├── .gitmodules
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── configs
├── Custom
│ └── custom_template.yaml
├── Dynamic
│ ├── Bonn
│ │ ├── bonn_balloon.yaml
│ │ ├── bonn_balloon2.yaml
│ │ ├── bonn_crowd.yaml
│ │ ├── bonn_crowd2.yaml
│ │ ├── bonn_dynamic.yaml
│ │ ├── bonn_moving_nonobstructing_box.yaml
│ │ ├── bonn_moving_nonobstructing_box2.yaml
│ │ ├── bonn_person_tracking.yaml
│ │ └── bonn_person_tracking2.yaml
│ ├── TUM_RGBD
│ │ ├── freiburg2_desk_with_person.yaml
│ │ ├── freiburg3_sitting_halfsphere.yaml
│ │ ├── freiburg3_sitting_halfsphere_static.yaml
│ │ ├── freiburg3_sitting_rpy.yaml
│ │ ├── freiburg3_sitting_xyz.yaml
│ │ ├── freiburg3_walking_halfsphere.yaml
│ │ ├── freiburg3_walking_halfsphere_static.yaml
│ │ ├── freiburg3_walking_rpy.yaml
│ │ ├── freiburg3_walking_xyz.yaml
│ │ └── tum_dynamic.yaml
│ ├── Wild_SLAM_Mocap
│ │ ├── ANYmal1.yaml
│ │ ├── ANYmal2.yaml
│ │ ├── ball.yaml
│ │ ├── crowd.yaml
│ │ ├── crowd_demo.yaml
│ │ ├── person_tracking.yaml
│ │ ├── racket.yaml
│ │ ├── stones.yaml
│ │ ├── table_tracking1.yaml
│ │ ├── table_tracking2.yaml
│ │ ├── umbrella.yaml
│ │ └── wild_slam_mocap.yaml
│ └── Wild_SLAM_iPhone
│ │ ├── horse.yaml
│ │ ├── parking.yaml
│ │ ├── piano.yaml
│ │ ├── shopping.yaml
│ │ ├── street.yaml
│ │ ├── tower.yaml
│ │ └── wild_slam_iphone.yaml
├── Static
│ └── TUM_RGBD
│ │ ├── freiburg1_desk.yaml
│ │ ├── freiburg2_xyz.yaml
│ │ ├── freiburg3_office.yaml
│ │ └── tum.yaml
└── wildgs_slam.yaml
├── media
└── teaser.png
├── requirements.txt
├── run.py
├── scripts_downloading
├── download_bonn.sh
├── download_demo_data.sh
├── download_tum.sh
├── download_wild_slam_iphone.sh
├── download_wild_slam_mocap_scene1.sh
└── download_wild_slam_mocap_scene2.sh
├── scripts_run
├── run_bonn_all.sh
├── run_tum_dynamic_all.sh
├── run_wild_slam_mocap_all.sh
└── summarize_pose_eval.py
├── setup.py
├── src
├── __init__.py
├── backend.py
├── config.py
├── depth_video.py
├── factor_graph.py
├── frontend.py
├── geom
│ ├── __init__.py
│ ├── ba.py
│ ├── chol.py
│ └── projective_ops.py
├── gui
│ ├── gl_render
│ │ ├── LICENSE
│ │ ├── __init__.py
│ │ ├── render_ogl.py
│ │ ├── shaders
│ │ │ ├── gau_frag.glsl
│ │ │ └── gau_vert.glsl
│ │ ├── util.py
│ │ └── util_gau.py
│ ├── gui_utils.py
│ └── slam_gui.py
├── lib
│ ├── altcorr_kernel.cu
│ ├── correlation_kernels.cu
│ ├── droid.cpp
│ └── droid_kernels.cu
├── mapper.py
├── modules
│ └── droid_net
│ │ ├── __init__.py
│ │ ├── clipping.py
│ │ ├── corr.py
│ │ ├── droid_net.py
│ │ ├── extractor.py
│ │ └── gru.py
├── motion_filter.py
├── slam.py
├── tracker.py
├── trajectory_filler.py
└── utils
│ ├── Printer.py
│ ├── camera_utils.py
│ ├── common.py
│ ├── datasets.py
│ ├── dyn_uncertainty
│ ├── __init__.py
│ ├── mapping_utils.py
│ ├── median_filter.py
│ └── uncertainty_model.py
│ ├── eval_traj.py
│ ├── eval_utils.py
│ ├── mono_priors
│ ├── img_feature_extractors.py
│ └── metric_depth_estimators.py
│ ├── plot_utils.py
│ ├── pose_utils.py
│ └── slam_utils.py
└── thirdparty
├── __init__.py
├── depth_anything_v2
├── DA-2K.md
├── LICENSE
├── README.md
├── app.py
├── assets
│ ├── DA-2K.png
│ ├── examples
│ │ ├── demo01.jpg
│ │ ├── demo02.jpg
│ │ ├── demo03.jpg
│ │ ├── demo04.jpg
│ │ ├── demo05.jpg
│ │ ├── demo06.jpg
│ │ ├── demo07.jpg
│ │ ├── demo08.jpg
│ │ ├── demo09.jpg
│ │ ├── demo10.jpg
│ │ ├── demo11.jpg
│ │ ├── demo12.jpg
│ │ ├── demo13.jpg
│ │ ├── demo14.jpg
│ │ ├── demo15.jpg
│ │ ├── demo16.jpg
│ │ ├── demo17.jpg
│ │ ├── demo18.jpg
│ │ ├── demo19.jpg
│ │ └── demo20.jpg
│ ├── examples_video
│ │ ├── basketball.mp4
│ │ └── ferris_wheel.mp4
│ └── teaser.png
├── depth_anything_v2
│ ├── dinov2.py
│ ├── dinov2_layers
│ │ ├── __init__.py
│ │ ├── attention.py
│ │ ├── block.py
│ │ ├── drop_path.py
│ │ ├── layer_scale.py
│ │ ├── mlp.py
│ │ ├── patch_embed.py
│ │ └── swiglu_ffn.py
│ ├── dpt.py
│ └── util
│ │ ├── blocks.py
│ │ └── transform.py
├── metric_depth
│ ├── README.md
│ ├── assets
│ │ └── compare_zoedepth.png
│ ├── dataset
│ │ ├── hypersim.py
│ │ ├── kitti.py
│ │ ├── splits
│ │ │ ├── hypersim
│ │ │ │ ├── train.txt
│ │ │ │ └── val.txt
│ │ │ ├── kitti
│ │ │ │ └── val.txt
│ │ │ └── vkitti2
│ │ │ │ └── train.txt
│ │ ├── transform.py
│ │ └── vkitti2.py
│ ├── depth_anything_v2
│ │ ├── dinov2.py
│ │ ├── dinov2_layers
│ │ │ ├── __init__.py
│ │ │ ├── attention.py
│ │ │ ├── block.py
│ │ │ ├── drop_path.py
│ │ │ ├── layer_scale.py
│ │ │ ├── mlp.py
│ │ │ ├── patch_embed.py
│ │ │ └── swiglu_ffn.py
│ │ ├── dpt.py
│ │ └── util
│ │ │ ├── blocks.py
│ │ │ └── transform.py
│ ├── depth_to_pointcloud.py
│ ├── dist_train.sh
│ ├── requirements.txt
│ ├── run.py
│ ├── train.py
│ └── util
│ │ ├── dist_helper.py
│ │ ├── loss.py
│ │ ├── metric.py
│ │ └── utils.py
├── requirements.txt
├── run.py
└── run_video.py
└── gaussian_splatting
├── LICENSE.md
├── __init__.py
├── gaussian_renderer
└── __init__.py
├── scene
└── gaussian_model.py
└── utils
├── general_utils.py
├── graphics_utils.py
├── image_utils.py
├── loss_utils.py
├── sh_utils.py
└── system_utils.py
/.gitignore:
--------------------------------------------------------------------------------
1 | datasets/
2 | build/
3 | *.egg-info/
4 |
5 | __pycache__/
6 | *.pyc
7 | *.so
8 |
9 |
10 | pretrained/
11 |
12 | output*/
13 |
14 | .vscode/
15 |
16 | temp/
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "thirdparty/evaluate_3d_reconstruction_lib"]
2 | path = thirdparty/evaluate_3d_reconstruction_lib
3 | url = https://github.com/eriksandstroem/evaluate_3d_reconstruction_lib.git
4 | [submodule "thirdparty/lietorch"]
5 | path = thirdparty/lietorch
6 | url = https://github.com/princeton-vl/lietorch.git
7 | [submodule "thirdparty/diff-gaussian-rasterization-w-pose"]
8 | path = thirdparty/diff-gaussian-rasterization-w-pose
9 | url = https://github.com/rmurai0610/diff-gaussian-rasterization-w-pose.git
10 | [submodule "thirdparty/simple-knn"]
11 | path = thirdparty/simple-knn
12 | url = https://github.com/camenduru/simple-knn.git
13 | [submodule "thirdparty/eigen"]
14 | path = thirdparty/eigen
15 | url = https://gitlab.com/libeigen/eigen.git
16 | [submodule "thirdparty/fit3d"]
17 | path = thirdparty/fit3d
18 | url = git@github.com:ywyue/FiT3D.git
19 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # How to contribute
2 |
3 | We'd love to accept your patches and contributions to this project.
4 |
5 | ## Before you begin
6 |
7 | ### Sign our Contributor License Agreement
8 |
9 | Contributions to this project must be accompanied by a
10 | [Contributor License Agreement](https://cla.developers.google.com/about) (CLA).
11 | You (or your employer) retain the copyright to your contribution; this simply
12 | gives us permission to use and redistribute your contributions as part of the
13 | project.
14 |
15 | If you or your current employer have already signed the Google CLA (even if it
16 | was for a different project), you probably don't need to do it again.
17 |
18 | Visit to see your current agreements or to
19 | sign a new one.
20 |
21 | ### Review our community guidelines
22 |
23 | This project follows
24 | [Google's Open Source Community Guidelines](https://opensource.google/conduct/).
25 |
26 | ## Contribution process
27 |
28 | ### Code reviews
29 |
30 | All submissions, including submissions by project members, require review. We
31 | use GitHub pull requests for this purpose. Consult
32 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
33 | information on using pull requests.
--------------------------------------------------------------------------------
/configs/Custom/custom_template.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/wildgs_slam.yaml
2 | scene: custom_scene # Replace with your scene name
3 |
4 | dataset: 'wild_slam_iphone'
5 | data:
6 | input_folder: ./datasets/{Path_to_your_data}
7 | output: ./output/Custom
8 |
9 | cam:
10 | H: 1242
11 | W: 2208
12 | H_out: 360
13 | W_out: 480
14 | fx: 1974.4219
15 | fy: 1974.4219
16 | cx: 1134.8486
17 | cy: 655.6515
18 | # H_edge: 0 # Uncomment this and the following line if you have edge cropping like in TUM datasets
19 | # W_edge: 0
20 | # distortion: [0.0, 0.0, 0.0, 0.0, 0.0] # Uncomment if you have distortion coefficients
21 |
22 | mapping:
23 | Training:
24 | alpha: 0.8 # Increase this value to make rendering loss weighs more on rgb rather than depth
25 | uncertainty_params:
26 | # For outdoor dataset where the metric depth estimation is unstable,
27 | # I recommend to set this value to be 0.1 or even 0.
28 | uncer_depth_mult: 0.2
29 |
30 |
31 | # # Uncomment the following lines to enable fast mode and GUI
32 | # fast_mode: True
33 | # gui: True
34 |
35 | # # Uncomment the following lines to save online plotting data
36 | # mapping:
37 | # online_plotting: True
--------------------------------------------------------------------------------
/configs/Dynamic/Bonn/bonn_balloon.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/Dynamic/Bonn/bonn_dynamic.yaml
2 | scene: bonn_balloon
3 |
4 | data:
5 | input_folder: ROOT_FOLDER_PLACEHOLDER/rgbd_bonn_balloon
--------------------------------------------------------------------------------
/configs/Dynamic/Bonn/bonn_balloon2.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/Dynamic/Bonn/bonn_dynamic.yaml
2 | scene: bonn_balloon2
3 |
4 | data:
5 | input_folder: ROOT_FOLDER_PLACEHOLDER/rgbd_bonn_balloon2
--------------------------------------------------------------------------------
/configs/Dynamic/Bonn/bonn_crowd.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/Dynamic/Bonn/bonn_dynamic.yaml
2 | scene: bonn_crowd
3 |
4 | data:
5 | input_folder: ROOT_FOLDER_PLACEHOLDER/rgbd_bonn_crowd
--------------------------------------------------------------------------------
/configs/Dynamic/Bonn/bonn_crowd2.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/Dynamic/Bonn/bonn_dynamic.yaml
2 | scene: bonn_crowd2
3 |
4 | data:
5 | input_folder: ROOT_FOLDER_PLACEHOLDER/rgbd_bonn_crowd2
--------------------------------------------------------------------------------
/configs/Dynamic/Bonn/bonn_dynamic.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/wildgs_slam.yaml
2 |
3 | dataset: 'bonn_dynamic'
4 |
5 | data:
6 | root_folder: ./datasets/Bonn
7 | output: ./output/Bonn
8 |
9 | cam: #intrinsic is different per scene in TUM, this cam does not have distortion
10 | H: 480
11 | W: 640
12 | fx: 542.822841
13 | fy: 542.576870
14 | cx: 315.593520
15 | cy: 237.756098
16 | distortion: [0.039903, -0.099343, -0.000730, -0.000144, 0.000000]
17 | H_out: 384
18 | W_out: 512
--------------------------------------------------------------------------------
/configs/Dynamic/Bonn/bonn_moving_nonobstructing_box.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/Dynamic/Bonn/bonn_dynamic.yaml
2 | scene: bonn_moving_nonobstructing_box
3 |
4 | data:
5 | input_folder: ROOT_FOLDER_PLACEHOLDER/rgbd_bonn_moving_nonobstructing_box
6 |
--------------------------------------------------------------------------------
/configs/Dynamic/Bonn/bonn_moving_nonobstructing_box2.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/Dynamic/Bonn/bonn_dynamic.yaml
2 | scene: bonn_moving_nonobstructing_box2
3 |
4 | data:
5 | input_folder: ROOT_FOLDER_PLACEHOLDER/rgbd_bonn_moving_nonobstructing_box2
--------------------------------------------------------------------------------
/configs/Dynamic/Bonn/bonn_person_tracking.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/Dynamic/Bonn/bonn_dynamic.yaml
2 | scene: bonn_person_tracking
3 |
4 | data:
5 | input_folder: ROOT_FOLDER_PLACEHOLDER/rgbd_bonn_person_tracking
6 |
--------------------------------------------------------------------------------
/configs/Dynamic/Bonn/bonn_person_tracking2.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/Dynamic/Bonn/bonn_dynamic.yaml
2 | scene: bonn_person_tracking2
3 |
4 | data:
5 | input_folder: ROOT_FOLDER_PLACEHOLDER/rgbd_bonn_person_tracking2
--------------------------------------------------------------------------------
/configs/Dynamic/TUM_RGBD/freiburg2_desk_with_person.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/Dynamic/TUM_RGBD/tum_dynamic.yaml
2 | scene: freiburg2_desk_with_person
3 |
4 | data:
5 | input_folder: ROOT_FOLDER_PLACEHOLDER/rgbd_dataset_freiburg2_desk_with_person
6 | cam: #intrinsic is different per scene in TUM, this cam does not have distortion
7 | H: 480
8 | W: 640
9 | fx: 520.9
10 | fy: 521.0
11 | cx: 325.1
12 | cy: 249.7
13 | distortion: [0.2312, -0.7849, -0.0033, -0.0001, 0.9172]
14 | H_edge: 8
15 | W_edge: 8
16 | H_out: 240
17 | W_out: 320
18 |
19 | tracking:
20 | # This sequence is too long
21 | force_keyframe_every_n_frames: -1
--------------------------------------------------------------------------------
/configs/Dynamic/TUM_RGBD/freiburg3_sitting_halfsphere.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/Dynamic/TUM_RGBD/tum_dynamic.yaml
2 | scene: freiburg3_sitting_halfsphere
3 |
4 | data:
5 | input_folder: ROOT_FOLDER_PLACEHOLDER/rgbd_dataset_freiburg3_sitting_halfsphere
6 | cam: #intrinsic is different per scene in TUM, this cam does not have distortion
7 | H: 480
8 | W: 640
9 | fx: 535.4
10 | fy: 539.2
11 | cx: 320.1
12 | cy: 247.6
--------------------------------------------------------------------------------
/configs/Dynamic/TUM_RGBD/freiburg3_sitting_halfsphere_static.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/Dynamic/TUM_RGBD/tum_dynamic.yaml
2 | scene: freiburg3_sitting_static
3 |
4 | data:
5 | input_folder: ROOT_FOLDER_PLACEHOLDER/rgbd_dataset_freiburg3_sitting_static
6 | cam: #intrinsic is different per scene in TUM, this cam does not have distortion
7 | H: 480
8 | W: 640
9 | fx: 535.4
10 | fy: 539.2
11 | cx: 320.1
12 | cy: 247.6
--------------------------------------------------------------------------------
/configs/Dynamic/TUM_RGBD/freiburg3_sitting_rpy.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/Dynamic/TUM_RGBD/tum_dynamic.yaml
2 | scene: freiburg3_sitting_rpy
3 |
4 | data:
5 | input_folder: ROOT_FOLDER_PLACEHOLDER/rgbd_dataset_freiburg3_sitting_rpy
6 | cam: #intrinsic is different per scene in TUM, this cam does not have distortion
7 | H: 480
8 | W: 640
9 | fx: 535.4
10 | fy: 539.2
11 | cx: 320.1
12 | cy: 247.6
--------------------------------------------------------------------------------
/configs/Dynamic/TUM_RGBD/freiburg3_sitting_xyz.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/Dynamic/TUM_RGBD/tum_dynamic.yaml
2 | scene: freiburg3_sitting_xyz
3 |
4 | data:
5 | input_folder: ROOT_FOLDER_PLACEHOLDER/rgbd_dataset_freiburg3_sitting_xyz
6 | cam: #intrinsic is different per scene in TUM, this cam does not have distortion
7 | H: 480
8 | W: 640
9 | fx: 535.4
10 | fy: 539.2
11 | cx: 320.1
12 | cy: 247.6
--------------------------------------------------------------------------------
/configs/Dynamic/TUM_RGBD/freiburg3_walking_halfsphere.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/Dynamic/TUM_RGBD/tum_dynamic.yaml
2 | scene: freiburg3_walking_halfsphere
3 |
4 | data:
5 | input_folder: ROOT_FOLDER_PLACEHOLDER/rgbd_dataset_freiburg3_walking_halfsphere
6 | cam: #intrinsic is different per scene in TUM, this cam does not have distortion
7 | H: 480
8 | W: 640
9 | fx: 535.4
10 | fy: 539.2
11 | cx: 320.1
12 | cy: 247.6
--------------------------------------------------------------------------------
/configs/Dynamic/TUM_RGBD/freiburg3_walking_halfsphere_static.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/Dynamic/TUM_RGBD/tum_dynamic.yaml
2 | scene: freiburg3_walking_static
3 |
4 | data:
5 | input_folder: ROOT_FOLDER_PLACEHOLDER/rgbd_dataset_freiburg3_walking_static
6 | cam: #intrinsic is different per scene in TUM, this cam does not have distortion
7 | H: 480
8 | W: 640
9 | fx: 535.4
10 | fy: 539.2
11 | cx: 320.1
12 | cy: 247.6
--------------------------------------------------------------------------------
/configs/Dynamic/TUM_RGBD/freiburg3_walking_rpy.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/Dynamic/TUM_RGBD/tum_dynamic.yaml
2 | scene: freiburg3_walking_rpy
3 |
4 | data:
5 | input_folder: ROOT_FOLDER_PLACEHOLDER/rgbd_dataset_freiburg3_walking_rpy
6 | cam: #intrinsic is different per scene in TUM, this cam does not have distortion
7 | H: 480
8 | W: 640
9 | fx: 535.4
10 | fy: 539.2
11 | cx: 320.1
12 | cy: 247.6
--------------------------------------------------------------------------------
/configs/Dynamic/TUM_RGBD/freiburg3_walking_xyz.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/Dynamic/TUM_RGBD/tum_dynamic.yaml
2 | scene: freiburg3_walking_xyz
3 |
4 | data:
5 | input_folder: ROOT_FOLDER_PLACEHOLDER/rgbd_dataset_freiburg3_walking_xyz
6 | cam: #intrinsic is different per scene in TUM, this cam does not have distortion
7 | H: 480
8 | W: 640
9 | fx: 535.4
10 | fy: 539.2
11 | cx: 320.1
12 | cy: 247.6
--------------------------------------------------------------------------------
/configs/Dynamic/TUM_RGBD/tum_dynamic.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/wildgs_slam.yaml
2 |
3 | dataset: 'tumrgbd'
4 |
5 | tracking:
6 | buffer: 350
7 |
8 | # Less weight on the depth loss for TUM
9 | mapping:
10 | Training:
11 | alpha: 0.8
12 |
13 | data:
14 | root_folder: ./datasets/TUM_RGBD
15 | output: ./output/TUM_RGBD
16 |
17 | cam: #NOTE: intrinsic is different per scene in TUM
18 | # refer to https://vision.in.tum.de/data/datasets/rgbd-dataset/file_formats#intrinsic_camera_calibration_of_the_kinect
19 | png_depth_scale: 5000.0 #for depth image in png format
20 | ### target/output camera settings, camera_size -> resize -> crop -> target_size
21 | H: 480
22 | W: 640
23 | fx: 535.4
24 | fy: 539.2
25 | cx: 320.1
26 | cy: 247.6
27 | H_edge: 8
28 | W_edge: 8
29 | H_out: 384
30 | W_out: 512
31 |
--------------------------------------------------------------------------------
/configs/Dynamic/Wild_SLAM_Mocap/ANYmal1.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/Dynamic/Wild_SLAM_Mocap/wild_slam_mocap.yaml
2 | scene: ANYmal1
3 |
4 | data:
5 | input_folder: ROOT_FOLDER_PLACEHOLDER/scene2/ANYmal1
6 |
7 | cam:
8 | fx: 647.7445068359375
9 | fy: 646.9425659179688
--------------------------------------------------------------------------------
/configs/Dynamic/Wild_SLAM_Mocap/ANYmal2.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/Dynamic/Wild_SLAM_Mocap/wild_slam_mocap.yaml
2 | scene: ANYmal2
3 |
4 | data:
5 | input_folder: ROOT_FOLDER_PLACEHOLDER/scene2/ANYmal2
6 |
7 | cam:
8 | fx: 647.7445068359375
9 | fy: 646.9425659179688
--------------------------------------------------------------------------------
/configs/Dynamic/Wild_SLAM_Mocap/ball.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/Dynamic/Wild_SLAM_Mocap/wild_slam_mocap.yaml
2 | scene: basketball
3 |
4 | data:
5 | input_folder: ROOT_FOLDER_PLACEHOLDER/scene1/ball
--------------------------------------------------------------------------------
/configs/Dynamic/Wild_SLAM_Mocap/crowd.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/Dynamic/Wild_SLAM_Mocap/wild_slam_mocap.yaml
2 | scene: crowd
3 |
4 | data:
5 | input_folder: ROOT_FOLDER_PLACEHOLDER/scene1/crowd
--------------------------------------------------------------------------------
/configs/Dynamic/Wild_SLAM_Mocap/crowd_demo.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/Dynamic/Wild_SLAM_Mocap/wild_slam_mocap.yaml
2 | scene: crowd_demo
3 |
4 | fast_mode: True
5 | gui: True
6 | mapping:
7 | online_plotting: True
8 |
9 | data:
10 | input_folder: ROOT_FOLDER_PLACEHOLDER/scene1/crowd
11 | output: ./output/Wild_SLAM_Mocap_demo
--------------------------------------------------------------------------------
/configs/Dynamic/Wild_SLAM_Mocap/person_tracking.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/Dynamic/Wild_SLAM_Mocap/wild_slam_mocap.yaml
2 | scene: person_tracking
3 |
4 | data:
5 | input_folder: ROOT_FOLDER_PLACEHOLDER/scene1/person_tracking
6 |
7 | cam:
8 | fx: 647.5684814453125
9 | fy: 646.766845703125
--------------------------------------------------------------------------------
/configs/Dynamic/Wild_SLAM_Mocap/racket.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/Dynamic/Wild_SLAM_Mocap/wild_slam_mocap.yaml
2 | scene: racket
3 |
4 | data:
5 | input_folder: ROOT_FOLDER_PLACEHOLDER/scene1/racket
6 |
7 | cam:
8 | fx: 647.3926391601562
9 | fy: 646.5911254882812
--------------------------------------------------------------------------------
/configs/Dynamic/Wild_SLAM_Mocap/stones.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/Dynamic/Wild_SLAM_Mocap/wild_slam_mocap.yaml
2 | scene: stones
3 |
4 | data:
5 | input_folder: ROOT_FOLDER_PLACEHOLDER/scene1/stones
6 |
7 | cam:
8 | fx: 647.7445068359375
9 | fy: 646.9425659179688
--------------------------------------------------------------------------------
/configs/Dynamic/Wild_SLAM_Mocap/table_tracking1.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/Dynamic/Wild_SLAM_Mocap/wild_slam_mocap.yaml
2 | scene: table_tracking1
3 |
4 | data:
5 | input_folder: ROOT_FOLDER_PLACEHOLDER/scene1/table_tracking1
6 |
7 | cam:
8 | fx: 647.9204711914062
9 | fy: 647.1183471679688
--------------------------------------------------------------------------------
/configs/Dynamic/Wild_SLAM_Mocap/table_tracking2.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/Dynamic/Wild_SLAM_Mocap/wild_slam_mocap.yaml
2 | scene: table_tracking2
3 |
4 | data:
5 | input_folder: ROOT_FOLDER_PLACEHOLDER/scene1/table_tracking2
6 |
7 | cam:
8 | fx: 647.5684814453125
9 | fy: 646.766845703125
--------------------------------------------------------------------------------
/configs/Dynamic/Wild_SLAM_Mocap/umbrella.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/Dynamic/Wild_SLAM_Mocap/wild_slam_mocap.yaml
2 | scene: umbrella
3 |
4 | data:
5 | input_folder: ROOT_FOLDER_PLACEHOLDER/scene1/umbrella
6 |
7 | cam:
8 | fx: 647.7445068359375
9 | fy: 646.9425659179688
--------------------------------------------------------------------------------
/configs/Dynamic/Wild_SLAM_Mocap/wild_slam_mocap.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/wildgs_slam.yaml
2 |
3 | dataset: 'wild_slam_mocap'
4 |
5 | data:
6 | root_folder: ./datasets/Wild_SLAM_Mocap
7 | output: ./output/Wild_SLAM_Mocap
8 |
9 | cam: #intrinsic is slightly different per seq
10 | H: 720
11 | W: 1280
12 | fx: 647.2167358398438
13 | fy: 646.4154663085938
14 | cx: 643.1209716796875
15 | cy: 365.55963134765625
16 | distortion: [-0.0550149604678154, 0.06560786068439484,-0.0005061274860054255,0.0004771310486830771,-0.021717390045523643]
17 | H_out: 360
18 | W_out: 640
--------------------------------------------------------------------------------
/configs/Dynamic/Wild_SLAM_iPhone/horse.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/Dynamic/Wild_SLAM_iPhone/wild_slam_iphone.yaml
2 | scene: iphone_horse
3 |
4 | data:
5 | input_folder: ROOT_FOLDER_PLACEHOLDER/horse
6 |
7 | cam:
8 | fx: 1341.1414794921875
9 | fy: 1341.1414794921875
10 | cx: 960.2431640625
11 | cy: 729.904052734375
--------------------------------------------------------------------------------
/configs/Dynamic/Wild_SLAM_iPhone/parking.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/Dynamic/Wild_SLAM_iPhone/wild_slam_iphone.yaml
2 | scene: iphone_parking
3 |
4 | data:
5 | input_folder: ROOT_FOLDER_PLACEHOLDER/parking
6 |
7 | cam:
8 | fx: 1336.74609375
9 | fy: 1336.74609375
10 | cx: 957.005859375
11 | cy: 726.88409423828125
--------------------------------------------------------------------------------
/configs/Dynamic/Wild_SLAM_iPhone/piano.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/Dynamic/Wild_SLAM_iPhone/wild_slam_iphone.yaml
2 | scene: iphone_piano
3 |
4 | data:
5 | input_folder: ROOT_FOLDER_PLACEHOLDER/piano
6 |
7 | cam:
8 | fx: 1351.06982421875
9 | fy: 1351.06982421875
10 | cx: 961.050537109375
11 | cy: 730.18597412109375
--------------------------------------------------------------------------------
/configs/Dynamic/Wild_SLAM_iPhone/shopping.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/Dynamic/Wild_SLAM_iPhone/wild_slam_iphone.yaml
2 | scene: iphone_shopping
3 |
4 | data:
5 | input_folder: ROOT_FOLDER_PLACEHOLDER/shopping
6 |
7 | cam:
8 | fx: 1340.6441650390625
9 | fy: 1340.6441650390625
10 | cx: 960.7640380859375
11 | cy: 730.26397705078125
--------------------------------------------------------------------------------
/configs/Dynamic/Wild_SLAM_iPhone/street.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/Dynamic/Wild_SLAM_iPhone/wild_slam_iphone.yaml
2 | scene: iphone_street
3 |
4 | data:
5 | input_folder: ROOT_FOLDER_PLACEHOLDER/street
6 |
7 | cam:
8 | fx: 1331.6123046875
9 | fy: 1331.6123046875
10 | cx: 956.61676025390625
11 | cy: 727.839599609375
--------------------------------------------------------------------------------
/configs/Dynamic/Wild_SLAM_iPhone/tower.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/Dynamic/Wild_SLAM_iPhone/wild_slam_iphone.yaml
2 | scene: iphone_tower
3 |
4 | data:
5 | input_folder: ROOT_FOLDER_PLACEHOLDER/tower
6 |
7 | cam:
8 | fx: 1338.494140625
9 | fy: 1338.494140625
10 | cx: 960.17327880859375
11 | cy: 730.55328369140625
--------------------------------------------------------------------------------
/configs/Dynamic/Wild_SLAM_iPhone/wild_slam_iphone.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/wildgs_slam.yaml
2 |
3 | dataset: 'wild_slam_iphone'
4 |
5 | data:
6 | root_folder: ./datasets/Wild_SLAM_iPhone
7 | output: ./output/Wild_SLAM_iPhone
8 |
9 | mapping:
10 | Training:
11 | alpha: 0.8 # Increase this value to make rendering loss weighs more on rgb rather than depth
12 | uncertainty_params:
13 | # This parameter weighs depth loss when training uncertainty MLP
14 | # It's lambda_1 in equation 4 in the paper.
15 | # We set it 0 here as the metric depth is not reliable in iphone dataset.
16 | # However, feel free to finetune this parameter if trying to run with your own dataset.
17 | uncer_depth_mult: 0.0
18 |
19 | cam:
20 | H: 1440
21 | W: 1920
22 | H_out: 360
23 | W_out: 480
--------------------------------------------------------------------------------
/configs/Static/TUM_RGBD/freiburg1_desk.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/Static/TUM_RGBD/tum.yaml
2 | scene: freiburg1_desk
3 |
4 | data:
5 | input_folder: ROOT_FOLDER_PLACEHOLDER/rgbd_dataset_freiburg1_desk
6 | cam:
7 | H: 480
8 | W: 640
9 | fx: 517.3
10 | fy: 516.5
11 | cx: 318.6
12 | cy: 255.3
13 | distortion: [0.2624, -0.9531, -0.0054, 0.0026, 1.1633]
14 |
--------------------------------------------------------------------------------
/configs/Static/TUM_RGBD/freiburg2_xyz.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/Static/TUM_RGBD/tum.yaml
2 | scene: freiburg2_xyz
3 |
4 | data:
5 | input_folder: ROOT_FOLDER_PLACEHOLDER/rgbd_dataset_freiburg2_xyz
6 | cam: #intrinsic is different per scene in TUM
7 | H: 480
8 | W: 640
9 | fx: 520.9
10 | fy: 521.0
11 | cx: 325.1
12 | cy: 249.7
13 | distortion: [0.2312, -0.7849, -0.0033, -0.0001, 0.9172]
14 | H_out: 240
15 | W_out: 320
16 |
17 |
--------------------------------------------------------------------------------
/configs/Static/TUM_RGBD/freiburg3_office.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/Static/TUM_RGBD/tum.yaml
2 | scene: reiburg3_long_office_household
3 |
4 | data:
5 | input_folder: ROOT_FOLDER_PLACEHOLDER/rgbd_dataset_freiburg3_long_office_household
6 |
7 | cam: #intrinsic is different per scene in TUM, this cam does not have distortion
8 | H: 480
9 | W: 640
10 | fx: 535.4
11 | fy: 539.2
12 | cx: 320.1
13 | cy: 247.6
--------------------------------------------------------------------------------
/configs/Static/TUM_RGBD/tum.yaml:
--------------------------------------------------------------------------------
1 | inherit_from: ./configs/wildgs_slam.yaml
2 |
3 | dataset: 'tumrgbd'
4 |
5 | mapping:
6 | Calibration:
7 | depth_scale: 5000.0
8 |
9 | tracking:
10 | buffer: 500
11 | warmup: 12
12 | multiview_filter:
13 | visible_num: 2
14 | frontend:
15 | keyframe_thresh: 3.0
16 | radius: 2
17 | backend:
18 | loop_nms: 10
19 |
20 | cam: #NOTE: intrinsic is different per scene in TUM
21 | # refer to https://vision.in.tum.de/data/datasets/rgbd-dataset/file_formats#intrinsic_camera_calibration_of_the_kinect
22 | png_depth_scale: 5000.0 #for depth image in png format
23 | ### target/output camera settings, camera_size -> resize -> crop -> target_size
24 | H_edge: 8
25 | W_edge: 8
26 | H_out: 384
27 | W_out: 512
28 |
29 | data:
30 | root_folder: /home/jianhaozheng/Gaussian_in_the_Wild/data/tum_rgb-d
31 | output: ./output/TUM_RGBD
32 |
--------------------------------------------------------------------------------
/media/teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/media/teaser.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pillow
2 | imageio
3 | joblib
4 | pandas
5 | scikit-image
6 | scikit-learn
7 | scipy
8 | seaborn
9 | PyOpenGL-accelerate
10 | pyrender
11 | ninja
12 | setuptools
13 | timm==0.9.10
14 | plyfile==0.8.1
15 | tqdm
16 | opencv-python==4.8.1.78
17 | munch
18 | evo
19 | open3d==0.17.0
20 | torchmetrics
21 | imgviz
22 | lpips
23 | rich
24 | kornia
25 | PyQt5
26 | glfw
27 | PyGLM
28 | mmengine
--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import argparse
4 | import os
5 |
6 | from src import config
7 | from src.slam import SLAM
8 | from src.utils.datasets import get_dataset
9 | from time import gmtime, strftime
10 | from colorama import Fore,Style
11 |
12 | import random
13 | def setup_seed(seed):
14 | torch.manual_seed(seed)
15 | torch.cuda.manual_seed_all(seed)
16 | np.random.seed(seed)
17 | random.seed(seed)
18 | torch.backends.cudnn.deterministic = True
19 |
20 | if __name__ == '__main__':
21 | parser = argparse.ArgumentParser()
22 | parser.add_argument('config', type=str, help='Path to config file.')
23 | args = parser.parse_args()
24 |
25 | torch.multiprocessing.set_start_method('spawn')
26 |
27 | cfg = config.load_config(args.config)
28 | setup_seed(cfg['setup_seed'])
29 | if cfg['fast_mode']:
30 | # Force the final refine iterations to be 3000 if in fast mode
31 | cfg['mapping']['final_refine_iters'] = 3000
32 |
33 | output_dir = cfg['data']['output']
34 | output_dir = output_dir+f"/{cfg['scene']}"
35 |
36 | start_time = strftime("%Y-%m-%d %H:%M:%S", gmtime())
37 | start_info = "-"*30+Fore.LIGHTRED_EX+\
38 | f"\nStart WildGS-SLAM at {start_time},\n"+Style.RESET_ALL+ \
39 | f" scene: {cfg['dataset']}-{cfg['scene']},\n" \
40 | f" output: {output_dir}\n"+ \
41 | "-"*30
42 | print(start_info)
43 |
44 | if not os.path.exists(output_dir):
45 | os.makedirs(output_dir)
46 |
47 | config.save_config(cfg, f'{output_dir}/cfg.yaml')
48 |
49 | dataset = get_dataset(cfg)
50 |
51 | slam = SLAM(cfg,dataset)
52 | slam.run()
53 |
54 | end_time = strftime("%Y-%m-%d %H:%M:%S", gmtime())
55 | print("-"*30+Fore.LIGHTRED_EX+f"\nWildGS-SLAM finishes!\n"+Style.RESET_ALL+f"{end_time}\n"+"-"*30)
56 |
57 |
--------------------------------------------------------------------------------
/scripts_downloading/download_bonn.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | mkdir -p datasets/Bonn
4 | cd datasets/Bonn
5 |
6 | scenes=(
7 | "balloon"
8 | "balloon2"
9 | "crowd"
10 | "crowd2"
11 | "person_tracking"
12 | "person_tracking2"
13 | "moving_nonobstructing_box"
14 | "moving_nonobstructing_box2"
15 | )
16 |
17 | for scene in "${scenes[@]}"
18 | do
19 | echo "Processing scene: $scene"
20 |
21 | # Check if the folder already exists
22 | if [ -d "$scene" ]; then
23 | echo "Folder $scene already exists, skipping download"
24 | else
25 | zip_file="rgbd_bonn_${scene}.zip"
26 | wget "https://www.ipb.uni-bonn.de/html/projects/rgbd_dynamic2019/${zip_file}"
27 |
28 | if [ $? -eq 0 ]; then
29 | echo "Successfully downloaded ${zip_file}"
30 | unzip -q "${zip_file}"
31 | if [ $? -eq 0 ]; then
32 | echo "Successfully extracted ${zip_file}"
33 | rm "${zip_file}"
34 | echo "Removed ${zip_file}"
35 | else
36 | echo "Failed to extract ${zip_file}"
37 | fi
38 | else
39 | echo "Failed to download ${zip_file}"
40 | fi
41 | fi
42 |
43 | echo "Finished processing ${scene}"
44 | echo "-----------------------------"
45 | done
--------------------------------------------------------------------------------
/scripts_downloading/download_demo_data.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | mkdir -p datasets/Wild_SLAM_Mocap/scene1
4 | cd datasets/Wild_SLAM_Mocap/scene1
5 |
6 | scenes=(
7 | "crowd"
8 | )
9 |
10 | for scene in "${scenes[@]}"
11 | do
12 | echo "Processing scene: $scene"
13 |
14 | # Check if the folder already exists
15 | if [ -d "$scene" ]; then
16 | echo "Folder $scene already exists, skipping download"
17 | else
18 | zip_file="${scene}.zip"
19 | wget "https://huggingface.co/datasets/gradient-spaces/Wild-SLAM/resolve/main/Mocap/scene1/${zip_file}"
20 |
21 | if [ $? -eq 0 ]; then
22 | echo "Successfully downloaded ${zip_file}"
23 | unzip -q "${zip_file}"
24 | if [ $? -eq 0 ]; then
25 | echo "Successfully extracted ${zip_file}"
26 | rm "${zip_file}"
27 | echo "Removed ${zip_file}"
28 | else
29 | echo "Failed to extract ${zip_file}"
30 | fi
31 | else
32 | echo "Failed to download ${zip_file}"
33 | fi
34 | fi
35 |
36 | echo "Finished processing ${scene}"
37 | echo "-----------------------------"
38 | done
--------------------------------------------------------------------------------
/scripts_downloading/download_tum.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | mkdir -p datasets/TUM_RGBD
4 | cd datasets/TUM_RGBD
5 |
6 | wget https://cvg.cit.tum.de/rgbd/dataset/freiburg2/rgbd_dataset_freiburg2_desk_with_person.tgz
7 | tar -xvzf rgbd_dataset_freiburg2_desk_with_person.tgz
8 | rm rgbd_dataset_freiburg2_desk_with_person.tgz
9 |
10 | wget https://cvg.cit.tum.de/rgbd/dataset/freiburg3/rgbd_dataset_freiburg3_sitting_static.tgz
11 | tar -xvzf rgbd_dataset_freiburg3_sitting_static.tgz
12 | rm rgbd_dataset_freiburg3_sitting_static.tgz
13 |
14 | wget https://cvg.cit.tum.de/rgbd/dataset/freiburg3/rgbd_dataset_freiburg3_sitting_xyz.tgz
15 | tar -xvzf rgbd_dataset_freiburg3_sitting_xyz.tgz
16 | rm rgbd_dataset_freiburg3_sitting_xyz.tgz
17 |
18 | wget https://cvg.cit.tum.de/rgbd/dataset/freiburg3/rgbd_dataset_freiburg3_sitting_halfsphere.tgz
19 | tar -xvzf rgbd_dataset_freiburg3_sitting_halfsphere.tgz
20 | rm rgbd_dataset_freiburg3_sitting_halfsphere.tgz
21 |
22 | wget https://cvg.cit.tum.de/rgbd/dataset/freiburg3/rgbd_dataset_freiburg3_sitting_rpy.tgz
23 | tar -xvzf rgbd_dataset_freiburg3_sitting_rpy.tgz
24 | rm rgbd_dataset_freiburg3_sitting_rpy.tgz
25 |
26 | wget https://cvg.cit.tum.de/rgbd/dataset/freiburg3/rgbd_dataset_freiburg3_walking_static.tgz
27 | tar -xvzf rgbd_dataset_freiburg3_walking_static.tgz
28 | rm rgbd_dataset_freiburg3_walking_static.tgz
29 |
30 | wget https://cvg.cit.tum.de/rgbd/dataset/freiburg3/rgbd_dataset_freiburg3_walking_xyz.tgz
31 | tar -xvzf rgbd_dataset_freiburg3_walking_xyz.tgz
32 | rm rgbd_dataset_freiburg3_walking_xyz.tgz
33 |
34 | wget https://cvg.cit.tum.de/rgbd/dataset/freiburg3/rgbd_dataset_freiburg3_walking_halfsphere.tgz
35 | tar -xvzf rgbd_dataset_freiburg3_walking_halfsphere.tgz
36 | rm rgbd_dataset_freiburg3_walking_halfsphere.tgz
37 |
38 | wget https://cvg.cit.tum.de/rgbd/dataset/freiburg3/rgbd_dataset_freiburg3_walking_rpy.tgz
39 | tar -xvzf rgbd_dataset_freiburg3_walking_rpy.tgz
40 | rm rgbd_dataset_freiburg3_walking_rpy.tgz
41 |
42 |
--------------------------------------------------------------------------------
/scripts_downloading/download_wild_slam_iphone.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | mkdir -p datasets/Wild_SLAM_iPhone
4 | cd datasets/Wild_SLAM_iPhone
5 |
6 | scenes=(
7 | "parking"
8 | "piano"
9 | "shopping"
10 | "street"
11 | "tower"
12 | "wall"
13 | "wandering"
14 | )
15 |
16 | for scene in "${scenes[@]}"
17 | do
18 | echo "Processing scene: $scene"
19 |
20 | # Check if the folder already exists
21 | if [ -d "$scene" ]; then
22 | echo "Folder $scene already exists, skipping download"
23 | else
24 | zip_file="${scene}.zip"
25 | wget "https://huggingface.co/datasets/gradient-spaces/Wild-SLAM/resolve/main/iPhone/${zip_file}"
26 |
27 | if [ $? -eq 0 ]; then
28 | echo "Successfully downloaded ${zip_file}"
29 | unzip -q "${zip_file}"
30 | if [ $? -eq 0 ]; then
31 | echo "Successfully extracted ${zip_file}"
32 | rm "${zip_file}"
33 | echo "Removed ${zip_file}"
34 | else
35 | echo "Failed to extract ${zip_file}"
36 | fi
37 | else
38 | echo "Failed to download ${zip_file}"
39 | fi
40 | fi
41 |
42 | echo "Finished processing ${scene}"
43 | echo "-----------------------------"
44 | done
--------------------------------------------------------------------------------
/scripts_downloading/download_wild_slam_mocap_scene1.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | mkdir -p datasets/Wild_SLAM_Mocap/scene1
4 | cd datasets/Wild_SLAM_Mocap/scene1
5 |
6 | scenes=(
7 | "ball"
8 | "crowd"
9 | "person_tracking"
10 | "racket"
11 | "stones"
12 | "table_tracking1"
13 | "table_tracking2"
14 | "umbrella"
15 | )
16 |
17 | for scene in "${scenes[@]}"
18 | do
19 | echo "Processing scene: $scene"
20 |
21 | # Check if the folder already exists
22 | if [ -d "$scene" ]; then
23 | echo "Folder $scene already exists, skipping download"
24 | else
25 | zip_file="${scene}.zip"
26 | wget "https://huggingface.co/datasets/gradient-spaces/Wild-SLAM/resolve/main/Mocap/scene1/${zip_file}"
27 |
28 | if [ $? -eq 0 ]; then
29 | echo "Successfully downloaded ${zip_file}"
30 | unzip -q "${zip_file}"
31 | if [ $? -eq 0 ]; then
32 | echo "Successfully extracted ${zip_file}"
33 | rm "${zip_file}"
34 | echo "Removed ${zip_file}"
35 | else
36 | echo "Failed to extract ${zip_file}"
37 | fi
38 | else
39 | echo "Failed to download ${zip_file}"
40 | fi
41 | fi
42 |
43 | echo "Finished processing ${scene}"
44 | echo "-----------------------------"
45 | done
--------------------------------------------------------------------------------
/scripts_downloading/download_wild_slam_mocap_scene2.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | mkdir -p datasets/Wild_SLAM_Mocap/scene2
4 | cd datasets/Wild_SLAM_Mocap/scene2
5 |
6 | scenes=(
7 | "ANYmal1"
8 | "ANYmal2"
9 | )
10 |
11 | for scene in "${scenes[@]}"
12 | do
13 | echo "Processing scene: $scene"
14 |
15 | # Check if the folder already exists
16 | if [ -d "$scene" ]; then
17 | echo "Folder $scene already exists, skipping download"
18 | else
19 | zip_file="${scene}.zip"
20 | wget "https://huggingface.co/datasets/gradient-spaces/Wild-SLAM/resolve/main/Mocap/scene2/${zip_file}"
21 |
22 | if [ $? -eq 0 ]; then
23 | echo "Successfully downloaded ${zip_file}"
24 | unzip -q "${zip_file}"
25 | if [ $? -eq 0 ]; then
26 | echo "Successfully extracted ${zip_file}"
27 | rm "${zip_file}"
28 | echo "Removed ${zip_file}"
29 | else
30 | echo "Failed to extract ${zip_file}"
31 | fi
32 | else
33 | echo "Failed to download ${zip_file}"
34 | fi
35 | fi
36 |
37 | echo "Finished processing ${scene}"
38 | echo "-----------------------------"
39 | done
--------------------------------------------------------------------------------
/scripts_run/run_bonn_all.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | python run.py ./configs/Dynamic/Bonn/bonn_balloon.yaml
4 | python run.py ./configs/Dynamic/Bonn/bonn_balloon2.yaml
5 | python run.py ./configs/Dynamic/Bonn/bonn_crowd.yaml
6 | python run.py ./configs/Dynamic/Bonn/bonn_crowd2.yaml
7 | python run.py ./configs/Dynamic/Bonn/bonn_moving_nonobstructing_box.yaml
8 | python run.py ./configs/Dynamic/Bonn/bonn_moving_nonobstructing_box2.yaml
9 | python run.py ./configs/Dynamic/Bonn/bonn_person_tracking.yaml
10 | python run.py ./configs/Dynamic/Bonn/bonn_person_tracking2.yaml
11 |
--------------------------------------------------------------------------------
/scripts_run/run_tum_dynamic_all.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | python run.py ./configs/Dynamic/TUM_RGBD/freiburg2_desk_with_person.yaml
4 | python run.py ./configs/Dynamic/TUM_RGBD/freiburg3_sitting_halfsphere_static.yaml
5 | python run.py ./configs/Dynamic/TUM_RGBD/freiburg3_sitting_halfsphere.yaml
6 | python run.py ./configs/Dynamic/TUM_RGBD/freiburg3_sitting_rpy.yaml
7 | python run.py ./configs/Dynamic/TUM_RGBD/freiburg3_sitting_xyz.yaml
8 | python run.py ./configs/Dynamic/TUM_RGBD/freiburg3_walking_halfsphere_static.yaml
9 | python run.py ./configs/Dynamic/TUM_RGBD/freiburg3_walking_halfsphere.yaml
10 | python run.py ./configs/Dynamic/TUM_RGBD/freiburg3_walking_rpy.yaml
11 | python run.py ./configs/Dynamic/TUM_RGBD/freiburg3_walking_xyz.yaml
--------------------------------------------------------------------------------
/scripts_run/run_wild_slam_mocap_all.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | python run.py ./configs/Dynamic/Wild_SLAM_Mocap/ball.yaml
4 | python run.py ./configs/Dynamic/Wild_SLAM_Mocap/crowd.yaml
5 | python run.py ./configs/Dynamic/Wild_SLAM_Mocap/person_tracking.yaml
6 | python run.py ./configs/Dynamic/Wild_SLAM_Mocap/racket.yaml
7 | python run.py ./configs/Dynamic/Wild_SLAM_Mocap/stones.yaml
8 | python run.py ./configs/Dynamic/Wild_SLAM_Mocap/table_tracking1.yaml
9 | python run.py ./configs/Dynamic/Wild_SLAM_Mocap/table_tracking2.yaml
10 | python run.py ./configs/Dynamic/Wild_SLAM_Mocap/umbrella.yaml
11 | python run.py ./configs/Dynamic/Wild_SLAM_Mocap/ANYmal1.yaml
12 | python run.py ./configs/Dynamic/Wild_SLAM_Mocap/ANYmal2.yaml
13 |
--------------------------------------------------------------------------------
/scripts_run/summarize_pose_eval.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import json
3 | import pandas as pd
4 | import os
5 |
6 | datasets = os.listdir('./output')
7 | for dataset in datasets:
8 | if not os.path.isdir(os.path.join('output', dataset)):
9 | continue
10 | dataset_path = os.path.join('output', dataset)
11 | scenes = sorted(os.listdir(dataset_path))
12 |
13 | data = {scene: [] for scene in scenes}
14 | averages = []
15 |
16 | row_data = []
17 | rmses = []
18 | for scene in scenes:
19 | exp_folder = os.path.join(dataset_path, scene)
20 | # metrics_full_traj, metrics_kf_traj, metrics_kf_traj_before_ba
21 | result_file = os.path.join(exp_folder, "traj/metrics_full_traj.txt")
22 | if os.path.exists(result_file):
23 | # Load the JSON file
24 | with open(result_file, "r") as f:
25 | output = f.readlines()
26 |
27 | rmse = float(output[8].split(',')[0].replace("{'rmse': ",''))
28 |
29 | # Add metrics to the row
30 | row_data.append(f"{rmse*1e2:.2f}")
31 | rmses.append(rmse)
32 | else:
33 | row_data.append("N/A") # If file doesn't exist, mark it as N/A
34 | avg_rmse = np.nanmean(rmses)
35 | averages.append(f"{avg_rmse*1e2:.2f}")
36 | for scene, value in zip(scenes, row_data):
37 | data[scene].append(value)
38 |
39 | data['Average'] = averages
40 |
41 | # Convert the data to a Pandas DataFrame
42 | df = pd.DataFrame(data, index=['wildgs-slam'])
43 |
44 | # Save the DataFrame as a CSV file
45 | csv_path = f"./output/{dataset}_eval.csv"
46 | df.to_csv(csv_path)
47 |
48 | # Output the CSV file path
49 | print(f"Results saved to {csv_path}")
50 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension
3 |
4 | import os.path as osp
5 | ROOT = osp.dirname(osp.abspath(__file__))
6 |
7 | setup(
8 | name='droid_backends',
9 | ext_modules=[
10 | CUDAExtension('droid_backends',
11 | include_dirs=[osp.join(ROOT, 'thirdparty/lietorch/eigen')],
12 | sources=[
13 | 'src/lib/droid.cpp',
14 | 'src/lib/droid_kernels.cu',
15 | 'src/lib/correlation_kernels.cu',
16 | 'src/lib/altcorr_kernel.cu',
17 | ],
18 | extra_compile_args={
19 | 'cxx': ['-O3'],
20 | 'nvcc': ['-O3',
21 | '-gencode=arch=compute_60,code=sm_60',
22 | '-gencode=arch=compute_61,code=sm_61',
23 | '-gencode=arch=compute_70,code=sm_70',
24 | '-gencode=arch=compute_75,code=sm_75',
25 | '-gencode=arch=compute_80,code=sm_80',
26 | '-gencode=arch=compute_86,code=sm_86',
27 | ]
28 | }),
29 | ],
30 | cmdclass={ 'build_ext' : BuildExtension }
31 | )
32 |
--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/src/__init__.py
--------------------------------------------------------------------------------
/src/backend.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 The GlORIE-SLAM Authors.
2 |
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 |
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 |
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import torch
16 | from src.factor_graph import FactorGraph
17 | from copy import deepcopy
18 |
19 | class Backend:
20 | def __init__(self, net, video, cfg):
21 | self.cfg = cfg
22 | self.video = video
23 | self.update_op = net.update
24 | self.device = cfg['device']
25 | # global optimization window
26 | self.t0 = 0
27 | self.t1 = 0
28 |
29 | self.beta = cfg['tracking']['beta']
30 | self.backend_thresh = cfg['tracking']['backend']['thresh']
31 | self.backend_radius = cfg['tracking']['backend']['radius']
32 | self.backend_nms = cfg['tracking']['backend']['nms']
33 | self.backend_normalize = cfg['tracking']['backend']['normalize']
34 | self.output = f"{cfg['data']['output']}/{cfg['scene']}"
35 |
36 | self.backend_loop_window = cfg['tracking']['backend']['loop_window']
37 | self.backend_loop_thresh = cfg['tracking']['backend']['loop_thresh']
38 | self.backend_loop_radius = cfg['tracking']['backend']['loop_radius']
39 | self.backend_loop_nms = cfg['tracking']['backend']['loop_nms']
40 |
41 | @torch.no_grad()
42 | def backend_ba(self, t_start, t_end, steps, graph, nms, radius, thresh, max_factors, t_start_loop=None, loop=False, motion_only=False, enable_wq=True):
43 | """ main update """
44 | if self.cfg['tracking']["uncertainty_params"]['activate']:
45 | self.video.update_all_uncertainty_mask()
46 |
47 | if t_start_loop is None or not loop:
48 | t_start_loop = t_start
49 | assert t_start_loop >= t_start, f'short: {t_start_loop}, long: {t_start}.'
50 | edge_num = graph.add_backend_proximity_factors(t_start,t_end,nms,radius,thresh,max_factors,self.beta, t_start_loop,loop)
51 | if edge_num == 0:
52 | graph.clear_edges()
53 | return 0
54 |
55 | graph.update_lowmem(
56 | t0=t_start_loop+1, # fix the start point to avoid drift, be sure to use t_start_loop rather than t_start here.
57 | t1=t_end,
58 | itrs=2,
59 | use_inactive=False,
60 | steps=steps,
61 | enable_wq = enable_wq
62 | )
63 |
64 | graph.clear_edges()
65 | return edge_num
66 |
67 | @torch.no_grad()
68 | def dense_ba(self, steps=6, enable_wq=True):
69 | t_start = 0
70 | t_end = self.video.counter.value
71 | nms = self.backend_nms
72 | radius = self.backend_radius
73 | thresh = self.backend_thresh
74 | n = t_end - t_start
75 | max_factors = ((radius + 2) * 2) * n
76 | if self.backend_normalize:
77 | self.video.normalize()
78 | graph = FactorGraph(self.video, self.update_op, device=self.device,
79 | corr_impl='alt', max_factors=max_factors)
80 | n_edges = self.backend_ba(t_start, t_end, steps, graph, nms, radius,
81 | thresh, max_factors, motion_only=False, enable_wq=enable_wq)
82 |
83 | del graph
84 | torch.cuda.empty_cache()
85 | self.video.set_dirty(t_start,t_end)
86 | self.video.update_valid_depth_mask()
87 | return n, n_edges
88 |
89 |
90 |
91 | @torch.no_grad()
92 | def loop_ba(self, t_start, t_end, steps=6, motion_only=False, local_graph=None, enable_wq=True):
93 | ''' loop closure, add edges with high-covisiablity'''
94 | radius = self.backend_loop_radius
95 | window = self.backend_loop_window
96 | max_factors = 8 * window
97 | nms = self.backend_loop_nms
98 | thresh = self.backend_loop_thresh
99 | t_start_loop = max(0, t_end - window)
100 |
101 | graph = FactorGraph(self.video, self.update_op, device=self.device, corr_impl='alt', max_factors=max_factors)
102 | if local_graph is not None:
103 | copy_attr = ['ii', 'jj', 'age', 'net', 'target', 'weight']
104 | for key in copy_attr:
105 | val = getattr(local_graph, key)
106 | if val is not None:
107 | setattr(graph, key, deepcopy(val))
108 |
109 | left_factors = max_factors - len(graph.ii)
110 | n_edges = self.backend_ba(t_start, t_end, steps, graph, nms, radius, thresh,
111 | left_factors, t_start_loop=t_start_loop, loop=True,
112 | motion_only=motion_only, enable_wq=enable_wq)
113 | del graph
114 | torch.cuda.empty_cache()
115 | return t_end - t_start_loop, n_edges
116 |
117 |
--------------------------------------------------------------------------------
/src/config.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 The GlORIE-SLAM Authors.
2 |
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 |
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 |
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import yaml
16 |
17 |
18 | def load_config(path, default_path=None):
19 | """
20 | Load config file
21 | Args:
22 | path: (str), path to config file
23 | default_path: (str, optional), whether to use default path.
24 |
25 | Returns:
26 | cfg: (dict), config dict
27 |
28 | """
29 | # load configuration from file itself
30 | with open(path, 'r' ) as f:
31 | cfg_special = yaml.full_load(f)
32 |
33 | # check if we should inherit from a config
34 | inherit_from = cfg_special.get('inherit_from')
35 |
36 | # if yes, load this config first as default
37 | # if no, use the default path
38 | if inherit_from is not None:
39 | cfg = load_config(inherit_from, default_path)
40 | elif default_path is not None:
41 | with open(default_path, 'r') as f:
42 | cfg = yaml.full_load(f)
43 | else:
44 | cfg = dict()
45 |
46 | # include main configuration
47 | update_recursive(cfg, cfg_special)
48 |
49 | return cfg
50 |
51 | def save_config(cfg, path):
52 | with open(path, 'w+') as fp:
53 | yaml.dump(cfg, fp)
54 |
55 |
56 | def update_recursive(dict1, dict2):
57 | """
58 | update two config dictionaries recursively
59 | Args:
60 | dict1: (dict), first dictionary to be updated
61 | dictw: (dict), second dictionary which entries should be used
62 |
63 | Returns:
64 |
65 | """
66 | for k, v in dict2.items():
67 | if k not in dict1:
68 | dict1[k] = dict()
69 | if isinstance(v, dict):
70 | update_recursive(dict1[k], v)
71 | else:
72 | dict1[k] = v
--------------------------------------------------------------------------------
/src/geom/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/src/geom/__init__.py
--------------------------------------------------------------------------------
/src/geom/chol.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 The GlORIE-SLAM Authors.
2 |
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 |
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 |
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import torch
16 | import torch.nn.functional as F
17 | import src.geom.projective_ops as pops
18 |
19 | # class CholeskySolver(torch.autograd.Function):
20 | class CholeskySolver():
21 | @staticmethod
22 |
23 | def apply(H,b):
24 | try:
25 | U = torch.linalg.cholesky(H)
26 | xs = torch.cholesky_solve(b, U)
27 | except Exception as e:
28 | print(e)
29 | xs = torch.zeros_like(b)
30 |
31 | return xs
32 |
33 | def __call__(ctx, H, b):
34 | # don't crash training if cholesky decomp fails
35 | try:
36 | U = torch.linalg.cholesky(H)
37 | xs = torch.cholesky_solve(b, U)
38 | ctx.save_for_backward(U, xs)
39 | ctx.failed = False
40 | except Exception as e:
41 | print(e)
42 | ctx.failed = True
43 | xs = torch.zeros_like(b)
44 |
45 | return xs
46 |
47 | @staticmethod
48 | def backward(ctx, grad_x):
49 | if ctx.failed:
50 | return None, None
51 |
52 | U, xs = ctx.saved_tensors
53 | dz = torch.cholesky_solve(grad_x, U)
54 | dH = -torch.matmul(xs, dz.transpose(-1,-2))
55 |
56 | return dH, dz
57 |
58 | def block_solve(H, b, ep=0.1, lm=0.0001):
59 | """ solve normal equations """
60 | B, N, _, D, _ = H.shape
61 | I = torch.eye(D).to(H.device)
62 | H = H + (ep + lm*H) * I
63 |
64 | H = H.permute(0,1,3,2,4)
65 | H = H.reshape(B, N*D, N*D)
66 | b = b.reshape(B, N*D, 1)
67 |
68 | x = CholeskySolver.apply(H,b)
69 | return x.reshape(B, N, D)
70 |
71 |
72 | def schur_solve(H, E, C, v, w, ep=0.1, lm=0.0001, sless=False):
73 | """ solve using shur complement """
74 |
75 | B, P, M, D, HW = E.shape
76 | H = H.permute(0,1,3,2,4).reshape(B, P*D, P*D)
77 | E = E.permute(0,1,3,2,4).reshape(B, P*D, M*HW)
78 | Q = (1.0 / C).view(B, M*HW, 1)
79 |
80 | # damping
81 | I = torch.eye(P*D).to(H.device)
82 | H = H + (ep + lm*H) * I
83 |
84 | v = v.reshape(B, P*D, 1)
85 | w = w.reshape(B, M*HW, 1)
86 |
87 | Et = E.transpose(1,2)
88 | S = H - torch.matmul(E, Q*Et)
89 | v = v - torch.matmul(E, Q*w)
90 |
91 | dx = CholeskySolver.apply(S, v)
92 | if sless:
93 | return dx.reshape(B, P, D)
94 |
95 | dz = Q * (w - Et @ dx)
96 | dx = dx.reshape(B, P, D)
97 | dz = dz.reshape(B, M, HW)
98 |
99 | return dx, dz
--------------------------------------------------------------------------------
/src/geom/projective_ops.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 The GlORIE-SLAM Authors.
2 |
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 |
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 |
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import torch
16 | import torch.nn.functional as F
17 |
18 | from lietorch import SE3, Sim3
19 |
20 | MIN_DEPTH = 0.2
21 |
22 | def extract_intrinsics(intrinsics):
23 | return intrinsics[...,None,None,:].unbind(dim=-1)
24 |
25 | def coords_grid(ht, wd, device):
26 | y, x = torch.meshgrid(
27 | torch.arange(ht).to(device).float(),
28 | torch.arange(wd).to(device).float(),indexing="ij")
29 |
30 | return torch.stack([x, y], dim=-1)
31 |
32 | def iproj(disps, intrinsics, jacobian=False):
33 | """ pinhole camera inverse projection """
34 | ht, wd = disps.shape[2:]
35 | fx, fy, cx, cy = extract_intrinsics(intrinsics)
36 |
37 | y, x = torch.meshgrid(
38 | torch.arange(ht).to(disps.device).float(),
39 | torch.arange(wd).to(disps.device).float(),indexing="ij")
40 |
41 | i = torch.ones_like(disps)
42 | X = (x - cx) / fx
43 | Y = (y - cy) / fy
44 | pts = torch.stack([X, Y, i, disps], dim=-1)
45 |
46 | if jacobian:
47 | J = torch.zeros_like(pts)
48 | J[...,-1] = 1.0
49 | return pts, J
50 |
51 | return pts, None
52 |
53 | def proj(Xs, intrinsics, jacobian=False, return_depth=False):
54 | """ pinhole camera projection """
55 | fx, fy, cx, cy = extract_intrinsics(intrinsics)
56 | X, Y, Z, D = Xs.unbind(dim=-1)
57 |
58 | Z = torch.where(Z < 0.5*MIN_DEPTH, torch.ones_like(Z), Z)
59 | d = 1.0 / Z
60 |
61 | x = fx * (X * d) + cx
62 | y = fy * (Y * d) + cy
63 | if return_depth:
64 | coords = torch.stack([x, y, D*d], dim=-1)
65 | else:
66 | coords = torch.stack([x, y], dim=-1)
67 |
68 | if jacobian:
69 | B, N, H, W = d.shape
70 | o = torch.zeros_like(d)
71 | proj_jac = torch.stack([
72 | fx*d, o, -fx*X*d*d, o,
73 | o, fy*d, -fy*Y*d*d, o,
74 | # o, o, -D*d*d, d,
75 | ], dim=-1).view(B, N, H, W, 2, 4)
76 |
77 | return coords, proj_jac
78 |
79 | return coords, None
80 |
81 | def actp(Gij, X0, jacobian=False):
82 | """ action on point cloud """
83 | X1 = Gij[:,:,None,None] * X0
84 |
85 | if jacobian:
86 | X, Y, Z, d = X1.unbind(dim=-1)
87 | o = torch.zeros_like(d)
88 | B, N, H, W = d.shape
89 |
90 | if isinstance(Gij, SE3):
91 | Ja = torch.stack([
92 | d, o, o, o, Z, -Y,
93 | o, d, o, -Z, o, X,
94 | o, o, d, Y, -X, o,
95 | o, o, o, o, o, o,
96 | ], dim=-1).view(B, N, H, W, 4, 6)
97 |
98 | elif isinstance(Gij, Sim3):
99 | Ja = torch.stack([
100 | d, o, o, o, Z, -Y, X,
101 | o, d, o, -Z, o, X, Y,
102 | o, o, d, Y, -X, o, Z,
103 | o, o, o, o, o, o, o
104 | ], dim=-1).view(B, N, H, W, 4, 7)
105 |
106 | return X1, Ja
107 |
108 | return X1, None
109 |
110 | def projective_transform(poses, depths, intrinsics, ii, jj, jacobian=False, return_depth=False):
111 | """ map points from ii->jj """
112 |
113 | # inverse project (pinhole)
114 | X0, Jz = iproj(depths[:,ii], intrinsics[:,ii], jacobian=jacobian)
115 |
116 | # transform
117 | Gij = poses[:,jj] * poses[:,ii].inv()
118 |
119 | Gij.data[:,ii==jj] = torch.as_tensor([-0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0], device="cuda")
120 | X1, Ja = actp(Gij, X0, jacobian=jacobian)
121 |
122 | # project (pinhole)
123 | x1, Jp = proj(X1, intrinsics[:,jj], jacobian=jacobian, return_depth=return_depth)
124 |
125 | # exclude points too close to camera
126 | valid = ((X1[...,2] > MIN_DEPTH) & (X0[...,2] > MIN_DEPTH)).float()
127 | valid = valid.unsqueeze(-1)
128 |
129 | if jacobian:
130 | # Ji transforms according to dual adjoint
131 | Jj = torch.matmul(Jp, Ja)
132 | Ji = -Gij[:,:,None,None,None].adjT(Jj)
133 |
134 | Jz = Gij[:,:,None,None] * Jz
135 | Jz = torch.matmul(Jp, Jz.unsqueeze(-1))
136 |
137 | return x1, valid, (Ji, Jj, Jz)
138 |
139 | return x1, valid
140 |
141 | def induced_flow(poses, disps, intrinsics, ii, jj):
142 | """ optical flow induced by camera motion """
143 |
144 | ht, wd = disps.shape[2:]
145 | y, x = torch.meshgrid(
146 | torch.arange(ht).to(disps.device).float(),
147 | torch.arange(wd).to(disps.device).float(),indexing="ij")
148 |
149 | coords0 = torch.stack([x, y], dim=-1)
150 | coords1, valid = projective_transform(poses, disps, intrinsics, ii, jj, False)
151 |
152 | return coords1[...,:2] - coords0, valid
153 |
154 |
--------------------------------------------------------------------------------
/src/gui/gl_render/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 Li Ma
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/src/gui/gl_render/__init__.py:
--------------------------------------------------------------------------------
1 | from . import render_ogl, util, util_gau
2 |
--------------------------------------------------------------------------------
/src/gui/gl_render/render_ogl.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import numpy as np
4 | import torch
5 | from OpenGL import GL as gl
6 |
7 | from . import util, util_gau
8 |
9 | _sort_buffer_xyz = None
10 | _sort_buffer_gausid = None # used to tell whether gaussian is reloaded
11 |
12 |
13 | def _sort_gaussian_torch(gaus, view_mat):
14 | global _sort_buffer_gausid, _sort_buffer_xyz
15 | if _sort_buffer_gausid != id(gaus):
16 | _sort_buffer_xyz = torch.tensor(gaus.xyz).cuda()
17 | _sort_buffer_gausid = id(gaus)
18 |
19 | xyz = torch.tensor(gaus.xyz).cuda()
20 | view_mat = torch.tensor(view_mat).cuda()
21 | xyz_view = view_mat[None, :3, :3] @ xyz[..., None] + view_mat[None, :3, 3, None]
22 | depth = xyz_view[:, 2, 0]
23 | index = torch.argsort(depth)
24 | index = index.type(torch.int32).reshape(-1, 1).cpu().numpy()
25 | return index
26 |
27 |
28 | # Decide which sort to use
29 | _sort_gaussian = None
30 | if not torch.cuda.is_available():
31 | raise ImportError
32 | _sort_gaussian = _sort_gaussian_torch
33 |
34 |
35 | class GaussianRenderBase:
36 | def __init__(self):
37 | self.gaussians = None
38 |
39 | def update_gaussian_data(self, gaus: util_gau.GaussianData):
40 | raise NotImplementedError()
41 |
42 | def sort_and_update(self):
43 | raise NotImplementedError()
44 |
45 | def set_scale_modifier(self, modifier: float):
46 | raise NotImplementedError()
47 |
48 | def set_render_mod(self, mod: int):
49 | raise NotImplementedError()
50 |
51 | def update_camera_pose(self, camera: util.Camera):
52 | raise NotImplementedError()
53 |
54 | def update_camera_intrin(self, camera: util.Camera):
55 | raise NotImplementedError()
56 |
57 | def draw(self):
58 | raise NotImplementedError()
59 |
60 | def set_render_reso(self, w, h):
61 | raise NotImplementedError()
62 |
63 |
64 | class OpenGLRenderer(GaussianRenderBase):
65 | def __init__(self, w, h):
66 | super().__init__()
67 | gl.glViewport(0, 0, w, h)
68 | cur_path = os.path.dirname(os.path.abspath(__file__))
69 | self.program = util.load_shaders(
70 | os.path.join(cur_path, "shaders/gau_vert.glsl"),
71 | os.path.join(cur_path, "shaders/gau_frag.glsl"),
72 | )
73 |
74 | # Vertex data for a quad
75 | self.quad_v = np.array([-1, 1, 1, 1, 1, -1, -1, -1], dtype=np.float32).reshape(
76 | 4, 2
77 | )
78 | self.quad_f = np.array([0, 1, 2, 0, 2, 3], dtype=np.uint32).reshape(2, 3)
79 |
80 | # load quad geometry
81 | vao, buffer_id = util.set_attributes(self.program, ["position"], [self.quad_v])
82 | util.set_faces_tovao(vao, self.quad_f)
83 | self.vao = vao
84 | self.gau_bufferid = None
85 | self.index_bufferid = None
86 |
87 | # opengl settings
88 | gl.glDisable(gl.GL_CULL_FACE)
89 | gl.glEnable(gl.GL_BLEND)
90 | gl.glBlendFunc(gl.GL_SRC_ALPHA, gl.GL_ONE_MINUS_SRC_ALPHA)
91 |
92 | def update_gaussian_data(self, gaus: util_gau.GaussianData):
93 | self.gaussians = gaus
94 | # load gaussian geometry
95 | gaussian_data = gaus.flat()
96 | self.gau_bufferid = util.set_storage_buffer_data(
97 | self.program, "gaussian_data", gaussian_data, bind_idx=0,
98 | buffer_id=self.gau_bufferid
99 | )
100 | util.set_uniform_1int(self.program, gaus.sh_dim, "sh_dim")
101 |
102 | def sort_and_update(self, camera: util.Camera):
103 | index = _sort_gaussian(self.gaussians, camera.get_view_matrix())
104 | self.index_bufferid = util.set_storage_buffer_data(self.program, "gi", index, bind_idx=1,
105 | buffer_id=self.index_bufferid)
106 | return
107 |
108 | def set_scale_modifier(self, modifier):
109 | util.set_uniform_1f(self.program, modifier, "scale_modifier")
110 |
111 | def set_render_mod(self, mod: int):
112 | util.set_uniform_1int(self.program, mod, "render_mod")
113 |
114 | def set_render_reso(self, w, h):
115 | gl.glViewport(0, 0, w, h)
116 |
117 | def update_camera_pose(self, camera: util.Camera):
118 | view_mat = camera.get_view_matrix()
119 | util.set_uniform_mat4(self.program, view_mat, "view_matrix")
120 | util.set_uniform_v3(self.program, camera.position, "cam_pos")
121 |
122 | def update_camera_intrin(self, camera: util.Camera):
123 | proj_mat = camera.get_project_matrix()
124 | util.set_uniform_mat4(self.program, proj_mat, "projection_matrix")
125 | util.set_uniform_v3(self.program, camera.get_htanfovxy_focal(), "hfovxy_focal")
126 |
127 | def draw(self):
128 | gl.glUseProgram(self.program)
129 | gl.glBindVertexArray(self.vao)
130 | num_gau = len(self.gaussians)
131 | gl.glDrawElementsInstanced(
132 | gl.GL_TRIANGLES,
133 | len(self.quad_f.reshape(-1)),
134 | gl.GL_UNSIGNED_INT,
135 | None,
136 | num_gau,
137 | )
138 |
--------------------------------------------------------------------------------
/src/gui/gl_render/shaders/gau_frag.glsl:
--------------------------------------------------------------------------------
1 | #version 430 core
2 |
3 | in vec3 color;
4 | in float alpha;
5 | in vec3 conic;
6 | in vec2 coordxy; // local coordinate in quad, unit in pixel
7 |
8 | uniform int render_mod; // > 0 render 0-ith SH dim, -1 depth, -2 bill board, -3 flat ball, -4 gaussian ball
9 |
10 | out vec4 FragColor;
11 |
12 | void main()
13 | {
14 | if (render_mod == -2)
15 | {
16 | FragColor = vec4(color, 1.f);
17 | return;
18 | }
19 |
20 | float power = -0.5f * (conic.x * coordxy.x * coordxy.x + conic.z * coordxy.y * coordxy.y) - conic.y * coordxy.x * coordxy.y;
21 | if (power > 0.f)
22 | discard;
23 | float opacity = min(0.99f, alpha * exp(power));
24 | if (opacity < 1.f / 255.f)
25 | discard;
26 | FragColor = vec4(color, opacity);
27 |
28 | // handling special shading effect
29 | if (render_mod == -3)
30 | FragColor.a = FragColor.a > 0.22 ? 1 : 0;
31 | else if (render_mod == -4)
32 | {
33 | FragColor.a = FragColor.a > 0.4 ? 1 : 0;
34 | FragColor.rgb = FragColor.rgb * exp(power);
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/src/gui/gl_render/util_gau.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 |
3 | import numpy as np
4 |
5 |
6 | @dataclass
7 | class GaussianData:
8 | xyz: np.ndarray
9 | rot: np.ndarray
10 | scale: np.ndarray
11 | opacity: np.ndarray
12 | sh: np.ndarray
13 |
14 | def flat(self) -> np.ndarray:
15 | ret = np.concatenate(
16 | [self.xyz, self.rot, self.scale, self.opacity, self.sh], axis=-1
17 | )
18 | return np.ascontiguousarray(ret)
19 |
20 | def __len__(self):
21 | return len(self.xyz)
22 |
23 | @property
24 | def sh_dim(self):
25 | return self.sh.shape[-1]
26 |
--------------------------------------------------------------------------------
/src/modules/droid_net/__init__.py:
--------------------------------------------------------------------------------
1 | from .clipping import GradientClip
2 | from .gru import ConvGRU
3 | from .extractor import BasicEncoder
4 | from .corr import CorrBlock, AltCorrBlock
5 | from .droid_net import DroidNet, cvx_upsample
--------------------------------------------------------------------------------
/src/modules/droid_net/clipping.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 The GlORIE-SLAM Authors.
2 |
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 |
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 |
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import torch
16 | import torch.nn as nn
17 |
18 |
19 | GRAD_CLIP = 0.01
20 |
21 | class GradClip(torch.autograd.Function):
22 | @staticmethod
23 | def forward(ctx, x):
24 | return x
25 |
26 | @staticmethod
27 | def backward(ctx, grad_x):
28 | o = torch.zeros_like(grad_x)
29 | grad_x = torch.where(grad_x.abs() > GRAD_CLIP, o, grad_x)
30 | grad_x = torch.where(torch.isnan(grad_x), o, grad_x)
31 |
32 | return grad_x
33 |
34 |
35 | class GradientClip(nn.Module):
36 | def __init__(self):
37 | super(GradientClip, self).__init__()
38 |
39 | def forward(self, x):
40 | return GradClip.apply(x)
--------------------------------------------------------------------------------
/src/modules/droid_net/droid_net.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 The GlORIE-SLAM Authors.
2 |
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 |
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 |
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import torch
16 | import torch.nn as nn
17 | import torch.nn.functional as F
18 | from torch_scatter import scatter_mean
19 |
20 | from src.modules.droid_net import ConvGRU, BasicEncoder, GradientClip
21 |
22 |
23 | def cvx_upsample(data, mask):
24 | """ upsample pixel-wise transformation field """
25 | batch, ht, wd, dim = data.shape
26 | data = data.permute(0, 3, 1, 2).contiguous()
27 | mask = mask.view(batch, 1, 9, 8, 8, ht, wd)
28 | mask = torch.softmax(mask, dim=2)
29 |
30 | up_data = F.unfold(data, kernel_size=(3, 3), padding=(1, 1))
31 | up_data = up_data.view(batch, dim, 9, 1, 1, ht, wd)
32 |
33 | up_data = torch.sum(mask * up_data, dim=2, keepdim=False)
34 | up_data = up_data.permute(0, 4, 2, 5, 3, 1).contiguous()
35 | up_data = up_data.reshape(batch, 8*ht, 8*wd, dim)
36 |
37 | return up_data
38 |
39 |
40 | def upsample_disp(disp, mask):
41 | batch, num, ht, wd = disp.shape
42 | disp = disp.view(batch*num, ht, wd, 1)
43 | mask = mask.view(batch*num, -1, ht, wd)
44 |
45 | return cvx_upsample(disp, mask).view(batch, num, 8*ht, 8*wd)
46 |
47 |
48 | class GraphAgg(nn.Module):
49 | def __init__(self):
50 | super(GraphAgg, self).__init__()
51 | self.conv1 = nn.Conv2d(128, 128, kernel_size=(3, 3), padding=(1, 1))
52 | self.conv2 = nn.Conv2d(128, 128, kernel_size=(3, 3), padding=(1, 1))
53 | self.relu = nn.ReLU(inplace=True)
54 |
55 | self.eta = nn.Sequential(
56 | nn.Conv2d(128, 1, kernel_size=(3, 3), padding=(1, 1)),
57 | GradientClip(),
58 | nn.Softplus(),
59 | )
60 |
61 | self.upmask = nn.Sequential(
62 | nn.Conv2d(128, 8*8*9, kernel_size=(1, 1), padding=(0, 0))
63 | )
64 |
65 | def forward(self, net, ii):
66 | batch, num, ch, ht, wd = net.shape
67 | net = net.view(batch*num, ch, ht, wd)
68 |
69 | _, ix = torch.unique(ii, sorted=True, return_inverse=True)
70 | net = self.relu(self.conv1(net))
71 | net =net.view(batch, num, 128, ht, wd)
72 |
73 | net = scatter_mean(net, ix, dim=1)
74 | net = net.view(-1, 128, ht, wd)
75 |
76 | net = self.relu(self.conv2(net))
77 | eta = self.eta(net).view(batch, -1, ht, wd)
78 | upmask = self.upmask(net).view(batch, -1, 8*8*9, ht, wd)
79 |
80 | return 0.01 * eta, upmask
81 |
82 |
83 | class UpdateModule(nn.Module):
84 | def __init__(self):
85 | super(UpdateModule, self).__init__()
86 | cor_planes = 4 * (2*3+1)**2
87 |
88 | self.corr_encoder = nn.Sequential(
89 | nn.Conv2d(cor_planes, 128, kernel_size=(1, 1), padding=(0, 0)),
90 | nn.ReLU(inplace=True),
91 | nn.Conv2d(128, 128, kernel_size=(3, 3), padding=(1, 1)),
92 | nn.ReLU(inplace=True),
93 | )
94 |
95 | self.flow_encoder = nn.Sequential(
96 | nn.Conv2d(4, 128, kernel_size=(7, 7), padding=(3, 3)),
97 | nn.ReLU(inplace=True),
98 | nn.Conv2d(128, 64, kernel_size=(3, 3), padding=(1, 1)),
99 | nn.ReLU(inplace=True),
100 | )
101 |
102 | self.weight = nn.Sequential(
103 | nn.Conv2d(128, 128, kernel_size=(3, 3), padding=(1, 1)),
104 | nn.ReLU(inplace=True),
105 | nn.Conv2d(128, 2, kernel_size=(3, 3), padding=(1, 1)),
106 | GradientClip(),
107 | nn.Sigmoid(),
108 | )
109 |
110 | self.delta = nn.Sequential(
111 | nn.Conv2d(128, 128, kernel_size=(3, 3), padding=(1, 1)),
112 | nn.ReLU(inplace=True),
113 | nn.Conv2d(128, 2, kernel_size=(3, 3), padding=(1, 1)),
114 | GradientClip(),
115 | )
116 |
117 | self.gru = ConvGRU(128, 128+128+64)
118 | self.agg = GraphAgg()
119 |
120 | def forward(self, net, inp, corr, flow=None, ii=None, jj=None):
121 | """ update operation """
122 |
123 | batch, num, ch, ht, wd = net.shape
124 | device = net.device
125 |
126 | if flow is None:
127 | flow = torch.zeros(batch, num, 4, ht, wd, device=device)
128 |
129 | out_dim = (batch, num, -1, ht, wd)
130 |
131 | net = net.view(batch*num, -1, ht, wd)
132 | inp = inp.view(batch*num, -1, ht, wd)
133 | corr = corr.view(batch*num, -1, ht, wd)
134 | flow = flow.view(batch*num, -1, ht, wd)
135 |
136 | corr = self.corr_encoder(corr)
137 | flow = self.flow_encoder(flow)
138 | net = self.gru(net, inp, corr, flow)
139 |
140 | ### update variables ###
141 | delta = self.delta(net).view(*out_dim)
142 | weight = self.weight(net).view(*out_dim)
143 |
144 | delta = delta.permute(0, 1, 3, 4, 2)[..., :2].contiguous()
145 | weight = weight.permute(0, 1, 3, 4, 2)[..., :2].contiguous()
146 |
147 | net = net.view(*out_dim)
148 |
149 | if ii is not None:
150 | eta, upmask = self.agg(net, ii.to(device))
151 | return net, delta, weight, eta, upmask
152 | else:
153 | return net, delta, weight
154 |
155 |
156 | class DroidNet(nn.Module):
157 | def __init__(self):
158 | super(DroidNet, self).__init__()
159 | self.fnet = BasicEncoder(out_dim=128, norm_fn='instance')
160 | self.cnet = BasicEncoder(out_dim=256, norm_fn='none')
161 | self.update = UpdateModule()
162 |
163 |
--------------------------------------------------------------------------------
/src/modules/droid_net/extractor.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 The GlORIE-SLAM Authors.
2 |
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 |
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 |
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import torch.nn as nn
16 |
17 |
18 | class ResidualBlock(nn.Module):
19 | def __init__(self, in_planes, planes, norm_fn='group', stride=1):
20 | super(ResidualBlock, self).__init__()
21 |
22 | self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, padding=1, stride=stride)
23 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1)
24 | self.relu = nn.ReLU(inplace=True)
25 |
26 | num_groups = planes // 8
27 | if norm_fn == 'group':
28 | self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
29 | self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
30 | if stride > 1:
31 | self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
32 |
33 | elif norm_fn == 'batch':
34 | self.norm1 = nn.BatchNorm2d(planes)
35 | self.norm2 = nn.BatchNorm2d(planes)
36 | if stride > 1:
37 | self.norm3 = nn.BatchNorm2d(planes)
38 |
39 | elif norm_fn == 'instance':
40 | self.norm1 = nn.InstanceNorm2d(planes)
41 | self.norm2 = nn.InstanceNorm2d(planes)
42 | if stride > 1:
43 | self.norm3 = nn.InstanceNorm2d(planes)
44 |
45 | elif norm_fn == 'none':
46 | self.norm1 = nn.Sequential()
47 | self.norm2 = nn.Sequential()
48 | if stride > 1:
49 | self.norm3 = nn.Sequential()
50 | else:
51 | raise TypeError(norm_fn)
52 |
53 | if stride == 1:
54 | self.downsample = None
55 | else:
56 | self.downsample = nn.Sequential(
57 | nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride, padding=0),
58 | self.norm3,
59 | )
60 |
61 | def forward(self, x):
62 | y = x
63 | y = self.relu(self.norm1(self.conv1(y)))
64 | y = self.relu(self.norm2(self.conv2(y)))
65 |
66 | if self.downsample is not None:
67 | x = self.downsample(x)
68 |
69 | return self.relu(x+y)
70 |
71 |
72 | DIM = 32
73 |
74 |
75 | class BasicEncoder(nn.Module):
76 | def __init__(self, out_dim, norm_fn='batch'):
77 | super(BasicEncoder, self).__init__()
78 | self.out_dim = out_dim
79 | self.norm_fn = norm_fn
80 |
81 | if norm_fn == 'group':
82 | self.norm1 = nn.GroupNorm(num_groups=8, num_channels=DIM)
83 |
84 | elif norm_fn == 'batch':
85 | self.norm1 = nn.BatchNorm2d(DIM)
86 |
87 | elif norm_fn == 'instance':
88 | self.norm1 = nn.InstanceNorm2d(DIM)
89 |
90 | elif self.norm_fn == 'none':
91 | self.norm1 = nn.Sequential()
92 |
93 | else:
94 | raise TypeError(self.norm_fn)
95 |
96 | self.conv1 = nn.Conv2d(3, DIM, 7, 2, 3)
97 | self.relu1 = nn.ReLU(inplace=True)
98 |
99 | self.in_planes = DIM
100 | self.layer1 = self._make_layer(DIM, stride=1)
101 | self.layer2 = self._make_layer(2*DIM, stride=2)
102 | self.layer3 = self._make_layer(4*DIM, stride=2)
103 |
104 | self.conv2 = nn.Conv2d(4*DIM, out_dim, kernel_size=(1, 1))
105 |
106 | for m in self.modules():
107 | if isinstance(m, nn.Conv2d):
108 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
109 | elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
110 | if m.weight is not None:
111 | nn.init.constant_(m.weight, 1)
112 | if m.bias is not None:
113 | nn.init.constant_(m.bias, 0)
114 |
115 | def _make_layer(self, dim, stride=1):
116 | layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride)
117 | layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1)
118 | layers = [layer1, layer2]
119 |
120 | self.in_planes = dim
121 |
122 | return nn.Sequential(*layers)
123 |
124 | def forward(self, x):
125 | b, n, c1, h1, w1 = x.shape
126 | x = x.view(b*n, c1, h1, w1)
127 |
128 | x = self.conv1(x)
129 | x = self.norm1(x)
130 | x = self.relu1(x)
131 |
132 | x = self.layer1(x)
133 | x = self.layer2(x)
134 | x = self.layer3(x)
135 |
136 | x = self.conv2(x)
137 |
138 | _, c2, h2, w2 = x.shape
139 | x = x.view(b, n, c2, h2, w2)
140 |
141 | return x
--------------------------------------------------------------------------------
/src/modules/droid_net/gru.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 The GlORIE-SLAM Authors.
2 |
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 |
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 |
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import torch
16 | import torch.nn as nn
17 |
18 |
19 | class ConvGRU(nn.Module):
20 | def __init__(self, h_planes=128, i_planes=128):
21 | super(ConvGRU, self).__init__()
22 | self.do_checkpoint = False
23 |
24 | self.convz = nn.Conv2d(h_planes+i_planes, h_planes, kernel_size=(3, 3), padding=(1, 1))
25 | self.convr = nn.Conv2d(h_planes+i_planes, h_planes, kernel_size=(3, 3), padding=(1, 1))
26 | self.convq = nn.Conv2d(h_planes+i_planes, h_planes, kernel_size=(3, 3), padding=(1, 1))
27 |
28 | self.w = nn.Conv2d(h_planes, h_planes, kernel_size=(1, 1), padding=(0, 0))
29 |
30 | self.convz_glo = nn.Conv2d(h_planes, h_planes, kernel_size=(1, 1), padding=(0, 0))
31 | self.convr_glo = nn.Conv2d(h_planes, h_planes, kernel_size=(1, 1), padding=(0, 0))
32 | self.convq_glo = nn.Conv2d(h_planes, h_planes, kernel_size=(1, 1), padding=(0, 0))
33 |
34 | def forward(self, net, *inputs):
35 | inp = torch.cat(inputs, dim=1)
36 | net_inp = torch.cat([net, inp], dim=1)
37 |
38 | b, c, h, w = net.shape
39 | glo = torch.sigmoid(self.w(net)) * net
40 | glo = glo.view(b, c, h*w).mean(dim=-1, keepdim=True).view(b, c, 1, 1)
41 |
42 | z = torch.sigmoid(self.convz(net_inp) + self.convz_glo(glo))
43 | r = torch.sigmoid(self.convr(net_inp) + self.convr_glo(glo))
44 | q = torch.tanh(self.convq(torch.cat([r*net, inp], dim=1)) + self.convq_glo(glo))
45 |
46 | net = (1 - z) * net + z * q
47 |
48 | return net
--------------------------------------------------------------------------------
/src/tracker.py:
--------------------------------------------------------------------------------
1 | from src.motion_filter import MotionFilter
2 | from src.frontend import Frontend
3 | from src.backend import Backend
4 | import torch
5 | from colorama import Fore, Style
6 | from multiprocessing.connection import Connection
7 | from src.utils.datasets import BaseDataset
8 | from src.utils.Printer import Printer,FontColor
9 | class Tracker:
10 | def __init__(self, slam, pipe:Connection):
11 | self.cfg = slam.cfg
12 | self.device = self.cfg['device']
13 | self.net = slam.droid_net
14 | self.video = slam.video
15 | self.verbose = slam.verbose
16 | self.pipe = pipe
17 | self.output = slam.save_dir
18 |
19 | # filter incoming frames so that there is enough motion
20 | self.frontend_window = self.cfg['tracking']['frontend']['window']
21 | filter_thresh = self.cfg['tracking']['motion_filter']['thresh']
22 | self.motion_filter = MotionFilter(self.net, self.video, self.cfg, thresh=filter_thresh, device=self.device)
23 | self.enable_online_ba = self.cfg['tracking']['frontend']['enable_online_ba']
24 | # frontend process
25 | self.frontend = Frontend(self.net, self.video, self.cfg)
26 | self.online_ba = Backend(self.net,self.video, self.cfg)
27 | self.ba_freq = self.cfg['tracking']['backend']['ba_freq']
28 |
29 | self.printer:Printer = slam.printer
30 |
31 | def run(self, stream:BaseDataset):
32 | '''
33 | Trigger the tracking process.
34 | 1. check whether there is enough motion between the current frame and last keyframe by motion_filter
35 | 2. use frontend to do local bundle adjustment, to estimate camera pose and depth image,
36 | also delete the current keyframe if it is too close to the previous keyframe after local BA.
37 | 3. run online global BA periodically by backend
38 | 4. send the estimated pose and depth to mapper,
39 | and wait until the mapper finish its current mapping optimization.
40 | '''
41 | prev_kf_idx = 0
42 | curr_kf_idx = 0
43 | prev_ba_idx = 0
44 |
45 | intrinsic = stream.get_intrinsic()
46 | # for (timestamp, image, _, _) in tqdm(stream):
47 | for i in range(len(stream)):
48 | timestamp, image, _, _ = stream[i]
49 | with torch.no_grad():
50 | starting_count = self.video.counter.value
51 | ### check there is enough motion
52 | force_to_add_keyframe = self.motion_filter.track(timestamp, image, intrinsic)
53 |
54 | # local bundle adjustment
55 | self.frontend(force_to_add_keyframe)
56 |
57 | if (starting_count < self.video.counter.value) and self.cfg['mapping']['full_resolution']:
58 | if self.motion_filter.uncertainty_aware:
59 | img_full = stream.get_color_full_resol(i)
60 | self.motion_filter.get_img_feature(timestamp,img_full,suffix='full')
61 | curr_kf_idx = self.video.counter.value - 1
62 |
63 | if curr_kf_idx != prev_kf_idx and self.frontend.is_initialized:
64 | if self.video.counter.value == self.frontend.warmup:
65 | ## We just finish the initialization
66 | self.pipe.send({"is_keyframe":True, "video_idx":curr_kf_idx,
67 | "timestamp":timestamp, "just_initialized": True,
68 | "end":False})
69 | self.pipe.recv()
70 | self.frontend.initialize_second_stage()
71 | else:
72 | if self.enable_online_ba and curr_kf_idx >= prev_ba_idx + self.ba_freq:
73 | # run online global BA every {self.ba_freq} keyframes
74 | self.printer.print(f"Online BA at {curr_kf_idx}th keyframe, frame index: {timestamp}",FontColor.TRACKER)
75 | self.online_ba.dense_ba(2)
76 | prev_ba_idx = curr_kf_idx
77 | # inform the mapper that the estimation of current pose and depth is finished
78 | self.pipe.send({"is_keyframe":True, "video_idx":curr_kf_idx,
79 | "timestamp":timestamp, "just_initialized": False,
80 | "end":False})
81 | self.pipe.recv()
82 |
83 | prev_kf_idx = curr_kf_idx
84 | self.printer.update_pbar()
85 |
86 | self.pipe.send({"is_keyframe":True, "video_idx":None,
87 | "timestamp":None, "just_initialized": False,
88 | "end":True})
89 |
90 |
91 |
--------------------------------------------------------------------------------
/src/trajectory_filler.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import lietorch
3 | from lietorch import SE3
4 | from src.factor_graph import FactorGraph
5 | from tqdm import tqdm
6 | from src.utils.datasets import BaseDataset
7 | from src.utils.Printer import FontColor
8 | from src.utils.mono_priors.img_feature_extractors import predict_img_features, get_feature_extractor
9 |
10 | class PoseTrajectoryFiller:
11 | """ This class is used to fill in non-keyframe poses
12 | mainly inherited from DROID-SLAM
13 | """
14 | def __init__(self, cfg, net, video, printer, device='cuda:0'):
15 | self.cfg = cfg
16 |
17 | # split net modules
18 | self.cnet = net.cnet
19 | self.fnet = net.fnet
20 | self.update = net.update
21 |
22 | self.count = 0
23 | self.video = video
24 | self.device = device
25 | self.printer = printer
26 |
27 | # mean, std for image normalization
28 | self.MEAN = torch.tensor([0.485, 0.456, 0.406], device=device)[:, None, None]
29 | self.STDV = torch.tensor([0.229, 0.224, 0.225], device=device)[:, None, None]
30 |
31 | self.uncertainty_aware = cfg['tracking']["uncertainty_params"]['activate']
32 |
33 | def setup_feature_extractor(self):
34 | if self.uncertainty_aware:
35 | self.feat_extractor = get_feature_extractor(self.cfg)
36 |
37 | @torch.amp.autocast('cuda',enabled=True)
38 | def __feature_encoder(self, image):
39 | """ features for correlation volume """
40 | return self.fnet(image)
41 |
42 | def __fill(self, timestamps, images, depths, intrinsics, dino_features):
43 | """ fill operator """
44 | tt = torch.tensor(timestamps, device=self.device)
45 | images = torch.stack(images, dim=0)
46 | if depths is not None:
47 | depths = torch.stack(depths, dim=0)
48 | intrinsics = torch.stack(intrinsics, 0)
49 | if dino_features is not None:
50 | dino_features = torch.stack(dino_features, dim=0).to(self.device)
51 | inputs = images.to(self.device)
52 |
53 | ### linear pose interpolation ###
54 | N = self.video.counter.value
55 | M = len(timestamps)
56 |
57 | ts = self.video.timestamp[:N]
58 | Ps = SE3(self.video.poses[:N])
59 |
60 | # found the location of current timestamp in keyframe queue
61 | t0 = torch.tensor([ts[ts<=t].shape[0] - 1 for t in timestamps])
62 | t1 = torch.where(t0 < N-1, t0+1, t0)
63 |
64 | # time interval between nearby keyframes
65 | dt = ts[t1] - ts[t0] + 1e-3
66 | dP = Ps[t1] * Ps[t0].inv()
67 |
68 | v = dP.log() / dt.unsqueeze(dim=-1)
69 | w = v * (tt - ts[t0]).unsqueeze(dim=-1)
70 | Gs = SE3.exp(w) * Ps[t0]
71 |
72 | # extract features (no need for context features)
73 | inputs = inputs.sub_(self.MEAN).div_(self.STDV)
74 | fmap = self.__feature_encoder(inputs)
75 |
76 | # temporally put the non-keyframe at the end of keyframe queue
77 | self.video.counter.value += M
78 | self.video[N:N+M] = (tt, images[:, 0], Gs.data, 1, depths, intrinsics / 8.0, fmap, None, None, dino_features)
79 |
80 | if self.uncertainty_aware:
81 | self.video.update_uncertainty_mask_given_index(range(N,N+M))
82 |
83 | graph = FactorGraph(self.video, self.update)
84 | # build edge between current frame and nearby keyframes for optimization
85 | graph.add_factors(t0.cuda(), torch.arange(N, N+M).cuda())
86 | graph.add_factors(t1.cuda(), torch.arange(N, N+M).cuda())
87 |
88 | for _ in range(12):
89 | graph.update(N, N+M, motion_only=True)
90 |
91 | Gs = SE3(self.video.poses[N:N+M].clone())
92 | self.video.counter.value -= M
93 |
94 | return [Gs]
95 |
96 | @torch.no_grad()
97 | def __call__(self, image_stream:BaseDataset):
98 | """ fill in poses of non-keyframe images. """
99 |
100 | # store all camera poses
101 | pose_list = []
102 | dino_feats = None
103 | if self.uncertainty_aware:
104 | dino_feats = []
105 |
106 | timestamps = []
107 | images = []
108 | intrinsics = []
109 | dino_features = []
110 |
111 | self.printer.print("Filling full trajectory ...",FontColor.INFO)
112 | intrinsic = image_stream.get_intrinsic()
113 | for (timestamp, image, _ , _) in tqdm(image_stream):
114 | timestamps.append(timestamp)
115 | images.append(image)
116 | intrinsics.append(intrinsic)
117 | if self.uncertainty_aware:
118 | dino_feature = predict_img_features(self.feat_extractor,
119 | timestamp,image,
120 | self.cfg,
121 | self.device,
122 | save_feat=False)
123 | dino_features.append(dino_feature)
124 | else:
125 | dino_features = None
126 |
127 | if len(timestamps) == 16:
128 | pose_list += self.__fill(timestamps, images, None, intrinsics, dino_features)
129 | if dino_features is not None:
130 | dino_feats += dino_features
131 | timestamps, images, intrinsics, dino_features = [], [], [], []
132 |
133 | if len(timestamps) > 0:
134 | pose_list += self.__fill(timestamps, images, None, intrinsics, dino_features)
135 | if dino_features is not None:
136 | dino_feats += dino_features
137 |
138 | # stitch pose segments together
139 | return lietorch.cat(pose_list, dim=0), dino_feats
--------------------------------------------------------------------------------
/src/utils/Printer.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 The Splat-SLAM Authors.
2 | # Licensed under the Apache License, Version 2.0
3 | # available at: https://github.com/google-research/Splat-SLAM/blob/main/LICENSE
4 |
5 | from colorama import Fore, Style
6 | import torch.multiprocessing as mp
7 |
8 |
9 | class FontColor(object):
10 | MAPPER=Fore.CYAN
11 | TRACKER=Fore.BLUE
12 | INFO=Fore.YELLOW
13 | ERROR=Fore.RED
14 | PCL=Fore.GREEN
15 | EVAL=Fore.MAGENTA
16 | MESH="yellow"
17 |
18 |
19 | def get_msg_prefix(color):
20 | if color == FontColor.MAPPER:
21 | msg_prefix = color + "[MAPPER] " + Style.RESET_ALL
22 | elif color == FontColor.TRACKER:
23 | msg_prefix = color + "[TRACKER] " + Style.RESET_ALL
24 | elif color == FontColor.INFO:
25 | msg_prefix = color + "[INFO] " + Style.RESET_ALL
26 | elif color == FontColor.ERROR:
27 | msg_prefix = color + "[ERROR] " + Style.RESET_ALL
28 | elif color == FontColor.PCL:
29 | msg_prefix = color + "[POINTCLOUD] " + Style.RESET_ALL
30 | elif color == FontColor.EVAL:
31 | msg_prefix = color + "[EVALUATION] " + Style.RESET_ALL
32 | elif color == FontColor.MESH:
33 | msg_prefix = FontColor.INFO + "[MESH] " + Style.RESET_ALL
34 | else:
35 | msg_prefix = Style.RESET_ALL
36 | return msg_prefix
37 |
38 | class TrivialPrinter(object):
39 | def print(self,msg:str,color=None):
40 | msg_prefix = get_msg_prefix(color)
41 | msg = msg_prefix + msg + Style.RESET_ALL
42 | print(msg)
43 |
44 | class Printer(TrivialPrinter):
45 | def __init__(self, total_img_num):
46 | self.msg_lock = mp.Lock()
47 | self.msg_queue = mp.Queue()
48 | self.progress_counter = mp.Value('i', 0)
49 | process = mp.Process(target=self.printer_process, args=(total_img_num,))
50 | process.start()
51 | def print(self,msg:str,color=None):
52 | msg_prefix = get_msg_prefix(color)
53 | msg = msg_prefix + msg + Style.RESET_ALL
54 | with self.msg_lock:
55 | self.msg_queue.put(msg)
56 | def update_pbar(self):
57 | with self.msg_lock:
58 | self.progress_counter.value += 1
59 | self.msg_queue.put(f"PROGRESS")
60 | def pbar_ready(self):
61 | with self.msg_lock:
62 | self.msg_queue.put(f"READY")
63 |
64 | def printer_process(self,total_img_num):
65 | from tqdm import tqdm
66 | while True:
67 | message = self.msg_queue.get()
68 | if message == "READY":
69 | break
70 | else:
71 | print(message)
72 | with tqdm(total=total_img_num) as pbar:
73 | while self.progress_counter.value < total_img_num:
74 | message = self.msg_queue.get()
75 | if message == "DONE":
76 | break
77 | elif message.startswith("PROGRESS"):
78 | with self.msg_lock:
79 | completed = self.progress_counter.value
80 | pbar.set_description(FontColor.TRACKER+f"[TRACKER] "+Style.RESET_ALL)
81 | pbar.n = completed
82 | pbar.refresh()
83 | else:
84 | pbar.write(message)
85 | while True:
86 | message = self.msg_queue.get()
87 | if message == "DONE":
88 | break
89 | else:
90 | print(message)
91 |
92 |
93 | def terminate(self):
94 | self.msg_queue.put("DONE")
95 |
96 |
97 |
--------------------------------------------------------------------------------
/src/utils/common.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 The Splat-SLAM Authors.
2 | # Licensed under the Apache License, Version 2.0
3 | # available at: https://github.com/google-research/Splat-SLAM/blob/main/LICENSE
4 |
5 | import numpy as np
6 | import random
7 | import torch
8 |
9 |
10 | def setup_seed(seed):
11 | torch.manual_seed(seed)
12 | torch.cuda.manual_seed_all(seed)
13 | np.random.seed(seed)
14 | random.seed(seed)
15 | torch.backends.cudnn.deterministic = True
16 | torch.backends.cudnn.benchmark = False
17 |
18 |
19 | def as_intrinsics_matrix(intrinsics):
20 | """
21 | Get matrix representation of intrinsics.
22 |
23 | """
24 | K = torch.eye(3)
25 | K[0, 0] = intrinsics[0]
26 | K[1, 1] = intrinsics[1]
27 | K[0, 2] = intrinsics[2]
28 | K[1, 2] = intrinsics[3]
29 | return K
30 |
31 |
32 | def update_cam(cfg):
33 | """
34 | Update the camera intrinsics according to the pre-processing config,
35 | such as resize or edge crop
36 | """
37 | # resize the input images to crop_size(variable name used in lietorch)
38 | H, W = cfg['cam']['H'], cfg['cam']['W']
39 | fx, fy = cfg['cam']['fx'], cfg['cam']['fy']
40 | cx, cy = cfg['cam']['cx'], cfg['cam']['cy']
41 |
42 | h_edge, w_edge = cfg['cam']['H_edge'], cfg['cam']['W_edge']
43 | H_out, W_out = cfg['cam']['H_out'], cfg['cam']['W_out']
44 |
45 | fx = fx * (W_out + w_edge * 2) / W
46 | fy = fy * (H_out + h_edge * 2) / H
47 | cx = cx * (W_out + w_edge * 2) / W
48 | cy = cy * (H_out + h_edge * 2) / H
49 | H, W = H_out, W_out
50 |
51 | cx = cx - w_edge
52 | cy = cy - h_edge
53 | return H,W,fx,fy,cx,cy
54 |
55 |
56 | @torch.no_grad()
57 | def align_scale_and_shift(prediction, target, weights):
58 |
59 | '''
60 | weighted least squares problem to solve scale and shift:
61 | min sum{
62 | weight[i,j] *
63 | (prediction[i,j] * scale + shift - target[i,j])^2
64 | }
65 |
66 | prediction: [B,H,W]
67 | target: [B,H,W]
68 | weights: [B,H,W]
69 | '''
70 |
71 | if weights is None:
72 | weights = torch.ones_like(prediction).to(prediction.device)
73 | if len(prediction.shape)<3:
74 | prediction = prediction.unsqueeze(0)
75 | target = target.unsqueeze(0)
76 | weights = weights.unsqueeze(0)
77 | a_00 = torch.sum(weights * prediction * prediction, dim=[1,2])
78 | a_01 = torch.sum(weights * prediction, dim=[1,2])
79 | a_11 = torch.sum(weights, dim=[1,2])
80 | # right hand side: b = [b_0, b_1]
81 | b_0 = torch.sum(weights * prediction * target, dim=[1,2])
82 | b_1 = torch.sum(weights * target, dim=[1,2])
83 | # solution: x = A^-1 . b = [[a_11, -a_01], [-a_10, a_00]] / (a_00 * a_11 - a_01 * a_10) . b
84 | det = a_00 * a_11 - a_01 * a_01
85 | scale = (a_11 * b_0 - a_01 * b_1) / det
86 | shift = (-a_01 * b_0 + a_00 * b_1) / det
87 | error = (scale[:,None,None]*prediction+shift[:,None,None]-target).abs()
88 | masked_error = error*weights
89 | error_sum = masked_error.sum(dim=[1,2])
90 | error_num = weights.sum(dim=[1,2])
91 | avg_error = error_sum/error_num
92 |
93 | return scale,shift,avg_error
--------------------------------------------------------------------------------
/src/utils/dyn_uncertainty/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/src/utils/dyn_uncertainty/__init__.py
--------------------------------------------------------------------------------
/src/utils/dyn_uncertainty/median_filter.py:
--------------------------------------------------------------------------------
1 | # Based on https://gist.github.com/rwightman/f2d3849281624be7c0f11c85c87c1598
2 | import math
3 | import torch
4 | import torch.nn as nn
5 | import torch.nn.functional as F
6 | from torch.nn.modules.utils import _pair, _quadruple
7 |
8 |
9 | class MedianPool2d(nn.Module):
10 | """ Median pool module.
11 |
12 | This is used to smooth the thin line in ssim loss.
13 |
14 | Args:
15 | kernel_size: size of pooling kernel, int or 2-tuple
16 | stride: pool stride, int or 2-tuple
17 | padding: pool padding, int or 4-tuple (l, r, t, b) as in pytorch F.pad
18 | same: override padding and enforce same padding, boolean
19 | """
20 | def __init__(self, kernel_size=3, stride=1, padding=0, same=False):
21 | super(MedianPool2d, self).__init__()
22 | self.k = _pair(kernel_size)
23 | self.stride = _pair(stride)
24 | self.padding = _quadruple(padding) # convert to l, r, t, b
25 | self.same = same
26 |
27 | def _padding(self, x):
28 | if self.same:
29 | ih, iw = x.size()[2:]
30 | if ih % self.stride[0] == 0:
31 | ph = max(self.k[0] - self.stride[0], 0)
32 | else:
33 | ph = max(self.k[0] - (ih % self.stride[0]), 0)
34 | if iw % self.stride[1] == 0:
35 | pw = max(self.k[1] - self.stride[1], 0)
36 | else:
37 | pw = max(self.k[1] - (iw % self.stride[1]), 0)
38 | pl = pw // 2
39 | pr = pw - pl
40 | pt = ph // 2
41 | pb = ph - pt
42 | padding = (pl, pr, pt, pb)
43 | else:
44 | padding = self.padding
45 | return padding
46 |
47 | def forward(self, x):
48 | # using existing pytorch functions and tensor ops so that we get autograd,
49 | # would likely be more efficient to implement from scratch at C/Cuda level
50 | x = F.pad(x, self._padding(x), mode='reflect')
51 | x = x.unfold(2, self.k[0], self.stride[0]).unfold(3, self.k[1], self.stride[1])
52 | x = x.contiguous().view(x.size()[:4] + (-1,)).median(dim=-1)[0]
53 | return x
--------------------------------------------------------------------------------
/src/utils/dyn_uncertainty/uncertainty_model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 | class MLPNetwork(nn.Module):
6 | def __init__(self, input_dim: int = 384, hidden_dim: int = 64, output_dim: int = 1,
7 | net_depth: int = 2, net_activation=F.relu, weight_init: str = 'he_uniform'):
8 | super(MLPNetwork, self).__init__()
9 |
10 | self.output_layer_input_dim = hidden_dim
11 |
12 | # Initialize MLP layers
13 | self.layers = nn.ModuleList()
14 | for i in range(net_depth):
15 | dense_layer = nn.Linear(input_dim if i == 0 else hidden_dim, hidden_dim)
16 |
17 | # Apply weight initialization
18 | if weight_init == 'he_uniform':
19 | nn.init.kaiming_uniform_(dense_layer.weight, nonlinearity='relu')
20 | elif weight_init == 'xavier_uniform':
21 | nn.init.xavier_uniform_(dense_layer.weight)
22 | else:
23 | raise NotImplementedError(f"Unknown Weight initialization method {weight_init}")
24 |
25 | self.layers.append(dense_layer)
26 |
27 | # Initialize output layer
28 | self.output_layer = nn.Linear(self.output_layer_input_dim, output_dim)
29 | nn.init.kaiming_uniform_(self.output_layer.weight, nonlinearity='relu')
30 |
31 | # Set activation function
32 | self.net_activation = net_activation
33 | self.softplus = nn.Softplus()
34 |
35 | def forward(self, x: torch.Tensor) -> torch.Tensor:
36 | # Get input dimensions
37 | H, W, C = x.shape[-3:]
38 | input_with_batch_dim = True
39 |
40 | # Add batch dimension if not present
41 | if len(x.shape) == 3:
42 | input_with_batch_dim = False
43 | x = x.unsqueeze(0)
44 | batch_size = 1
45 | else:
46 | batch_size = x.shape[0]
47 |
48 | # Flatten input for MLP
49 | x = x.view(-1, x.size()[-1])
50 |
51 | # Pass through MLP layers
52 | for layer in self.layers:
53 | x = layer(x)
54 | x = self.net_activation(x)
55 | x = F.dropout(x, p=0.2)
56 |
57 | # Pass through output layer and apply softplus activation
58 | x = self.output_layer(x)
59 | x = self.softplus(x)
60 |
61 | # Reshape output to original dimensions
62 | if input_with_batch_dim:
63 | x = x.view(batch_size, H, W)
64 | else:
65 | x = x.view(H, W)
66 |
67 | return x
68 |
69 | def generate_uncertainty_mlp(n_features: int) -> MLPNetwork:
70 | # Create and return an MLP network with the specified input dimensions
71 | network = MLPNetwork(input_dim=n_features).cuda()
72 | return network
--------------------------------------------------------------------------------
/src/utils/eval_utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/src/utils/eval_utils.py
--------------------------------------------------------------------------------
/src/utils/mono_priors/metric_depth_estimators.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import torch.nn.functional as F
4 | from torchvision import transforms
5 | import torchvision.transforms.functional as TF
6 | from typing import Dict, Tuple, Union
7 |
8 | from thirdparty.depth_anything_v2.metric_depth.depth_anything_v2.dpt import (
9 | DepthAnythingV2,
10 | )
11 |
12 |
13 | def get_metric_depth_estimator(cfg: Dict) -> torch.nn.Module:
14 | """
15 | Get the metric depth estimator model based on the configuration.
16 |
17 | Args:
18 | cfg (Dict): Configuration dictionary.
19 |
20 | Returns:
21 | torch.nn.Module: The metric depth estimator model.
22 | """
23 | device = cfg["device"]
24 | depth_model = cfg["mono_prior"]["depth"]
25 |
26 | if "metric3d_vit" in depth_model:
27 | # Options: metric3d_vit_small, metric3d_vit_large, metric3d_vit_giant2
28 | model = torch.hub.load("yvanyin/metric3d", depth_model, pretrain=True)
29 | elif "dpt2" in depth_model:
30 | model = _create_dpt2_model(depth_model)
31 | else:
32 | # If use other metric depth estimator as prior, write the code here
33 | raise NotImplementedError("Unsupported depth model")
34 | return model.to(device).eval()
35 |
36 |
37 | def _create_dpt2_model(depth_model: str) -> DepthAnythingV2:
38 | """
39 | Create a DPT2 model based on the depth model string.
40 |
41 | Args:
42 | depth_model (str): Depth model configuration string.
43 |
44 | Returns:
45 | DepthAnythingV2: Configured DPT2 model.
46 | """
47 | model_configs = {
48 | "vits": {"encoder": "vits", "features": 64, "out_channels": [48, 96, 192, 384]},
49 | "vitb": {
50 | "encoder": "vitb",
51 | "features": 128,
52 | "out_channels": [96, 192, 384, 768],
53 | },
54 | "vitl": {
55 | "encoder": "vitl",
56 | "features": 256,
57 | "out_channels": [256, 512, 1024, 1024],
58 | },
59 | }
60 |
61 | encoder, dataset, max_depth = depth_model.split("_")[1:4]
62 | config = {**model_configs[encoder], "max_depth": int(max_depth)}
63 | model = DepthAnythingV2(**config)
64 |
65 | weights_path = f"pretrained/depth_anything_v2_metric_{dataset}_{encoder}.pth"
66 | model.load_state_dict(
67 | torch.load(weights_path, map_location="cpu", weights_only=True)
68 | )
69 |
70 | return model
71 |
72 |
73 | @torch.no_grad()
74 | def predict_metric_depth(
75 | model: torch.nn.Module,
76 | idx: int,
77 | input_tensor: torch.Tensor,
78 | cfg: Dict,
79 | device: str,
80 | save_depth: bool = True,
81 | ) -> torch.Tensor:
82 | """
83 | Predict metric depth using the given model.
84 |
85 | Args:
86 | model (torch.nn.Module): The depth estimation model.
87 | idx (int): Image index.
88 | input_tensor (torch.Tensor): Input image tensor of shape (1, 3, H, W).
89 | cfg (Dict): Configuration dictionary.
90 | device (str): Device to run the model on.
91 | save_depth (bool): Whether to save the depth map.
92 |
93 | Returns:
94 | torch.Tensor: Predicted depth map.
95 | """
96 | depth_model = cfg["mono_prior"]["depth"]
97 | if "metric3d_vit" in depth_model:
98 | output = _predict_metric3d_depth(model, input_tensor, cfg, device)
99 | elif "dpt2" in depth_model:
100 | # dpt2 model takes np.uint8 as the dtype of input
101 | input_numpy = (255.0 * input.squeeze().permute(1, 2, 0).cpu().numpy()).astype(
102 | np.uint8
103 | )
104 | depth = model.infer_image(input_numpy, input_size=518)
105 | output = torch.tensor(depth).to(device)
106 | else:
107 | # If use other metric depth estimator as prior, write the code here
108 | raise NotImplementedError("Unsupported depth model")
109 |
110 | if save_depth:
111 | _save_depth_map(output, cfg, idx)
112 |
113 | return output
114 |
115 |
116 | def _predict_metric3d_depth(
117 | model: torch.nn.Module, input_tensor: torch.Tensor, cfg: Dict, device: str
118 | ) -> torch.Tensor:
119 | # Refer from: https://github.com/YvanYin/Metric3D/blob/34afafe58d9543f13c01b65222255dab53333838/hubconf.py#L181
120 | image_size = (616, 1064)
121 | h, w = input_tensor.shape[-2:]
122 | scale = min(image_size[0] / h, image_size[1] / w)
123 |
124 | trans_totensor = transforms.Compose(
125 | [
126 | transforms.Resize((int(h * scale), int(w * scale))),
127 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
128 | ]
129 | )
130 | img_tensor = trans_totensor(input_tensor).to(device)
131 |
132 | pad_h, pad_w = image_size[0] - int(h * scale), image_size[1] - int(w * scale)
133 | pad_h_half, pad_w_half = pad_h // 2, pad_w // 2
134 | img_tensor = TF.pad(
135 | img_tensor,
136 | (pad_w_half, pad_h_half, pad_w - pad_w_half, pad_h - pad_h_half),
137 | padding_mode="constant",
138 | fill=0.0,
139 | )
140 |
141 | pad_info = [pad_h_half, pad_h - pad_h_half, pad_w_half, pad_w - pad_w_half]
142 | pred_depth, _, _ = model.inference({"input": img_tensor})
143 | pred_depth = pred_depth.squeeze()
144 | pred_depth = pred_depth[
145 | pad_info[0] : pred_depth.shape[0] - pad_info[1],
146 | pad_info[2] : pred_depth.shape[1] - pad_info[3],
147 | ]
148 | pred_depth = F.interpolate(
149 | pred_depth[None, None, :, :], (h, w), mode="bicubic"
150 | ).squeeze()
151 |
152 | canonical_to_real_scale = cfg["cam"]["fx"] / 1000.0
153 | pred_depth = pred_depth * canonical_to_real_scale
154 | return torch.clamp(pred_depth, 0, 300)
155 |
156 |
157 | def _save_depth_map(depth_map: torch.Tensor, cfg: Dict, idx: int) -> None:
158 | output_dir = f"{cfg['data']['output']}/{cfg['scene']}"
159 | output_path = f"{output_dir}/mono_priors/depths/{idx:05d}.npy"
160 | final_depth = depth_map.detach().cpu().float().numpy()
161 | np.save(output_path, final_depth)
162 |
--------------------------------------------------------------------------------
/src/utils/plot_utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | from PIL import Image
3 | import re
4 |
5 |
6 | def create_gif_from_directory(directory_path, output_filename, duration=100, online=True):
7 | """
8 | Creates a GIF from all PNG images in a given directory.
9 |
10 | :param directory_path: Path to the directory containing PNG images.
11 | :param output_filename: Output filename for the GIF.
12 | :param duration: Duration of each frame in the GIF (in milliseconds).
13 | """
14 | # Function to extract the number from the filename
15 | def extract_number(filename):
16 | # Pattern to find a number followed by '.png'
17 | match = re.search(r'(\d+)\.png$', filename)
18 | if match:
19 | return int(match.group(1))
20 | else:
21 | return None
22 |
23 |
24 | if online:
25 | # Get all PNG files in the directory
26 | image_files = [os.path.join(directory_path, file) for file in os.listdir(directory_path) if file.endswith('.png')]
27 |
28 | # Sort the files based on the number in the filename
29 | image_files.sort(key=extract_number)
30 | else:
31 | # Get all PNG files in the directory
32 | image_files = [os.path.join(directory_path, file) for file in os.listdir(directory_path) if file.endswith('.png')]
33 |
34 | # Sort the files based on the number in the filename
35 | image_files.sort()
36 |
37 | # Load images
38 | images = [Image.open(file) for file in image_files]
39 |
40 | # Convert images to the same mode and size for consistency
41 | images = [img.convert('RGBA') for img in images]
42 | base_size = images[0].size
43 | resized_images = [img.resize(base_size, Image.LANCZOS) for img in images]
44 |
45 | # Save as GIF
46 | resized_images[0].save(output_filename, save_all=True, append_images=resized_images[1:], optimize=False, duration=duration, loop=0)
--------------------------------------------------------------------------------
/src/utils/pose_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 The MonoGS Authors.
2 |
3 | # Licensed under the License issued by the MonoGS Authors
4 | # available here: https://github.com/muskie82/MonoGS/blob/main/LICENSE.md
5 |
6 | import numpy as np
7 | import torch
8 |
9 |
10 | def rt2mato(R, T): # TODO: remove?
11 | mat = np.eye(4)
12 | mat[0:3, 0:3] = R
13 | mat[0:3, 3] = T
14 | return mat
15 |
16 |
17 | def skew_sym_mat(x):
18 | device = x.device
19 | dtype = x.dtype
20 | ssm = torch.zeros(3, 3, device=device, dtype=dtype)
21 | ssm[0, 1] = -x[2]
22 | ssm[0, 2] = x[1]
23 | ssm[1, 0] = x[2]
24 | ssm[1, 2] = -x[0]
25 | ssm[2, 0] = -x[1]
26 | ssm[2, 1] = x[0]
27 | return ssm
28 |
29 |
30 | def SO3_exp(theta):
31 | device = theta.device
32 | dtype = theta.dtype
33 |
34 | W = skew_sym_mat(theta)
35 | W2 = W @ W
36 | angle = torch.norm(theta)
37 | I = torch.eye(3, device=device, dtype=dtype)
38 | if angle < 1e-5:
39 | return I + W + 0.5 * W2
40 | else:
41 | return (
42 | I
43 | + (torch.sin(angle) / angle) * W
44 | + ((1 - torch.cos(angle)) / (angle**2)) * W2
45 | )
46 |
47 |
48 | def V(theta):
49 | dtype = theta.dtype
50 | device = theta.device
51 | I = torch.eye(3, device=device, dtype=dtype)
52 | W = skew_sym_mat(theta)
53 | W2 = W @ W
54 | angle = torch.norm(theta)
55 | if angle < 1e-5:
56 | V = I + 0.5 * W + (1.0 / 6.0) * W2
57 | else:
58 | V = (
59 | I
60 | + W * ((1.0 - torch.cos(angle)) / (angle**2))
61 | + W2 * ((angle - torch.sin(angle)) / (angle**3))
62 | )
63 | return V
64 |
65 |
66 | def SE3_exp(tau):
67 | dtype = tau.dtype
68 | device = tau.device
69 |
70 | rho = tau[:3]
71 | theta = tau[3:]
72 | R = SO3_exp(theta)
73 | t = V(theta) @ rho
74 |
75 | T = torch.eye(4, device=device, dtype=dtype)
76 | T[:3, :3] = R
77 | T[:3, 3] = t
78 | return T
79 |
80 |
81 | def update_pose(camera, converged_threshold=1e-4):
82 | tau = torch.cat([camera.cam_trans_delta, camera.cam_rot_delta], axis=0)
83 |
84 | T_w2c = torch.eye(4, device=tau.device)
85 | T_w2c[0:3, 0:3] = camera.R
86 | T_w2c[0:3, 3] = camera.T
87 |
88 | new_w2c = SE3_exp(tau) @ T_w2c
89 |
90 | new_R = new_w2c[0:3, 0:3]
91 | new_T = new_w2c[0:3, 3]
92 |
93 | converged = tau.norm() < converged_threshold
94 | camera.update_RT(new_R, new_T)
95 |
96 | camera.cam_rot_delta.data.fill_(0)
97 | camera.cam_trans_delta.data.fill_(0)
98 | return converged
99 |
--------------------------------------------------------------------------------
/thirdparty/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/__init__.py
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/DA-2K.md:
--------------------------------------------------------------------------------
1 | # DA-2K Evaluation Benchmark
2 |
3 | ## Introduction
4 |
5 | 
6 |
7 | DA-2K is proposed in [Depth Anything V2](https://depth-anything-v2.github.io) to evaluate the relative depth estimation capability. It encompasses eight representative scenarios of `indoor`, `outdoor`, `non_real`, `transparent_reflective`, `adverse_style`, `aerial`, `underwater`, and `object`. It consists of 1K diverse high-quality images and 2K precise pair-wise relative depth annotations.
8 |
9 | Please refer to our [paper](https://arxiv.org/abs/2406.09414) for details in constructing this benchmark.
10 |
11 |
12 | ## Usage
13 |
14 | Please first [download the benchmark](https://huggingface.co/datasets/depth-anything/DA-2K/tree/main).
15 |
16 | All annotations are stored in `annotations.json`. The annotation file is a JSON object where each key is the path to an image file, and the value is a list of annotations associated with that image. Each annotation describes two points and identifies which point is closer to the camera. The structure is detailed below:
17 |
18 | ```
19 | {
20 | "image_path": [
21 | {
22 | "point1": [h1, w1], # (vertical position, horizontal position)
23 | "point2": [h2, w2], # (vertical position, horizontal position)
24 | "closer_point": "point1" # we always set "point1" as the closer one
25 | },
26 | ...
27 | ],
28 | ...
29 | }
30 | ```
31 |
32 | To visualize the annotations:
33 | ```bash
34 | python visualize.py [--scene-type ]
35 | ```
36 |
37 | **Options**
38 | - `--scene-type ` (optional): Specify the scene type (`indoor`, `outdoor`, `non_real`, `transparent_reflective`, `adverse_style`, `aerial`, `underwater`, and `object`). Skip this argument or set as `""` to include all scene types.
39 |
40 | ## Citation
41 |
42 | If you find this benchmark useful, please consider citing:
43 |
44 | ```bibtex
45 | @article{depth_anything_v2,
46 | title={Depth Anything V2},
47 | author={Yang, Lihe and Kang, Bingyi and Huang, Zilong and Zhao, Zhen and Xu, Xiaogang and Feng, Jiashi and Zhao, Hengshuang},
48 | journal={arXiv:2406.09414},
49 | year={2024}
50 | }
51 | ```
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/app.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import gradio as gr
3 | import matplotlib
4 | import numpy as np
5 | from PIL import Image
6 | import torch
7 | import tempfile
8 | from gradio_imageslider import ImageSlider
9 |
10 | from depth_anything_v2.dpt import DepthAnythingV2
11 |
12 | css = """
13 | #img-display-container {
14 | max-height: 100vh;
15 | }
16 | #img-display-input {
17 | max-height: 80vh;
18 | }
19 | #img-display-output {
20 | max-height: 80vh;
21 | }
22 | #download {
23 | height: 62px;
24 | }
25 | """
26 | DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
27 | model_configs = {
28 | 'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
29 | 'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
30 | 'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
31 | 'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
32 | }
33 | encoder = 'vitl'
34 | model = DepthAnythingV2(**model_configs[encoder])
35 | state_dict = torch.load(f'checkpoints/depth_anything_v2_{encoder}.pth', map_location="cpu")
36 | model.load_state_dict(state_dict)
37 | model = model.to(DEVICE).eval()
38 |
39 | title = "# Depth Anything V2"
40 | description = """Official demo for **Depth Anything V2**.
41 | Please refer to our [paper](https://arxiv.org/abs/2406.09414), [project page](https://depth-anything-v2.github.io), or [github](https://github.com/DepthAnything/Depth-Anything-V2) for more details."""
42 |
43 | def predict_depth(image):
44 | return model.infer_image(image)
45 |
46 | with gr.Blocks(css=css) as demo:
47 | gr.Markdown(title)
48 | gr.Markdown(description)
49 | gr.Markdown("### Depth Prediction demo")
50 |
51 | with gr.Row():
52 | input_image = gr.Image(label="Input Image", type='numpy', elem_id='img-display-input')
53 | depth_image_slider = ImageSlider(label="Depth Map with Slider View", elem_id='img-display-output', position=0.5)
54 | submit = gr.Button(value="Compute Depth")
55 | gray_depth_file = gr.File(label="Grayscale depth map", elem_id="download",)
56 | raw_file = gr.File(label="16-bit raw output (can be considered as disparity)", elem_id="download",)
57 |
58 | cmap = matplotlib.colormaps.get_cmap('Spectral_r')
59 |
60 | def on_submit(image):
61 | original_image = image.copy()
62 |
63 | h, w = image.shape[:2]
64 |
65 | depth = predict_depth(image[:, :, ::-1])
66 |
67 | raw_depth = Image.fromarray(depth.astype('uint16'))
68 | tmp_raw_depth = tempfile.NamedTemporaryFile(suffix='.png', delete=False)
69 | raw_depth.save(tmp_raw_depth.name)
70 |
71 | depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
72 | depth = depth.astype(np.uint8)
73 | colored_depth = (cmap(depth)[:, :, :3] * 255).astype(np.uint8)
74 |
75 | gray_depth = Image.fromarray(depth)
76 | tmp_gray_depth = tempfile.NamedTemporaryFile(suffix='.png', delete=False)
77 | gray_depth.save(tmp_gray_depth.name)
78 |
79 | return [(original_image, colored_depth), tmp_gray_depth.name, tmp_raw_depth.name]
80 |
81 | submit.click(on_submit, inputs=[input_image], outputs=[depth_image_slider, gray_depth_file, raw_file])
82 |
83 | example_files = glob.glob('assets/examples/*')
84 | examples = gr.Examples(examples=example_files, inputs=[input_image], outputs=[depth_image_slider, gray_depth_file, raw_file], fn=on_submit)
85 |
86 |
87 | if __name__ == '__main__':
88 | demo.queue().launch()
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/assets/DA-2K.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/DA-2K.png
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/assets/examples/demo01.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples/demo01.jpg
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/assets/examples/demo02.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples/demo02.jpg
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/assets/examples/demo03.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples/demo03.jpg
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/assets/examples/demo04.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples/demo04.jpg
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/assets/examples/demo05.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples/demo05.jpg
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/assets/examples/demo06.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples/demo06.jpg
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/assets/examples/demo07.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples/demo07.jpg
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/assets/examples/demo08.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples/demo08.jpg
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/assets/examples/demo09.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples/demo09.jpg
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/assets/examples/demo10.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples/demo10.jpg
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/assets/examples/demo11.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples/demo11.jpg
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/assets/examples/demo12.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples/demo12.jpg
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/assets/examples/demo13.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples/demo13.jpg
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/assets/examples/demo14.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples/demo14.jpg
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/assets/examples/demo15.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples/demo15.jpg
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/assets/examples/demo16.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples/demo16.jpg
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/assets/examples/demo17.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples/demo17.jpg
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/assets/examples/demo18.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples/demo18.jpg
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/assets/examples/demo19.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples/demo19.jpg
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/assets/examples/demo20.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples/demo20.jpg
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/assets/examples_video/basketball.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples_video/basketball.mp4
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/assets/examples_video/ferris_wheel.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples_video/ferris_wheel.mp4
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/assets/teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/teaser.png
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/depth_anything_v2/dinov2_layers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 |
7 | from .mlp import Mlp
8 | from .patch_embed import PatchEmbed
9 | from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
10 | from .block import NestedTensorBlock
11 | from .attention import MemEffAttention
12 |
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/depth_anything_v2/dinov2_layers/attention.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 |
7 | # References:
8 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
9 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
10 |
11 | import logging
12 |
13 | from torch import Tensor
14 | from torch import nn
15 |
16 |
17 | logger = logging.getLogger("dinov2")
18 |
19 |
20 | try:
21 | from xformers.ops import memory_efficient_attention, unbind, fmha
22 |
23 | XFORMERS_AVAILABLE = True
24 | except ImportError:
25 | logger.warning("xFormers not available")
26 | XFORMERS_AVAILABLE = False
27 |
28 |
29 | class Attention(nn.Module):
30 | def __init__(
31 | self,
32 | dim: int,
33 | num_heads: int = 8,
34 | qkv_bias: bool = False,
35 | proj_bias: bool = True,
36 | attn_drop: float = 0.0,
37 | proj_drop: float = 0.0,
38 | ) -> None:
39 | super().__init__()
40 | self.num_heads = num_heads
41 | head_dim = dim // num_heads
42 | self.scale = head_dim**-0.5
43 |
44 | self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
45 | self.attn_drop = nn.Dropout(attn_drop)
46 | self.proj = nn.Linear(dim, dim, bias=proj_bias)
47 | self.proj_drop = nn.Dropout(proj_drop)
48 |
49 | def forward(self, x: Tensor) -> Tensor:
50 | B, N, C = x.shape
51 | qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
52 |
53 | q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
54 | attn = q @ k.transpose(-2, -1)
55 |
56 | attn = attn.softmax(dim=-1)
57 | attn = self.attn_drop(attn)
58 |
59 | x = (attn @ v).transpose(1, 2).reshape(B, N, C)
60 | x = self.proj(x)
61 | x = self.proj_drop(x)
62 | return x
63 |
64 |
65 | class MemEffAttention(Attention):
66 | def forward(self, x: Tensor, attn_bias=None) -> Tensor:
67 | if not XFORMERS_AVAILABLE:
68 | assert attn_bias is None, "xFormers is required for nested tensors usage"
69 | return super().forward(x)
70 |
71 | B, N, C = x.shape
72 | qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
73 |
74 | q, k, v = unbind(qkv, 2)
75 |
76 | x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
77 | x = x.reshape([B, N, C])
78 |
79 | x = self.proj(x)
80 | x = self.proj_drop(x)
81 | return x
82 |
83 |
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/depth_anything_v2/dinov2_layers/drop_path.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 |
7 | # References:
8 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
9 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
10 |
11 |
12 | from torch import nn
13 |
14 |
15 | def drop_path(x, drop_prob: float = 0.0, training: bool = False):
16 | if drop_prob == 0.0 or not training:
17 | return x
18 | keep_prob = 1 - drop_prob
19 | shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
20 | random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
21 | if keep_prob > 0.0:
22 | random_tensor.div_(keep_prob)
23 | output = x * random_tensor
24 | return output
25 |
26 |
27 | class DropPath(nn.Module):
28 | """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
29 |
30 | def __init__(self, drop_prob=None):
31 | super(DropPath, self).__init__()
32 | self.drop_prob = drop_prob
33 |
34 | def forward(self, x):
35 | return drop_path(x, self.drop_prob, self.training)
36 |
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/depth_anything_v2/dinov2_layers/layer_scale.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 |
7 | # Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
8 |
9 | from typing import Union
10 |
11 | import torch
12 | from torch import Tensor
13 | from torch import nn
14 |
15 |
16 | class LayerScale(nn.Module):
17 | def __init__(
18 | self,
19 | dim: int,
20 | init_values: Union[float, Tensor] = 1e-5,
21 | inplace: bool = False,
22 | ) -> None:
23 | super().__init__()
24 | self.inplace = inplace
25 | self.gamma = nn.Parameter(init_values * torch.ones(dim))
26 |
27 | def forward(self, x: Tensor) -> Tensor:
28 | return x.mul_(self.gamma) if self.inplace else x * self.gamma
29 |
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/depth_anything_v2/dinov2_layers/mlp.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 |
7 | # References:
8 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
9 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
10 |
11 |
12 | from typing import Callable, Optional
13 |
14 | from torch import Tensor, nn
15 |
16 |
17 | class Mlp(nn.Module):
18 | def __init__(
19 | self,
20 | in_features: int,
21 | hidden_features: Optional[int] = None,
22 | out_features: Optional[int] = None,
23 | act_layer: Callable[..., nn.Module] = nn.GELU,
24 | drop: float = 0.0,
25 | bias: bool = True,
26 | ) -> None:
27 | super().__init__()
28 | out_features = out_features or in_features
29 | hidden_features = hidden_features or in_features
30 | self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
31 | self.act = act_layer()
32 | self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
33 | self.drop = nn.Dropout(drop)
34 |
35 | def forward(self, x: Tensor) -> Tensor:
36 | x = self.fc1(x)
37 | x = self.act(x)
38 | x = self.drop(x)
39 | x = self.fc2(x)
40 | x = self.drop(x)
41 | return x
42 |
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/depth_anything_v2/dinov2_layers/patch_embed.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 |
7 | # References:
8 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
9 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
10 |
11 | from typing import Callable, Optional, Tuple, Union
12 |
13 | from torch import Tensor
14 | import torch.nn as nn
15 |
16 |
17 | def make_2tuple(x):
18 | if isinstance(x, tuple):
19 | assert len(x) == 2
20 | return x
21 |
22 | assert isinstance(x, int)
23 | return (x, x)
24 |
25 |
26 | class PatchEmbed(nn.Module):
27 | """
28 | 2D image to patch embedding: (B,C,H,W) -> (B,N,D)
29 |
30 | Args:
31 | img_size: Image size.
32 | patch_size: Patch token size.
33 | in_chans: Number of input image channels.
34 | embed_dim: Number of linear projection output channels.
35 | norm_layer: Normalization layer.
36 | """
37 |
38 | def __init__(
39 | self,
40 | img_size: Union[int, Tuple[int, int]] = 224,
41 | patch_size: Union[int, Tuple[int, int]] = 16,
42 | in_chans: int = 3,
43 | embed_dim: int = 768,
44 | norm_layer: Optional[Callable] = None,
45 | flatten_embedding: bool = True,
46 | ) -> None:
47 | super().__init__()
48 |
49 | image_HW = make_2tuple(img_size)
50 | patch_HW = make_2tuple(patch_size)
51 | patch_grid_size = (
52 | image_HW[0] // patch_HW[0],
53 | image_HW[1] // patch_HW[1],
54 | )
55 |
56 | self.img_size = image_HW
57 | self.patch_size = patch_HW
58 | self.patches_resolution = patch_grid_size
59 | self.num_patches = patch_grid_size[0] * patch_grid_size[1]
60 |
61 | self.in_chans = in_chans
62 | self.embed_dim = embed_dim
63 |
64 | self.flatten_embedding = flatten_embedding
65 |
66 | self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
67 | self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
68 |
69 | def forward(self, x: Tensor) -> Tensor:
70 | _, _, H, W = x.shape
71 | patch_H, patch_W = self.patch_size
72 |
73 | assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
74 | assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
75 |
76 | x = self.proj(x) # B C H W
77 | H, W = x.size(2), x.size(3)
78 | x = x.flatten(2).transpose(1, 2) # B HW C
79 | x = self.norm(x)
80 | if not self.flatten_embedding:
81 | x = x.reshape(-1, H, W, self.embed_dim) # B H W C
82 | return x
83 |
84 | def flops(self) -> float:
85 | Ho, Wo = self.patches_resolution
86 | flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
87 | if self.norm is not None:
88 | flops += Ho * Wo * self.embed_dim
89 | return flops
90 |
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/depth_anything_v2/dinov2_layers/swiglu_ffn.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 |
7 | from typing import Callable, Optional
8 |
9 | from torch import Tensor, nn
10 | import torch.nn.functional as F
11 |
12 |
13 | class SwiGLUFFN(nn.Module):
14 | def __init__(
15 | self,
16 | in_features: int,
17 | hidden_features: Optional[int] = None,
18 | out_features: Optional[int] = None,
19 | act_layer: Callable[..., nn.Module] = None,
20 | drop: float = 0.0,
21 | bias: bool = True,
22 | ) -> None:
23 | super().__init__()
24 | out_features = out_features or in_features
25 | hidden_features = hidden_features or in_features
26 | self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
27 | self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
28 |
29 | def forward(self, x: Tensor) -> Tensor:
30 | x12 = self.w12(x)
31 | x1, x2 = x12.chunk(2, dim=-1)
32 | hidden = F.silu(x1) * x2
33 | return self.w3(hidden)
34 |
35 |
36 | try:
37 | from xformers.ops import SwiGLU
38 |
39 | XFORMERS_AVAILABLE = True
40 | except ImportError:
41 | SwiGLU = SwiGLUFFN
42 | XFORMERS_AVAILABLE = False
43 |
44 |
45 | class SwiGLUFFNFused(SwiGLU):
46 | def __init__(
47 | self,
48 | in_features: int,
49 | hidden_features: Optional[int] = None,
50 | out_features: Optional[int] = None,
51 | act_layer: Callable[..., nn.Module] = None,
52 | drop: float = 0.0,
53 | bias: bool = True,
54 | ) -> None:
55 | out_features = out_features or in_features
56 | hidden_features = hidden_features or in_features
57 | hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
58 | super().__init__(
59 | in_features=in_features,
60 | hidden_features=hidden_features,
61 | out_features=out_features,
62 | bias=bias,
63 | )
64 |
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/depth_anything_v2/util/blocks.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 |
3 |
4 | def _make_scratch(in_shape, out_shape, groups=1, expand=False):
5 | scratch = nn.Module()
6 |
7 | out_shape1 = out_shape
8 | out_shape2 = out_shape
9 | out_shape3 = out_shape
10 | if len(in_shape) >= 4:
11 | out_shape4 = out_shape
12 |
13 | if expand:
14 | out_shape1 = out_shape
15 | out_shape2 = out_shape * 2
16 | out_shape3 = out_shape * 4
17 | if len(in_shape) >= 4:
18 | out_shape4 = out_shape * 8
19 |
20 | scratch.layer1_rn = nn.Conv2d(in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
21 | scratch.layer2_rn = nn.Conv2d(in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
22 | scratch.layer3_rn = nn.Conv2d(in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
23 | if len(in_shape) >= 4:
24 | scratch.layer4_rn = nn.Conv2d(in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
25 |
26 | return scratch
27 |
28 |
29 | class ResidualConvUnit(nn.Module):
30 | """Residual convolution module.
31 | """
32 |
33 | def __init__(self, features, activation, bn):
34 | """Init.
35 |
36 | Args:
37 | features (int): number of features
38 | """
39 | super().__init__()
40 |
41 | self.bn = bn
42 |
43 | self.groups=1
44 |
45 | self.conv1 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
46 |
47 | self.conv2 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
48 |
49 | if self.bn == True:
50 | self.bn1 = nn.BatchNorm2d(features)
51 | self.bn2 = nn.BatchNorm2d(features)
52 |
53 | self.activation = activation
54 |
55 | self.skip_add = nn.quantized.FloatFunctional()
56 |
57 | def forward(self, x):
58 | """Forward pass.
59 |
60 | Args:
61 | x (tensor): input
62 |
63 | Returns:
64 | tensor: output
65 | """
66 |
67 | out = self.activation(x)
68 | out = self.conv1(out)
69 | if self.bn == True:
70 | out = self.bn1(out)
71 |
72 | out = self.activation(out)
73 | out = self.conv2(out)
74 | if self.bn == True:
75 | out = self.bn2(out)
76 |
77 | if self.groups > 1:
78 | out = self.conv_merge(out)
79 |
80 | return self.skip_add.add(out, x)
81 |
82 |
83 | class FeatureFusionBlock(nn.Module):
84 | """Feature fusion block.
85 | """
86 |
87 | def __init__(
88 | self,
89 | features,
90 | activation,
91 | deconv=False,
92 | bn=False,
93 | expand=False,
94 | align_corners=True,
95 | size=None
96 | ):
97 | """Init.
98 |
99 | Args:
100 | features (int): number of features
101 | """
102 | super(FeatureFusionBlock, self).__init__()
103 |
104 | self.deconv = deconv
105 | self.align_corners = align_corners
106 |
107 | self.groups=1
108 |
109 | self.expand = expand
110 | out_features = features
111 | if self.expand == True:
112 | out_features = features // 2
113 |
114 | self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1)
115 |
116 | self.resConfUnit1 = ResidualConvUnit(features, activation, bn)
117 | self.resConfUnit2 = ResidualConvUnit(features, activation, bn)
118 |
119 | self.skip_add = nn.quantized.FloatFunctional()
120 |
121 | self.size=size
122 |
123 | def forward(self, *xs, size=None):
124 | """Forward pass.
125 |
126 | Returns:
127 | tensor: output
128 | """
129 | output = xs[0]
130 |
131 | if len(xs) == 2:
132 | res = self.resConfUnit1(xs[1])
133 | output = self.skip_add.add(output, res)
134 |
135 | output = self.resConfUnit2(output)
136 |
137 | if (size is None) and (self.size is None):
138 | modifier = {"scale_factor": 2}
139 | elif size is None:
140 | modifier = {"size": self.size}
141 | else:
142 | modifier = {"size": size}
143 |
144 | output = nn.functional.interpolate(output, **modifier, mode="bilinear", align_corners=self.align_corners)
145 |
146 | output = self.out_conv(output)
147 |
148 | return output
149 |
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/metric_depth/README.md:
--------------------------------------------------------------------------------
1 | # Depth Anything V2 for Metric Depth Estimation
2 |
3 | 
4 |
5 | We here provide a simple codebase to fine-tune our Depth Anything V2 pre-trained encoder for metric depth estimation. Built on our powerful encoder, we use a simple DPT head to regress the depth. We fine-tune our pre-trained encoder on synthetic Hypersim / Virtual KITTI datasets for indoor / outdoor metric depth estimation, respectively.
6 |
7 |
8 | # Pre-trained Models
9 |
10 | We provide **six metric depth models** of three scales for indoor and outdoor scenes, respectively.
11 |
12 | | Base Model | Params | Indoor (Hypersim) | Outdoor (Virtual KITTI 2) |
13 | |:-|-:|:-:|:-:|
14 | | Depth-Anything-V2-Small | 24.8M | [Download](https://huggingface.co/depth-anything/Depth-Anything-V2-Metric-Hypersim-Small/resolve/main/depth_anything_v2_metric_hypersim_vits.pth?download=true) | [Download](https://huggingface.co/depth-anything/Depth-Anything-V2-Metric-VKITTI-Small/resolve/main/depth_anything_v2_metric_vkitti_vits.pth?download=true) |
15 | | Depth-Anything-V2-Base | 97.5M | [Download](https://huggingface.co/depth-anything/Depth-Anything-V2-Metric-Hypersim-Base/resolve/main/depth_anything_v2_metric_hypersim_vitb.pth?download=true) | [Download](https://huggingface.co/depth-anything/Depth-Anything-V2-Metric-VKITTI-Base/resolve/main/depth_anything_v2_metric_vkitti_vitb.pth?download=true) |
16 | | Depth-Anything-V2-Large | 335.3M | [Download](https://huggingface.co/depth-anything/Depth-Anything-V2-Metric-Hypersim-Large/resolve/main/depth_anything_v2_metric_hypersim_vitl.pth?download=true) | [Download](https://huggingface.co/depth-anything/Depth-Anything-V2-Metric-VKITTI-Large/resolve/main/depth_anything_v2_metric_vkitti_vitl.pth?download=true) |
17 |
18 | *We recommend to first try our larger models (if computational cost is affordable) and the indoor version.*
19 |
20 | ## Usage
21 |
22 | ### Prepraration
23 |
24 | ```bash
25 | git clone https://github.com/DepthAnything/Depth-Anything-V2
26 | cd Depth-Anything-V2/metric_depth
27 | pip install -r requirements.txt
28 | ```
29 |
30 | Download the checkpoints listed [here](#pre-trained-models) and put them under the `checkpoints` directory.
31 |
32 | ### Use our models
33 | ```python
34 | import cv2
35 | import torch
36 |
37 | from depth_anything_v2.dpt import DepthAnythingV2
38 |
39 | model_configs = {
40 | 'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
41 | 'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
42 | 'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]}
43 | }
44 |
45 | encoder = 'vitl' # or 'vits', 'vitb'
46 | dataset = 'hypersim' # 'hypersim' for indoor model, 'vkitti' for outdoor model
47 | max_depth = 20 # 20 for indoor model, 80 for outdoor model
48 |
49 | model = DepthAnythingV2(**{**model_configs[encoder], 'max_depth': max_depth})
50 | model.load_state_dict(torch.load(f'checkpoints/depth_anything_v2_metric_{dataset}_{encoder}.pth', map_location='cpu'))
51 | model.eval()
52 |
53 | raw_img = cv2.imread('your/image/path')
54 | depth = model.infer_image(raw_img) # HxW depth map in meters in numpy
55 | ```
56 |
57 | ### Running script on images
58 |
59 | Here, we take the `vitl` encoder as an example. You can also use `vitb` or `vits` encoders.
60 |
61 | ```bash
62 | # indoor scenes
63 | python run.py \
64 | --encoder vitl \
65 | --load-from checkpoints/depth_anything_v2_metric_hypersim_vitl.pth \
66 | --max-depth 20 \
67 | --img-path --outdir [--input-size ] [--save-numpy]
68 |
69 | # outdoor scenes
70 | python run.py \
71 | --encoder vitl \
72 | --load-from checkpoints/depth_anything_v2_metric_vkitti_vitl.pth \
73 | --max-depth 80 \
74 | --img-path --outdir [--input-size ] [--save-numpy]
75 | ```
76 |
77 | ### Project 2D images to point clouds:
78 |
79 | ```bash
80 | python depth_to_pointcloud.py \
81 | --encoder vitl \
82 | --load-from checkpoints/depth_anything_v2_metric_hypersim_vitl.pth \
83 | --max-depth 20 \
84 | --img-path --outdir
85 | ```
86 |
87 | ### Reproduce training
88 |
89 | Please first prepare the [Hypersim](https://github.com/apple/ml-hypersim) and [Virtual KITTI 2](https://europe.naverlabs.com/research/computer-vision/proxy-virtual-worlds-vkitti-2/) datasets. Then:
90 |
91 | ```bash
92 | bash dist_train.sh
93 | ```
94 |
95 |
96 | ## Citation
97 |
98 | If you find this project useful, please consider citing:
99 |
100 | ```bibtex
101 | @article{depth_anything_v2,
102 | title={Depth Anything V2},
103 | author={Yang, Lihe and Kang, Bingyi and Huang, Zilong and Zhao, Zhen and Xu, Xiaogang and Feng, Jiashi and Zhao, Hengshuang},
104 | journal={arXiv:2406.09414},
105 | year={2024}
106 | }
107 |
108 | @inproceedings{depth_anything_v1,
109 | title={Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data},
110 | author={Yang, Lihe and Kang, Bingyi and Huang, Zilong and Xu, Xiaogang and Feng, Jiashi and Zhao, Hengshuang},
111 | booktitle={CVPR},
112 | year={2024}
113 | }
114 | ```
115 |
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/metric_depth/assets/compare_zoedepth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/metric_depth/assets/compare_zoedepth.png
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/metric_depth/dataset/hypersim.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | import h5py
3 | import numpy as np
4 | import torch
5 | from torch.utils.data import Dataset
6 | from torchvision.transforms import Compose
7 |
8 | from dataset.transform import Resize, NormalizeImage, PrepareForNet, Crop
9 |
10 |
11 | def hypersim_distance_to_depth(npyDistance):
12 | intWidth, intHeight, fltFocal = 1024, 768, 886.81
13 |
14 | npyImageplaneX = np.linspace((-0.5 * intWidth) + 0.5, (0.5 * intWidth) - 0.5, intWidth).reshape(
15 | 1, intWidth).repeat(intHeight, 0).astype(np.float32)[:, :, None]
16 | npyImageplaneY = np.linspace((-0.5 * intHeight) + 0.5, (0.5 * intHeight) - 0.5,
17 | intHeight).reshape(intHeight, 1).repeat(intWidth, 1).astype(np.float32)[:, :, None]
18 | npyImageplaneZ = np.full([intHeight, intWidth, 1], fltFocal, np.float32)
19 | npyImageplane = np.concatenate(
20 | [npyImageplaneX, npyImageplaneY, npyImageplaneZ], 2)
21 |
22 | npyDepth = npyDistance / np.linalg.norm(npyImageplane, 2, 2) * fltFocal
23 | return npyDepth
24 |
25 |
26 | class Hypersim(Dataset):
27 | def __init__(self, filelist_path, mode, size=(518, 518)):
28 |
29 | self.mode = mode
30 | self.size = size
31 |
32 | with open(filelist_path, 'r') as f:
33 | self.filelist = f.read().splitlines()
34 |
35 | net_w, net_h = size
36 | self.transform = Compose([
37 | Resize(
38 | width=net_w,
39 | height=net_h,
40 | resize_target=True if mode == 'train' else False,
41 | keep_aspect_ratio=True,
42 | ensure_multiple_of=14,
43 | resize_method='lower_bound',
44 | image_interpolation_method=cv2.INTER_CUBIC,
45 | ),
46 | NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
47 | PrepareForNet(),
48 | ] + ([Crop(size[0])] if self.mode == 'train' else []))
49 |
50 | def __getitem__(self, item):
51 | img_path = self.filelist[item].split(' ')[0]
52 | depth_path = self.filelist[item].split(' ')[1]
53 |
54 | image = cv2.imread(img_path)
55 | image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) / 255.0
56 |
57 | depth_fd = h5py.File(depth_path, "r")
58 | distance_meters = np.array(depth_fd['dataset'])
59 | depth = hypersim_distance_to_depth(distance_meters)
60 |
61 | sample = self.transform({'image': image, 'depth': depth})
62 |
63 | sample['image'] = torch.from_numpy(sample['image'])
64 | sample['depth'] = torch.from_numpy(sample['depth'])
65 |
66 | sample['valid_mask'] = (torch.isnan(sample['depth']) == 0)
67 | sample['depth'][sample['valid_mask'] == 0] = 0
68 |
69 | sample['image_path'] = self.filelist[item].split(' ')[0]
70 |
71 | return sample
72 |
73 | def __len__(self):
74 | return len(self.filelist)
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/metric_depth/dataset/kitti.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | import torch
3 | from torch.utils.data import Dataset
4 | from torchvision.transforms import Compose
5 |
6 | from dataset.transform import Resize, NormalizeImage, PrepareForNet
7 |
8 |
9 | class KITTI(Dataset):
10 | def __init__(self, filelist_path, mode, size=(518, 518)):
11 | if mode != 'val':
12 | raise NotImplementedError
13 |
14 | self.mode = mode
15 | self.size = size
16 |
17 | with open(filelist_path, 'r') as f:
18 | self.filelist = f.read().splitlines()
19 |
20 | net_w, net_h = size
21 | self.transform = Compose([
22 | Resize(
23 | width=net_w,
24 | height=net_h,
25 | resize_target=True if mode == 'train' else False,
26 | keep_aspect_ratio=True,
27 | ensure_multiple_of=14,
28 | resize_method='lower_bound',
29 | image_interpolation_method=cv2.INTER_CUBIC,
30 | ),
31 | NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
32 | PrepareForNet(),
33 | ])
34 |
35 | def __getitem__(self, item):
36 | img_path = self.filelist[item].split(' ')[0]
37 | depth_path = self.filelist[item].split(' ')[1]
38 |
39 | image = cv2.imread(img_path)
40 | image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) / 255.0
41 |
42 | depth = cv2.imread(depth_path, cv2.IMREAD_UNCHANGED).astype('float32')
43 |
44 | sample = self.transform({'image': image, 'depth': depth})
45 |
46 | sample['image'] = torch.from_numpy(sample['image'])
47 | sample['depth'] = torch.from_numpy(sample['depth'])
48 | sample['depth'] = sample['depth'] / 256.0 # convert in meters
49 |
50 | sample['valid_mask'] = sample['depth'] > 0
51 |
52 | sample['image_path'] = self.filelist[item].split(' ')[0]
53 |
54 | return sample
55 |
56 | def __len__(self):
57 | return len(self.filelist)
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/metric_depth/dataset/vkitti2.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | import torch
3 | from torch.utils.data import Dataset
4 | from torchvision.transforms import Compose
5 |
6 | from dataset.transform import Resize, NormalizeImage, PrepareForNet, Crop
7 |
8 |
9 | class VKITTI2(Dataset):
10 | def __init__(self, filelist_path, mode, size=(518, 518)):
11 |
12 | self.mode = mode
13 | self.size = size
14 |
15 | with open(filelist_path, 'r') as f:
16 | self.filelist = f.read().splitlines()
17 |
18 | net_w, net_h = size
19 | self.transform = Compose([
20 | Resize(
21 | width=net_w,
22 | height=net_h,
23 | resize_target=True if mode == 'train' else False,
24 | keep_aspect_ratio=True,
25 | ensure_multiple_of=14,
26 | resize_method='lower_bound',
27 | image_interpolation_method=cv2.INTER_CUBIC,
28 | ),
29 | NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
30 | PrepareForNet(),
31 | ] + ([Crop(size[0])] if self.mode == 'train' else []))
32 |
33 | def __getitem__(self, item):
34 | img_path = self.filelist[item].split(' ')[0]
35 | depth_path = self.filelist[item].split(' ')[1]
36 |
37 | image = cv2.imread(img_path)
38 | image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) / 255.0
39 |
40 | depth = cv2.imread(depth_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH) / 100.0 # cm to m
41 |
42 | sample = self.transform({'image': image, 'depth': depth})
43 |
44 | sample['image'] = torch.from_numpy(sample['image'])
45 | sample['depth'] = torch.from_numpy(sample['depth'])
46 |
47 | sample['valid_mask'] = (sample['depth'] <= 80)
48 |
49 | sample['image_path'] = self.filelist[item].split(' ')[0]
50 |
51 | return sample
52 |
53 | def __len__(self):
54 | return len(self.filelist)
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/metric_depth/depth_anything_v2/dinov2_layers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 |
7 | from .mlp import Mlp
8 | from .patch_embed import PatchEmbed
9 | from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
10 | from .block import NestedTensorBlock
11 | from .attention import MemEffAttention
12 |
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/metric_depth/depth_anything_v2/dinov2_layers/attention.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 |
7 | # References:
8 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
9 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
10 |
11 | import logging
12 |
13 | from torch import Tensor
14 | from torch import nn
15 |
16 |
17 | logger = logging.getLogger("dinov2")
18 |
19 |
20 | try:
21 | from xformers.ops import memory_efficient_attention, unbind, fmha
22 |
23 | XFORMERS_AVAILABLE = True
24 | except ImportError:
25 | logger.warning("xFormers not available")
26 | XFORMERS_AVAILABLE = False
27 |
28 |
29 | class Attention(nn.Module):
30 | def __init__(
31 | self,
32 | dim: int,
33 | num_heads: int = 8,
34 | qkv_bias: bool = False,
35 | proj_bias: bool = True,
36 | attn_drop: float = 0.0,
37 | proj_drop: float = 0.0,
38 | ) -> None:
39 | super().__init__()
40 | self.num_heads = num_heads
41 | head_dim = dim // num_heads
42 | self.scale = head_dim**-0.5
43 |
44 | self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
45 | self.attn_drop = nn.Dropout(attn_drop)
46 | self.proj = nn.Linear(dim, dim, bias=proj_bias)
47 | self.proj_drop = nn.Dropout(proj_drop)
48 |
49 | def forward(self, x: Tensor) -> Tensor:
50 | B, N, C = x.shape
51 | qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
52 |
53 | q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
54 | attn = q @ k.transpose(-2, -1)
55 |
56 | attn = attn.softmax(dim=-1)
57 | attn = self.attn_drop(attn)
58 |
59 | x = (attn @ v).transpose(1, 2).reshape(B, N, C)
60 | x = self.proj(x)
61 | x = self.proj_drop(x)
62 | return x
63 |
64 |
65 | class MemEffAttention(Attention):
66 | def forward(self, x: Tensor, attn_bias=None) -> Tensor:
67 | if not XFORMERS_AVAILABLE:
68 | assert attn_bias is None, "xFormers is required for nested tensors usage"
69 | return super().forward(x)
70 |
71 | B, N, C = x.shape
72 | qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
73 |
74 | q, k, v = unbind(qkv, 2)
75 |
76 | x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
77 | x = x.reshape([B, N, C])
78 |
79 | x = self.proj(x)
80 | x = self.proj_drop(x)
81 | return x
82 |
83 |
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/metric_depth/depth_anything_v2/dinov2_layers/drop_path.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 |
7 | # References:
8 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
9 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
10 |
11 |
12 | from torch import nn
13 |
14 |
15 | def drop_path(x, drop_prob: float = 0.0, training: bool = False):
16 | if drop_prob == 0.0 or not training:
17 | return x
18 | keep_prob = 1 - drop_prob
19 | shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
20 | random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
21 | if keep_prob > 0.0:
22 | random_tensor.div_(keep_prob)
23 | output = x * random_tensor
24 | return output
25 |
26 |
27 | class DropPath(nn.Module):
28 | """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
29 |
30 | def __init__(self, drop_prob=None):
31 | super(DropPath, self).__init__()
32 | self.drop_prob = drop_prob
33 |
34 | def forward(self, x):
35 | return drop_path(x, self.drop_prob, self.training)
36 |
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/metric_depth/depth_anything_v2/dinov2_layers/layer_scale.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 |
7 | # Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
8 |
9 | from typing import Union
10 |
11 | import torch
12 | from torch import Tensor
13 | from torch import nn
14 |
15 |
16 | class LayerScale(nn.Module):
17 | def __init__(
18 | self,
19 | dim: int,
20 | init_values: Union[float, Tensor] = 1e-5,
21 | inplace: bool = False,
22 | ) -> None:
23 | super().__init__()
24 | self.inplace = inplace
25 | self.gamma = nn.Parameter(init_values * torch.ones(dim))
26 |
27 | def forward(self, x: Tensor) -> Tensor:
28 | return x.mul_(self.gamma) if self.inplace else x * self.gamma
29 |
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/metric_depth/depth_anything_v2/dinov2_layers/mlp.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 |
7 | # References:
8 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
9 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
10 |
11 |
12 | from typing import Callable, Optional
13 |
14 | from torch import Tensor, nn
15 |
16 |
17 | class Mlp(nn.Module):
18 | def __init__(
19 | self,
20 | in_features: int,
21 | hidden_features: Optional[int] = None,
22 | out_features: Optional[int] = None,
23 | act_layer: Callable[..., nn.Module] = nn.GELU,
24 | drop: float = 0.0,
25 | bias: bool = True,
26 | ) -> None:
27 | super().__init__()
28 | out_features = out_features or in_features
29 | hidden_features = hidden_features or in_features
30 | self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
31 | self.act = act_layer()
32 | self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
33 | self.drop = nn.Dropout(drop)
34 |
35 | def forward(self, x: Tensor) -> Tensor:
36 | x = self.fc1(x)
37 | x = self.act(x)
38 | x = self.drop(x)
39 | x = self.fc2(x)
40 | x = self.drop(x)
41 | return x
42 |
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/metric_depth/depth_anything_v2/dinov2_layers/patch_embed.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 |
7 | # References:
8 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
9 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
10 |
11 | from typing import Callable, Optional, Tuple, Union
12 |
13 | from torch import Tensor
14 | import torch.nn as nn
15 |
16 |
17 | def make_2tuple(x):
18 | if isinstance(x, tuple):
19 | assert len(x) == 2
20 | return x
21 |
22 | assert isinstance(x, int)
23 | return (x, x)
24 |
25 |
26 | class PatchEmbed(nn.Module):
27 | """
28 | 2D image to patch embedding: (B,C,H,W) -> (B,N,D)
29 |
30 | Args:
31 | img_size: Image size.
32 | patch_size: Patch token size.
33 | in_chans: Number of input image channels.
34 | embed_dim: Number of linear projection output channels.
35 | norm_layer: Normalization layer.
36 | """
37 |
38 | def __init__(
39 | self,
40 | img_size: Union[int, Tuple[int, int]] = 224,
41 | patch_size: Union[int, Tuple[int, int]] = 16,
42 | in_chans: int = 3,
43 | embed_dim: int = 768,
44 | norm_layer: Optional[Callable] = None,
45 | flatten_embedding: bool = True,
46 | ) -> None:
47 | super().__init__()
48 |
49 | image_HW = make_2tuple(img_size)
50 | patch_HW = make_2tuple(patch_size)
51 | patch_grid_size = (
52 | image_HW[0] // patch_HW[0],
53 | image_HW[1] // patch_HW[1],
54 | )
55 |
56 | self.img_size = image_HW
57 | self.patch_size = patch_HW
58 | self.patches_resolution = patch_grid_size
59 | self.num_patches = patch_grid_size[0] * patch_grid_size[1]
60 |
61 | self.in_chans = in_chans
62 | self.embed_dim = embed_dim
63 |
64 | self.flatten_embedding = flatten_embedding
65 |
66 | self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
67 | self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
68 |
69 | def forward(self, x: Tensor) -> Tensor:
70 | _, _, H, W = x.shape
71 | patch_H, patch_W = self.patch_size
72 |
73 | assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
74 | assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
75 |
76 | x = self.proj(x) # B C H W
77 | H, W = x.size(2), x.size(3)
78 | x = x.flatten(2).transpose(1, 2) # B HW C
79 | x = self.norm(x)
80 | if not self.flatten_embedding:
81 | x = x.reshape(-1, H, W, self.embed_dim) # B H W C
82 | return x
83 |
84 | def flops(self) -> float:
85 | Ho, Wo = self.patches_resolution
86 | flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
87 | if self.norm is not None:
88 | flops += Ho * Wo * self.embed_dim
89 | return flops
90 |
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/metric_depth/depth_anything_v2/dinov2_layers/swiglu_ffn.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 |
7 | from typing import Callable, Optional
8 |
9 | from torch import Tensor, nn
10 | import torch.nn.functional as F
11 |
12 |
13 | class SwiGLUFFN(nn.Module):
14 | def __init__(
15 | self,
16 | in_features: int,
17 | hidden_features: Optional[int] = None,
18 | out_features: Optional[int] = None,
19 | act_layer: Callable[..., nn.Module] = None,
20 | drop: float = 0.0,
21 | bias: bool = True,
22 | ) -> None:
23 | super().__init__()
24 | out_features = out_features or in_features
25 | hidden_features = hidden_features or in_features
26 | self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
27 | self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
28 |
29 | def forward(self, x: Tensor) -> Tensor:
30 | x12 = self.w12(x)
31 | x1, x2 = x12.chunk(2, dim=-1)
32 | hidden = F.silu(x1) * x2
33 | return self.w3(hidden)
34 |
35 |
36 | try:
37 | from xformers.ops import SwiGLU
38 |
39 | XFORMERS_AVAILABLE = True
40 | except ImportError:
41 | SwiGLU = SwiGLUFFN
42 | XFORMERS_AVAILABLE = False
43 |
44 |
45 | class SwiGLUFFNFused(SwiGLU):
46 | def __init__(
47 | self,
48 | in_features: int,
49 | hidden_features: Optional[int] = None,
50 | out_features: Optional[int] = None,
51 | act_layer: Callable[..., nn.Module] = None,
52 | drop: float = 0.0,
53 | bias: bool = True,
54 | ) -> None:
55 | out_features = out_features or in_features
56 | hidden_features = hidden_features or in_features
57 | hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
58 | super().__init__(
59 | in_features=in_features,
60 | hidden_features=hidden_features,
61 | out_features=out_features,
62 | bias=bias,
63 | )
64 |
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/metric_depth/depth_anything_v2/util/blocks.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 |
3 |
4 | def _make_scratch(in_shape, out_shape, groups=1, expand=False):
5 | scratch = nn.Module()
6 |
7 | out_shape1 = out_shape
8 | out_shape2 = out_shape
9 | out_shape3 = out_shape
10 | if len(in_shape) >= 4:
11 | out_shape4 = out_shape
12 |
13 | if expand:
14 | out_shape1 = out_shape
15 | out_shape2 = out_shape * 2
16 | out_shape3 = out_shape * 4
17 | if len(in_shape) >= 4:
18 | out_shape4 = out_shape * 8
19 |
20 | scratch.layer1_rn = nn.Conv2d(in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
21 | scratch.layer2_rn = nn.Conv2d(in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
22 | scratch.layer3_rn = nn.Conv2d(in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
23 | if len(in_shape) >= 4:
24 | scratch.layer4_rn = nn.Conv2d(in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
25 |
26 | return scratch
27 |
28 |
29 | class ResidualConvUnit(nn.Module):
30 | """Residual convolution module.
31 | """
32 |
33 | def __init__(self, features, activation, bn):
34 | """Init.
35 |
36 | Args:
37 | features (int): number of features
38 | """
39 | super().__init__()
40 |
41 | self.bn = bn
42 |
43 | self.groups=1
44 |
45 | self.conv1 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
46 |
47 | self.conv2 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
48 |
49 | if self.bn == True:
50 | self.bn1 = nn.BatchNorm2d(features)
51 | self.bn2 = nn.BatchNorm2d(features)
52 |
53 | self.activation = activation
54 |
55 | self.skip_add = nn.quantized.FloatFunctional()
56 |
57 | def forward(self, x):
58 | """Forward pass.
59 |
60 | Args:
61 | x (tensor): input
62 |
63 | Returns:
64 | tensor: output
65 | """
66 |
67 | out = self.activation(x)
68 | out = self.conv1(out)
69 | if self.bn == True:
70 | out = self.bn1(out)
71 |
72 | out = self.activation(out)
73 | out = self.conv2(out)
74 | if self.bn == True:
75 | out = self.bn2(out)
76 |
77 | if self.groups > 1:
78 | out = self.conv_merge(out)
79 |
80 | return self.skip_add.add(out, x)
81 |
82 |
83 | class FeatureFusionBlock(nn.Module):
84 | """Feature fusion block.
85 | """
86 |
87 | def __init__(
88 | self,
89 | features,
90 | activation,
91 | deconv=False,
92 | bn=False,
93 | expand=False,
94 | align_corners=True,
95 | size=None
96 | ):
97 | """Init.
98 |
99 | Args:
100 | features (int): number of features
101 | """
102 | super(FeatureFusionBlock, self).__init__()
103 |
104 | self.deconv = deconv
105 | self.align_corners = align_corners
106 |
107 | self.groups=1
108 |
109 | self.expand = expand
110 | out_features = features
111 | if self.expand == True:
112 | out_features = features // 2
113 |
114 | self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1)
115 |
116 | self.resConfUnit1 = ResidualConvUnit(features, activation, bn)
117 | self.resConfUnit2 = ResidualConvUnit(features, activation, bn)
118 |
119 | self.skip_add = nn.quantized.FloatFunctional()
120 |
121 | self.size=size
122 |
123 | def forward(self, *xs, size=None):
124 | """Forward pass.
125 |
126 | Returns:
127 | tensor: output
128 | """
129 | output = xs[0]
130 |
131 | if len(xs) == 2:
132 | res = self.resConfUnit1(xs[1])
133 | output = self.skip_add.add(output, res)
134 |
135 | output = self.resConfUnit2(output)
136 |
137 | if (size is None) and (self.size is None):
138 | modifier = {"scale_factor": 2}
139 | elif size is None:
140 | modifier = {"size": self.size}
141 | else:
142 | modifier = {"size": size}
143 |
144 | output = nn.functional.interpolate(output, **modifier, mode="bilinear", align_corners=self.align_corners)
145 |
146 | output = self.out_conv(output)
147 |
148 | return output
149 |
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/metric_depth/depth_to_pointcloud.py:
--------------------------------------------------------------------------------
1 | """
2 | Born out of Depth Anything V1 Issue 36
3 | Make sure you have the necessary libraries installed.
4 | Code by @1ssb
5 |
6 | This script processes a set of images to generate depth maps and corresponding point clouds.
7 | The resulting point clouds are saved in the specified output directory.
8 |
9 | Usage:
10 | python script.py --encoder vitl --load-from path_to_model --max-depth 20 --img-path path_to_images --outdir output_directory --focal-length-x 470.4 --focal-length-y 470.4
11 |
12 | Arguments:
13 | --encoder: Model encoder to use. Choices are ['vits', 'vitb', 'vitl', 'vitg'].
14 | --load-from: Path to the pre-trained model weights.
15 | --max-depth: Maximum depth value for the depth map.
16 | --img-path: Path to the input image or directory containing images.
17 | --outdir: Directory to save the output point clouds.
18 | --focal-length-x: Focal length along the x-axis.
19 | --focal-length-y: Focal length along the y-axis.
20 | """
21 |
22 | import argparse
23 | import cv2
24 | import glob
25 | import numpy as np
26 | import open3d as o3d
27 | import os
28 | from PIL import Image
29 | import torch
30 |
31 | from depth_anything_v2.dpt import DepthAnythingV2
32 |
33 |
34 | def main():
35 | # Parse command-line arguments
36 | parser = argparse.ArgumentParser(description='Generate depth maps and point clouds from images.')
37 | parser.add_argument('--encoder', default='vitl', type=str, choices=['vits', 'vitb', 'vitl', 'vitg'],
38 | help='Model encoder to use.')
39 | parser.add_argument('--load-from', default='', type=str, required=True,
40 | help='Path to the pre-trained model weights.')
41 | parser.add_argument('--max-depth', default=20, type=float,
42 | help='Maximum depth value for the depth map.')
43 | parser.add_argument('--img-path', type=str, required=True,
44 | help='Path to the input image or directory containing images.')
45 | parser.add_argument('--outdir', type=str, default='./vis_pointcloud',
46 | help='Directory to save the output point clouds.')
47 | parser.add_argument('--focal-length-x', default=470.4, type=float,
48 | help='Focal length along the x-axis.')
49 | parser.add_argument('--focal-length-y', default=470.4, type=float,
50 | help='Focal length along the y-axis.')
51 |
52 | args = parser.parse_args()
53 |
54 | # Determine the device to use (CUDA, MPS, or CPU)
55 | DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
56 |
57 | # Model configuration based on the chosen encoder
58 | model_configs = {
59 | 'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
60 | 'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
61 | 'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
62 | 'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
63 | }
64 |
65 | # Initialize the DepthAnythingV2 model with the specified configuration
66 | depth_anything = DepthAnythingV2(**{**model_configs[args.encoder], 'max_depth': args.max_depth})
67 | depth_anything.load_state_dict(torch.load(args.load_from, map_location='cpu'))
68 | depth_anything = depth_anything.to(DEVICE).eval()
69 |
70 | # Get the list of image files to process
71 | if os.path.isfile(args.img_path):
72 | if args.img_path.endswith('txt'):
73 | with open(args.img_path, 'r') as f:
74 | filenames = f.read().splitlines()
75 | else:
76 | filenames = [args.img_path]
77 | else:
78 | filenames = glob.glob(os.path.join(args.img_path, '**/*'), recursive=True)
79 |
80 | # Create the output directory if it doesn't exist
81 | os.makedirs(args.outdir, exist_ok=True)
82 |
83 | # Process each image file
84 | for k, filename in enumerate(filenames):
85 | print(f'Processing {k+1}/{len(filenames)}: {filename}')
86 |
87 | # Load the image
88 | color_image = Image.open(filename).convert('RGB')
89 | width, height = color_image.size
90 |
91 | # Read the image using OpenCV
92 | image = cv2.imread(filename)
93 | pred = depth_anything.infer_image(image, height)
94 |
95 | # Resize depth prediction to match the original image size
96 | resized_pred = Image.fromarray(pred).resize((width, height), Image.NEAREST)
97 |
98 | # Generate mesh grid and calculate point cloud coordinates
99 | x, y = np.meshgrid(np.arange(width), np.arange(height))
100 | x = (x - width / 2) / args.focal_length_x
101 | y = (y - height / 2) / args.focal_length_y
102 | z = np.array(resized_pred)
103 | points = np.stack((np.multiply(x, z), np.multiply(y, z), z), axis=-1).reshape(-1, 3)
104 | colors = np.array(color_image).reshape(-1, 3) / 255.0
105 |
106 | # Create the point cloud and save it to the output directory
107 | pcd = o3d.geometry.PointCloud()
108 | pcd.points = o3d.utility.Vector3dVector(points)
109 | pcd.colors = o3d.utility.Vector3dVector(colors)
110 | o3d.io.write_point_cloud(os.path.join(args.outdir, os.path.splitext(os.path.basename(filename))[0] + ".ply"), pcd)
111 |
112 |
113 | if __name__ == '__main__':
114 | main()
115 |
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/metric_depth/dist_train.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | now=$(date +"%Y%m%d_%H%M%S")
3 |
4 | epoch=120
5 | bs=4
6 | gpus=8
7 | lr=0.000005
8 | encoder=vitl
9 | dataset=hypersim # vkitti
10 | img_size=518
11 | min_depth=0.001
12 | max_depth=20 # 80 for virtual kitti
13 | pretrained_from=../checkpoints/depth_anything_v2_${encoder}.pth
14 | save_path=exp/hypersim # exp/vkitti
15 |
16 | mkdir -p $save_path
17 |
18 | python3 -m torch.distributed.launch \
19 | --nproc_per_node=$gpus \
20 | --nnodes 1 \
21 | --node_rank=0 \
22 | --master_addr=localhost \
23 | --master_port=20596 \
24 | train.py --epoch $epoch --encoder $encoder --bs $bs --lr $lr --save-path $save_path --dataset $dataset \
25 | --img-size $img_size --min-depth $min_depth --max-depth $max_depth --pretrained-from $pretrained_from \
26 | --port 20596 2>&1 | tee -a $save_path/$now.log
27 |
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/metric_depth/requirements.txt:
--------------------------------------------------------------------------------
1 | matplotlib
2 | opencv-python
3 | open3d
4 | torch
5 | torchvision
6 |
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/metric_depth/run.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import cv2
3 | import glob
4 | import matplotlib
5 | import numpy as np
6 | import os
7 | import torch
8 |
9 | from depth_anything_v2.dpt import DepthAnythingV2
10 |
11 |
12 | if __name__ == '__main__':
13 | parser = argparse.ArgumentParser(description='Depth Anything V2 Metric Depth Estimation')
14 |
15 | parser.add_argument('--img-path', type=str)
16 | parser.add_argument('--input-size', type=int, default=518)
17 | parser.add_argument('--outdir', type=str, default='./vis_depth')
18 |
19 | parser.add_argument('--encoder', type=str, default='vitl', choices=['vits', 'vitb', 'vitl', 'vitg'])
20 | parser.add_argument('--load-from', type=str, default='checkpoints/depth_anything_v2_metric_hypersim_vitl.pth')
21 | parser.add_argument('--max-depth', type=float, default=20)
22 |
23 | parser.add_argument('--save-numpy', dest='save_numpy', action='store_true', help='save the model raw output')
24 | parser.add_argument('--pred-only', dest='pred_only', action='store_true', help='only display the prediction')
25 | parser.add_argument('--grayscale', dest='grayscale', action='store_true', help='do not apply colorful palette')
26 |
27 | args = parser.parse_args()
28 |
29 | DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
30 |
31 | model_configs = {
32 | 'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
33 | 'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
34 | 'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
35 | 'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
36 | }
37 |
38 | depth_anything = DepthAnythingV2(**{**model_configs[args.encoder], 'max_depth': args.max_depth})
39 | depth_anything.load_state_dict(torch.load(args.load_from, map_location='cpu'))
40 | depth_anything = depth_anything.to(DEVICE).eval()
41 |
42 | if os.path.isfile(args.img_path):
43 | if args.img_path.endswith('txt'):
44 | with open(args.img_path, 'r') as f:
45 | filenames = f.read().splitlines()
46 | else:
47 | filenames = [args.img_path]
48 | else:
49 | filenames = glob.glob(os.path.join(args.img_path, '**/*'), recursive=True)
50 |
51 | os.makedirs(args.outdir, exist_ok=True)
52 |
53 | cmap = matplotlib.colormaps.get_cmap('Spectral')
54 |
55 | for k, filename in enumerate(filenames):
56 | print(f'Progress {k+1}/{len(filenames)}: {filename}')
57 |
58 | raw_image = cv2.imread(filename)
59 |
60 | depth = depth_anything.infer_image(raw_image, args.input_size)
61 |
62 | if args.save_numpy:
63 | output_path = os.path.join(args.outdir, os.path.splitext(os.path.basename(filename))[0] + '_raw_depth_meter.npy')
64 | np.save(output_path, depth)
65 |
66 | depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
67 | depth = depth.astype(np.uint8)
68 |
69 | if args.grayscale:
70 | depth = np.repeat(depth[..., np.newaxis], 3, axis=-1)
71 | else:
72 | depth = (cmap(depth)[:, :, :3] * 255)[:, :, ::-1].astype(np.uint8)
73 |
74 | output_path = os.path.join(args.outdir, os.path.splitext(os.path.basename(filename))[0] + '.png')
75 | if args.pred_only:
76 | cv2.imwrite(output_path, depth)
77 | else:
78 | split_region = np.ones((raw_image.shape[0], 50, 3), dtype=np.uint8) * 255
79 | combined_result = cv2.hconcat([raw_image, split_region, depth])
80 |
81 | cv2.imwrite(output_path, combined_result)
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/metric_depth/util/dist_helper.py:
--------------------------------------------------------------------------------
1 | import os
2 | import subprocess
3 |
4 | import torch
5 | import torch.distributed as dist
6 |
7 |
8 | def setup_distributed(backend="nccl", port=None):
9 | """AdaHessian Optimizer
10 | Lifted from https://github.com/BIGBALLON/distribuuuu/blob/master/distribuuuu/utils.py
11 | Originally licensed MIT, Copyright (c) 2020 Wei Li
12 | """
13 | num_gpus = torch.cuda.device_count()
14 |
15 | if "SLURM_JOB_ID" in os.environ:
16 | rank = int(os.environ["SLURM_PROCID"])
17 | world_size = int(os.environ["SLURM_NTASKS"])
18 | node_list = os.environ["SLURM_NODELIST"]
19 | addr = subprocess.getoutput(f"scontrol show hostname {node_list} | head -n1")
20 | # specify master port
21 | if port is not None:
22 | os.environ["MASTER_PORT"] = str(port)
23 | elif "MASTER_PORT" not in os.environ:
24 | os.environ["MASTER_PORT"] = "10685"
25 | if "MASTER_ADDR" not in os.environ:
26 | os.environ["MASTER_ADDR"] = addr
27 | os.environ["WORLD_SIZE"] = str(world_size)
28 | os.environ["LOCAL_RANK"] = str(rank % num_gpus)
29 | os.environ["RANK"] = str(rank)
30 | else:
31 | rank = int(os.environ["RANK"])
32 | world_size = int(os.environ["WORLD_SIZE"])
33 |
34 | torch.cuda.set_device(rank % num_gpus)
35 |
36 | dist.init_process_group(
37 | backend=backend,
38 | world_size=world_size,
39 | rank=rank,
40 | )
41 | return rank, world_size
42 |
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/metric_depth/util/loss.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 |
4 |
5 | class SiLogLoss(nn.Module):
6 | def __init__(self, lambd=0.5):
7 | super().__init__()
8 | self.lambd = lambd
9 |
10 | def forward(self, pred, target, valid_mask):
11 | valid_mask = valid_mask.detach()
12 | diff_log = torch.log(target[valid_mask]) - torch.log(pred[valid_mask])
13 | loss = torch.sqrt(torch.pow(diff_log, 2).mean() -
14 | self.lambd * torch.pow(diff_log.mean(), 2))
15 |
16 | return loss
17 |
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/metric_depth/util/metric.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | def eval_depth(pred, target):
5 | assert pred.shape == target.shape
6 |
7 | thresh = torch.max((target / pred), (pred / target))
8 |
9 | d1 = torch.sum(thresh < 1.25).float() / len(thresh)
10 | d2 = torch.sum(thresh < 1.25 ** 2).float() / len(thresh)
11 | d3 = torch.sum(thresh < 1.25 ** 3).float() / len(thresh)
12 |
13 | diff = pred - target
14 | diff_log = torch.log(pred) - torch.log(target)
15 |
16 | abs_rel = torch.mean(torch.abs(diff) / target)
17 | sq_rel = torch.mean(torch.pow(diff, 2) / target)
18 |
19 | rmse = torch.sqrt(torch.mean(torch.pow(diff, 2)))
20 | rmse_log = torch.sqrt(torch.mean(torch.pow(diff_log , 2)))
21 |
22 | log10 = torch.mean(torch.abs(torch.log10(pred) - torch.log10(target)))
23 | silog = torch.sqrt(torch.pow(diff_log, 2).mean() - 0.5 * torch.pow(diff_log.mean(), 2))
24 |
25 | return {'d1': d1.item(), 'd2': d2.item(), 'd3': d3.item(), 'abs_rel': abs_rel.item(), 'sq_rel': sq_rel.item(),
26 | 'rmse': rmse.item(), 'rmse_log': rmse_log.item(), 'log10':log10.item(), 'silog':silog.item()}
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/metric_depth/util/utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | import numpy as np
4 | import logging
5 |
6 | logs = set()
7 |
8 |
9 | def init_log(name, level=logging.INFO):
10 | if (name, level) in logs:
11 | return
12 | logs.add((name, level))
13 | logger = logging.getLogger(name)
14 | logger.setLevel(level)
15 | ch = logging.StreamHandler()
16 | ch.setLevel(level)
17 | if "SLURM_PROCID" in os.environ:
18 | rank = int(os.environ["SLURM_PROCID"])
19 | logger.addFilter(lambda record: rank == 0)
20 | else:
21 | rank = 0
22 | format_str = "[%(asctime)s][%(levelname)8s] %(message)s"
23 | formatter = logging.Formatter(format_str)
24 | ch.setFormatter(formatter)
25 | logger.addHandler(ch)
26 | return logger
27 |
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/requirements.txt:
--------------------------------------------------------------------------------
1 | gradio_imageslider
2 | gradio==4.29.0
3 | matplotlib
4 | opencv-python
5 | torch
6 | torchvision
7 |
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/run.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import cv2
3 | import glob
4 | import matplotlib
5 | import numpy as np
6 | import os
7 | import torch
8 |
9 | from depth_anything_v2.dpt import DepthAnythingV2
10 |
11 |
12 | if __name__ == '__main__':
13 | parser = argparse.ArgumentParser(description='Depth Anything V2')
14 |
15 | parser.add_argument('--img-path', type=str)
16 | parser.add_argument('--input-size', type=int, default=518)
17 | parser.add_argument('--outdir', type=str, default='./vis_depth')
18 |
19 | parser.add_argument('--encoder', type=str, default='vitl', choices=['vits', 'vitb', 'vitl', 'vitg'])
20 |
21 | parser.add_argument('--pred-only', dest='pred_only', action='store_true', help='only display the prediction')
22 | parser.add_argument('--grayscale', dest='grayscale', action='store_true', help='do not apply colorful palette')
23 |
24 | args = parser.parse_args()
25 |
26 | DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
27 |
28 | model_configs = {
29 | 'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
30 | 'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
31 | 'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
32 | 'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
33 | }
34 |
35 | depth_anything = DepthAnythingV2(**model_configs[args.encoder])
36 | depth_anything.load_state_dict(torch.load(f'checkpoints/depth_anything_v2_{args.encoder}.pth', map_location='cpu'))
37 | depth_anything = depth_anything.to(DEVICE).eval()
38 |
39 | if os.path.isfile(args.img_path):
40 | if args.img_path.endswith('txt'):
41 | with open(args.img_path, 'r') as f:
42 | filenames = f.read().splitlines()
43 | else:
44 | filenames = [args.img_path]
45 | else:
46 | filenames = glob.glob(os.path.join(args.img_path, '**/*'), recursive=True)
47 |
48 | os.makedirs(args.outdir, exist_ok=True)
49 |
50 | cmap = matplotlib.colormaps.get_cmap('Spectral_r')
51 |
52 | for k, filename in enumerate(filenames):
53 | print(f'Progress {k+1}/{len(filenames)}: {filename}')
54 |
55 | raw_image = cv2.imread(filename)
56 |
57 | depth = depth_anything.infer_image(raw_image, args.input_size)
58 |
59 | depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
60 | depth = depth.astype(np.uint8)
61 |
62 | if args.grayscale:
63 | depth = np.repeat(depth[..., np.newaxis], 3, axis=-1)
64 | else:
65 | depth = (cmap(depth)[:, :, :3] * 255)[:, :, ::-1].astype(np.uint8)
66 |
67 | if args.pred_only:
68 | cv2.imwrite(os.path.join(args.outdir, os.path.splitext(os.path.basename(filename))[0] + '.png'), depth)
69 | else:
70 | split_region = np.ones((raw_image.shape[0], 50, 3), dtype=np.uint8) * 255
71 | combined_result = cv2.hconcat([raw_image, split_region, depth])
72 |
73 | cv2.imwrite(os.path.join(args.outdir, os.path.splitext(os.path.basename(filename))[0] + '.png'), combined_result)
--------------------------------------------------------------------------------
/thirdparty/depth_anything_v2/run_video.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import cv2
3 | import glob
4 | import matplotlib
5 | import numpy as np
6 | import os
7 | import torch
8 |
9 | from depth_anything_v2.dpt import DepthAnythingV2
10 |
11 |
12 | if __name__ == '__main__':
13 | parser = argparse.ArgumentParser(description='Depth Anything V2')
14 |
15 | parser.add_argument('--video-path', type=str)
16 | parser.add_argument('--input-size', type=int, default=518)
17 | parser.add_argument('--outdir', type=str, default='./vis_video_depth')
18 |
19 | parser.add_argument('--encoder', type=str, default='vitl', choices=['vits', 'vitb', 'vitl', 'vitg'])
20 |
21 | parser.add_argument('--pred-only', dest='pred_only', action='store_true', help='only display the prediction')
22 | parser.add_argument('--grayscale', dest='grayscale', action='store_true', help='do not apply colorful palette')
23 |
24 | args = parser.parse_args()
25 |
26 | DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
27 |
28 | model_configs = {
29 | 'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
30 | 'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
31 | 'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
32 | 'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
33 | }
34 |
35 | depth_anything = DepthAnythingV2(**model_configs[args.encoder])
36 | depth_anything.load_state_dict(torch.load(f'checkpoints/depth_anything_v2_{args.encoder}.pth', map_location='cpu'))
37 | depth_anything = depth_anything.to(DEVICE).eval()
38 |
39 | if os.path.isfile(args.video_path):
40 | if args.video_path.endswith('txt'):
41 | with open(args.video_path, 'r') as f:
42 | lines = f.read().splitlines()
43 | else:
44 | filenames = [args.video_path]
45 | else:
46 | filenames = glob.glob(os.path.join(args.video_path, '**/*'), recursive=True)
47 |
48 | os.makedirs(args.outdir, exist_ok=True)
49 |
50 | margin_width = 50
51 | cmap = matplotlib.colormaps.get_cmap('Spectral_r')
52 |
53 | for k, filename in enumerate(filenames):
54 | print(f'Progress {k+1}/{len(filenames)}: {filename}')
55 |
56 | raw_video = cv2.VideoCapture(filename)
57 | frame_width, frame_height = int(raw_video.get(cv2.CAP_PROP_FRAME_WIDTH)), int(raw_video.get(cv2.CAP_PROP_FRAME_HEIGHT))
58 | frame_rate = int(raw_video.get(cv2.CAP_PROP_FPS))
59 |
60 | if args.pred_only:
61 | output_width = frame_width
62 | else:
63 | output_width = frame_width * 2 + margin_width
64 |
65 | output_path = os.path.join(args.outdir, os.path.splitext(os.path.basename(filename))[0] + '.mp4')
66 | out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*"mp4v"), frame_rate, (output_width, frame_height))
67 |
68 | while raw_video.isOpened():
69 | ret, raw_frame = raw_video.read()
70 | if not ret:
71 | break
72 |
73 | depth = depth_anything.infer_image(raw_frame, args.input_size)
74 |
75 | depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
76 | depth = depth.astype(np.uint8)
77 |
78 | if args.grayscale:
79 | depth = np.repeat(depth[..., np.newaxis], 3, axis=-1)
80 | else:
81 | depth = (cmap(depth)[:, :, :3] * 255)[:, :, ::-1].astype(np.uint8)
82 |
83 | if args.pred_only:
84 | out.write(depth)
85 | else:
86 | split_region = np.ones((frame_height, margin_width, 3), dtype=np.uint8) * 255
87 | combined_frame = cv2.hconcat([raw_frame, split_region, depth])
88 |
89 | out.write(combined_frame)
90 |
91 | raw_video.release()
92 | out.release()
93 |
--------------------------------------------------------------------------------
/thirdparty/gaussian_splatting/LICENSE.md:
--------------------------------------------------------------------------------
1 | Gaussian-Splatting License
2 | ===========================
3 |
4 | **Inria** and **the Max Planck Institut for Informatik (MPII)** hold all the ownership rights on the *Software* named **gaussian-splatting**.
5 | The *Software* is in the process of being registered with the Agence pour la Protection des
6 | Programmes (APP).
7 |
8 | The *Software* is still being developed by the *Licensor*.
9 |
10 | *Licensor*'s goal is to allow the research community to use, test and evaluate
11 | the *Software*.
12 |
13 | ## 1. Definitions
14 |
15 | *Licensee* means any person or entity that uses the *Software* and distributes
16 | its *Work*.
17 |
18 | *Licensor* means the owners of the *Software*, i.e Inria and MPII
19 |
20 | *Software* means the original work of authorship made available under this
21 | License ie gaussian-splatting.
22 |
23 | *Work* means the *Software* and any additions to or derivative works of the
24 | *Software* that are made available under this License.
25 |
26 |
27 | ## 2. Purpose
28 | This license is intended to define the rights granted to the *Licensee* by
29 | Licensors under the *Software*.
30 |
31 | ## 3. Rights granted
32 |
33 | For the above reasons Licensors have decided to distribute the *Software*.
34 | Licensors grant non-exclusive rights to use the *Software* for research purposes
35 | to research users (both academic and industrial), free of charge, without right
36 | to sublicense.. The *Software* may be used "non-commercially", i.e., for research
37 | and/or evaluation purposes only.
38 |
39 | Subject to the terms and conditions of this License, you are granted a
40 | non-exclusive, royalty-free, license to reproduce, prepare derivative works of,
41 | publicly display, publicly perform and distribute its *Work* and any resulting
42 | derivative works in any form.
43 |
44 | ## 4. Limitations
45 |
46 | **4.1 Redistribution.** You may reproduce or distribute the *Work* only if (a) you do
47 | so under this License, (b) you include a complete copy of this License with
48 | your distribution, and (c) you retain without modification any copyright,
49 | patent, trademark, or attribution notices that are present in the *Work*.
50 |
51 | **4.2 Derivative Works.** You may specify that additional or different terms apply
52 | to the use, reproduction, and distribution of your derivative works of the *Work*
53 | ("Your Terms") only if (a) Your Terms provide that the use limitation in
54 | Section 2 applies to your derivative works, and (b) you identify the specific
55 | derivative works that are subject to Your Terms. Notwithstanding Your Terms,
56 | this License (including the redistribution requirements in Section 3.1) will
57 | continue to apply to the *Work* itself.
58 |
59 | **4.3** Any other use without of prior consent of Licensors is prohibited. Research
60 | users explicitly acknowledge having received from Licensors all information
61 | allowing to appreciate the adequacy between of the *Software* and their needs and
62 | to undertake all necessary precautions for its execution and use.
63 |
64 | **4.4** The *Software* is provided both as a compiled library file and as source
65 | code. In case of using the *Software* for a publication or other results obtained
66 | through the use of the *Software*, users are strongly encouraged to cite the
67 | corresponding publications as explained in the documentation of the *Software*.
68 |
69 | ## 5. Disclaimer
70 |
71 | THE USER CANNOT USE, EXPLOIT OR DISTRIBUTE THE *SOFTWARE* FOR COMMERCIAL PURPOSES
72 | WITHOUT PRIOR AND EXPLICIT CONSENT OF LICENSORS. YOU MUST CONTACT INRIA FOR ANY
73 | UNAUTHORIZED USE: stip-sophia.transfert@inria.fr . ANY SUCH ACTION WILL
74 | CONSTITUTE A FORGERY. THIS *SOFTWARE* IS PROVIDED "AS IS" WITHOUT ANY WARRANTIES
75 | OF ANY NATURE AND ANY EXPRESS OR IMPLIED WARRANTIES, WITH REGARDS TO COMMERCIAL
76 | USE, PROFESSIONNAL USE, LEGAL OR NOT, OR OTHER, OR COMMERCIALISATION OR
77 | ADAPTATION. UNLESS EXPLICITLY PROVIDED BY LAW, IN NO EVENT, SHALL INRIA OR THE
78 | AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
79 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
80 | GOODS OR SERVICES, LOSS OF USE, DATA, OR PROFITS OR BUSINESS INTERRUPTION)
81 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
82 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING FROM, OUT OF OR
83 | IN CONNECTION WITH THE *SOFTWARE* OR THE USE OR OTHER DEALINGS IN THE *SOFTWARE*.
84 |
85 | ## 6. Files subject to permissive licenses
86 | The contents of the file ```utils/loss_utils.py``` are based on publicly available code authored by Evan Su, which falls under the permissive MIT license.
87 |
88 | Title: pytorch-ssim\
89 | Project code: https://github.com/Po-Hsun-Su/pytorch-ssim\
90 | Copyright Evan Su, 2017\
91 | License: https://github.com/Po-Hsun-Su/pytorch-ssim/blob/master/LICENSE.txt (MIT)
--------------------------------------------------------------------------------
/thirdparty/gaussian_splatting/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/gaussian_splatting/__init__.py
--------------------------------------------------------------------------------
/thirdparty/gaussian_splatting/gaussian_renderer/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (C) 2023, Inria
3 | # GRAPHDECO research group, https://team.inria.fr/graphdeco
4 | # All rights reserved.
5 | #
6 | # This software is free for non-commercial, research and evaluation use
7 | # under the terms of the LICENSE.md file.
8 | #
9 | # For inquiries contact george.drettakis@inria.fr
10 | #
11 |
12 | import math
13 |
14 | import torch
15 | from diff_gaussian_rasterization import (
16 | GaussianRasterizationSettings,
17 | GaussianRasterizer,
18 | )
19 |
20 | from thirdparty.gaussian_splatting.scene.gaussian_model import GaussianModel
21 | from thirdparty.gaussian_splatting.utils.sh_utils import eval_sh
22 |
23 |
24 | def render(
25 | viewpoint_camera,
26 | pc: GaussianModel,
27 | pipe,
28 | bg_color: torch.Tensor,
29 | scaling_modifier=1.0,
30 | override_color=None,
31 | mask=None,
32 | ):
33 | """
34 | Render the scene.
35 |
36 | Background tensor (bg_color) must be on GPU!
37 | """
38 |
39 | # Create zero tensor. We will use it to make pytorch return gradients of the 2D (screen-space) means
40 | if pc.get_xyz.shape[0] == 0:
41 | return None
42 |
43 | screenspace_points = (
44 | torch.zeros_like(
45 | pc.get_xyz, dtype=pc.get_xyz.dtype, requires_grad=True, device="cuda"
46 | )
47 | + 0
48 | )
49 | try:
50 | screenspace_points.retain_grad()
51 | except Exception:
52 | pass
53 |
54 | # Set up rasterization configuration
55 | tanfovx = math.tan(viewpoint_camera.FoVx * 0.5)
56 | tanfovy = math.tan(viewpoint_camera.FoVy * 0.5)
57 |
58 | raster_settings = GaussianRasterizationSettings(
59 | image_height=int(viewpoint_camera.image_height),
60 | image_width=int(viewpoint_camera.image_width),
61 | tanfovx=tanfovx,
62 | tanfovy=tanfovy,
63 | bg=bg_color,
64 | scale_modifier=scaling_modifier,
65 | viewmatrix=viewpoint_camera.world_view_transform,
66 | projmatrix=viewpoint_camera.full_proj_transform,
67 | projmatrix_raw=viewpoint_camera.projection_matrix,
68 | sh_degree=pc.active_sh_degree,
69 | campos=viewpoint_camera.camera_center,
70 | prefiltered=False,
71 | debug=False,
72 | )
73 |
74 | rasterizer = GaussianRasterizer(raster_settings=raster_settings)
75 |
76 | means3D = pc.get_xyz
77 | means2D = screenspace_points
78 | opacity = pc.get_opacity
79 |
80 | # If precomputed 3d covariance is provided, use it. If not, then it will be computed from
81 | # scaling / rotation by the rasterizer.
82 | scales = None
83 | rotations = None
84 | cov3D_precomp = None
85 | if pipe.compute_cov3D_python:
86 | cov3D_precomp = pc.get_covariance(scaling_modifier)
87 | else:
88 | # check if the covariance is isotropic
89 | if pc.get_scaling.shape[-1] == 1:
90 | scales = pc.get_scaling.repeat(1, 3)
91 | else:
92 | scales = pc.get_scaling
93 | rotations = pc.get_rotation
94 |
95 | # If precomputed colors are provided, use them. Otherwise, if it is desired to precompute colors
96 | # from SHs in Python, do it. If not, then SH -> RGB conversion will be done by rasterizer.
97 | shs = None
98 | colors_precomp = None
99 | if colors_precomp is None:
100 | if pipe.convert_SHs_python:
101 | shs_view = pc.get_features.transpose(1, 2).view(
102 | -1, 3, (pc.max_sh_degree + 1) ** 2
103 | )
104 | dir_pp = pc.get_xyz - viewpoint_camera.camera_center.repeat(
105 | pc.get_features.shape[0], 1
106 | )
107 | dir_pp_normalized = dir_pp / dir_pp.norm(dim=1, keepdim=True)
108 | sh2rgb = eval_sh(pc.active_sh_degree, shs_view, dir_pp_normalized)
109 | colors_precomp = torch.clamp_min(sh2rgb + 0.5, 0.0)
110 | else:
111 | shs = pc.get_features
112 | else:
113 | colors_precomp = override_color
114 |
115 | # Rasterize visible Gaussians to image, obtain their radii (on screen).
116 | if mask is not None:
117 | rendered_image, radii, depth, opacity = rasterizer(
118 | means3D=means3D[mask],
119 | means2D=means2D[mask],
120 | shs=shs[mask],
121 | colors_precomp=colors_precomp[mask] if colors_precomp is not None else None,
122 | opacities=opacity[mask],
123 | scales=scales[mask],
124 | rotations=rotations[mask],
125 | cov3D_precomp=cov3D_precomp[mask] if cov3D_precomp is not None else None,
126 | theta=viewpoint_camera.cam_rot_delta,
127 | rho=viewpoint_camera.cam_trans_delta,
128 | )
129 | else:
130 | rendered_image, radii, depth, opacity, n_touched = rasterizer(
131 | means3D=means3D,
132 | means2D=means2D,
133 | shs=shs,
134 | colors_precomp=colors_precomp,
135 | opacities=opacity,
136 | scales=scales,
137 | rotations=rotations,
138 | cov3D_precomp=cov3D_precomp,
139 | theta=viewpoint_camera.cam_rot_delta,
140 | rho=viewpoint_camera.cam_trans_delta,
141 | )
142 |
143 | # Those Gaussians that were frustum culled or had a radius of 0 were not visible.
144 | # They will be excluded from value updates used in the splitting criteria.
145 | return {
146 | "render": rendered_image,
147 | "viewspace_points": screenspace_points,
148 | "visibility_filter": radii > 0,
149 | "radii": radii,
150 | "depth": depth,
151 | "opacity": opacity,
152 | "n_touched": n_touched,
153 | }
154 |
--------------------------------------------------------------------------------
/thirdparty/gaussian_splatting/utils/graphics_utils.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (C) 2023, Inria
3 | # GRAPHDECO research group, https://team.inria.fr/graphdeco
4 | # All rights reserved.
5 | #
6 | # This software is free for non-commercial, research and evaluation use
7 | # under the terms of the LICENSE.md file.
8 | #
9 | # For inquiries contact george.drettakis@inria.fr
10 | #
11 |
12 | import math
13 | from typing import NamedTuple
14 |
15 | import numpy as np
16 | import torch
17 |
18 |
19 | class BasicPointCloud(NamedTuple):
20 | points: np.array
21 | colors: np.array
22 | normals: np.array
23 |
24 |
25 | def getWorld2View(R, t):
26 | Rt = np.zeros((4, 4))
27 | Rt[:3, :3] = R.transpose()
28 | Rt[:3, 3] = t
29 | Rt[3, 3] = 1.0
30 | return np.float32(Rt)
31 |
32 |
33 | def getWorld2View2(R, t, translate=torch.tensor([0.0, 0.0, 0.0]), scale=1.0):
34 | translate = translate.to(R.device)
35 | Rt = torch.zeros((4, 4), device=R.device)
36 | # Rt[:3, :3] = R.transpose()
37 | Rt[:3, :3] = R
38 | Rt[:3, 3] = t
39 | Rt[3, 3] = 1.0
40 |
41 | C2W = torch.linalg.inv(Rt)
42 | cam_center = C2W[:3, 3]
43 | cam_center = (cam_center + translate) * scale
44 | C2W[:3, 3] = cam_center
45 | Rt = torch.linalg.inv(C2W)
46 | return Rt
47 |
48 |
49 | def getProjectionMatrix(znear, zfar, fovX, fovY):
50 | tanHalfFovY = math.tan((fovY / 2))
51 | tanHalfFovX = math.tan((fovX / 2))
52 |
53 | top = tanHalfFovY * znear
54 | bottom = -top
55 | right = tanHalfFovX * znear
56 | left = -right
57 |
58 | P = torch.zeros(4, 4)
59 |
60 | z_sign = 1.0
61 |
62 | P[0, 0] = 2.0 * znear / (right - left)
63 | P[1, 1] = 2.0 * znear / (top - bottom)
64 | P[0, 2] = (right + left) / (right - left)
65 | P[1, 2] = (top + bottom) / (top - bottom)
66 | P[3, 2] = z_sign
67 | P[2, 2] = -(zfar + znear) / (zfar - znear)
68 | P[2, 3] = -2 * (zfar * znear) / (zfar - znear)
69 | return P
70 |
71 |
72 | def getProjectionMatrix2(znear, zfar, cx, cy, fx, fy, W, H):
73 | left = ((2 * cx - W) / W - 1.0) * W / 2.0
74 | right = ((2 * cx - W) / W + 1.0) * W / 2.0
75 | top = ((2 * cy - H) / H + 1.0) * H / 2.0
76 | bottom = ((2 * cy - H) / H - 1.0) * H / 2.0
77 | left = znear / fx * left
78 | right = znear / fx * right
79 | top = znear / fy * top
80 | bottom = znear / fy * bottom
81 | P = torch.zeros(4, 4)
82 |
83 | z_sign = 1.0
84 |
85 | P[0, 0] = 2.0 * znear / (right - left)
86 | P[1, 1] = 2.0 * znear / (top - bottom)
87 | P[0, 2] = (right + left) / (right - left)
88 | P[1, 2] = (top + bottom) / (top - bottom)
89 | P[3, 2] = z_sign
90 | P[2, 2] = z_sign * zfar / (zfar - znear)
91 | P[2, 3] = -(zfar * znear) / (zfar - znear)
92 |
93 | return P
94 |
95 |
96 | def fov2focal(fov, pixels):
97 | return pixels / (2 * math.tan(fov / 2))
98 |
99 |
100 | def focal2fov(focal, pixels):
101 | return 2 * math.atan(pixels / (2 * focal))
102 |
--------------------------------------------------------------------------------
/thirdparty/gaussian_splatting/utils/image_utils.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (C) 2023, Inria
3 | # GRAPHDECO research group, https://team.inria.fr/graphdeco
4 | # All rights reserved.
5 | #
6 | # This software is free for non-commercial, research and evaluation use
7 | # under the terms of the LICENSE.md file.
8 | #
9 | # For inquiries contact george.drettakis@inria.fr
10 | #
11 |
12 | import torch
13 |
14 |
15 | def mse(img1, img2):
16 | return ((img1 - img2) ** 2).view(img1.shape[0], -1).mean(1, keepdim=True)
17 |
18 |
19 | def psnr(img1, img2):
20 | mse = ((img1 - img2) ** 2).view(img1.shape[0], -1).mean(1, keepdim=True)
21 | return 20 * torch.log10(1.0 / torch.sqrt(mse))
22 |
--------------------------------------------------------------------------------
/thirdparty/gaussian_splatting/utils/loss_utils.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (C) 2023, Inria
3 | # GRAPHDECO research group, https://team.inria.fr/graphdeco
4 | # All rights reserved.
5 | #
6 | # This software is free for non-commercial, research and evaluation use
7 | # under the terms of the LICENSE.md file.
8 | #
9 | # For inquiries contact george.drettakis@inria.fr
10 | #
11 |
12 | from math import exp
13 |
14 | import cv2
15 | import numpy as np
16 | import torch
17 | import torch.nn.functional as F
18 | from torch.autograd import Variable
19 |
20 |
21 | def l1_loss(network_output, gt):
22 | return torch.abs((network_output - gt)).mean()
23 |
24 |
25 | def l1_loss_weight(network_output, gt):
26 | image = gt.detach().cpu().numpy().transpose((1, 2, 0))
27 | rgb_raw_gray = np.dot(image[..., :3], [0.2989, 0.5870, 0.1140])
28 | sobelx = cv2.Sobel(rgb_raw_gray, cv2.CV_64F, 1, 0, ksize=5)
29 | sobely = cv2.Sobel(rgb_raw_gray, cv2.CV_64F, 0, 1, ksize=5)
30 | sobel_merge = np.sqrt(sobelx * sobelx + sobely * sobely) + 1e-10
31 | sobel_merge = np.exp(sobel_merge)
32 | sobel_merge /= np.max(sobel_merge)
33 | sobel_merge = torch.from_numpy(sobel_merge)[None, ...].to(gt.device)
34 |
35 | return torch.abs((network_output - gt) * sobel_merge).mean()
36 |
37 |
38 | def l2_loss(network_output, gt):
39 | return ((network_output - gt) ** 2).mean()
40 |
41 |
42 | def gaussian(window_size, sigma):
43 | gauss = torch.Tensor(
44 | [
45 | exp(-((x - window_size // 2) ** 2) / float(2 * sigma**2))
46 | for x in range(window_size)
47 | ]
48 | )
49 | return gauss / gauss.sum()
50 |
51 |
52 | def create_window(window_size, channel):
53 | _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
54 | _2D_window = _1D_window.mm(_1D_window.t()).float().unsqueeze(0).unsqueeze(0)
55 | window = Variable(
56 | _2D_window.expand(channel, 1, window_size, window_size).contiguous()
57 | )
58 | return window
59 |
60 |
61 | def ssim(img1, img2, window_size=11, size_average=True):
62 | channel = img1.size(-3)
63 | window = create_window(window_size, channel)
64 |
65 | if img1.is_cuda:
66 | window = window.cuda(img1.get_device())
67 | window = window.type_as(img1)
68 |
69 | return _ssim(img1, img2, window, window_size, channel, size_average)
70 |
71 |
72 | def _ssim(img1, img2, window, window_size, channel, size_average=True):
73 | mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
74 | mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)
75 |
76 | mu1_sq = mu1.pow(2)
77 | mu2_sq = mu2.pow(2)
78 | mu1_mu2 = mu1 * mu2
79 |
80 | sigma1_sq = (
81 | F.conv2d(img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq
82 | )
83 | sigma2_sq = (
84 | F.conv2d(img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq
85 | )
86 | sigma12 = (
87 | F.conv2d(img1 * img2, window, padding=window_size // 2, groups=channel)
88 | - mu1_mu2
89 | )
90 |
91 | C1 = 0.01**2
92 | C2 = 0.03**2
93 |
94 | ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / (
95 | (mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2)
96 | )
97 |
98 | if size_average:
99 | return ssim_map.mean()
100 | else:
101 | return ssim_map.mean(1).mean(1).mean(1)
102 |
--------------------------------------------------------------------------------
/thirdparty/gaussian_splatting/utils/sh_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 The PlenOctree Authors.
2 | # Redistribution and use in source and binary forms, with or without
3 | # modification, are permitted provided that the following conditions are met:
4 | #
5 | # 1. Redistributions of source code must retain the above copyright notice,
6 | # this list of conditions and the following disclaimer.
7 | #
8 | # 2. Redistributions in binary form must reproduce the above copyright notice,
9 | # this list of conditions and the following disclaimer in the documentation
10 | # and/or other materials provided with the distribution.
11 | #
12 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
13 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
14 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
15 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
16 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
17 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
18 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
19 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
20 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
21 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
22 | # POSSIBILITY OF SUCH DAMAGE.
23 |
24 | C0 = 0.28209479177387814
25 | C1 = 0.4886025119029199
26 | C2 = [
27 | 1.0925484305920792,
28 | -1.0925484305920792,
29 | 0.31539156525252005,
30 | -1.0925484305920792,
31 | 0.5462742152960396,
32 | ]
33 | C3 = [
34 | -0.5900435899266435,
35 | 2.890611442640554,
36 | -0.4570457994644658,
37 | 0.3731763325901154,
38 | -0.4570457994644658,
39 | 1.445305721320277,
40 | -0.5900435899266435,
41 | ]
42 | C4 = [
43 | 2.5033429417967046,
44 | -1.7701307697799304,
45 | 0.9461746957575601,
46 | -0.6690465435572892,
47 | 0.10578554691520431,
48 | -0.6690465435572892,
49 | 0.47308734787878004,
50 | -1.7701307697799304,
51 | 0.6258357354491761,
52 | ]
53 |
54 |
55 | def eval_sh(deg, sh, dirs):
56 | """
57 | Evaluate spherical harmonics at unit directions
58 | using hardcoded SH polynomials.
59 | Works with torch/np/jnp.
60 | ... Can be 0 or more batch dimensions.
61 | Args:
62 | deg: int SH deg. Currently, 0-3 supported
63 | sh: jnp.ndarray SH coeffs [..., C, (deg + 1) ** 2]
64 | dirs: jnp.ndarray unit directions [..., 3]
65 | Returns:
66 | [..., C]
67 | """
68 | assert deg <= 4 and deg >= 0
69 | coeff = (deg + 1) ** 2
70 | assert sh.shape[-1] >= coeff
71 |
72 | result = C0 * sh[..., 0]
73 | if deg > 0:
74 | x, y, z = dirs[..., 0:1], dirs[..., 1:2], dirs[..., 2:3]
75 | result = (
76 | result - C1 * y * sh[..., 1] + C1 * z * sh[..., 2] - C1 * x * sh[..., 3]
77 | )
78 |
79 | if deg > 1:
80 | xx, yy, zz = x * x, y * y, z * z
81 | xy, yz, xz = x * y, y * z, x * z
82 | result = (
83 | result
84 | + C2[0] * xy * sh[..., 4]
85 | + C2[1] * yz * sh[..., 5]
86 | + C2[2] * (2.0 * zz - xx - yy) * sh[..., 6]
87 | + C2[3] * xz * sh[..., 7]
88 | + C2[4] * (xx - yy) * sh[..., 8]
89 | )
90 |
91 | if deg > 2:
92 | result = (
93 | result
94 | + C3[0] * y * (3 * xx - yy) * sh[..., 9]
95 | + C3[1] * xy * z * sh[..., 10]
96 | + C3[2] * y * (4 * zz - xx - yy) * sh[..., 11]
97 | + C3[3] * z * (2 * zz - 3 * xx - 3 * yy) * sh[..., 12]
98 | + C3[4] * x * (4 * zz - xx - yy) * sh[..., 13]
99 | + C3[5] * z * (xx - yy) * sh[..., 14]
100 | + C3[6] * x * (xx - 3 * yy) * sh[..., 15]
101 | )
102 |
103 | if deg > 3:
104 | result = (
105 | result
106 | + C4[0] * xy * (xx - yy) * sh[..., 16]
107 | + C4[1] * yz * (3 * xx - yy) * sh[..., 17]
108 | + C4[2] * xy * (7 * zz - 1) * sh[..., 18]
109 | + C4[3] * yz * (7 * zz - 3) * sh[..., 19]
110 | + C4[4] * (zz * (35 * zz - 30) + 3) * sh[..., 20]
111 | + C4[5] * xz * (7 * zz - 3) * sh[..., 21]
112 | + C4[6] * (xx - yy) * (7 * zz - 1) * sh[..., 22]
113 | + C4[7] * xz * (xx - 3 * yy) * sh[..., 23]
114 | + C4[8]
115 | * (xx * (xx - 3 * yy) - yy * (3 * xx - yy))
116 | * sh[..., 24]
117 | )
118 | return result
119 |
120 |
121 | def RGB2SH(rgb):
122 | return (rgb - 0.5) / C0
123 |
124 |
125 | def SH2RGB(sh):
126 | return sh * C0 + 0.5
127 |
--------------------------------------------------------------------------------
/thirdparty/gaussian_splatting/utils/system_utils.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (C) 2023, Inria
3 | # GRAPHDECO research group, https://team.inria.fr/graphdeco
4 | # All rights reserved.
5 | #
6 | # This software is free for non-commercial, research and evaluation use
7 | # under the terms of the LICENSE.md file.
8 | #
9 | # For inquiries contact george.drettakis@inria.fr
10 | #
11 |
12 | import os
13 | from errno import EEXIST
14 | from os import makedirs, path
15 |
16 |
17 | def mkdir_p(folder_path):
18 | # Creates a directory. equivalent to using mkdir -p on the command line
19 | try:
20 | makedirs(folder_path)
21 | except OSError as exc: # Python >2.5
22 | if exc.errno == EEXIST and path.isdir(folder_path):
23 | pass
24 | else:
25 | raise
26 |
27 |
28 | def searchForMaxIteration(folder):
29 | saved_iters = [int(fname.split("_")[-1]) for fname in os.listdir(folder)]
30 | return max(saved_iters)
31 |
--------------------------------------------------------------------------------