├── .gitignore ├── .gitmodules ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── configs ├── Custom │ └── custom_template.yaml ├── Dynamic │ ├── Bonn │ │ ├── bonn_balloon.yaml │ │ ├── bonn_balloon2.yaml │ │ ├── bonn_crowd.yaml │ │ ├── bonn_crowd2.yaml │ │ ├── bonn_dynamic.yaml │ │ ├── bonn_moving_nonobstructing_box.yaml │ │ ├── bonn_moving_nonobstructing_box2.yaml │ │ ├── bonn_person_tracking.yaml │ │ └── bonn_person_tracking2.yaml │ ├── TUM_RGBD │ │ ├── freiburg2_desk_with_person.yaml │ │ ├── freiburg3_sitting_halfsphere.yaml │ │ ├── freiburg3_sitting_halfsphere_static.yaml │ │ ├── freiburg3_sitting_rpy.yaml │ │ ├── freiburg3_sitting_xyz.yaml │ │ ├── freiburg3_walking_halfsphere.yaml │ │ ├── freiburg3_walking_halfsphere_static.yaml │ │ ├── freiburg3_walking_rpy.yaml │ │ ├── freiburg3_walking_xyz.yaml │ │ └── tum_dynamic.yaml │ ├── Wild_SLAM_Mocap │ │ ├── ANYmal1.yaml │ │ ├── ANYmal2.yaml │ │ ├── ball.yaml │ │ ├── crowd.yaml │ │ ├── crowd_demo.yaml │ │ ├── person_tracking.yaml │ │ ├── racket.yaml │ │ ├── stones.yaml │ │ ├── table_tracking1.yaml │ │ ├── table_tracking2.yaml │ │ ├── umbrella.yaml │ │ └── wild_slam_mocap.yaml │ └── Wild_SLAM_iPhone │ │ ├── horse.yaml │ │ ├── parking.yaml │ │ ├── piano.yaml │ │ ├── shopping.yaml │ │ ├── street.yaml │ │ ├── tower.yaml │ │ └── wild_slam_iphone.yaml ├── Static │ └── TUM_RGBD │ │ ├── freiburg1_desk.yaml │ │ ├── freiburg2_xyz.yaml │ │ ├── freiburg3_office.yaml │ │ └── tum.yaml └── wildgs_slam.yaml ├── media └── teaser.png ├── requirements.txt ├── run.py ├── scripts_downloading ├── download_bonn.sh ├── download_demo_data.sh ├── download_tum.sh ├── download_wild_slam_iphone.sh ├── download_wild_slam_mocap_scene1.sh └── download_wild_slam_mocap_scene2.sh ├── scripts_run ├── run_bonn_all.sh ├── run_tum_dynamic_all.sh ├── run_wild_slam_mocap_all.sh └── summarize_pose_eval.py ├── setup.py ├── src ├── __init__.py ├── backend.py ├── config.py ├── depth_video.py ├── factor_graph.py ├── frontend.py ├── geom │ ├── __init__.py │ ├── ba.py │ ├── chol.py │ └── projective_ops.py ├── gui │ ├── gl_render │ │ ├── LICENSE │ │ ├── __init__.py │ │ ├── render_ogl.py │ │ ├── shaders │ │ │ ├── gau_frag.glsl │ │ │ └── gau_vert.glsl │ │ ├── util.py │ │ └── util_gau.py │ ├── gui_utils.py │ └── slam_gui.py ├── lib │ ├── altcorr_kernel.cu │ ├── correlation_kernels.cu │ ├── droid.cpp │ └── droid_kernels.cu ├── mapper.py ├── modules │ └── droid_net │ │ ├── __init__.py │ │ ├── clipping.py │ │ ├── corr.py │ │ ├── droid_net.py │ │ ├── extractor.py │ │ └── gru.py ├── motion_filter.py ├── slam.py ├── tracker.py ├── trajectory_filler.py └── utils │ ├── Printer.py │ ├── camera_utils.py │ ├── common.py │ ├── datasets.py │ ├── dyn_uncertainty │ ├── __init__.py │ ├── mapping_utils.py │ ├── median_filter.py │ └── uncertainty_model.py │ ├── eval_traj.py │ ├── eval_utils.py │ ├── mono_priors │ ├── img_feature_extractors.py │ └── metric_depth_estimators.py │ ├── plot_utils.py │ ├── pose_utils.py │ └── slam_utils.py └── thirdparty ├── __init__.py ├── depth_anything_v2 ├── DA-2K.md ├── LICENSE ├── README.md ├── app.py ├── assets │ ├── DA-2K.png │ ├── examples │ │ ├── demo01.jpg │ │ ├── demo02.jpg │ │ ├── demo03.jpg │ │ ├── demo04.jpg │ │ ├── demo05.jpg │ │ ├── demo06.jpg │ │ ├── demo07.jpg │ │ ├── demo08.jpg │ │ ├── demo09.jpg │ │ ├── demo10.jpg │ │ ├── demo11.jpg │ │ ├── demo12.jpg │ │ ├── demo13.jpg │ │ ├── demo14.jpg │ │ ├── demo15.jpg │ │ ├── demo16.jpg │ │ ├── demo17.jpg │ │ ├── demo18.jpg │ │ ├── demo19.jpg │ │ └── demo20.jpg │ ├── examples_video │ │ ├── basketball.mp4 │ │ └── ferris_wheel.mp4 │ └── teaser.png ├── depth_anything_v2 │ ├── dinov2.py │ ├── dinov2_layers │ │ ├── __init__.py │ │ ├── attention.py │ │ ├── block.py │ │ ├── drop_path.py │ │ ├── layer_scale.py │ │ ├── mlp.py │ │ ├── patch_embed.py │ │ └── swiglu_ffn.py │ ├── dpt.py │ └── util │ │ ├── blocks.py │ │ └── transform.py ├── metric_depth │ ├── README.md │ ├── assets │ │ └── compare_zoedepth.png │ ├── dataset │ │ ├── hypersim.py │ │ ├── kitti.py │ │ ├── splits │ │ │ ├── hypersim │ │ │ │ ├── train.txt │ │ │ │ └── val.txt │ │ │ ├── kitti │ │ │ │ └── val.txt │ │ │ └── vkitti2 │ │ │ │ └── train.txt │ │ ├── transform.py │ │ └── vkitti2.py │ ├── depth_anything_v2 │ │ ├── dinov2.py │ │ ├── dinov2_layers │ │ │ ├── __init__.py │ │ │ ├── attention.py │ │ │ ├── block.py │ │ │ ├── drop_path.py │ │ │ ├── layer_scale.py │ │ │ ├── mlp.py │ │ │ ├── patch_embed.py │ │ │ └── swiglu_ffn.py │ │ ├── dpt.py │ │ └── util │ │ │ ├── blocks.py │ │ │ └── transform.py │ ├── depth_to_pointcloud.py │ ├── dist_train.sh │ ├── requirements.txt │ ├── run.py │ ├── train.py │ └── util │ │ ├── dist_helper.py │ │ ├── loss.py │ │ ├── metric.py │ │ └── utils.py ├── requirements.txt ├── run.py └── run_video.py └── gaussian_splatting ├── LICENSE.md ├── __init__.py ├── gaussian_renderer └── __init__.py ├── scene └── gaussian_model.py └── utils ├── general_utils.py ├── graphics_utils.py ├── image_utils.py ├── loss_utils.py ├── sh_utils.py └── system_utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | datasets/ 2 | build/ 3 | *.egg-info/ 4 | 5 | __pycache__/ 6 | *.pyc 7 | *.so 8 | 9 | 10 | pretrained/ 11 | 12 | output*/ 13 | 14 | .vscode/ 15 | 16 | temp/ -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "thirdparty/evaluate_3d_reconstruction_lib"] 2 | path = thirdparty/evaluate_3d_reconstruction_lib 3 | url = https://github.com/eriksandstroem/evaluate_3d_reconstruction_lib.git 4 | [submodule "thirdparty/lietorch"] 5 | path = thirdparty/lietorch 6 | url = https://github.com/princeton-vl/lietorch.git 7 | [submodule "thirdparty/diff-gaussian-rasterization-w-pose"] 8 | path = thirdparty/diff-gaussian-rasterization-w-pose 9 | url = https://github.com/rmurai0610/diff-gaussian-rasterization-w-pose.git 10 | [submodule "thirdparty/simple-knn"] 11 | path = thirdparty/simple-knn 12 | url = https://github.com/camenduru/simple-knn.git 13 | [submodule "thirdparty/eigen"] 14 | path = thirdparty/eigen 15 | url = https://gitlab.com/libeigen/eigen.git 16 | [submodule "thirdparty/fit3d"] 17 | path = thirdparty/fit3d 18 | url = git@github.com:ywyue/FiT3D.git 19 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to contribute 2 | 3 | We'd love to accept your patches and contributions to this project. 4 | 5 | ## Before you begin 6 | 7 | ### Sign our Contributor License Agreement 8 | 9 | Contributions to this project must be accompanied by a 10 | [Contributor License Agreement](https://cla.developers.google.com/about) (CLA). 11 | You (or your employer) retain the copyright to your contribution; this simply 12 | gives us permission to use and redistribute your contributions as part of the 13 | project. 14 | 15 | If you or your current employer have already signed the Google CLA (even if it 16 | was for a different project), you probably don't need to do it again. 17 | 18 | Visit to see your current agreements or to 19 | sign a new one. 20 | 21 | ### Review our community guidelines 22 | 23 | This project follows 24 | [Google's Open Source Community Guidelines](https://opensource.google/conduct/). 25 | 26 | ## Contribution process 27 | 28 | ### Code reviews 29 | 30 | All submissions, including submissions by project members, require review. We 31 | use GitHub pull requests for this purpose. Consult 32 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more 33 | information on using pull requests. -------------------------------------------------------------------------------- /configs/Custom/custom_template.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/wildgs_slam.yaml 2 | scene: custom_scene # Replace with your scene name 3 | 4 | dataset: 'wild_slam_iphone' 5 | data: 6 | input_folder: ./datasets/{Path_to_your_data} 7 | output: ./output/Custom 8 | 9 | cam: 10 | H: 1242 11 | W: 2208 12 | H_out: 360 13 | W_out: 480 14 | fx: 1974.4219 15 | fy: 1974.4219 16 | cx: 1134.8486 17 | cy: 655.6515 18 | # H_edge: 0 # Uncomment this and the following line if you have edge cropping like in TUM datasets 19 | # W_edge: 0 20 | # distortion: [0.0, 0.0, 0.0, 0.0, 0.0] # Uncomment if you have distortion coefficients 21 | 22 | mapping: 23 | Training: 24 | alpha: 0.8 # Increase this value to make rendering loss weighs more on rgb rather than depth 25 | uncertainty_params: 26 | # For outdoor dataset where the metric depth estimation is unstable, 27 | # I recommend to set this value to be 0.1 or even 0. 28 | uncer_depth_mult: 0.2 29 | 30 | 31 | # # Uncomment the following lines to enable fast mode and GUI 32 | # fast_mode: True 33 | # gui: True 34 | 35 | # # Uncomment the following lines to save online plotting data 36 | # mapping: 37 | # online_plotting: True -------------------------------------------------------------------------------- /configs/Dynamic/Bonn/bonn_balloon.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/Dynamic/Bonn/bonn_dynamic.yaml 2 | scene: bonn_balloon 3 | 4 | data: 5 | input_folder: ROOT_FOLDER_PLACEHOLDER/rgbd_bonn_balloon -------------------------------------------------------------------------------- /configs/Dynamic/Bonn/bonn_balloon2.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/Dynamic/Bonn/bonn_dynamic.yaml 2 | scene: bonn_balloon2 3 | 4 | data: 5 | input_folder: ROOT_FOLDER_PLACEHOLDER/rgbd_bonn_balloon2 -------------------------------------------------------------------------------- /configs/Dynamic/Bonn/bonn_crowd.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/Dynamic/Bonn/bonn_dynamic.yaml 2 | scene: bonn_crowd 3 | 4 | data: 5 | input_folder: ROOT_FOLDER_PLACEHOLDER/rgbd_bonn_crowd -------------------------------------------------------------------------------- /configs/Dynamic/Bonn/bonn_crowd2.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/Dynamic/Bonn/bonn_dynamic.yaml 2 | scene: bonn_crowd2 3 | 4 | data: 5 | input_folder: ROOT_FOLDER_PLACEHOLDER/rgbd_bonn_crowd2 -------------------------------------------------------------------------------- /configs/Dynamic/Bonn/bonn_dynamic.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/wildgs_slam.yaml 2 | 3 | dataset: 'bonn_dynamic' 4 | 5 | data: 6 | root_folder: ./datasets/Bonn 7 | output: ./output/Bonn 8 | 9 | cam: #intrinsic is different per scene in TUM, this cam does not have distortion 10 | H: 480 11 | W: 640 12 | fx: 542.822841 13 | fy: 542.576870 14 | cx: 315.593520 15 | cy: 237.756098 16 | distortion: [0.039903, -0.099343, -0.000730, -0.000144, 0.000000] 17 | H_out: 384 18 | W_out: 512 -------------------------------------------------------------------------------- /configs/Dynamic/Bonn/bonn_moving_nonobstructing_box.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/Dynamic/Bonn/bonn_dynamic.yaml 2 | scene: bonn_moving_nonobstructing_box 3 | 4 | data: 5 | input_folder: ROOT_FOLDER_PLACEHOLDER/rgbd_bonn_moving_nonobstructing_box 6 | -------------------------------------------------------------------------------- /configs/Dynamic/Bonn/bonn_moving_nonobstructing_box2.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/Dynamic/Bonn/bonn_dynamic.yaml 2 | scene: bonn_moving_nonobstructing_box2 3 | 4 | data: 5 | input_folder: ROOT_FOLDER_PLACEHOLDER/rgbd_bonn_moving_nonobstructing_box2 -------------------------------------------------------------------------------- /configs/Dynamic/Bonn/bonn_person_tracking.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/Dynamic/Bonn/bonn_dynamic.yaml 2 | scene: bonn_person_tracking 3 | 4 | data: 5 | input_folder: ROOT_FOLDER_PLACEHOLDER/rgbd_bonn_person_tracking 6 | -------------------------------------------------------------------------------- /configs/Dynamic/Bonn/bonn_person_tracking2.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/Dynamic/Bonn/bonn_dynamic.yaml 2 | scene: bonn_person_tracking2 3 | 4 | data: 5 | input_folder: ROOT_FOLDER_PLACEHOLDER/rgbd_bonn_person_tracking2 -------------------------------------------------------------------------------- /configs/Dynamic/TUM_RGBD/freiburg2_desk_with_person.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/Dynamic/TUM_RGBD/tum_dynamic.yaml 2 | scene: freiburg2_desk_with_person 3 | 4 | data: 5 | input_folder: ROOT_FOLDER_PLACEHOLDER/rgbd_dataset_freiburg2_desk_with_person 6 | cam: #intrinsic is different per scene in TUM, this cam does not have distortion 7 | H: 480 8 | W: 640 9 | fx: 520.9 10 | fy: 521.0 11 | cx: 325.1 12 | cy: 249.7 13 | distortion: [0.2312, -0.7849, -0.0033, -0.0001, 0.9172] 14 | H_edge: 8 15 | W_edge: 8 16 | H_out: 240 17 | W_out: 320 18 | 19 | tracking: 20 | # This sequence is too long 21 | force_keyframe_every_n_frames: -1 -------------------------------------------------------------------------------- /configs/Dynamic/TUM_RGBD/freiburg3_sitting_halfsphere.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/Dynamic/TUM_RGBD/tum_dynamic.yaml 2 | scene: freiburg3_sitting_halfsphere 3 | 4 | data: 5 | input_folder: ROOT_FOLDER_PLACEHOLDER/rgbd_dataset_freiburg3_sitting_halfsphere 6 | cam: #intrinsic is different per scene in TUM, this cam does not have distortion 7 | H: 480 8 | W: 640 9 | fx: 535.4 10 | fy: 539.2 11 | cx: 320.1 12 | cy: 247.6 -------------------------------------------------------------------------------- /configs/Dynamic/TUM_RGBD/freiburg3_sitting_halfsphere_static.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/Dynamic/TUM_RGBD/tum_dynamic.yaml 2 | scene: freiburg3_sitting_static 3 | 4 | data: 5 | input_folder: ROOT_FOLDER_PLACEHOLDER/rgbd_dataset_freiburg3_sitting_static 6 | cam: #intrinsic is different per scene in TUM, this cam does not have distortion 7 | H: 480 8 | W: 640 9 | fx: 535.4 10 | fy: 539.2 11 | cx: 320.1 12 | cy: 247.6 -------------------------------------------------------------------------------- /configs/Dynamic/TUM_RGBD/freiburg3_sitting_rpy.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/Dynamic/TUM_RGBD/tum_dynamic.yaml 2 | scene: freiburg3_sitting_rpy 3 | 4 | data: 5 | input_folder: ROOT_FOLDER_PLACEHOLDER/rgbd_dataset_freiburg3_sitting_rpy 6 | cam: #intrinsic is different per scene in TUM, this cam does not have distortion 7 | H: 480 8 | W: 640 9 | fx: 535.4 10 | fy: 539.2 11 | cx: 320.1 12 | cy: 247.6 -------------------------------------------------------------------------------- /configs/Dynamic/TUM_RGBD/freiburg3_sitting_xyz.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/Dynamic/TUM_RGBD/tum_dynamic.yaml 2 | scene: freiburg3_sitting_xyz 3 | 4 | data: 5 | input_folder: ROOT_FOLDER_PLACEHOLDER/rgbd_dataset_freiburg3_sitting_xyz 6 | cam: #intrinsic is different per scene in TUM, this cam does not have distortion 7 | H: 480 8 | W: 640 9 | fx: 535.4 10 | fy: 539.2 11 | cx: 320.1 12 | cy: 247.6 -------------------------------------------------------------------------------- /configs/Dynamic/TUM_RGBD/freiburg3_walking_halfsphere.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/Dynamic/TUM_RGBD/tum_dynamic.yaml 2 | scene: freiburg3_walking_halfsphere 3 | 4 | data: 5 | input_folder: ROOT_FOLDER_PLACEHOLDER/rgbd_dataset_freiburg3_walking_halfsphere 6 | cam: #intrinsic is different per scene in TUM, this cam does not have distortion 7 | H: 480 8 | W: 640 9 | fx: 535.4 10 | fy: 539.2 11 | cx: 320.1 12 | cy: 247.6 -------------------------------------------------------------------------------- /configs/Dynamic/TUM_RGBD/freiburg3_walking_halfsphere_static.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/Dynamic/TUM_RGBD/tum_dynamic.yaml 2 | scene: freiburg3_walking_static 3 | 4 | data: 5 | input_folder: ROOT_FOLDER_PLACEHOLDER/rgbd_dataset_freiburg3_walking_static 6 | cam: #intrinsic is different per scene in TUM, this cam does not have distortion 7 | H: 480 8 | W: 640 9 | fx: 535.4 10 | fy: 539.2 11 | cx: 320.1 12 | cy: 247.6 -------------------------------------------------------------------------------- /configs/Dynamic/TUM_RGBD/freiburg3_walking_rpy.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/Dynamic/TUM_RGBD/tum_dynamic.yaml 2 | scene: freiburg3_walking_rpy 3 | 4 | data: 5 | input_folder: ROOT_FOLDER_PLACEHOLDER/rgbd_dataset_freiburg3_walking_rpy 6 | cam: #intrinsic is different per scene in TUM, this cam does not have distortion 7 | H: 480 8 | W: 640 9 | fx: 535.4 10 | fy: 539.2 11 | cx: 320.1 12 | cy: 247.6 -------------------------------------------------------------------------------- /configs/Dynamic/TUM_RGBD/freiburg3_walking_xyz.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/Dynamic/TUM_RGBD/tum_dynamic.yaml 2 | scene: freiburg3_walking_xyz 3 | 4 | data: 5 | input_folder: ROOT_FOLDER_PLACEHOLDER/rgbd_dataset_freiburg3_walking_xyz 6 | cam: #intrinsic is different per scene in TUM, this cam does not have distortion 7 | H: 480 8 | W: 640 9 | fx: 535.4 10 | fy: 539.2 11 | cx: 320.1 12 | cy: 247.6 -------------------------------------------------------------------------------- /configs/Dynamic/TUM_RGBD/tum_dynamic.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/wildgs_slam.yaml 2 | 3 | dataset: 'tumrgbd' 4 | 5 | tracking: 6 | buffer: 350 7 | 8 | # Less weight on the depth loss for TUM 9 | mapping: 10 | Training: 11 | alpha: 0.8 12 | 13 | data: 14 | root_folder: ./datasets/TUM_RGBD 15 | output: ./output/TUM_RGBD 16 | 17 | cam: #NOTE: intrinsic is different per scene in TUM 18 | # refer to https://vision.in.tum.de/data/datasets/rgbd-dataset/file_formats#intrinsic_camera_calibration_of_the_kinect 19 | png_depth_scale: 5000.0 #for depth image in png format 20 | ### target/output camera settings, camera_size -> resize -> crop -> target_size 21 | H: 480 22 | W: 640 23 | fx: 535.4 24 | fy: 539.2 25 | cx: 320.1 26 | cy: 247.6 27 | H_edge: 8 28 | W_edge: 8 29 | H_out: 384 30 | W_out: 512 31 | -------------------------------------------------------------------------------- /configs/Dynamic/Wild_SLAM_Mocap/ANYmal1.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/Dynamic/Wild_SLAM_Mocap/wild_slam_mocap.yaml 2 | scene: ANYmal1 3 | 4 | data: 5 | input_folder: ROOT_FOLDER_PLACEHOLDER/scene2/ANYmal1 6 | 7 | cam: 8 | fx: 647.7445068359375 9 | fy: 646.9425659179688 -------------------------------------------------------------------------------- /configs/Dynamic/Wild_SLAM_Mocap/ANYmal2.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/Dynamic/Wild_SLAM_Mocap/wild_slam_mocap.yaml 2 | scene: ANYmal2 3 | 4 | data: 5 | input_folder: ROOT_FOLDER_PLACEHOLDER/scene2/ANYmal2 6 | 7 | cam: 8 | fx: 647.7445068359375 9 | fy: 646.9425659179688 -------------------------------------------------------------------------------- /configs/Dynamic/Wild_SLAM_Mocap/ball.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/Dynamic/Wild_SLAM_Mocap/wild_slam_mocap.yaml 2 | scene: basketball 3 | 4 | data: 5 | input_folder: ROOT_FOLDER_PLACEHOLDER/scene1/ball -------------------------------------------------------------------------------- /configs/Dynamic/Wild_SLAM_Mocap/crowd.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/Dynamic/Wild_SLAM_Mocap/wild_slam_mocap.yaml 2 | scene: crowd 3 | 4 | data: 5 | input_folder: ROOT_FOLDER_PLACEHOLDER/scene1/crowd -------------------------------------------------------------------------------- /configs/Dynamic/Wild_SLAM_Mocap/crowd_demo.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/Dynamic/Wild_SLAM_Mocap/wild_slam_mocap.yaml 2 | scene: crowd_demo 3 | 4 | fast_mode: True 5 | gui: True 6 | mapping: 7 | online_plotting: True 8 | 9 | data: 10 | input_folder: ROOT_FOLDER_PLACEHOLDER/scene1/crowd 11 | output: ./output/Wild_SLAM_Mocap_demo -------------------------------------------------------------------------------- /configs/Dynamic/Wild_SLAM_Mocap/person_tracking.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/Dynamic/Wild_SLAM_Mocap/wild_slam_mocap.yaml 2 | scene: person_tracking 3 | 4 | data: 5 | input_folder: ROOT_FOLDER_PLACEHOLDER/scene1/person_tracking 6 | 7 | cam: 8 | fx: 647.5684814453125 9 | fy: 646.766845703125 -------------------------------------------------------------------------------- /configs/Dynamic/Wild_SLAM_Mocap/racket.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/Dynamic/Wild_SLAM_Mocap/wild_slam_mocap.yaml 2 | scene: racket 3 | 4 | data: 5 | input_folder: ROOT_FOLDER_PLACEHOLDER/scene1/racket 6 | 7 | cam: 8 | fx: 647.3926391601562 9 | fy: 646.5911254882812 -------------------------------------------------------------------------------- /configs/Dynamic/Wild_SLAM_Mocap/stones.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/Dynamic/Wild_SLAM_Mocap/wild_slam_mocap.yaml 2 | scene: stones 3 | 4 | data: 5 | input_folder: ROOT_FOLDER_PLACEHOLDER/scene1/stones 6 | 7 | cam: 8 | fx: 647.7445068359375 9 | fy: 646.9425659179688 -------------------------------------------------------------------------------- /configs/Dynamic/Wild_SLAM_Mocap/table_tracking1.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/Dynamic/Wild_SLAM_Mocap/wild_slam_mocap.yaml 2 | scene: table_tracking1 3 | 4 | data: 5 | input_folder: ROOT_FOLDER_PLACEHOLDER/scene1/table_tracking1 6 | 7 | cam: 8 | fx: 647.9204711914062 9 | fy: 647.1183471679688 -------------------------------------------------------------------------------- /configs/Dynamic/Wild_SLAM_Mocap/table_tracking2.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/Dynamic/Wild_SLAM_Mocap/wild_slam_mocap.yaml 2 | scene: table_tracking2 3 | 4 | data: 5 | input_folder: ROOT_FOLDER_PLACEHOLDER/scene1/table_tracking2 6 | 7 | cam: 8 | fx: 647.5684814453125 9 | fy: 646.766845703125 -------------------------------------------------------------------------------- /configs/Dynamic/Wild_SLAM_Mocap/umbrella.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/Dynamic/Wild_SLAM_Mocap/wild_slam_mocap.yaml 2 | scene: umbrella 3 | 4 | data: 5 | input_folder: ROOT_FOLDER_PLACEHOLDER/scene1/umbrella 6 | 7 | cam: 8 | fx: 647.7445068359375 9 | fy: 646.9425659179688 -------------------------------------------------------------------------------- /configs/Dynamic/Wild_SLAM_Mocap/wild_slam_mocap.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/wildgs_slam.yaml 2 | 3 | dataset: 'wild_slam_mocap' 4 | 5 | data: 6 | root_folder: ./datasets/Wild_SLAM_Mocap 7 | output: ./output/Wild_SLAM_Mocap 8 | 9 | cam: #intrinsic is slightly different per seq 10 | H: 720 11 | W: 1280 12 | fx: 647.2167358398438 13 | fy: 646.4154663085938 14 | cx: 643.1209716796875 15 | cy: 365.55963134765625 16 | distortion: [-0.0550149604678154, 0.06560786068439484,-0.0005061274860054255,0.0004771310486830771,-0.021717390045523643] 17 | H_out: 360 18 | W_out: 640 -------------------------------------------------------------------------------- /configs/Dynamic/Wild_SLAM_iPhone/horse.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/Dynamic/Wild_SLAM_iPhone/wild_slam_iphone.yaml 2 | scene: iphone_horse 3 | 4 | data: 5 | input_folder: ROOT_FOLDER_PLACEHOLDER/horse 6 | 7 | cam: 8 | fx: 1341.1414794921875 9 | fy: 1341.1414794921875 10 | cx: 960.2431640625 11 | cy: 729.904052734375 -------------------------------------------------------------------------------- /configs/Dynamic/Wild_SLAM_iPhone/parking.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/Dynamic/Wild_SLAM_iPhone/wild_slam_iphone.yaml 2 | scene: iphone_parking 3 | 4 | data: 5 | input_folder: ROOT_FOLDER_PLACEHOLDER/parking 6 | 7 | cam: 8 | fx: 1336.74609375 9 | fy: 1336.74609375 10 | cx: 957.005859375 11 | cy: 726.88409423828125 -------------------------------------------------------------------------------- /configs/Dynamic/Wild_SLAM_iPhone/piano.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/Dynamic/Wild_SLAM_iPhone/wild_slam_iphone.yaml 2 | scene: iphone_piano 3 | 4 | data: 5 | input_folder: ROOT_FOLDER_PLACEHOLDER/piano 6 | 7 | cam: 8 | fx: 1351.06982421875 9 | fy: 1351.06982421875 10 | cx: 961.050537109375 11 | cy: 730.18597412109375 -------------------------------------------------------------------------------- /configs/Dynamic/Wild_SLAM_iPhone/shopping.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/Dynamic/Wild_SLAM_iPhone/wild_slam_iphone.yaml 2 | scene: iphone_shopping 3 | 4 | data: 5 | input_folder: ROOT_FOLDER_PLACEHOLDER/shopping 6 | 7 | cam: 8 | fx: 1340.6441650390625 9 | fy: 1340.6441650390625 10 | cx: 960.7640380859375 11 | cy: 730.26397705078125 -------------------------------------------------------------------------------- /configs/Dynamic/Wild_SLAM_iPhone/street.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/Dynamic/Wild_SLAM_iPhone/wild_slam_iphone.yaml 2 | scene: iphone_street 3 | 4 | data: 5 | input_folder: ROOT_FOLDER_PLACEHOLDER/street 6 | 7 | cam: 8 | fx: 1331.6123046875 9 | fy: 1331.6123046875 10 | cx: 956.61676025390625 11 | cy: 727.839599609375 -------------------------------------------------------------------------------- /configs/Dynamic/Wild_SLAM_iPhone/tower.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/Dynamic/Wild_SLAM_iPhone/wild_slam_iphone.yaml 2 | scene: iphone_tower 3 | 4 | data: 5 | input_folder: ROOT_FOLDER_PLACEHOLDER/tower 6 | 7 | cam: 8 | fx: 1338.494140625 9 | fy: 1338.494140625 10 | cx: 960.17327880859375 11 | cy: 730.55328369140625 -------------------------------------------------------------------------------- /configs/Dynamic/Wild_SLAM_iPhone/wild_slam_iphone.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/wildgs_slam.yaml 2 | 3 | dataset: 'wild_slam_iphone' 4 | 5 | data: 6 | root_folder: ./datasets/Wild_SLAM_iPhone 7 | output: ./output/Wild_SLAM_iPhone 8 | 9 | mapping: 10 | Training: 11 | alpha: 0.8 # Increase this value to make rendering loss weighs more on rgb rather than depth 12 | uncertainty_params: 13 | # This parameter weighs depth loss when training uncertainty MLP 14 | # It's lambda_1 in equation 4 in the paper. 15 | # We set it 0 here as the metric depth is not reliable in iphone dataset. 16 | # However, feel free to finetune this parameter if trying to run with your own dataset. 17 | uncer_depth_mult: 0.0 18 | 19 | cam: 20 | H: 1440 21 | W: 1920 22 | H_out: 360 23 | W_out: 480 -------------------------------------------------------------------------------- /configs/Static/TUM_RGBD/freiburg1_desk.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/Static/TUM_RGBD/tum.yaml 2 | scene: freiburg1_desk 3 | 4 | data: 5 | input_folder: ROOT_FOLDER_PLACEHOLDER/rgbd_dataset_freiburg1_desk 6 | cam: 7 | H: 480 8 | W: 640 9 | fx: 517.3 10 | fy: 516.5 11 | cx: 318.6 12 | cy: 255.3 13 | distortion: [0.2624, -0.9531, -0.0054, 0.0026, 1.1633] 14 | -------------------------------------------------------------------------------- /configs/Static/TUM_RGBD/freiburg2_xyz.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/Static/TUM_RGBD/tum.yaml 2 | scene: freiburg2_xyz 3 | 4 | data: 5 | input_folder: ROOT_FOLDER_PLACEHOLDER/rgbd_dataset_freiburg2_xyz 6 | cam: #intrinsic is different per scene in TUM 7 | H: 480 8 | W: 640 9 | fx: 520.9 10 | fy: 521.0 11 | cx: 325.1 12 | cy: 249.7 13 | distortion: [0.2312, -0.7849, -0.0033, -0.0001, 0.9172] 14 | H_out: 240 15 | W_out: 320 16 | 17 | -------------------------------------------------------------------------------- /configs/Static/TUM_RGBD/freiburg3_office.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/Static/TUM_RGBD/tum.yaml 2 | scene: reiburg3_long_office_household 3 | 4 | data: 5 | input_folder: ROOT_FOLDER_PLACEHOLDER/rgbd_dataset_freiburg3_long_office_household 6 | 7 | cam: #intrinsic is different per scene in TUM, this cam does not have distortion 8 | H: 480 9 | W: 640 10 | fx: 535.4 11 | fy: 539.2 12 | cx: 320.1 13 | cy: 247.6 -------------------------------------------------------------------------------- /configs/Static/TUM_RGBD/tum.yaml: -------------------------------------------------------------------------------- 1 | inherit_from: ./configs/wildgs_slam.yaml 2 | 3 | dataset: 'tumrgbd' 4 | 5 | mapping: 6 | Calibration: 7 | depth_scale: 5000.0 8 | 9 | tracking: 10 | buffer: 500 11 | warmup: 12 12 | multiview_filter: 13 | visible_num: 2 14 | frontend: 15 | keyframe_thresh: 3.0 16 | radius: 2 17 | backend: 18 | loop_nms: 10 19 | 20 | cam: #NOTE: intrinsic is different per scene in TUM 21 | # refer to https://vision.in.tum.de/data/datasets/rgbd-dataset/file_formats#intrinsic_camera_calibration_of_the_kinect 22 | png_depth_scale: 5000.0 #for depth image in png format 23 | ### target/output camera settings, camera_size -> resize -> crop -> target_size 24 | H_edge: 8 25 | W_edge: 8 26 | H_out: 384 27 | W_out: 512 28 | 29 | data: 30 | root_folder: /home/jianhaozheng/Gaussian_in_the_Wild/data/tum_rgb-d 31 | output: ./output/TUM_RGBD 32 | -------------------------------------------------------------------------------- /media/teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/media/teaser.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pillow 2 | imageio 3 | joblib 4 | pandas 5 | scikit-image 6 | scikit-learn 7 | scipy 8 | seaborn 9 | PyOpenGL-accelerate 10 | pyrender 11 | ninja 12 | setuptools 13 | timm==0.9.10 14 | plyfile==0.8.1 15 | tqdm 16 | opencv-python==4.8.1.78 17 | munch 18 | evo 19 | open3d==0.17.0 20 | torchmetrics 21 | imgviz 22 | lpips 23 | rich 24 | kornia 25 | PyQt5 26 | glfw 27 | PyGLM 28 | mmengine -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import argparse 4 | import os 5 | 6 | from src import config 7 | from src.slam import SLAM 8 | from src.utils.datasets import get_dataset 9 | from time import gmtime, strftime 10 | from colorama import Fore,Style 11 | 12 | import random 13 | def setup_seed(seed): 14 | torch.manual_seed(seed) 15 | torch.cuda.manual_seed_all(seed) 16 | np.random.seed(seed) 17 | random.seed(seed) 18 | torch.backends.cudnn.deterministic = True 19 | 20 | if __name__ == '__main__': 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument('config', type=str, help='Path to config file.') 23 | args = parser.parse_args() 24 | 25 | torch.multiprocessing.set_start_method('spawn') 26 | 27 | cfg = config.load_config(args.config) 28 | setup_seed(cfg['setup_seed']) 29 | if cfg['fast_mode']: 30 | # Force the final refine iterations to be 3000 if in fast mode 31 | cfg['mapping']['final_refine_iters'] = 3000 32 | 33 | output_dir = cfg['data']['output'] 34 | output_dir = output_dir+f"/{cfg['scene']}" 35 | 36 | start_time = strftime("%Y-%m-%d %H:%M:%S", gmtime()) 37 | start_info = "-"*30+Fore.LIGHTRED_EX+\ 38 | f"\nStart WildGS-SLAM at {start_time},\n"+Style.RESET_ALL+ \ 39 | f" scene: {cfg['dataset']}-{cfg['scene']},\n" \ 40 | f" output: {output_dir}\n"+ \ 41 | "-"*30 42 | print(start_info) 43 | 44 | if not os.path.exists(output_dir): 45 | os.makedirs(output_dir) 46 | 47 | config.save_config(cfg, f'{output_dir}/cfg.yaml') 48 | 49 | dataset = get_dataset(cfg) 50 | 51 | slam = SLAM(cfg,dataset) 52 | slam.run() 53 | 54 | end_time = strftime("%Y-%m-%d %H:%M:%S", gmtime()) 55 | print("-"*30+Fore.LIGHTRED_EX+f"\nWildGS-SLAM finishes!\n"+Style.RESET_ALL+f"{end_time}\n"+"-"*30) 56 | 57 | -------------------------------------------------------------------------------- /scripts_downloading/download_bonn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p datasets/Bonn 4 | cd datasets/Bonn 5 | 6 | scenes=( 7 | "balloon" 8 | "balloon2" 9 | "crowd" 10 | "crowd2" 11 | "person_tracking" 12 | "person_tracking2" 13 | "moving_nonobstructing_box" 14 | "moving_nonobstructing_box2" 15 | ) 16 | 17 | for scene in "${scenes[@]}" 18 | do 19 | echo "Processing scene: $scene" 20 | 21 | # Check if the folder already exists 22 | if [ -d "$scene" ]; then 23 | echo "Folder $scene already exists, skipping download" 24 | else 25 | zip_file="rgbd_bonn_${scene}.zip" 26 | wget "https://www.ipb.uni-bonn.de/html/projects/rgbd_dynamic2019/${zip_file}" 27 | 28 | if [ $? -eq 0 ]; then 29 | echo "Successfully downloaded ${zip_file}" 30 | unzip -q "${zip_file}" 31 | if [ $? -eq 0 ]; then 32 | echo "Successfully extracted ${zip_file}" 33 | rm "${zip_file}" 34 | echo "Removed ${zip_file}" 35 | else 36 | echo "Failed to extract ${zip_file}" 37 | fi 38 | else 39 | echo "Failed to download ${zip_file}" 40 | fi 41 | fi 42 | 43 | echo "Finished processing ${scene}" 44 | echo "-----------------------------" 45 | done -------------------------------------------------------------------------------- /scripts_downloading/download_demo_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p datasets/Wild_SLAM_Mocap/scene1 4 | cd datasets/Wild_SLAM_Mocap/scene1 5 | 6 | scenes=( 7 | "crowd" 8 | ) 9 | 10 | for scene in "${scenes[@]}" 11 | do 12 | echo "Processing scene: $scene" 13 | 14 | # Check if the folder already exists 15 | if [ -d "$scene" ]; then 16 | echo "Folder $scene already exists, skipping download" 17 | else 18 | zip_file="${scene}.zip" 19 | wget "https://huggingface.co/datasets/gradient-spaces/Wild-SLAM/resolve/main/Mocap/scene1/${zip_file}" 20 | 21 | if [ $? -eq 0 ]; then 22 | echo "Successfully downloaded ${zip_file}" 23 | unzip -q "${zip_file}" 24 | if [ $? -eq 0 ]; then 25 | echo "Successfully extracted ${zip_file}" 26 | rm "${zip_file}" 27 | echo "Removed ${zip_file}" 28 | else 29 | echo "Failed to extract ${zip_file}" 30 | fi 31 | else 32 | echo "Failed to download ${zip_file}" 33 | fi 34 | fi 35 | 36 | echo "Finished processing ${scene}" 37 | echo "-----------------------------" 38 | done -------------------------------------------------------------------------------- /scripts_downloading/download_tum.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p datasets/TUM_RGBD 4 | cd datasets/TUM_RGBD 5 | 6 | wget https://cvg.cit.tum.de/rgbd/dataset/freiburg2/rgbd_dataset_freiburg2_desk_with_person.tgz 7 | tar -xvzf rgbd_dataset_freiburg2_desk_with_person.tgz 8 | rm rgbd_dataset_freiburg2_desk_with_person.tgz 9 | 10 | wget https://cvg.cit.tum.de/rgbd/dataset/freiburg3/rgbd_dataset_freiburg3_sitting_static.tgz 11 | tar -xvzf rgbd_dataset_freiburg3_sitting_static.tgz 12 | rm rgbd_dataset_freiburg3_sitting_static.tgz 13 | 14 | wget https://cvg.cit.tum.de/rgbd/dataset/freiburg3/rgbd_dataset_freiburg3_sitting_xyz.tgz 15 | tar -xvzf rgbd_dataset_freiburg3_sitting_xyz.tgz 16 | rm rgbd_dataset_freiburg3_sitting_xyz.tgz 17 | 18 | wget https://cvg.cit.tum.de/rgbd/dataset/freiburg3/rgbd_dataset_freiburg3_sitting_halfsphere.tgz 19 | tar -xvzf rgbd_dataset_freiburg3_sitting_halfsphere.tgz 20 | rm rgbd_dataset_freiburg3_sitting_halfsphere.tgz 21 | 22 | wget https://cvg.cit.tum.de/rgbd/dataset/freiburg3/rgbd_dataset_freiburg3_sitting_rpy.tgz 23 | tar -xvzf rgbd_dataset_freiburg3_sitting_rpy.tgz 24 | rm rgbd_dataset_freiburg3_sitting_rpy.tgz 25 | 26 | wget https://cvg.cit.tum.de/rgbd/dataset/freiburg3/rgbd_dataset_freiburg3_walking_static.tgz 27 | tar -xvzf rgbd_dataset_freiburg3_walking_static.tgz 28 | rm rgbd_dataset_freiburg3_walking_static.tgz 29 | 30 | wget https://cvg.cit.tum.de/rgbd/dataset/freiburg3/rgbd_dataset_freiburg3_walking_xyz.tgz 31 | tar -xvzf rgbd_dataset_freiburg3_walking_xyz.tgz 32 | rm rgbd_dataset_freiburg3_walking_xyz.tgz 33 | 34 | wget https://cvg.cit.tum.de/rgbd/dataset/freiburg3/rgbd_dataset_freiburg3_walking_halfsphere.tgz 35 | tar -xvzf rgbd_dataset_freiburg3_walking_halfsphere.tgz 36 | rm rgbd_dataset_freiburg3_walking_halfsphere.tgz 37 | 38 | wget https://cvg.cit.tum.de/rgbd/dataset/freiburg3/rgbd_dataset_freiburg3_walking_rpy.tgz 39 | tar -xvzf rgbd_dataset_freiburg3_walking_rpy.tgz 40 | rm rgbd_dataset_freiburg3_walking_rpy.tgz 41 | 42 | -------------------------------------------------------------------------------- /scripts_downloading/download_wild_slam_iphone.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p datasets/Wild_SLAM_iPhone 4 | cd datasets/Wild_SLAM_iPhone 5 | 6 | scenes=( 7 | "parking" 8 | "piano" 9 | "shopping" 10 | "street" 11 | "tower" 12 | "wall" 13 | "wandering" 14 | ) 15 | 16 | for scene in "${scenes[@]}" 17 | do 18 | echo "Processing scene: $scene" 19 | 20 | # Check if the folder already exists 21 | if [ -d "$scene" ]; then 22 | echo "Folder $scene already exists, skipping download" 23 | else 24 | zip_file="${scene}.zip" 25 | wget "https://huggingface.co/datasets/gradient-spaces/Wild-SLAM/resolve/main/iPhone/${zip_file}" 26 | 27 | if [ $? -eq 0 ]; then 28 | echo "Successfully downloaded ${zip_file}" 29 | unzip -q "${zip_file}" 30 | if [ $? -eq 0 ]; then 31 | echo "Successfully extracted ${zip_file}" 32 | rm "${zip_file}" 33 | echo "Removed ${zip_file}" 34 | else 35 | echo "Failed to extract ${zip_file}" 36 | fi 37 | else 38 | echo "Failed to download ${zip_file}" 39 | fi 40 | fi 41 | 42 | echo "Finished processing ${scene}" 43 | echo "-----------------------------" 44 | done -------------------------------------------------------------------------------- /scripts_downloading/download_wild_slam_mocap_scene1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p datasets/Wild_SLAM_Mocap/scene1 4 | cd datasets/Wild_SLAM_Mocap/scene1 5 | 6 | scenes=( 7 | "ball" 8 | "crowd" 9 | "person_tracking" 10 | "racket" 11 | "stones" 12 | "table_tracking1" 13 | "table_tracking2" 14 | "umbrella" 15 | ) 16 | 17 | for scene in "${scenes[@]}" 18 | do 19 | echo "Processing scene: $scene" 20 | 21 | # Check if the folder already exists 22 | if [ -d "$scene" ]; then 23 | echo "Folder $scene already exists, skipping download" 24 | else 25 | zip_file="${scene}.zip" 26 | wget "https://huggingface.co/datasets/gradient-spaces/Wild-SLAM/resolve/main/Mocap/scene1/${zip_file}" 27 | 28 | if [ $? -eq 0 ]; then 29 | echo "Successfully downloaded ${zip_file}" 30 | unzip -q "${zip_file}" 31 | if [ $? -eq 0 ]; then 32 | echo "Successfully extracted ${zip_file}" 33 | rm "${zip_file}" 34 | echo "Removed ${zip_file}" 35 | else 36 | echo "Failed to extract ${zip_file}" 37 | fi 38 | else 39 | echo "Failed to download ${zip_file}" 40 | fi 41 | fi 42 | 43 | echo "Finished processing ${scene}" 44 | echo "-----------------------------" 45 | done -------------------------------------------------------------------------------- /scripts_downloading/download_wild_slam_mocap_scene2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p datasets/Wild_SLAM_Mocap/scene2 4 | cd datasets/Wild_SLAM_Mocap/scene2 5 | 6 | scenes=( 7 | "ANYmal1" 8 | "ANYmal2" 9 | ) 10 | 11 | for scene in "${scenes[@]}" 12 | do 13 | echo "Processing scene: $scene" 14 | 15 | # Check if the folder already exists 16 | if [ -d "$scene" ]; then 17 | echo "Folder $scene already exists, skipping download" 18 | else 19 | zip_file="${scene}.zip" 20 | wget "https://huggingface.co/datasets/gradient-spaces/Wild-SLAM/resolve/main/Mocap/scene2/${zip_file}" 21 | 22 | if [ $? -eq 0 ]; then 23 | echo "Successfully downloaded ${zip_file}" 24 | unzip -q "${zip_file}" 25 | if [ $? -eq 0 ]; then 26 | echo "Successfully extracted ${zip_file}" 27 | rm "${zip_file}" 28 | echo "Removed ${zip_file}" 29 | else 30 | echo "Failed to extract ${zip_file}" 31 | fi 32 | else 33 | echo "Failed to download ${zip_file}" 34 | fi 35 | fi 36 | 37 | echo "Finished processing ${scene}" 38 | echo "-----------------------------" 39 | done -------------------------------------------------------------------------------- /scripts_run/run_bonn_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python run.py ./configs/Dynamic/Bonn/bonn_balloon.yaml 4 | python run.py ./configs/Dynamic/Bonn/bonn_balloon2.yaml 5 | python run.py ./configs/Dynamic/Bonn/bonn_crowd.yaml 6 | python run.py ./configs/Dynamic/Bonn/bonn_crowd2.yaml 7 | python run.py ./configs/Dynamic/Bonn/bonn_moving_nonobstructing_box.yaml 8 | python run.py ./configs/Dynamic/Bonn/bonn_moving_nonobstructing_box2.yaml 9 | python run.py ./configs/Dynamic/Bonn/bonn_person_tracking.yaml 10 | python run.py ./configs/Dynamic/Bonn/bonn_person_tracking2.yaml 11 | -------------------------------------------------------------------------------- /scripts_run/run_tum_dynamic_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python run.py ./configs/Dynamic/TUM_RGBD/freiburg2_desk_with_person.yaml 4 | python run.py ./configs/Dynamic/TUM_RGBD/freiburg3_sitting_halfsphere_static.yaml 5 | python run.py ./configs/Dynamic/TUM_RGBD/freiburg3_sitting_halfsphere.yaml 6 | python run.py ./configs/Dynamic/TUM_RGBD/freiburg3_sitting_rpy.yaml 7 | python run.py ./configs/Dynamic/TUM_RGBD/freiburg3_sitting_xyz.yaml 8 | python run.py ./configs/Dynamic/TUM_RGBD/freiburg3_walking_halfsphere_static.yaml 9 | python run.py ./configs/Dynamic/TUM_RGBD/freiburg3_walking_halfsphere.yaml 10 | python run.py ./configs/Dynamic/TUM_RGBD/freiburg3_walking_rpy.yaml 11 | python run.py ./configs/Dynamic/TUM_RGBD/freiburg3_walking_xyz.yaml -------------------------------------------------------------------------------- /scripts_run/run_wild_slam_mocap_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python run.py ./configs/Dynamic/Wild_SLAM_Mocap/ball.yaml 4 | python run.py ./configs/Dynamic/Wild_SLAM_Mocap/crowd.yaml 5 | python run.py ./configs/Dynamic/Wild_SLAM_Mocap/person_tracking.yaml 6 | python run.py ./configs/Dynamic/Wild_SLAM_Mocap/racket.yaml 7 | python run.py ./configs/Dynamic/Wild_SLAM_Mocap/stones.yaml 8 | python run.py ./configs/Dynamic/Wild_SLAM_Mocap/table_tracking1.yaml 9 | python run.py ./configs/Dynamic/Wild_SLAM_Mocap/table_tracking2.yaml 10 | python run.py ./configs/Dynamic/Wild_SLAM_Mocap/umbrella.yaml 11 | python run.py ./configs/Dynamic/Wild_SLAM_Mocap/ANYmal1.yaml 12 | python run.py ./configs/Dynamic/Wild_SLAM_Mocap/ANYmal2.yaml 13 | -------------------------------------------------------------------------------- /scripts_run/summarize_pose_eval.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import json 3 | import pandas as pd 4 | import os 5 | 6 | datasets = os.listdir('./output') 7 | for dataset in datasets: 8 | if not os.path.isdir(os.path.join('output', dataset)): 9 | continue 10 | dataset_path = os.path.join('output', dataset) 11 | scenes = sorted(os.listdir(dataset_path)) 12 | 13 | data = {scene: [] for scene in scenes} 14 | averages = [] 15 | 16 | row_data = [] 17 | rmses = [] 18 | for scene in scenes: 19 | exp_folder = os.path.join(dataset_path, scene) 20 | # metrics_full_traj, metrics_kf_traj, metrics_kf_traj_before_ba 21 | result_file = os.path.join(exp_folder, "traj/metrics_full_traj.txt") 22 | if os.path.exists(result_file): 23 | # Load the JSON file 24 | with open(result_file, "r") as f: 25 | output = f.readlines() 26 | 27 | rmse = float(output[8].split(',')[0].replace("{'rmse': ",'')) 28 | 29 | # Add metrics to the row 30 | row_data.append(f"{rmse*1e2:.2f}") 31 | rmses.append(rmse) 32 | else: 33 | row_data.append("N/A") # If file doesn't exist, mark it as N/A 34 | avg_rmse = np.nanmean(rmses) 35 | averages.append(f"{avg_rmse*1e2:.2f}") 36 | for scene, value in zip(scenes, row_data): 37 | data[scene].append(value) 38 | 39 | data['Average'] = averages 40 | 41 | # Convert the data to a Pandas DataFrame 42 | df = pd.DataFrame(data, index=['wildgs-slam']) 43 | 44 | # Save the DataFrame as a CSV file 45 | csv_path = f"./output/{dataset}_eval.csv" 46 | df.to_csv(csv_path) 47 | 48 | # Output the CSV file path 49 | print(f"Results saved to {csv_path}") 50 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension 3 | 4 | import os.path as osp 5 | ROOT = osp.dirname(osp.abspath(__file__)) 6 | 7 | setup( 8 | name='droid_backends', 9 | ext_modules=[ 10 | CUDAExtension('droid_backends', 11 | include_dirs=[osp.join(ROOT, 'thirdparty/lietorch/eigen')], 12 | sources=[ 13 | 'src/lib/droid.cpp', 14 | 'src/lib/droid_kernels.cu', 15 | 'src/lib/correlation_kernels.cu', 16 | 'src/lib/altcorr_kernel.cu', 17 | ], 18 | extra_compile_args={ 19 | 'cxx': ['-O3'], 20 | 'nvcc': ['-O3', 21 | '-gencode=arch=compute_60,code=sm_60', 22 | '-gencode=arch=compute_61,code=sm_61', 23 | '-gencode=arch=compute_70,code=sm_70', 24 | '-gencode=arch=compute_75,code=sm_75', 25 | '-gencode=arch=compute_80,code=sm_80', 26 | '-gencode=arch=compute_86,code=sm_86', 27 | ] 28 | }), 29 | ], 30 | cmdclass={ 'build_ext' : BuildExtension } 31 | ) 32 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/src/__init__.py -------------------------------------------------------------------------------- /src/backend.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The GlORIE-SLAM Authors. 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | from src.factor_graph import FactorGraph 17 | from copy import deepcopy 18 | 19 | class Backend: 20 | def __init__(self, net, video, cfg): 21 | self.cfg = cfg 22 | self.video = video 23 | self.update_op = net.update 24 | self.device = cfg['device'] 25 | # global optimization window 26 | self.t0 = 0 27 | self.t1 = 0 28 | 29 | self.beta = cfg['tracking']['beta'] 30 | self.backend_thresh = cfg['tracking']['backend']['thresh'] 31 | self.backend_radius = cfg['tracking']['backend']['radius'] 32 | self.backend_nms = cfg['tracking']['backend']['nms'] 33 | self.backend_normalize = cfg['tracking']['backend']['normalize'] 34 | self.output = f"{cfg['data']['output']}/{cfg['scene']}" 35 | 36 | self.backend_loop_window = cfg['tracking']['backend']['loop_window'] 37 | self.backend_loop_thresh = cfg['tracking']['backend']['loop_thresh'] 38 | self.backend_loop_radius = cfg['tracking']['backend']['loop_radius'] 39 | self.backend_loop_nms = cfg['tracking']['backend']['loop_nms'] 40 | 41 | @torch.no_grad() 42 | def backend_ba(self, t_start, t_end, steps, graph, nms, radius, thresh, max_factors, t_start_loop=None, loop=False, motion_only=False, enable_wq=True): 43 | """ main update """ 44 | if self.cfg['tracking']["uncertainty_params"]['activate']: 45 | self.video.update_all_uncertainty_mask() 46 | 47 | if t_start_loop is None or not loop: 48 | t_start_loop = t_start 49 | assert t_start_loop >= t_start, f'short: {t_start_loop}, long: {t_start}.' 50 | edge_num = graph.add_backend_proximity_factors(t_start,t_end,nms,radius,thresh,max_factors,self.beta, t_start_loop,loop) 51 | if edge_num == 0: 52 | graph.clear_edges() 53 | return 0 54 | 55 | graph.update_lowmem( 56 | t0=t_start_loop+1, # fix the start point to avoid drift, be sure to use t_start_loop rather than t_start here. 57 | t1=t_end, 58 | itrs=2, 59 | use_inactive=False, 60 | steps=steps, 61 | enable_wq = enable_wq 62 | ) 63 | 64 | graph.clear_edges() 65 | return edge_num 66 | 67 | @torch.no_grad() 68 | def dense_ba(self, steps=6, enable_wq=True): 69 | t_start = 0 70 | t_end = self.video.counter.value 71 | nms = self.backend_nms 72 | radius = self.backend_radius 73 | thresh = self.backend_thresh 74 | n = t_end - t_start 75 | max_factors = ((radius + 2) * 2) * n 76 | if self.backend_normalize: 77 | self.video.normalize() 78 | graph = FactorGraph(self.video, self.update_op, device=self.device, 79 | corr_impl='alt', max_factors=max_factors) 80 | n_edges = self.backend_ba(t_start, t_end, steps, graph, nms, radius, 81 | thresh, max_factors, motion_only=False, enable_wq=enable_wq) 82 | 83 | del graph 84 | torch.cuda.empty_cache() 85 | self.video.set_dirty(t_start,t_end) 86 | self.video.update_valid_depth_mask() 87 | return n, n_edges 88 | 89 | 90 | 91 | @torch.no_grad() 92 | def loop_ba(self, t_start, t_end, steps=6, motion_only=False, local_graph=None, enable_wq=True): 93 | ''' loop closure, add edges with high-covisiablity''' 94 | radius = self.backend_loop_radius 95 | window = self.backend_loop_window 96 | max_factors = 8 * window 97 | nms = self.backend_loop_nms 98 | thresh = self.backend_loop_thresh 99 | t_start_loop = max(0, t_end - window) 100 | 101 | graph = FactorGraph(self.video, self.update_op, device=self.device, corr_impl='alt', max_factors=max_factors) 102 | if local_graph is not None: 103 | copy_attr = ['ii', 'jj', 'age', 'net', 'target', 'weight'] 104 | for key in copy_attr: 105 | val = getattr(local_graph, key) 106 | if val is not None: 107 | setattr(graph, key, deepcopy(val)) 108 | 109 | left_factors = max_factors - len(graph.ii) 110 | n_edges = self.backend_ba(t_start, t_end, steps, graph, nms, radius, thresh, 111 | left_factors, t_start_loop=t_start_loop, loop=True, 112 | motion_only=motion_only, enable_wq=enable_wq) 113 | del graph 114 | torch.cuda.empty_cache() 115 | return t_end - t_start_loop, n_edges 116 | 117 | -------------------------------------------------------------------------------- /src/config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The GlORIE-SLAM Authors. 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import yaml 16 | 17 | 18 | def load_config(path, default_path=None): 19 | """ 20 | Load config file 21 | Args: 22 | path: (str), path to config file 23 | default_path: (str, optional), whether to use default path. 24 | 25 | Returns: 26 | cfg: (dict), config dict 27 | 28 | """ 29 | # load configuration from file itself 30 | with open(path, 'r' ) as f: 31 | cfg_special = yaml.full_load(f) 32 | 33 | # check if we should inherit from a config 34 | inherit_from = cfg_special.get('inherit_from') 35 | 36 | # if yes, load this config first as default 37 | # if no, use the default path 38 | if inherit_from is not None: 39 | cfg = load_config(inherit_from, default_path) 40 | elif default_path is not None: 41 | with open(default_path, 'r') as f: 42 | cfg = yaml.full_load(f) 43 | else: 44 | cfg = dict() 45 | 46 | # include main configuration 47 | update_recursive(cfg, cfg_special) 48 | 49 | return cfg 50 | 51 | def save_config(cfg, path): 52 | with open(path, 'w+') as fp: 53 | yaml.dump(cfg, fp) 54 | 55 | 56 | def update_recursive(dict1, dict2): 57 | """ 58 | update two config dictionaries recursively 59 | Args: 60 | dict1: (dict), first dictionary to be updated 61 | dictw: (dict), second dictionary which entries should be used 62 | 63 | Returns: 64 | 65 | """ 66 | for k, v in dict2.items(): 67 | if k not in dict1: 68 | dict1[k] = dict() 69 | if isinstance(v, dict): 70 | update_recursive(dict1[k], v) 71 | else: 72 | dict1[k] = v -------------------------------------------------------------------------------- /src/geom/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/src/geom/__init__.py -------------------------------------------------------------------------------- /src/geom/chol.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The GlORIE-SLAM Authors. 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | import torch.nn.functional as F 17 | import src.geom.projective_ops as pops 18 | 19 | # class CholeskySolver(torch.autograd.Function): 20 | class CholeskySolver(): 21 | @staticmethod 22 | 23 | def apply(H,b): 24 | try: 25 | U = torch.linalg.cholesky(H) 26 | xs = torch.cholesky_solve(b, U) 27 | except Exception as e: 28 | print(e) 29 | xs = torch.zeros_like(b) 30 | 31 | return xs 32 | 33 | def __call__(ctx, H, b): 34 | # don't crash training if cholesky decomp fails 35 | try: 36 | U = torch.linalg.cholesky(H) 37 | xs = torch.cholesky_solve(b, U) 38 | ctx.save_for_backward(U, xs) 39 | ctx.failed = False 40 | except Exception as e: 41 | print(e) 42 | ctx.failed = True 43 | xs = torch.zeros_like(b) 44 | 45 | return xs 46 | 47 | @staticmethod 48 | def backward(ctx, grad_x): 49 | if ctx.failed: 50 | return None, None 51 | 52 | U, xs = ctx.saved_tensors 53 | dz = torch.cholesky_solve(grad_x, U) 54 | dH = -torch.matmul(xs, dz.transpose(-1,-2)) 55 | 56 | return dH, dz 57 | 58 | def block_solve(H, b, ep=0.1, lm=0.0001): 59 | """ solve normal equations """ 60 | B, N, _, D, _ = H.shape 61 | I = torch.eye(D).to(H.device) 62 | H = H + (ep + lm*H) * I 63 | 64 | H = H.permute(0,1,3,2,4) 65 | H = H.reshape(B, N*D, N*D) 66 | b = b.reshape(B, N*D, 1) 67 | 68 | x = CholeskySolver.apply(H,b) 69 | return x.reshape(B, N, D) 70 | 71 | 72 | def schur_solve(H, E, C, v, w, ep=0.1, lm=0.0001, sless=False): 73 | """ solve using shur complement """ 74 | 75 | B, P, M, D, HW = E.shape 76 | H = H.permute(0,1,3,2,4).reshape(B, P*D, P*D) 77 | E = E.permute(0,1,3,2,4).reshape(B, P*D, M*HW) 78 | Q = (1.0 / C).view(B, M*HW, 1) 79 | 80 | # damping 81 | I = torch.eye(P*D).to(H.device) 82 | H = H + (ep + lm*H) * I 83 | 84 | v = v.reshape(B, P*D, 1) 85 | w = w.reshape(B, M*HW, 1) 86 | 87 | Et = E.transpose(1,2) 88 | S = H - torch.matmul(E, Q*Et) 89 | v = v - torch.matmul(E, Q*w) 90 | 91 | dx = CholeskySolver.apply(S, v) 92 | if sless: 93 | return dx.reshape(B, P, D) 94 | 95 | dz = Q * (w - Et @ dx) 96 | dx = dx.reshape(B, P, D) 97 | dz = dz.reshape(B, M, HW) 98 | 99 | return dx, dz -------------------------------------------------------------------------------- /src/geom/projective_ops.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The GlORIE-SLAM Authors. 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | import torch.nn.functional as F 17 | 18 | from lietorch import SE3, Sim3 19 | 20 | MIN_DEPTH = 0.2 21 | 22 | def extract_intrinsics(intrinsics): 23 | return intrinsics[...,None,None,:].unbind(dim=-1) 24 | 25 | def coords_grid(ht, wd, device): 26 | y, x = torch.meshgrid( 27 | torch.arange(ht).to(device).float(), 28 | torch.arange(wd).to(device).float(),indexing="ij") 29 | 30 | return torch.stack([x, y], dim=-1) 31 | 32 | def iproj(disps, intrinsics, jacobian=False): 33 | """ pinhole camera inverse projection """ 34 | ht, wd = disps.shape[2:] 35 | fx, fy, cx, cy = extract_intrinsics(intrinsics) 36 | 37 | y, x = torch.meshgrid( 38 | torch.arange(ht).to(disps.device).float(), 39 | torch.arange(wd).to(disps.device).float(),indexing="ij") 40 | 41 | i = torch.ones_like(disps) 42 | X = (x - cx) / fx 43 | Y = (y - cy) / fy 44 | pts = torch.stack([X, Y, i, disps], dim=-1) 45 | 46 | if jacobian: 47 | J = torch.zeros_like(pts) 48 | J[...,-1] = 1.0 49 | return pts, J 50 | 51 | return pts, None 52 | 53 | def proj(Xs, intrinsics, jacobian=False, return_depth=False): 54 | """ pinhole camera projection """ 55 | fx, fy, cx, cy = extract_intrinsics(intrinsics) 56 | X, Y, Z, D = Xs.unbind(dim=-1) 57 | 58 | Z = torch.where(Z < 0.5*MIN_DEPTH, torch.ones_like(Z), Z) 59 | d = 1.0 / Z 60 | 61 | x = fx * (X * d) + cx 62 | y = fy * (Y * d) + cy 63 | if return_depth: 64 | coords = torch.stack([x, y, D*d], dim=-1) 65 | else: 66 | coords = torch.stack([x, y], dim=-1) 67 | 68 | if jacobian: 69 | B, N, H, W = d.shape 70 | o = torch.zeros_like(d) 71 | proj_jac = torch.stack([ 72 | fx*d, o, -fx*X*d*d, o, 73 | o, fy*d, -fy*Y*d*d, o, 74 | # o, o, -D*d*d, d, 75 | ], dim=-1).view(B, N, H, W, 2, 4) 76 | 77 | return coords, proj_jac 78 | 79 | return coords, None 80 | 81 | def actp(Gij, X0, jacobian=False): 82 | """ action on point cloud """ 83 | X1 = Gij[:,:,None,None] * X0 84 | 85 | if jacobian: 86 | X, Y, Z, d = X1.unbind(dim=-1) 87 | o = torch.zeros_like(d) 88 | B, N, H, W = d.shape 89 | 90 | if isinstance(Gij, SE3): 91 | Ja = torch.stack([ 92 | d, o, o, o, Z, -Y, 93 | o, d, o, -Z, o, X, 94 | o, o, d, Y, -X, o, 95 | o, o, o, o, o, o, 96 | ], dim=-1).view(B, N, H, W, 4, 6) 97 | 98 | elif isinstance(Gij, Sim3): 99 | Ja = torch.stack([ 100 | d, o, o, o, Z, -Y, X, 101 | o, d, o, -Z, o, X, Y, 102 | o, o, d, Y, -X, o, Z, 103 | o, o, o, o, o, o, o 104 | ], dim=-1).view(B, N, H, W, 4, 7) 105 | 106 | return X1, Ja 107 | 108 | return X1, None 109 | 110 | def projective_transform(poses, depths, intrinsics, ii, jj, jacobian=False, return_depth=False): 111 | """ map points from ii->jj """ 112 | 113 | # inverse project (pinhole) 114 | X0, Jz = iproj(depths[:,ii], intrinsics[:,ii], jacobian=jacobian) 115 | 116 | # transform 117 | Gij = poses[:,jj] * poses[:,ii].inv() 118 | 119 | Gij.data[:,ii==jj] = torch.as_tensor([-0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0], device="cuda") 120 | X1, Ja = actp(Gij, X0, jacobian=jacobian) 121 | 122 | # project (pinhole) 123 | x1, Jp = proj(X1, intrinsics[:,jj], jacobian=jacobian, return_depth=return_depth) 124 | 125 | # exclude points too close to camera 126 | valid = ((X1[...,2] > MIN_DEPTH) & (X0[...,2] > MIN_DEPTH)).float() 127 | valid = valid.unsqueeze(-1) 128 | 129 | if jacobian: 130 | # Ji transforms according to dual adjoint 131 | Jj = torch.matmul(Jp, Ja) 132 | Ji = -Gij[:,:,None,None,None].adjT(Jj) 133 | 134 | Jz = Gij[:,:,None,None] * Jz 135 | Jz = torch.matmul(Jp, Jz.unsqueeze(-1)) 136 | 137 | return x1, valid, (Ji, Jj, Jz) 138 | 139 | return x1, valid 140 | 141 | def induced_flow(poses, disps, intrinsics, ii, jj): 142 | """ optical flow induced by camera motion """ 143 | 144 | ht, wd = disps.shape[2:] 145 | y, x = torch.meshgrid( 146 | torch.arange(ht).to(disps.device).float(), 147 | torch.arange(wd).to(disps.device).float(),indexing="ij") 148 | 149 | coords0 = torch.stack([x, y], dim=-1) 150 | coords1, valid = projective_transform(poses, disps, intrinsics, ii, jj, False) 151 | 152 | return coords1[...,:2] - coords0, valid 153 | 154 | -------------------------------------------------------------------------------- /src/gui/gl_render/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Li Ma 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /src/gui/gl_render/__init__.py: -------------------------------------------------------------------------------- 1 | from . import render_ogl, util, util_gau 2 | -------------------------------------------------------------------------------- /src/gui/gl_render/render_ogl.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import torch 5 | from OpenGL import GL as gl 6 | 7 | from . import util, util_gau 8 | 9 | _sort_buffer_xyz = None 10 | _sort_buffer_gausid = None # used to tell whether gaussian is reloaded 11 | 12 | 13 | def _sort_gaussian_torch(gaus, view_mat): 14 | global _sort_buffer_gausid, _sort_buffer_xyz 15 | if _sort_buffer_gausid != id(gaus): 16 | _sort_buffer_xyz = torch.tensor(gaus.xyz).cuda() 17 | _sort_buffer_gausid = id(gaus) 18 | 19 | xyz = torch.tensor(gaus.xyz).cuda() 20 | view_mat = torch.tensor(view_mat).cuda() 21 | xyz_view = view_mat[None, :3, :3] @ xyz[..., None] + view_mat[None, :3, 3, None] 22 | depth = xyz_view[:, 2, 0] 23 | index = torch.argsort(depth) 24 | index = index.type(torch.int32).reshape(-1, 1).cpu().numpy() 25 | return index 26 | 27 | 28 | # Decide which sort to use 29 | _sort_gaussian = None 30 | if not torch.cuda.is_available(): 31 | raise ImportError 32 | _sort_gaussian = _sort_gaussian_torch 33 | 34 | 35 | class GaussianRenderBase: 36 | def __init__(self): 37 | self.gaussians = None 38 | 39 | def update_gaussian_data(self, gaus: util_gau.GaussianData): 40 | raise NotImplementedError() 41 | 42 | def sort_and_update(self): 43 | raise NotImplementedError() 44 | 45 | def set_scale_modifier(self, modifier: float): 46 | raise NotImplementedError() 47 | 48 | def set_render_mod(self, mod: int): 49 | raise NotImplementedError() 50 | 51 | def update_camera_pose(self, camera: util.Camera): 52 | raise NotImplementedError() 53 | 54 | def update_camera_intrin(self, camera: util.Camera): 55 | raise NotImplementedError() 56 | 57 | def draw(self): 58 | raise NotImplementedError() 59 | 60 | def set_render_reso(self, w, h): 61 | raise NotImplementedError() 62 | 63 | 64 | class OpenGLRenderer(GaussianRenderBase): 65 | def __init__(self, w, h): 66 | super().__init__() 67 | gl.glViewport(0, 0, w, h) 68 | cur_path = os.path.dirname(os.path.abspath(__file__)) 69 | self.program = util.load_shaders( 70 | os.path.join(cur_path, "shaders/gau_vert.glsl"), 71 | os.path.join(cur_path, "shaders/gau_frag.glsl"), 72 | ) 73 | 74 | # Vertex data for a quad 75 | self.quad_v = np.array([-1, 1, 1, 1, 1, -1, -1, -1], dtype=np.float32).reshape( 76 | 4, 2 77 | ) 78 | self.quad_f = np.array([0, 1, 2, 0, 2, 3], dtype=np.uint32).reshape(2, 3) 79 | 80 | # load quad geometry 81 | vao, buffer_id = util.set_attributes(self.program, ["position"], [self.quad_v]) 82 | util.set_faces_tovao(vao, self.quad_f) 83 | self.vao = vao 84 | self.gau_bufferid = None 85 | self.index_bufferid = None 86 | 87 | # opengl settings 88 | gl.glDisable(gl.GL_CULL_FACE) 89 | gl.glEnable(gl.GL_BLEND) 90 | gl.glBlendFunc(gl.GL_SRC_ALPHA, gl.GL_ONE_MINUS_SRC_ALPHA) 91 | 92 | def update_gaussian_data(self, gaus: util_gau.GaussianData): 93 | self.gaussians = gaus 94 | # load gaussian geometry 95 | gaussian_data = gaus.flat() 96 | self.gau_bufferid = util.set_storage_buffer_data( 97 | self.program, "gaussian_data", gaussian_data, bind_idx=0, 98 | buffer_id=self.gau_bufferid 99 | ) 100 | util.set_uniform_1int(self.program, gaus.sh_dim, "sh_dim") 101 | 102 | def sort_and_update(self, camera: util.Camera): 103 | index = _sort_gaussian(self.gaussians, camera.get_view_matrix()) 104 | self.index_bufferid = util.set_storage_buffer_data(self.program, "gi", index, bind_idx=1, 105 | buffer_id=self.index_bufferid) 106 | return 107 | 108 | def set_scale_modifier(self, modifier): 109 | util.set_uniform_1f(self.program, modifier, "scale_modifier") 110 | 111 | def set_render_mod(self, mod: int): 112 | util.set_uniform_1int(self.program, mod, "render_mod") 113 | 114 | def set_render_reso(self, w, h): 115 | gl.glViewport(0, 0, w, h) 116 | 117 | def update_camera_pose(self, camera: util.Camera): 118 | view_mat = camera.get_view_matrix() 119 | util.set_uniform_mat4(self.program, view_mat, "view_matrix") 120 | util.set_uniform_v3(self.program, camera.position, "cam_pos") 121 | 122 | def update_camera_intrin(self, camera: util.Camera): 123 | proj_mat = camera.get_project_matrix() 124 | util.set_uniform_mat4(self.program, proj_mat, "projection_matrix") 125 | util.set_uniform_v3(self.program, camera.get_htanfovxy_focal(), "hfovxy_focal") 126 | 127 | def draw(self): 128 | gl.glUseProgram(self.program) 129 | gl.glBindVertexArray(self.vao) 130 | num_gau = len(self.gaussians) 131 | gl.glDrawElementsInstanced( 132 | gl.GL_TRIANGLES, 133 | len(self.quad_f.reshape(-1)), 134 | gl.GL_UNSIGNED_INT, 135 | None, 136 | num_gau, 137 | ) 138 | -------------------------------------------------------------------------------- /src/gui/gl_render/shaders/gau_frag.glsl: -------------------------------------------------------------------------------- 1 | #version 430 core 2 | 3 | in vec3 color; 4 | in float alpha; 5 | in vec3 conic; 6 | in vec2 coordxy; // local coordinate in quad, unit in pixel 7 | 8 | uniform int render_mod; // > 0 render 0-ith SH dim, -1 depth, -2 bill board, -3 flat ball, -4 gaussian ball 9 | 10 | out vec4 FragColor; 11 | 12 | void main() 13 | { 14 | if (render_mod == -2) 15 | { 16 | FragColor = vec4(color, 1.f); 17 | return; 18 | } 19 | 20 | float power = -0.5f * (conic.x * coordxy.x * coordxy.x + conic.z * coordxy.y * coordxy.y) - conic.y * coordxy.x * coordxy.y; 21 | if (power > 0.f) 22 | discard; 23 | float opacity = min(0.99f, alpha * exp(power)); 24 | if (opacity < 1.f / 255.f) 25 | discard; 26 | FragColor = vec4(color, opacity); 27 | 28 | // handling special shading effect 29 | if (render_mod == -3) 30 | FragColor.a = FragColor.a > 0.22 ? 1 : 0; 31 | else if (render_mod == -4) 32 | { 33 | FragColor.a = FragColor.a > 0.4 ? 1 : 0; 34 | FragColor.rgb = FragColor.rgb * exp(power); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/gui/gl_render/util_gau.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | import numpy as np 4 | 5 | 6 | @dataclass 7 | class GaussianData: 8 | xyz: np.ndarray 9 | rot: np.ndarray 10 | scale: np.ndarray 11 | opacity: np.ndarray 12 | sh: np.ndarray 13 | 14 | def flat(self) -> np.ndarray: 15 | ret = np.concatenate( 16 | [self.xyz, self.rot, self.scale, self.opacity, self.sh], axis=-1 17 | ) 18 | return np.ascontiguousarray(ret) 19 | 20 | def __len__(self): 21 | return len(self.xyz) 22 | 23 | @property 24 | def sh_dim(self): 25 | return self.sh.shape[-1] 26 | -------------------------------------------------------------------------------- /src/modules/droid_net/__init__.py: -------------------------------------------------------------------------------- 1 | from .clipping import GradientClip 2 | from .gru import ConvGRU 3 | from .extractor import BasicEncoder 4 | from .corr import CorrBlock, AltCorrBlock 5 | from .droid_net import DroidNet, cvx_upsample -------------------------------------------------------------------------------- /src/modules/droid_net/clipping.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The GlORIE-SLAM Authors. 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | import torch.nn as nn 17 | 18 | 19 | GRAD_CLIP = 0.01 20 | 21 | class GradClip(torch.autograd.Function): 22 | @staticmethod 23 | def forward(ctx, x): 24 | return x 25 | 26 | @staticmethod 27 | def backward(ctx, grad_x): 28 | o = torch.zeros_like(grad_x) 29 | grad_x = torch.where(grad_x.abs() > GRAD_CLIP, o, grad_x) 30 | grad_x = torch.where(torch.isnan(grad_x), o, grad_x) 31 | 32 | return grad_x 33 | 34 | 35 | class GradientClip(nn.Module): 36 | def __init__(self): 37 | super(GradientClip, self).__init__() 38 | 39 | def forward(self, x): 40 | return GradClip.apply(x) -------------------------------------------------------------------------------- /src/modules/droid_net/droid_net.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The GlORIE-SLAM Authors. 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | import torch.nn as nn 17 | import torch.nn.functional as F 18 | from torch_scatter import scatter_mean 19 | 20 | from src.modules.droid_net import ConvGRU, BasicEncoder, GradientClip 21 | 22 | 23 | def cvx_upsample(data, mask): 24 | """ upsample pixel-wise transformation field """ 25 | batch, ht, wd, dim = data.shape 26 | data = data.permute(0, 3, 1, 2).contiguous() 27 | mask = mask.view(batch, 1, 9, 8, 8, ht, wd) 28 | mask = torch.softmax(mask, dim=2) 29 | 30 | up_data = F.unfold(data, kernel_size=(3, 3), padding=(1, 1)) 31 | up_data = up_data.view(batch, dim, 9, 1, 1, ht, wd) 32 | 33 | up_data = torch.sum(mask * up_data, dim=2, keepdim=False) 34 | up_data = up_data.permute(0, 4, 2, 5, 3, 1).contiguous() 35 | up_data = up_data.reshape(batch, 8*ht, 8*wd, dim) 36 | 37 | return up_data 38 | 39 | 40 | def upsample_disp(disp, mask): 41 | batch, num, ht, wd = disp.shape 42 | disp = disp.view(batch*num, ht, wd, 1) 43 | mask = mask.view(batch*num, -1, ht, wd) 44 | 45 | return cvx_upsample(disp, mask).view(batch, num, 8*ht, 8*wd) 46 | 47 | 48 | class GraphAgg(nn.Module): 49 | def __init__(self): 50 | super(GraphAgg, self).__init__() 51 | self.conv1 = nn.Conv2d(128, 128, kernel_size=(3, 3), padding=(1, 1)) 52 | self.conv2 = nn.Conv2d(128, 128, kernel_size=(3, 3), padding=(1, 1)) 53 | self.relu = nn.ReLU(inplace=True) 54 | 55 | self.eta = nn.Sequential( 56 | nn.Conv2d(128, 1, kernel_size=(3, 3), padding=(1, 1)), 57 | GradientClip(), 58 | nn.Softplus(), 59 | ) 60 | 61 | self.upmask = nn.Sequential( 62 | nn.Conv2d(128, 8*8*9, kernel_size=(1, 1), padding=(0, 0)) 63 | ) 64 | 65 | def forward(self, net, ii): 66 | batch, num, ch, ht, wd = net.shape 67 | net = net.view(batch*num, ch, ht, wd) 68 | 69 | _, ix = torch.unique(ii, sorted=True, return_inverse=True) 70 | net = self.relu(self.conv1(net)) 71 | net =net.view(batch, num, 128, ht, wd) 72 | 73 | net = scatter_mean(net, ix, dim=1) 74 | net = net.view(-1, 128, ht, wd) 75 | 76 | net = self.relu(self.conv2(net)) 77 | eta = self.eta(net).view(batch, -1, ht, wd) 78 | upmask = self.upmask(net).view(batch, -1, 8*8*9, ht, wd) 79 | 80 | return 0.01 * eta, upmask 81 | 82 | 83 | class UpdateModule(nn.Module): 84 | def __init__(self): 85 | super(UpdateModule, self).__init__() 86 | cor_planes = 4 * (2*3+1)**2 87 | 88 | self.corr_encoder = nn.Sequential( 89 | nn.Conv2d(cor_planes, 128, kernel_size=(1, 1), padding=(0, 0)), 90 | nn.ReLU(inplace=True), 91 | nn.Conv2d(128, 128, kernel_size=(3, 3), padding=(1, 1)), 92 | nn.ReLU(inplace=True), 93 | ) 94 | 95 | self.flow_encoder = nn.Sequential( 96 | nn.Conv2d(4, 128, kernel_size=(7, 7), padding=(3, 3)), 97 | nn.ReLU(inplace=True), 98 | nn.Conv2d(128, 64, kernel_size=(3, 3), padding=(1, 1)), 99 | nn.ReLU(inplace=True), 100 | ) 101 | 102 | self.weight = nn.Sequential( 103 | nn.Conv2d(128, 128, kernel_size=(3, 3), padding=(1, 1)), 104 | nn.ReLU(inplace=True), 105 | nn.Conv2d(128, 2, kernel_size=(3, 3), padding=(1, 1)), 106 | GradientClip(), 107 | nn.Sigmoid(), 108 | ) 109 | 110 | self.delta = nn.Sequential( 111 | nn.Conv2d(128, 128, kernel_size=(3, 3), padding=(1, 1)), 112 | nn.ReLU(inplace=True), 113 | nn.Conv2d(128, 2, kernel_size=(3, 3), padding=(1, 1)), 114 | GradientClip(), 115 | ) 116 | 117 | self.gru = ConvGRU(128, 128+128+64) 118 | self.agg = GraphAgg() 119 | 120 | def forward(self, net, inp, corr, flow=None, ii=None, jj=None): 121 | """ update operation """ 122 | 123 | batch, num, ch, ht, wd = net.shape 124 | device = net.device 125 | 126 | if flow is None: 127 | flow = torch.zeros(batch, num, 4, ht, wd, device=device) 128 | 129 | out_dim = (batch, num, -1, ht, wd) 130 | 131 | net = net.view(batch*num, -1, ht, wd) 132 | inp = inp.view(batch*num, -1, ht, wd) 133 | corr = corr.view(batch*num, -1, ht, wd) 134 | flow = flow.view(batch*num, -1, ht, wd) 135 | 136 | corr = self.corr_encoder(corr) 137 | flow = self.flow_encoder(flow) 138 | net = self.gru(net, inp, corr, flow) 139 | 140 | ### update variables ### 141 | delta = self.delta(net).view(*out_dim) 142 | weight = self.weight(net).view(*out_dim) 143 | 144 | delta = delta.permute(0, 1, 3, 4, 2)[..., :2].contiguous() 145 | weight = weight.permute(0, 1, 3, 4, 2)[..., :2].contiguous() 146 | 147 | net = net.view(*out_dim) 148 | 149 | if ii is not None: 150 | eta, upmask = self.agg(net, ii.to(device)) 151 | return net, delta, weight, eta, upmask 152 | else: 153 | return net, delta, weight 154 | 155 | 156 | class DroidNet(nn.Module): 157 | def __init__(self): 158 | super(DroidNet, self).__init__() 159 | self.fnet = BasicEncoder(out_dim=128, norm_fn='instance') 160 | self.cnet = BasicEncoder(out_dim=256, norm_fn='none') 161 | self.update = UpdateModule() 162 | 163 | -------------------------------------------------------------------------------- /src/modules/droid_net/extractor.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The GlORIE-SLAM Authors. 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch.nn as nn 16 | 17 | 18 | class ResidualBlock(nn.Module): 19 | def __init__(self, in_planes, planes, norm_fn='group', stride=1): 20 | super(ResidualBlock, self).__init__() 21 | 22 | self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, padding=1, stride=stride) 23 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1) 24 | self.relu = nn.ReLU(inplace=True) 25 | 26 | num_groups = planes // 8 27 | if norm_fn == 'group': 28 | self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes) 29 | self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes) 30 | if stride > 1: 31 | self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes) 32 | 33 | elif norm_fn == 'batch': 34 | self.norm1 = nn.BatchNorm2d(planes) 35 | self.norm2 = nn.BatchNorm2d(planes) 36 | if stride > 1: 37 | self.norm3 = nn.BatchNorm2d(planes) 38 | 39 | elif norm_fn == 'instance': 40 | self.norm1 = nn.InstanceNorm2d(planes) 41 | self.norm2 = nn.InstanceNorm2d(planes) 42 | if stride > 1: 43 | self.norm3 = nn.InstanceNorm2d(planes) 44 | 45 | elif norm_fn == 'none': 46 | self.norm1 = nn.Sequential() 47 | self.norm2 = nn.Sequential() 48 | if stride > 1: 49 | self.norm3 = nn.Sequential() 50 | else: 51 | raise TypeError(norm_fn) 52 | 53 | if stride == 1: 54 | self.downsample = None 55 | else: 56 | self.downsample = nn.Sequential( 57 | nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride, padding=0), 58 | self.norm3, 59 | ) 60 | 61 | def forward(self, x): 62 | y = x 63 | y = self.relu(self.norm1(self.conv1(y))) 64 | y = self.relu(self.norm2(self.conv2(y))) 65 | 66 | if self.downsample is not None: 67 | x = self.downsample(x) 68 | 69 | return self.relu(x+y) 70 | 71 | 72 | DIM = 32 73 | 74 | 75 | class BasicEncoder(nn.Module): 76 | def __init__(self, out_dim, norm_fn='batch'): 77 | super(BasicEncoder, self).__init__() 78 | self.out_dim = out_dim 79 | self.norm_fn = norm_fn 80 | 81 | if norm_fn == 'group': 82 | self.norm1 = nn.GroupNorm(num_groups=8, num_channels=DIM) 83 | 84 | elif norm_fn == 'batch': 85 | self.norm1 = nn.BatchNorm2d(DIM) 86 | 87 | elif norm_fn == 'instance': 88 | self.norm1 = nn.InstanceNorm2d(DIM) 89 | 90 | elif self.norm_fn == 'none': 91 | self.norm1 = nn.Sequential() 92 | 93 | else: 94 | raise TypeError(self.norm_fn) 95 | 96 | self.conv1 = nn.Conv2d(3, DIM, 7, 2, 3) 97 | self.relu1 = nn.ReLU(inplace=True) 98 | 99 | self.in_planes = DIM 100 | self.layer1 = self._make_layer(DIM, stride=1) 101 | self.layer2 = self._make_layer(2*DIM, stride=2) 102 | self.layer3 = self._make_layer(4*DIM, stride=2) 103 | 104 | self.conv2 = nn.Conv2d(4*DIM, out_dim, kernel_size=(1, 1)) 105 | 106 | for m in self.modules(): 107 | if isinstance(m, nn.Conv2d): 108 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 109 | elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)): 110 | if m.weight is not None: 111 | nn.init.constant_(m.weight, 1) 112 | if m.bias is not None: 113 | nn.init.constant_(m.bias, 0) 114 | 115 | def _make_layer(self, dim, stride=1): 116 | layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride) 117 | layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1) 118 | layers = [layer1, layer2] 119 | 120 | self.in_planes = dim 121 | 122 | return nn.Sequential(*layers) 123 | 124 | def forward(self, x): 125 | b, n, c1, h1, w1 = x.shape 126 | x = x.view(b*n, c1, h1, w1) 127 | 128 | x = self.conv1(x) 129 | x = self.norm1(x) 130 | x = self.relu1(x) 131 | 132 | x = self.layer1(x) 133 | x = self.layer2(x) 134 | x = self.layer3(x) 135 | 136 | x = self.conv2(x) 137 | 138 | _, c2, h2, w2 = x.shape 139 | x = x.view(b, n, c2, h2, w2) 140 | 141 | return x -------------------------------------------------------------------------------- /src/modules/droid_net/gru.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The GlORIE-SLAM Authors. 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | import torch.nn as nn 17 | 18 | 19 | class ConvGRU(nn.Module): 20 | def __init__(self, h_planes=128, i_planes=128): 21 | super(ConvGRU, self).__init__() 22 | self.do_checkpoint = False 23 | 24 | self.convz = nn.Conv2d(h_planes+i_planes, h_planes, kernel_size=(3, 3), padding=(1, 1)) 25 | self.convr = nn.Conv2d(h_planes+i_planes, h_planes, kernel_size=(3, 3), padding=(1, 1)) 26 | self.convq = nn.Conv2d(h_planes+i_planes, h_planes, kernel_size=(3, 3), padding=(1, 1)) 27 | 28 | self.w = nn.Conv2d(h_planes, h_planes, kernel_size=(1, 1), padding=(0, 0)) 29 | 30 | self.convz_glo = nn.Conv2d(h_planes, h_planes, kernel_size=(1, 1), padding=(0, 0)) 31 | self.convr_glo = nn.Conv2d(h_planes, h_planes, kernel_size=(1, 1), padding=(0, 0)) 32 | self.convq_glo = nn.Conv2d(h_planes, h_planes, kernel_size=(1, 1), padding=(0, 0)) 33 | 34 | def forward(self, net, *inputs): 35 | inp = torch.cat(inputs, dim=1) 36 | net_inp = torch.cat([net, inp], dim=1) 37 | 38 | b, c, h, w = net.shape 39 | glo = torch.sigmoid(self.w(net)) * net 40 | glo = glo.view(b, c, h*w).mean(dim=-1, keepdim=True).view(b, c, 1, 1) 41 | 42 | z = torch.sigmoid(self.convz(net_inp) + self.convz_glo(glo)) 43 | r = torch.sigmoid(self.convr(net_inp) + self.convr_glo(glo)) 44 | q = torch.tanh(self.convq(torch.cat([r*net, inp], dim=1)) + self.convq_glo(glo)) 45 | 46 | net = (1 - z) * net + z * q 47 | 48 | return net -------------------------------------------------------------------------------- /src/tracker.py: -------------------------------------------------------------------------------- 1 | from src.motion_filter import MotionFilter 2 | from src.frontend import Frontend 3 | from src.backend import Backend 4 | import torch 5 | from colorama import Fore, Style 6 | from multiprocessing.connection import Connection 7 | from src.utils.datasets import BaseDataset 8 | from src.utils.Printer import Printer,FontColor 9 | class Tracker: 10 | def __init__(self, slam, pipe:Connection): 11 | self.cfg = slam.cfg 12 | self.device = self.cfg['device'] 13 | self.net = slam.droid_net 14 | self.video = slam.video 15 | self.verbose = slam.verbose 16 | self.pipe = pipe 17 | self.output = slam.save_dir 18 | 19 | # filter incoming frames so that there is enough motion 20 | self.frontend_window = self.cfg['tracking']['frontend']['window'] 21 | filter_thresh = self.cfg['tracking']['motion_filter']['thresh'] 22 | self.motion_filter = MotionFilter(self.net, self.video, self.cfg, thresh=filter_thresh, device=self.device) 23 | self.enable_online_ba = self.cfg['tracking']['frontend']['enable_online_ba'] 24 | # frontend process 25 | self.frontend = Frontend(self.net, self.video, self.cfg) 26 | self.online_ba = Backend(self.net,self.video, self.cfg) 27 | self.ba_freq = self.cfg['tracking']['backend']['ba_freq'] 28 | 29 | self.printer:Printer = slam.printer 30 | 31 | def run(self, stream:BaseDataset): 32 | ''' 33 | Trigger the tracking process. 34 | 1. check whether there is enough motion between the current frame and last keyframe by motion_filter 35 | 2. use frontend to do local bundle adjustment, to estimate camera pose and depth image, 36 | also delete the current keyframe if it is too close to the previous keyframe after local BA. 37 | 3. run online global BA periodically by backend 38 | 4. send the estimated pose and depth to mapper, 39 | and wait until the mapper finish its current mapping optimization. 40 | ''' 41 | prev_kf_idx = 0 42 | curr_kf_idx = 0 43 | prev_ba_idx = 0 44 | 45 | intrinsic = stream.get_intrinsic() 46 | # for (timestamp, image, _, _) in tqdm(stream): 47 | for i in range(len(stream)): 48 | timestamp, image, _, _ = stream[i] 49 | with torch.no_grad(): 50 | starting_count = self.video.counter.value 51 | ### check there is enough motion 52 | force_to_add_keyframe = self.motion_filter.track(timestamp, image, intrinsic) 53 | 54 | # local bundle adjustment 55 | self.frontend(force_to_add_keyframe) 56 | 57 | if (starting_count < self.video.counter.value) and self.cfg['mapping']['full_resolution']: 58 | if self.motion_filter.uncertainty_aware: 59 | img_full = stream.get_color_full_resol(i) 60 | self.motion_filter.get_img_feature(timestamp,img_full,suffix='full') 61 | curr_kf_idx = self.video.counter.value - 1 62 | 63 | if curr_kf_idx != prev_kf_idx and self.frontend.is_initialized: 64 | if self.video.counter.value == self.frontend.warmup: 65 | ## We just finish the initialization 66 | self.pipe.send({"is_keyframe":True, "video_idx":curr_kf_idx, 67 | "timestamp":timestamp, "just_initialized": True, 68 | "end":False}) 69 | self.pipe.recv() 70 | self.frontend.initialize_second_stage() 71 | else: 72 | if self.enable_online_ba and curr_kf_idx >= prev_ba_idx + self.ba_freq: 73 | # run online global BA every {self.ba_freq} keyframes 74 | self.printer.print(f"Online BA at {curr_kf_idx}th keyframe, frame index: {timestamp}",FontColor.TRACKER) 75 | self.online_ba.dense_ba(2) 76 | prev_ba_idx = curr_kf_idx 77 | # inform the mapper that the estimation of current pose and depth is finished 78 | self.pipe.send({"is_keyframe":True, "video_idx":curr_kf_idx, 79 | "timestamp":timestamp, "just_initialized": False, 80 | "end":False}) 81 | self.pipe.recv() 82 | 83 | prev_kf_idx = curr_kf_idx 84 | self.printer.update_pbar() 85 | 86 | self.pipe.send({"is_keyframe":True, "video_idx":None, 87 | "timestamp":None, "just_initialized": False, 88 | "end":True}) 89 | 90 | 91 | -------------------------------------------------------------------------------- /src/trajectory_filler.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import lietorch 3 | from lietorch import SE3 4 | from src.factor_graph import FactorGraph 5 | from tqdm import tqdm 6 | from src.utils.datasets import BaseDataset 7 | from src.utils.Printer import FontColor 8 | from src.utils.mono_priors.img_feature_extractors import predict_img_features, get_feature_extractor 9 | 10 | class PoseTrajectoryFiller: 11 | """ This class is used to fill in non-keyframe poses 12 | mainly inherited from DROID-SLAM 13 | """ 14 | def __init__(self, cfg, net, video, printer, device='cuda:0'): 15 | self.cfg = cfg 16 | 17 | # split net modules 18 | self.cnet = net.cnet 19 | self.fnet = net.fnet 20 | self.update = net.update 21 | 22 | self.count = 0 23 | self.video = video 24 | self.device = device 25 | self.printer = printer 26 | 27 | # mean, std for image normalization 28 | self.MEAN = torch.tensor([0.485, 0.456, 0.406], device=device)[:, None, None] 29 | self.STDV = torch.tensor([0.229, 0.224, 0.225], device=device)[:, None, None] 30 | 31 | self.uncertainty_aware = cfg['tracking']["uncertainty_params"]['activate'] 32 | 33 | def setup_feature_extractor(self): 34 | if self.uncertainty_aware: 35 | self.feat_extractor = get_feature_extractor(self.cfg) 36 | 37 | @torch.amp.autocast('cuda',enabled=True) 38 | def __feature_encoder(self, image): 39 | """ features for correlation volume """ 40 | return self.fnet(image) 41 | 42 | def __fill(self, timestamps, images, depths, intrinsics, dino_features): 43 | """ fill operator """ 44 | tt = torch.tensor(timestamps, device=self.device) 45 | images = torch.stack(images, dim=0) 46 | if depths is not None: 47 | depths = torch.stack(depths, dim=0) 48 | intrinsics = torch.stack(intrinsics, 0) 49 | if dino_features is not None: 50 | dino_features = torch.stack(dino_features, dim=0).to(self.device) 51 | inputs = images.to(self.device) 52 | 53 | ### linear pose interpolation ### 54 | N = self.video.counter.value 55 | M = len(timestamps) 56 | 57 | ts = self.video.timestamp[:N] 58 | Ps = SE3(self.video.poses[:N]) 59 | 60 | # found the location of current timestamp in keyframe queue 61 | t0 = torch.tensor([ts[ts<=t].shape[0] - 1 for t in timestamps]) 62 | t1 = torch.where(t0 < N-1, t0+1, t0) 63 | 64 | # time interval between nearby keyframes 65 | dt = ts[t1] - ts[t0] + 1e-3 66 | dP = Ps[t1] * Ps[t0].inv() 67 | 68 | v = dP.log() / dt.unsqueeze(dim=-1) 69 | w = v * (tt - ts[t0]).unsqueeze(dim=-1) 70 | Gs = SE3.exp(w) * Ps[t0] 71 | 72 | # extract features (no need for context features) 73 | inputs = inputs.sub_(self.MEAN).div_(self.STDV) 74 | fmap = self.__feature_encoder(inputs) 75 | 76 | # temporally put the non-keyframe at the end of keyframe queue 77 | self.video.counter.value += M 78 | self.video[N:N+M] = (tt, images[:, 0], Gs.data, 1, depths, intrinsics / 8.0, fmap, None, None, dino_features) 79 | 80 | if self.uncertainty_aware: 81 | self.video.update_uncertainty_mask_given_index(range(N,N+M)) 82 | 83 | graph = FactorGraph(self.video, self.update) 84 | # build edge between current frame and nearby keyframes for optimization 85 | graph.add_factors(t0.cuda(), torch.arange(N, N+M).cuda()) 86 | graph.add_factors(t1.cuda(), torch.arange(N, N+M).cuda()) 87 | 88 | for _ in range(12): 89 | graph.update(N, N+M, motion_only=True) 90 | 91 | Gs = SE3(self.video.poses[N:N+M].clone()) 92 | self.video.counter.value -= M 93 | 94 | return [Gs] 95 | 96 | @torch.no_grad() 97 | def __call__(self, image_stream:BaseDataset): 98 | """ fill in poses of non-keyframe images. """ 99 | 100 | # store all camera poses 101 | pose_list = [] 102 | dino_feats = None 103 | if self.uncertainty_aware: 104 | dino_feats = [] 105 | 106 | timestamps = [] 107 | images = [] 108 | intrinsics = [] 109 | dino_features = [] 110 | 111 | self.printer.print("Filling full trajectory ...",FontColor.INFO) 112 | intrinsic = image_stream.get_intrinsic() 113 | for (timestamp, image, _ , _) in tqdm(image_stream): 114 | timestamps.append(timestamp) 115 | images.append(image) 116 | intrinsics.append(intrinsic) 117 | if self.uncertainty_aware: 118 | dino_feature = predict_img_features(self.feat_extractor, 119 | timestamp,image, 120 | self.cfg, 121 | self.device, 122 | save_feat=False) 123 | dino_features.append(dino_feature) 124 | else: 125 | dino_features = None 126 | 127 | if len(timestamps) == 16: 128 | pose_list += self.__fill(timestamps, images, None, intrinsics, dino_features) 129 | if dino_features is not None: 130 | dino_feats += dino_features 131 | timestamps, images, intrinsics, dino_features = [], [], [], [] 132 | 133 | if len(timestamps) > 0: 134 | pose_list += self.__fill(timestamps, images, None, intrinsics, dino_features) 135 | if dino_features is not None: 136 | dino_feats += dino_features 137 | 138 | # stitch pose segments together 139 | return lietorch.cat(pose_list, dim=0), dino_feats -------------------------------------------------------------------------------- /src/utils/Printer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The Splat-SLAM Authors. 2 | # Licensed under the Apache License, Version 2.0 3 | # available at: https://github.com/google-research/Splat-SLAM/blob/main/LICENSE 4 | 5 | from colorama import Fore, Style 6 | import torch.multiprocessing as mp 7 | 8 | 9 | class FontColor(object): 10 | MAPPER=Fore.CYAN 11 | TRACKER=Fore.BLUE 12 | INFO=Fore.YELLOW 13 | ERROR=Fore.RED 14 | PCL=Fore.GREEN 15 | EVAL=Fore.MAGENTA 16 | MESH="yellow" 17 | 18 | 19 | def get_msg_prefix(color): 20 | if color == FontColor.MAPPER: 21 | msg_prefix = color + "[MAPPER] " + Style.RESET_ALL 22 | elif color == FontColor.TRACKER: 23 | msg_prefix = color + "[TRACKER] " + Style.RESET_ALL 24 | elif color == FontColor.INFO: 25 | msg_prefix = color + "[INFO] " + Style.RESET_ALL 26 | elif color == FontColor.ERROR: 27 | msg_prefix = color + "[ERROR] " + Style.RESET_ALL 28 | elif color == FontColor.PCL: 29 | msg_prefix = color + "[POINTCLOUD] " + Style.RESET_ALL 30 | elif color == FontColor.EVAL: 31 | msg_prefix = color + "[EVALUATION] " + Style.RESET_ALL 32 | elif color == FontColor.MESH: 33 | msg_prefix = FontColor.INFO + "[MESH] " + Style.RESET_ALL 34 | else: 35 | msg_prefix = Style.RESET_ALL 36 | return msg_prefix 37 | 38 | class TrivialPrinter(object): 39 | def print(self,msg:str,color=None): 40 | msg_prefix = get_msg_prefix(color) 41 | msg = msg_prefix + msg + Style.RESET_ALL 42 | print(msg) 43 | 44 | class Printer(TrivialPrinter): 45 | def __init__(self, total_img_num): 46 | self.msg_lock = mp.Lock() 47 | self.msg_queue = mp.Queue() 48 | self.progress_counter = mp.Value('i', 0) 49 | process = mp.Process(target=self.printer_process, args=(total_img_num,)) 50 | process.start() 51 | def print(self,msg:str,color=None): 52 | msg_prefix = get_msg_prefix(color) 53 | msg = msg_prefix + msg + Style.RESET_ALL 54 | with self.msg_lock: 55 | self.msg_queue.put(msg) 56 | def update_pbar(self): 57 | with self.msg_lock: 58 | self.progress_counter.value += 1 59 | self.msg_queue.put(f"PROGRESS") 60 | def pbar_ready(self): 61 | with self.msg_lock: 62 | self.msg_queue.put(f"READY") 63 | 64 | def printer_process(self,total_img_num): 65 | from tqdm import tqdm 66 | while True: 67 | message = self.msg_queue.get() 68 | if message == "READY": 69 | break 70 | else: 71 | print(message) 72 | with tqdm(total=total_img_num) as pbar: 73 | while self.progress_counter.value < total_img_num: 74 | message = self.msg_queue.get() 75 | if message == "DONE": 76 | break 77 | elif message.startswith("PROGRESS"): 78 | with self.msg_lock: 79 | completed = self.progress_counter.value 80 | pbar.set_description(FontColor.TRACKER+f"[TRACKER] "+Style.RESET_ALL) 81 | pbar.n = completed 82 | pbar.refresh() 83 | else: 84 | pbar.write(message) 85 | while True: 86 | message = self.msg_queue.get() 87 | if message == "DONE": 88 | break 89 | else: 90 | print(message) 91 | 92 | 93 | def terminate(self): 94 | self.msg_queue.put("DONE") 95 | 96 | 97 | -------------------------------------------------------------------------------- /src/utils/common.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The Splat-SLAM Authors. 2 | # Licensed under the Apache License, Version 2.0 3 | # available at: https://github.com/google-research/Splat-SLAM/blob/main/LICENSE 4 | 5 | import numpy as np 6 | import random 7 | import torch 8 | 9 | 10 | def setup_seed(seed): 11 | torch.manual_seed(seed) 12 | torch.cuda.manual_seed_all(seed) 13 | np.random.seed(seed) 14 | random.seed(seed) 15 | torch.backends.cudnn.deterministic = True 16 | torch.backends.cudnn.benchmark = False 17 | 18 | 19 | def as_intrinsics_matrix(intrinsics): 20 | """ 21 | Get matrix representation of intrinsics. 22 | 23 | """ 24 | K = torch.eye(3) 25 | K[0, 0] = intrinsics[0] 26 | K[1, 1] = intrinsics[1] 27 | K[0, 2] = intrinsics[2] 28 | K[1, 2] = intrinsics[3] 29 | return K 30 | 31 | 32 | def update_cam(cfg): 33 | """ 34 | Update the camera intrinsics according to the pre-processing config, 35 | such as resize or edge crop 36 | """ 37 | # resize the input images to crop_size(variable name used in lietorch) 38 | H, W = cfg['cam']['H'], cfg['cam']['W'] 39 | fx, fy = cfg['cam']['fx'], cfg['cam']['fy'] 40 | cx, cy = cfg['cam']['cx'], cfg['cam']['cy'] 41 | 42 | h_edge, w_edge = cfg['cam']['H_edge'], cfg['cam']['W_edge'] 43 | H_out, W_out = cfg['cam']['H_out'], cfg['cam']['W_out'] 44 | 45 | fx = fx * (W_out + w_edge * 2) / W 46 | fy = fy * (H_out + h_edge * 2) / H 47 | cx = cx * (W_out + w_edge * 2) / W 48 | cy = cy * (H_out + h_edge * 2) / H 49 | H, W = H_out, W_out 50 | 51 | cx = cx - w_edge 52 | cy = cy - h_edge 53 | return H,W,fx,fy,cx,cy 54 | 55 | 56 | @torch.no_grad() 57 | def align_scale_and_shift(prediction, target, weights): 58 | 59 | ''' 60 | weighted least squares problem to solve scale and shift: 61 | min sum{ 62 | weight[i,j] * 63 | (prediction[i,j] * scale + shift - target[i,j])^2 64 | } 65 | 66 | prediction: [B,H,W] 67 | target: [B,H,W] 68 | weights: [B,H,W] 69 | ''' 70 | 71 | if weights is None: 72 | weights = torch.ones_like(prediction).to(prediction.device) 73 | if len(prediction.shape)<3: 74 | prediction = prediction.unsqueeze(0) 75 | target = target.unsqueeze(0) 76 | weights = weights.unsqueeze(0) 77 | a_00 = torch.sum(weights * prediction * prediction, dim=[1,2]) 78 | a_01 = torch.sum(weights * prediction, dim=[1,2]) 79 | a_11 = torch.sum(weights, dim=[1,2]) 80 | # right hand side: b = [b_0, b_1] 81 | b_0 = torch.sum(weights * prediction * target, dim=[1,2]) 82 | b_1 = torch.sum(weights * target, dim=[1,2]) 83 | # solution: x = A^-1 . b = [[a_11, -a_01], [-a_10, a_00]] / (a_00 * a_11 - a_01 * a_10) . b 84 | det = a_00 * a_11 - a_01 * a_01 85 | scale = (a_11 * b_0 - a_01 * b_1) / det 86 | shift = (-a_01 * b_0 + a_00 * b_1) / det 87 | error = (scale[:,None,None]*prediction+shift[:,None,None]-target).abs() 88 | masked_error = error*weights 89 | error_sum = masked_error.sum(dim=[1,2]) 90 | error_num = weights.sum(dim=[1,2]) 91 | avg_error = error_sum/error_num 92 | 93 | return scale,shift,avg_error -------------------------------------------------------------------------------- /src/utils/dyn_uncertainty/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/src/utils/dyn_uncertainty/__init__.py -------------------------------------------------------------------------------- /src/utils/dyn_uncertainty/median_filter.py: -------------------------------------------------------------------------------- 1 | # Based on https://gist.github.com/rwightman/f2d3849281624be7c0f11c85c87c1598 2 | import math 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from torch.nn.modules.utils import _pair, _quadruple 7 | 8 | 9 | class MedianPool2d(nn.Module): 10 | """ Median pool module. 11 | 12 | This is used to smooth the thin line in ssim loss. 13 | 14 | Args: 15 | kernel_size: size of pooling kernel, int or 2-tuple 16 | stride: pool stride, int or 2-tuple 17 | padding: pool padding, int or 4-tuple (l, r, t, b) as in pytorch F.pad 18 | same: override padding and enforce same padding, boolean 19 | """ 20 | def __init__(self, kernel_size=3, stride=1, padding=0, same=False): 21 | super(MedianPool2d, self).__init__() 22 | self.k = _pair(kernel_size) 23 | self.stride = _pair(stride) 24 | self.padding = _quadruple(padding) # convert to l, r, t, b 25 | self.same = same 26 | 27 | def _padding(self, x): 28 | if self.same: 29 | ih, iw = x.size()[2:] 30 | if ih % self.stride[0] == 0: 31 | ph = max(self.k[0] - self.stride[0], 0) 32 | else: 33 | ph = max(self.k[0] - (ih % self.stride[0]), 0) 34 | if iw % self.stride[1] == 0: 35 | pw = max(self.k[1] - self.stride[1], 0) 36 | else: 37 | pw = max(self.k[1] - (iw % self.stride[1]), 0) 38 | pl = pw // 2 39 | pr = pw - pl 40 | pt = ph // 2 41 | pb = ph - pt 42 | padding = (pl, pr, pt, pb) 43 | else: 44 | padding = self.padding 45 | return padding 46 | 47 | def forward(self, x): 48 | # using existing pytorch functions and tensor ops so that we get autograd, 49 | # would likely be more efficient to implement from scratch at C/Cuda level 50 | x = F.pad(x, self._padding(x), mode='reflect') 51 | x = x.unfold(2, self.k[0], self.stride[0]).unfold(3, self.k[1], self.stride[1]) 52 | x = x.contiguous().view(x.size()[:4] + (-1,)).median(dim=-1)[0] 53 | return x -------------------------------------------------------------------------------- /src/utils/dyn_uncertainty/uncertainty_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | class MLPNetwork(nn.Module): 6 | def __init__(self, input_dim: int = 384, hidden_dim: int = 64, output_dim: int = 1, 7 | net_depth: int = 2, net_activation=F.relu, weight_init: str = 'he_uniform'): 8 | super(MLPNetwork, self).__init__() 9 | 10 | self.output_layer_input_dim = hidden_dim 11 | 12 | # Initialize MLP layers 13 | self.layers = nn.ModuleList() 14 | for i in range(net_depth): 15 | dense_layer = nn.Linear(input_dim if i == 0 else hidden_dim, hidden_dim) 16 | 17 | # Apply weight initialization 18 | if weight_init == 'he_uniform': 19 | nn.init.kaiming_uniform_(dense_layer.weight, nonlinearity='relu') 20 | elif weight_init == 'xavier_uniform': 21 | nn.init.xavier_uniform_(dense_layer.weight) 22 | else: 23 | raise NotImplementedError(f"Unknown Weight initialization method {weight_init}") 24 | 25 | self.layers.append(dense_layer) 26 | 27 | # Initialize output layer 28 | self.output_layer = nn.Linear(self.output_layer_input_dim, output_dim) 29 | nn.init.kaiming_uniform_(self.output_layer.weight, nonlinearity='relu') 30 | 31 | # Set activation function 32 | self.net_activation = net_activation 33 | self.softplus = nn.Softplus() 34 | 35 | def forward(self, x: torch.Tensor) -> torch.Tensor: 36 | # Get input dimensions 37 | H, W, C = x.shape[-3:] 38 | input_with_batch_dim = True 39 | 40 | # Add batch dimension if not present 41 | if len(x.shape) == 3: 42 | input_with_batch_dim = False 43 | x = x.unsqueeze(0) 44 | batch_size = 1 45 | else: 46 | batch_size = x.shape[0] 47 | 48 | # Flatten input for MLP 49 | x = x.view(-1, x.size()[-1]) 50 | 51 | # Pass through MLP layers 52 | for layer in self.layers: 53 | x = layer(x) 54 | x = self.net_activation(x) 55 | x = F.dropout(x, p=0.2) 56 | 57 | # Pass through output layer and apply softplus activation 58 | x = self.output_layer(x) 59 | x = self.softplus(x) 60 | 61 | # Reshape output to original dimensions 62 | if input_with_batch_dim: 63 | x = x.view(batch_size, H, W) 64 | else: 65 | x = x.view(H, W) 66 | 67 | return x 68 | 69 | def generate_uncertainty_mlp(n_features: int) -> MLPNetwork: 70 | # Create and return an MLP network with the specified input dimensions 71 | network = MLPNetwork(input_dim=n_features).cuda() 72 | return network -------------------------------------------------------------------------------- /src/utils/eval_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/src/utils/eval_utils.py -------------------------------------------------------------------------------- /src/utils/mono_priors/metric_depth_estimators.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn.functional as F 4 | from torchvision import transforms 5 | import torchvision.transforms.functional as TF 6 | from typing import Dict, Tuple, Union 7 | 8 | from thirdparty.depth_anything_v2.metric_depth.depth_anything_v2.dpt import ( 9 | DepthAnythingV2, 10 | ) 11 | 12 | 13 | def get_metric_depth_estimator(cfg: Dict) -> torch.nn.Module: 14 | """ 15 | Get the metric depth estimator model based on the configuration. 16 | 17 | Args: 18 | cfg (Dict): Configuration dictionary. 19 | 20 | Returns: 21 | torch.nn.Module: The metric depth estimator model. 22 | """ 23 | device = cfg["device"] 24 | depth_model = cfg["mono_prior"]["depth"] 25 | 26 | if "metric3d_vit" in depth_model: 27 | # Options: metric3d_vit_small, metric3d_vit_large, metric3d_vit_giant2 28 | model = torch.hub.load("yvanyin/metric3d", depth_model, pretrain=True) 29 | elif "dpt2" in depth_model: 30 | model = _create_dpt2_model(depth_model) 31 | else: 32 | # If use other metric depth estimator as prior, write the code here 33 | raise NotImplementedError("Unsupported depth model") 34 | return model.to(device).eval() 35 | 36 | 37 | def _create_dpt2_model(depth_model: str) -> DepthAnythingV2: 38 | """ 39 | Create a DPT2 model based on the depth model string. 40 | 41 | Args: 42 | depth_model (str): Depth model configuration string. 43 | 44 | Returns: 45 | DepthAnythingV2: Configured DPT2 model. 46 | """ 47 | model_configs = { 48 | "vits": {"encoder": "vits", "features": 64, "out_channels": [48, 96, 192, 384]}, 49 | "vitb": { 50 | "encoder": "vitb", 51 | "features": 128, 52 | "out_channels": [96, 192, 384, 768], 53 | }, 54 | "vitl": { 55 | "encoder": "vitl", 56 | "features": 256, 57 | "out_channels": [256, 512, 1024, 1024], 58 | }, 59 | } 60 | 61 | encoder, dataset, max_depth = depth_model.split("_")[1:4] 62 | config = {**model_configs[encoder], "max_depth": int(max_depth)} 63 | model = DepthAnythingV2(**config) 64 | 65 | weights_path = f"pretrained/depth_anything_v2_metric_{dataset}_{encoder}.pth" 66 | model.load_state_dict( 67 | torch.load(weights_path, map_location="cpu", weights_only=True) 68 | ) 69 | 70 | return model 71 | 72 | 73 | @torch.no_grad() 74 | def predict_metric_depth( 75 | model: torch.nn.Module, 76 | idx: int, 77 | input_tensor: torch.Tensor, 78 | cfg: Dict, 79 | device: str, 80 | save_depth: bool = True, 81 | ) -> torch.Tensor: 82 | """ 83 | Predict metric depth using the given model. 84 | 85 | Args: 86 | model (torch.nn.Module): The depth estimation model. 87 | idx (int): Image index. 88 | input_tensor (torch.Tensor): Input image tensor of shape (1, 3, H, W). 89 | cfg (Dict): Configuration dictionary. 90 | device (str): Device to run the model on. 91 | save_depth (bool): Whether to save the depth map. 92 | 93 | Returns: 94 | torch.Tensor: Predicted depth map. 95 | """ 96 | depth_model = cfg["mono_prior"]["depth"] 97 | if "metric3d_vit" in depth_model: 98 | output = _predict_metric3d_depth(model, input_tensor, cfg, device) 99 | elif "dpt2" in depth_model: 100 | # dpt2 model takes np.uint8 as the dtype of input 101 | input_numpy = (255.0 * input.squeeze().permute(1, 2, 0).cpu().numpy()).astype( 102 | np.uint8 103 | ) 104 | depth = model.infer_image(input_numpy, input_size=518) 105 | output = torch.tensor(depth).to(device) 106 | else: 107 | # If use other metric depth estimator as prior, write the code here 108 | raise NotImplementedError("Unsupported depth model") 109 | 110 | if save_depth: 111 | _save_depth_map(output, cfg, idx) 112 | 113 | return output 114 | 115 | 116 | def _predict_metric3d_depth( 117 | model: torch.nn.Module, input_tensor: torch.Tensor, cfg: Dict, device: str 118 | ) -> torch.Tensor: 119 | # Refer from: https://github.com/YvanYin/Metric3D/blob/34afafe58d9543f13c01b65222255dab53333838/hubconf.py#L181 120 | image_size = (616, 1064) 121 | h, w = input_tensor.shape[-2:] 122 | scale = min(image_size[0] / h, image_size[1] / w) 123 | 124 | trans_totensor = transforms.Compose( 125 | [ 126 | transforms.Resize((int(h * scale), int(w * scale))), 127 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), 128 | ] 129 | ) 130 | img_tensor = trans_totensor(input_tensor).to(device) 131 | 132 | pad_h, pad_w = image_size[0] - int(h * scale), image_size[1] - int(w * scale) 133 | pad_h_half, pad_w_half = pad_h // 2, pad_w // 2 134 | img_tensor = TF.pad( 135 | img_tensor, 136 | (pad_w_half, pad_h_half, pad_w - pad_w_half, pad_h - pad_h_half), 137 | padding_mode="constant", 138 | fill=0.0, 139 | ) 140 | 141 | pad_info = [pad_h_half, pad_h - pad_h_half, pad_w_half, pad_w - pad_w_half] 142 | pred_depth, _, _ = model.inference({"input": img_tensor}) 143 | pred_depth = pred_depth.squeeze() 144 | pred_depth = pred_depth[ 145 | pad_info[0] : pred_depth.shape[0] - pad_info[1], 146 | pad_info[2] : pred_depth.shape[1] - pad_info[3], 147 | ] 148 | pred_depth = F.interpolate( 149 | pred_depth[None, None, :, :], (h, w), mode="bicubic" 150 | ).squeeze() 151 | 152 | canonical_to_real_scale = cfg["cam"]["fx"] / 1000.0 153 | pred_depth = pred_depth * canonical_to_real_scale 154 | return torch.clamp(pred_depth, 0, 300) 155 | 156 | 157 | def _save_depth_map(depth_map: torch.Tensor, cfg: Dict, idx: int) -> None: 158 | output_dir = f"{cfg['data']['output']}/{cfg['scene']}" 159 | output_path = f"{output_dir}/mono_priors/depths/{idx:05d}.npy" 160 | final_depth = depth_map.detach().cpu().float().numpy() 161 | np.save(output_path, final_depth) 162 | -------------------------------------------------------------------------------- /src/utils/plot_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | from PIL import Image 3 | import re 4 | 5 | 6 | def create_gif_from_directory(directory_path, output_filename, duration=100, online=True): 7 | """ 8 | Creates a GIF from all PNG images in a given directory. 9 | 10 | :param directory_path: Path to the directory containing PNG images. 11 | :param output_filename: Output filename for the GIF. 12 | :param duration: Duration of each frame in the GIF (in milliseconds). 13 | """ 14 | # Function to extract the number from the filename 15 | def extract_number(filename): 16 | # Pattern to find a number followed by '.png' 17 | match = re.search(r'(\d+)\.png$', filename) 18 | if match: 19 | return int(match.group(1)) 20 | else: 21 | return None 22 | 23 | 24 | if online: 25 | # Get all PNG files in the directory 26 | image_files = [os.path.join(directory_path, file) for file in os.listdir(directory_path) if file.endswith('.png')] 27 | 28 | # Sort the files based on the number in the filename 29 | image_files.sort(key=extract_number) 30 | else: 31 | # Get all PNG files in the directory 32 | image_files = [os.path.join(directory_path, file) for file in os.listdir(directory_path) if file.endswith('.png')] 33 | 34 | # Sort the files based on the number in the filename 35 | image_files.sort() 36 | 37 | # Load images 38 | images = [Image.open(file) for file in image_files] 39 | 40 | # Convert images to the same mode and size for consistency 41 | images = [img.convert('RGBA') for img in images] 42 | base_size = images[0].size 43 | resized_images = [img.resize(base_size, Image.LANCZOS) for img in images] 44 | 45 | # Save as GIF 46 | resized_images[0].save(output_filename, save_all=True, append_images=resized_images[1:], optimize=False, duration=duration, loop=0) -------------------------------------------------------------------------------- /src/utils/pose_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The MonoGS Authors. 2 | 3 | # Licensed under the License issued by the MonoGS Authors 4 | # available here: https://github.com/muskie82/MonoGS/blob/main/LICENSE.md 5 | 6 | import numpy as np 7 | import torch 8 | 9 | 10 | def rt2mato(R, T): # TODO: remove? 11 | mat = np.eye(4) 12 | mat[0:3, 0:3] = R 13 | mat[0:3, 3] = T 14 | return mat 15 | 16 | 17 | def skew_sym_mat(x): 18 | device = x.device 19 | dtype = x.dtype 20 | ssm = torch.zeros(3, 3, device=device, dtype=dtype) 21 | ssm[0, 1] = -x[2] 22 | ssm[0, 2] = x[1] 23 | ssm[1, 0] = x[2] 24 | ssm[1, 2] = -x[0] 25 | ssm[2, 0] = -x[1] 26 | ssm[2, 1] = x[0] 27 | return ssm 28 | 29 | 30 | def SO3_exp(theta): 31 | device = theta.device 32 | dtype = theta.dtype 33 | 34 | W = skew_sym_mat(theta) 35 | W2 = W @ W 36 | angle = torch.norm(theta) 37 | I = torch.eye(3, device=device, dtype=dtype) 38 | if angle < 1e-5: 39 | return I + W + 0.5 * W2 40 | else: 41 | return ( 42 | I 43 | + (torch.sin(angle) / angle) * W 44 | + ((1 - torch.cos(angle)) / (angle**2)) * W2 45 | ) 46 | 47 | 48 | def V(theta): 49 | dtype = theta.dtype 50 | device = theta.device 51 | I = torch.eye(3, device=device, dtype=dtype) 52 | W = skew_sym_mat(theta) 53 | W2 = W @ W 54 | angle = torch.norm(theta) 55 | if angle < 1e-5: 56 | V = I + 0.5 * W + (1.0 / 6.0) * W2 57 | else: 58 | V = ( 59 | I 60 | + W * ((1.0 - torch.cos(angle)) / (angle**2)) 61 | + W2 * ((angle - torch.sin(angle)) / (angle**3)) 62 | ) 63 | return V 64 | 65 | 66 | def SE3_exp(tau): 67 | dtype = tau.dtype 68 | device = tau.device 69 | 70 | rho = tau[:3] 71 | theta = tau[3:] 72 | R = SO3_exp(theta) 73 | t = V(theta) @ rho 74 | 75 | T = torch.eye(4, device=device, dtype=dtype) 76 | T[:3, :3] = R 77 | T[:3, 3] = t 78 | return T 79 | 80 | 81 | def update_pose(camera, converged_threshold=1e-4): 82 | tau = torch.cat([camera.cam_trans_delta, camera.cam_rot_delta], axis=0) 83 | 84 | T_w2c = torch.eye(4, device=tau.device) 85 | T_w2c[0:3, 0:3] = camera.R 86 | T_w2c[0:3, 3] = camera.T 87 | 88 | new_w2c = SE3_exp(tau) @ T_w2c 89 | 90 | new_R = new_w2c[0:3, 0:3] 91 | new_T = new_w2c[0:3, 3] 92 | 93 | converged = tau.norm() < converged_threshold 94 | camera.update_RT(new_R, new_T) 95 | 96 | camera.cam_rot_delta.data.fill_(0) 97 | camera.cam_trans_delta.data.fill_(0) 98 | return converged 99 | -------------------------------------------------------------------------------- /thirdparty/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/__init__.py -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/DA-2K.md: -------------------------------------------------------------------------------- 1 | # DA-2K Evaluation Benchmark 2 | 3 | ## Introduction 4 | 5 | ![DA-2K](assets/DA-2K.png) 6 | 7 | DA-2K is proposed in [Depth Anything V2](https://depth-anything-v2.github.io) to evaluate the relative depth estimation capability. It encompasses eight representative scenarios of `indoor`, `outdoor`, `non_real`, `transparent_reflective`, `adverse_style`, `aerial`, `underwater`, and `object`. It consists of 1K diverse high-quality images and 2K precise pair-wise relative depth annotations. 8 | 9 | Please refer to our [paper](https://arxiv.org/abs/2406.09414) for details in constructing this benchmark. 10 | 11 | 12 | ## Usage 13 | 14 | Please first [download the benchmark](https://huggingface.co/datasets/depth-anything/DA-2K/tree/main). 15 | 16 | All annotations are stored in `annotations.json`. The annotation file is a JSON object where each key is the path to an image file, and the value is a list of annotations associated with that image. Each annotation describes two points and identifies which point is closer to the camera. The structure is detailed below: 17 | 18 | ``` 19 | { 20 | "image_path": [ 21 | { 22 | "point1": [h1, w1], # (vertical position, horizontal position) 23 | "point2": [h2, w2], # (vertical position, horizontal position) 24 | "closer_point": "point1" # we always set "point1" as the closer one 25 | }, 26 | ... 27 | ], 28 | ... 29 | } 30 | ``` 31 | 32 | To visualize the annotations: 33 | ```bash 34 | python visualize.py [--scene-type ] 35 | ``` 36 | 37 | **Options** 38 | - `--scene-type ` (optional): Specify the scene type (`indoor`, `outdoor`, `non_real`, `transparent_reflective`, `adverse_style`, `aerial`, `underwater`, and `object`). Skip this argument or set as `""` to include all scene types. 39 | 40 | ## Citation 41 | 42 | If you find this benchmark useful, please consider citing: 43 | 44 | ```bibtex 45 | @article{depth_anything_v2, 46 | title={Depth Anything V2}, 47 | author={Yang, Lihe and Kang, Bingyi and Huang, Zilong and Zhao, Zhen and Xu, Xiaogang and Feng, Jiashi and Zhao, Hengshuang}, 48 | journal={arXiv:2406.09414}, 49 | year={2024} 50 | } 51 | ``` -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/app.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import gradio as gr 3 | import matplotlib 4 | import numpy as np 5 | from PIL import Image 6 | import torch 7 | import tempfile 8 | from gradio_imageslider import ImageSlider 9 | 10 | from depth_anything_v2.dpt import DepthAnythingV2 11 | 12 | css = """ 13 | #img-display-container { 14 | max-height: 100vh; 15 | } 16 | #img-display-input { 17 | max-height: 80vh; 18 | } 19 | #img-display-output { 20 | max-height: 80vh; 21 | } 22 | #download { 23 | height: 62px; 24 | } 25 | """ 26 | DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu' 27 | model_configs = { 28 | 'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]}, 29 | 'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]}, 30 | 'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]}, 31 | 'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]} 32 | } 33 | encoder = 'vitl' 34 | model = DepthAnythingV2(**model_configs[encoder]) 35 | state_dict = torch.load(f'checkpoints/depth_anything_v2_{encoder}.pth', map_location="cpu") 36 | model.load_state_dict(state_dict) 37 | model = model.to(DEVICE).eval() 38 | 39 | title = "# Depth Anything V2" 40 | description = """Official demo for **Depth Anything V2**. 41 | Please refer to our [paper](https://arxiv.org/abs/2406.09414), [project page](https://depth-anything-v2.github.io), or [github](https://github.com/DepthAnything/Depth-Anything-V2) for more details.""" 42 | 43 | def predict_depth(image): 44 | return model.infer_image(image) 45 | 46 | with gr.Blocks(css=css) as demo: 47 | gr.Markdown(title) 48 | gr.Markdown(description) 49 | gr.Markdown("### Depth Prediction demo") 50 | 51 | with gr.Row(): 52 | input_image = gr.Image(label="Input Image", type='numpy', elem_id='img-display-input') 53 | depth_image_slider = ImageSlider(label="Depth Map with Slider View", elem_id='img-display-output', position=0.5) 54 | submit = gr.Button(value="Compute Depth") 55 | gray_depth_file = gr.File(label="Grayscale depth map", elem_id="download",) 56 | raw_file = gr.File(label="16-bit raw output (can be considered as disparity)", elem_id="download",) 57 | 58 | cmap = matplotlib.colormaps.get_cmap('Spectral_r') 59 | 60 | def on_submit(image): 61 | original_image = image.copy() 62 | 63 | h, w = image.shape[:2] 64 | 65 | depth = predict_depth(image[:, :, ::-1]) 66 | 67 | raw_depth = Image.fromarray(depth.astype('uint16')) 68 | tmp_raw_depth = tempfile.NamedTemporaryFile(suffix='.png', delete=False) 69 | raw_depth.save(tmp_raw_depth.name) 70 | 71 | depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0 72 | depth = depth.astype(np.uint8) 73 | colored_depth = (cmap(depth)[:, :, :3] * 255).astype(np.uint8) 74 | 75 | gray_depth = Image.fromarray(depth) 76 | tmp_gray_depth = tempfile.NamedTemporaryFile(suffix='.png', delete=False) 77 | gray_depth.save(tmp_gray_depth.name) 78 | 79 | return [(original_image, colored_depth), tmp_gray_depth.name, tmp_raw_depth.name] 80 | 81 | submit.click(on_submit, inputs=[input_image], outputs=[depth_image_slider, gray_depth_file, raw_file]) 82 | 83 | example_files = glob.glob('assets/examples/*') 84 | examples = gr.Examples(examples=example_files, inputs=[input_image], outputs=[depth_image_slider, gray_depth_file, raw_file], fn=on_submit) 85 | 86 | 87 | if __name__ == '__main__': 88 | demo.queue().launch() -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/assets/DA-2K.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/DA-2K.png -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/assets/examples/demo01.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples/demo01.jpg -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/assets/examples/demo02.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples/demo02.jpg -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/assets/examples/demo03.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples/demo03.jpg -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/assets/examples/demo04.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples/demo04.jpg -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/assets/examples/demo05.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples/demo05.jpg -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/assets/examples/demo06.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples/demo06.jpg -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/assets/examples/demo07.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples/demo07.jpg -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/assets/examples/demo08.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples/demo08.jpg -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/assets/examples/demo09.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples/demo09.jpg -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/assets/examples/demo10.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples/demo10.jpg -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/assets/examples/demo11.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples/demo11.jpg -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/assets/examples/demo12.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples/demo12.jpg -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/assets/examples/demo13.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples/demo13.jpg -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/assets/examples/demo14.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples/demo14.jpg -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/assets/examples/demo15.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples/demo15.jpg -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/assets/examples/demo16.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples/demo16.jpg -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/assets/examples/demo17.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples/demo17.jpg -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/assets/examples/demo18.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples/demo18.jpg -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/assets/examples/demo19.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples/demo19.jpg -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/assets/examples/demo20.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples/demo20.jpg -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/assets/examples_video/basketball.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples_video/basketball.mp4 -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/assets/examples_video/ferris_wheel.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/examples_video/ferris_wheel.mp4 -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/assets/teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/assets/teaser.png -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/depth_anything_v2/dinov2_layers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from .mlp import Mlp 8 | from .patch_embed import PatchEmbed 9 | from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused 10 | from .block import NestedTensorBlock 11 | from .attention import MemEffAttention 12 | -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/depth_anything_v2/dinov2_layers/attention.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # References: 8 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py 9 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py 10 | 11 | import logging 12 | 13 | from torch import Tensor 14 | from torch import nn 15 | 16 | 17 | logger = logging.getLogger("dinov2") 18 | 19 | 20 | try: 21 | from xformers.ops import memory_efficient_attention, unbind, fmha 22 | 23 | XFORMERS_AVAILABLE = True 24 | except ImportError: 25 | logger.warning("xFormers not available") 26 | XFORMERS_AVAILABLE = False 27 | 28 | 29 | class Attention(nn.Module): 30 | def __init__( 31 | self, 32 | dim: int, 33 | num_heads: int = 8, 34 | qkv_bias: bool = False, 35 | proj_bias: bool = True, 36 | attn_drop: float = 0.0, 37 | proj_drop: float = 0.0, 38 | ) -> None: 39 | super().__init__() 40 | self.num_heads = num_heads 41 | head_dim = dim // num_heads 42 | self.scale = head_dim**-0.5 43 | 44 | self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) 45 | self.attn_drop = nn.Dropout(attn_drop) 46 | self.proj = nn.Linear(dim, dim, bias=proj_bias) 47 | self.proj_drop = nn.Dropout(proj_drop) 48 | 49 | def forward(self, x: Tensor) -> Tensor: 50 | B, N, C = x.shape 51 | qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) 52 | 53 | q, k, v = qkv[0] * self.scale, qkv[1], qkv[2] 54 | attn = q @ k.transpose(-2, -1) 55 | 56 | attn = attn.softmax(dim=-1) 57 | attn = self.attn_drop(attn) 58 | 59 | x = (attn @ v).transpose(1, 2).reshape(B, N, C) 60 | x = self.proj(x) 61 | x = self.proj_drop(x) 62 | return x 63 | 64 | 65 | class MemEffAttention(Attention): 66 | def forward(self, x: Tensor, attn_bias=None) -> Tensor: 67 | if not XFORMERS_AVAILABLE: 68 | assert attn_bias is None, "xFormers is required for nested tensors usage" 69 | return super().forward(x) 70 | 71 | B, N, C = x.shape 72 | qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads) 73 | 74 | q, k, v = unbind(qkv, 2) 75 | 76 | x = memory_efficient_attention(q, k, v, attn_bias=attn_bias) 77 | x = x.reshape([B, N, C]) 78 | 79 | x = self.proj(x) 80 | x = self.proj_drop(x) 81 | return x 82 | 83 | -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/depth_anything_v2/dinov2_layers/drop_path.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # References: 8 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py 9 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py 10 | 11 | 12 | from torch import nn 13 | 14 | 15 | def drop_path(x, drop_prob: float = 0.0, training: bool = False): 16 | if drop_prob == 0.0 or not training: 17 | return x 18 | keep_prob = 1 - drop_prob 19 | shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets 20 | random_tensor = x.new_empty(shape).bernoulli_(keep_prob) 21 | if keep_prob > 0.0: 22 | random_tensor.div_(keep_prob) 23 | output = x * random_tensor 24 | return output 25 | 26 | 27 | class DropPath(nn.Module): 28 | """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" 29 | 30 | def __init__(self, drop_prob=None): 31 | super(DropPath, self).__init__() 32 | self.drop_prob = drop_prob 33 | 34 | def forward(self, x): 35 | return drop_path(x, self.drop_prob, self.training) 36 | -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/depth_anything_v2/dinov2_layers/layer_scale.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110 8 | 9 | from typing import Union 10 | 11 | import torch 12 | from torch import Tensor 13 | from torch import nn 14 | 15 | 16 | class LayerScale(nn.Module): 17 | def __init__( 18 | self, 19 | dim: int, 20 | init_values: Union[float, Tensor] = 1e-5, 21 | inplace: bool = False, 22 | ) -> None: 23 | super().__init__() 24 | self.inplace = inplace 25 | self.gamma = nn.Parameter(init_values * torch.ones(dim)) 26 | 27 | def forward(self, x: Tensor) -> Tensor: 28 | return x.mul_(self.gamma) if self.inplace else x * self.gamma 29 | -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/depth_anything_v2/dinov2_layers/mlp.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # References: 8 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py 9 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py 10 | 11 | 12 | from typing import Callable, Optional 13 | 14 | from torch import Tensor, nn 15 | 16 | 17 | class Mlp(nn.Module): 18 | def __init__( 19 | self, 20 | in_features: int, 21 | hidden_features: Optional[int] = None, 22 | out_features: Optional[int] = None, 23 | act_layer: Callable[..., nn.Module] = nn.GELU, 24 | drop: float = 0.0, 25 | bias: bool = True, 26 | ) -> None: 27 | super().__init__() 28 | out_features = out_features or in_features 29 | hidden_features = hidden_features or in_features 30 | self.fc1 = nn.Linear(in_features, hidden_features, bias=bias) 31 | self.act = act_layer() 32 | self.fc2 = nn.Linear(hidden_features, out_features, bias=bias) 33 | self.drop = nn.Dropout(drop) 34 | 35 | def forward(self, x: Tensor) -> Tensor: 36 | x = self.fc1(x) 37 | x = self.act(x) 38 | x = self.drop(x) 39 | x = self.fc2(x) 40 | x = self.drop(x) 41 | return x 42 | -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/depth_anything_v2/dinov2_layers/patch_embed.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # References: 8 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py 9 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py 10 | 11 | from typing import Callable, Optional, Tuple, Union 12 | 13 | from torch import Tensor 14 | import torch.nn as nn 15 | 16 | 17 | def make_2tuple(x): 18 | if isinstance(x, tuple): 19 | assert len(x) == 2 20 | return x 21 | 22 | assert isinstance(x, int) 23 | return (x, x) 24 | 25 | 26 | class PatchEmbed(nn.Module): 27 | """ 28 | 2D image to patch embedding: (B,C,H,W) -> (B,N,D) 29 | 30 | Args: 31 | img_size: Image size. 32 | patch_size: Patch token size. 33 | in_chans: Number of input image channels. 34 | embed_dim: Number of linear projection output channels. 35 | norm_layer: Normalization layer. 36 | """ 37 | 38 | def __init__( 39 | self, 40 | img_size: Union[int, Tuple[int, int]] = 224, 41 | patch_size: Union[int, Tuple[int, int]] = 16, 42 | in_chans: int = 3, 43 | embed_dim: int = 768, 44 | norm_layer: Optional[Callable] = None, 45 | flatten_embedding: bool = True, 46 | ) -> None: 47 | super().__init__() 48 | 49 | image_HW = make_2tuple(img_size) 50 | patch_HW = make_2tuple(patch_size) 51 | patch_grid_size = ( 52 | image_HW[0] // patch_HW[0], 53 | image_HW[1] // patch_HW[1], 54 | ) 55 | 56 | self.img_size = image_HW 57 | self.patch_size = patch_HW 58 | self.patches_resolution = patch_grid_size 59 | self.num_patches = patch_grid_size[0] * patch_grid_size[1] 60 | 61 | self.in_chans = in_chans 62 | self.embed_dim = embed_dim 63 | 64 | self.flatten_embedding = flatten_embedding 65 | 66 | self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW) 67 | self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() 68 | 69 | def forward(self, x: Tensor) -> Tensor: 70 | _, _, H, W = x.shape 71 | patch_H, patch_W = self.patch_size 72 | 73 | assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}" 74 | assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}" 75 | 76 | x = self.proj(x) # B C H W 77 | H, W = x.size(2), x.size(3) 78 | x = x.flatten(2).transpose(1, 2) # B HW C 79 | x = self.norm(x) 80 | if not self.flatten_embedding: 81 | x = x.reshape(-1, H, W, self.embed_dim) # B H W C 82 | return x 83 | 84 | def flops(self) -> float: 85 | Ho, Wo = self.patches_resolution 86 | flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1]) 87 | if self.norm is not None: 88 | flops += Ho * Wo * self.embed_dim 89 | return flops 90 | -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/depth_anything_v2/dinov2_layers/swiglu_ffn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from typing import Callable, Optional 8 | 9 | from torch import Tensor, nn 10 | import torch.nn.functional as F 11 | 12 | 13 | class SwiGLUFFN(nn.Module): 14 | def __init__( 15 | self, 16 | in_features: int, 17 | hidden_features: Optional[int] = None, 18 | out_features: Optional[int] = None, 19 | act_layer: Callable[..., nn.Module] = None, 20 | drop: float = 0.0, 21 | bias: bool = True, 22 | ) -> None: 23 | super().__init__() 24 | out_features = out_features or in_features 25 | hidden_features = hidden_features or in_features 26 | self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias) 27 | self.w3 = nn.Linear(hidden_features, out_features, bias=bias) 28 | 29 | def forward(self, x: Tensor) -> Tensor: 30 | x12 = self.w12(x) 31 | x1, x2 = x12.chunk(2, dim=-1) 32 | hidden = F.silu(x1) * x2 33 | return self.w3(hidden) 34 | 35 | 36 | try: 37 | from xformers.ops import SwiGLU 38 | 39 | XFORMERS_AVAILABLE = True 40 | except ImportError: 41 | SwiGLU = SwiGLUFFN 42 | XFORMERS_AVAILABLE = False 43 | 44 | 45 | class SwiGLUFFNFused(SwiGLU): 46 | def __init__( 47 | self, 48 | in_features: int, 49 | hidden_features: Optional[int] = None, 50 | out_features: Optional[int] = None, 51 | act_layer: Callable[..., nn.Module] = None, 52 | drop: float = 0.0, 53 | bias: bool = True, 54 | ) -> None: 55 | out_features = out_features or in_features 56 | hidden_features = hidden_features or in_features 57 | hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8 58 | super().__init__( 59 | in_features=in_features, 60 | hidden_features=hidden_features, 61 | out_features=out_features, 62 | bias=bias, 63 | ) 64 | -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/depth_anything_v2/util/blocks.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | 4 | def _make_scratch(in_shape, out_shape, groups=1, expand=False): 5 | scratch = nn.Module() 6 | 7 | out_shape1 = out_shape 8 | out_shape2 = out_shape 9 | out_shape3 = out_shape 10 | if len(in_shape) >= 4: 11 | out_shape4 = out_shape 12 | 13 | if expand: 14 | out_shape1 = out_shape 15 | out_shape2 = out_shape * 2 16 | out_shape3 = out_shape * 4 17 | if len(in_shape) >= 4: 18 | out_shape4 = out_shape * 8 19 | 20 | scratch.layer1_rn = nn.Conv2d(in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups) 21 | scratch.layer2_rn = nn.Conv2d(in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups) 22 | scratch.layer3_rn = nn.Conv2d(in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups) 23 | if len(in_shape) >= 4: 24 | scratch.layer4_rn = nn.Conv2d(in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups) 25 | 26 | return scratch 27 | 28 | 29 | class ResidualConvUnit(nn.Module): 30 | """Residual convolution module. 31 | """ 32 | 33 | def __init__(self, features, activation, bn): 34 | """Init. 35 | 36 | Args: 37 | features (int): number of features 38 | """ 39 | super().__init__() 40 | 41 | self.bn = bn 42 | 43 | self.groups=1 44 | 45 | self.conv1 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups) 46 | 47 | self.conv2 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups) 48 | 49 | if self.bn == True: 50 | self.bn1 = nn.BatchNorm2d(features) 51 | self.bn2 = nn.BatchNorm2d(features) 52 | 53 | self.activation = activation 54 | 55 | self.skip_add = nn.quantized.FloatFunctional() 56 | 57 | def forward(self, x): 58 | """Forward pass. 59 | 60 | Args: 61 | x (tensor): input 62 | 63 | Returns: 64 | tensor: output 65 | """ 66 | 67 | out = self.activation(x) 68 | out = self.conv1(out) 69 | if self.bn == True: 70 | out = self.bn1(out) 71 | 72 | out = self.activation(out) 73 | out = self.conv2(out) 74 | if self.bn == True: 75 | out = self.bn2(out) 76 | 77 | if self.groups > 1: 78 | out = self.conv_merge(out) 79 | 80 | return self.skip_add.add(out, x) 81 | 82 | 83 | class FeatureFusionBlock(nn.Module): 84 | """Feature fusion block. 85 | """ 86 | 87 | def __init__( 88 | self, 89 | features, 90 | activation, 91 | deconv=False, 92 | bn=False, 93 | expand=False, 94 | align_corners=True, 95 | size=None 96 | ): 97 | """Init. 98 | 99 | Args: 100 | features (int): number of features 101 | """ 102 | super(FeatureFusionBlock, self).__init__() 103 | 104 | self.deconv = deconv 105 | self.align_corners = align_corners 106 | 107 | self.groups=1 108 | 109 | self.expand = expand 110 | out_features = features 111 | if self.expand == True: 112 | out_features = features // 2 113 | 114 | self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1) 115 | 116 | self.resConfUnit1 = ResidualConvUnit(features, activation, bn) 117 | self.resConfUnit2 = ResidualConvUnit(features, activation, bn) 118 | 119 | self.skip_add = nn.quantized.FloatFunctional() 120 | 121 | self.size=size 122 | 123 | def forward(self, *xs, size=None): 124 | """Forward pass. 125 | 126 | Returns: 127 | tensor: output 128 | """ 129 | output = xs[0] 130 | 131 | if len(xs) == 2: 132 | res = self.resConfUnit1(xs[1]) 133 | output = self.skip_add.add(output, res) 134 | 135 | output = self.resConfUnit2(output) 136 | 137 | if (size is None) and (self.size is None): 138 | modifier = {"scale_factor": 2} 139 | elif size is None: 140 | modifier = {"size": self.size} 141 | else: 142 | modifier = {"size": size} 143 | 144 | output = nn.functional.interpolate(output, **modifier, mode="bilinear", align_corners=self.align_corners) 145 | 146 | output = self.out_conv(output) 147 | 148 | return output 149 | -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/metric_depth/README.md: -------------------------------------------------------------------------------- 1 | # Depth Anything V2 for Metric Depth Estimation 2 | 3 | ![teaser](./assets/compare_zoedepth.png) 4 | 5 | We here provide a simple codebase to fine-tune our Depth Anything V2 pre-trained encoder for metric depth estimation. Built on our powerful encoder, we use a simple DPT head to regress the depth. We fine-tune our pre-trained encoder on synthetic Hypersim / Virtual KITTI datasets for indoor / outdoor metric depth estimation, respectively. 6 | 7 | 8 | # Pre-trained Models 9 | 10 | We provide **six metric depth models** of three scales for indoor and outdoor scenes, respectively. 11 | 12 | | Base Model | Params | Indoor (Hypersim) | Outdoor (Virtual KITTI 2) | 13 | |:-|-:|:-:|:-:| 14 | | Depth-Anything-V2-Small | 24.8M | [Download](https://huggingface.co/depth-anything/Depth-Anything-V2-Metric-Hypersim-Small/resolve/main/depth_anything_v2_metric_hypersim_vits.pth?download=true) | [Download](https://huggingface.co/depth-anything/Depth-Anything-V2-Metric-VKITTI-Small/resolve/main/depth_anything_v2_metric_vkitti_vits.pth?download=true) | 15 | | Depth-Anything-V2-Base | 97.5M | [Download](https://huggingface.co/depth-anything/Depth-Anything-V2-Metric-Hypersim-Base/resolve/main/depth_anything_v2_metric_hypersim_vitb.pth?download=true) | [Download](https://huggingface.co/depth-anything/Depth-Anything-V2-Metric-VKITTI-Base/resolve/main/depth_anything_v2_metric_vkitti_vitb.pth?download=true) | 16 | | Depth-Anything-V2-Large | 335.3M | [Download](https://huggingface.co/depth-anything/Depth-Anything-V2-Metric-Hypersim-Large/resolve/main/depth_anything_v2_metric_hypersim_vitl.pth?download=true) | [Download](https://huggingface.co/depth-anything/Depth-Anything-V2-Metric-VKITTI-Large/resolve/main/depth_anything_v2_metric_vkitti_vitl.pth?download=true) | 17 | 18 | *We recommend to first try our larger models (if computational cost is affordable) and the indoor version.* 19 | 20 | ## Usage 21 | 22 | ### Prepraration 23 | 24 | ```bash 25 | git clone https://github.com/DepthAnything/Depth-Anything-V2 26 | cd Depth-Anything-V2/metric_depth 27 | pip install -r requirements.txt 28 | ``` 29 | 30 | Download the checkpoints listed [here](#pre-trained-models) and put them under the `checkpoints` directory. 31 | 32 | ### Use our models 33 | ```python 34 | import cv2 35 | import torch 36 | 37 | from depth_anything_v2.dpt import DepthAnythingV2 38 | 39 | model_configs = { 40 | 'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]}, 41 | 'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]}, 42 | 'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]} 43 | } 44 | 45 | encoder = 'vitl' # or 'vits', 'vitb' 46 | dataset = 'hypersim' # 'hypersim' for indoor model, 'vkitti' for outdoor model 47 | max_depth = 20 # 20 for indoor model, 80 for outdoor model 48 | 49 | model = DepthAnythingV2(**{**model_configs[encoder], 'max_depth': max_depth}) 50 | model.load_state_dict(torch.load(f'checkpoints/depth_anything_v2_metric_{dataset}_{encoder}.pth', map_location='cpu')) 51 | model.eval() 52 | 53 | raw_img = cv2.imread('your/image/path') 54 | depth = model.infer_image(raw_img) # HxW depth map in meters in numpy 55 | ``` 56 | 57 | ### Running script on images 58 | 59 | Here, we take the `vitl` encoder as an example. You can also use `vitb` or `vits` encoders. 60 | 61 | ```bash 62 | # indoor scenes 63 | python run.py \ 64 | --encoder vitl \ 65 | --load-from checkpoints/depth_anything_v2_metric_hypersim_vitl.pth \ 66 | --max-depth 20 \ 67 | --img-path --outdir [--input-size ] [--save-numpy] 68 | 69 | # outdoor scenes 70 | python run.py \ 71 | --encoder vitl \ 72 | --load-from checkpoints/depth_anything_v2_metric_vkitti_vitl.pth \ 73 | --max-depth 80 \ 74 | --img-path --outdir [--input-size ] [--save-numpy] 75 | ``` 76 | 77 | ### Project 2D images to point clouds: 78 | 79 | ```bash 80 | python depth_to_pointcloud.py \ 81 | --encoder vitl \ 82 | --load-from checkpoints/depth_anything_v2_metric_hypersim_vitl.pth \ 83 | --max-depth 20 \ 84 | --img-path --outdir 85 | ``` 86 | 87 | ### Reproduce training 88 | 89 | Please first prepare the [Hypersim](https://github.com/apple/ml-hypersim) and [Virtual KITTI 2](https://europe.naverlabs.com/research/computer-vision/proxy-virtual-worlds-vkitti-2/) datasets. Then: 90 | 91 | ```bash 92 | bash dist_train.sh 93 | ``` 94 | 95 | 96 | ## Citation 97 | 98 | If you find this project useful, please consider citing: 99 | 100 | ```bibtex 101 | @article{depth_anything_v2, 102 | title={Depth Anything V2}, 103 | author={Yang, Lihe and Kang, Bingyi and Huang, Zilong and Zhao, Zhen and Xu, Xiaogang and Feng, Jiashi and Zhao, Hengshuang}, 104 | journal={arXiv:2406.09414}, 105 | year={2024} 106 | } 107 | 108 | @inproceedings{depth_anything_v1, 109 | title={Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data}, 110 | author={Yang, Lihe and Kang, Bingyi and Huang, Zilong and Xu, Xiaogang and Feng, Jiashi and Zhao, Hengshuang}, 111 | booktitle={CVPR}, 112 | year={2024} 113 | } 114 | ``` 115 | -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/metric_depth/assets/compare_zoedepth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/depth_anything_v2/metric_depth/assets/compare_zoedepth.png -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/metric_depth/dataset/hypersim.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import h5py 3 | import numpy as np 4 | import torch 5 | from torch.utils.data import Dataset 6 | from torchvision.transforms import Compose 7 | 8 | from dataset.transform import Resize, NormalizeImage, PrepareForNet, Crop 9 | 10 | 11 | def hypersim_distance_to_depth(npyDistance): 12 | intWidth, intHeight, fltFocal = 1024, 768, 886.81 13 | 14 | npyImageplaneX = np.linspace((-0.5 * intWidth) + 0.5, (0.5 * intWidth) - 0.5, intWidth).reshape( 15 | 1, intWidth).repeat(intHeight, 0).astype(np.float32)[:, :, None] 16 | npyImageplaneY = np.linspace((-0.5 * intHeight) + 0.5, (0.5 * intHeight) - 0.5, 17 | intHeight).reshape(intHeight, 1).repeat(intWidth, 1).astype(np.float32)[:, :, None] 18 | npyImageplaneZ = np.full([intHeight, intWidth, 1], fltFocal, np.float32) 19 | npyImageplane = np.concatenate( 20 | [npyImageplaneX, npyImageplaneY, npyImageplaneZ], 2) 21 | 22 | npyDepth = npyDistance / np.linalg.norm(npyImageplane, 2, 2) * fltFocal 23 | return npyDepth 24 | 25 | 26 | class Hypersim(Dataset): 27 | def __init__(self, filelist_path, mode, size=(518, 518)): 28 | 29 | self.mode = mode 30 | self.size = size 31 | 32 | with open(filelist_path, 'r') as f: 33 | self.filelist = f.read().splitlines() 34 | 35 | net_w, net_h = size 36 | self.transform = Compose([ 37 | Resize( 38 | width=net_w, 39 | height=net_h, 40 | resize_target=True if mode == 'train' else False, 41 | keep_aspect_ratio=True, 42 | ensure_multiple_of=14, 43 | resize_method='lower_bound', 44 | image_interpolation_method=cv2.INTER_CUBIC, 45 | ), 46 | NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), 47 | PrepareForNet(), 48 | ] + ([Crop(size[0])] if self.mode == 'train' else [])) 49 | 50 | def __getitem__(self, item): 51 | img_path = self.filelist[item].split(' ')[0] 52 | depth_path = self.filelist[item].split(' ')[1] 53 | 54 | image = cv2.imread(img_path) 55 | image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) / 255.0 56 | 57 | depth_fd = h5py.File(depth_path, "r") 58 | distance_meters = np.array(depth_fd['dataset']) 59 | depth = hypersim_distance_to_depth(distance_meters) 60 | 61 | sample = self.transform({'image': image, 'depth': depth}) 62 | 63 | sample['image'] = torch.from_numpy(sample['image']) 64 | sample['depth'] = torch.from_numpy(sample['depth']) 65 | 66 | sample['valid_mask'] = (torch.isnan(sample['depth']) == 0) 67 | sample['depth'][sample['valid_mask'] == 0] = 0 68 | 69 | sample['image_path'] = self.filelist[item].split(' ')[0] 70 | 71 | return sample 72 | 73 | def __len__(self): 74 | return len(self.filelist) -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/metric_depth/dataset/kitti.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import torch 3 | from torch.utils.data import Dataset 4 | from torchvision.transforms import Compose 5 | 6 | from dataset.transform import Resize, NormalizeImage, PrepareForNet 7 | 8 | 9 | class KITTI(Dataset): 10 | def __init__(self, filelist_path, mode, size=(518, 518)): 11 | if mode != 'val': 12 | raise NotImplementedError 13 | 14 | self.mode = mode 15 | self.size = size 16 | 17 | with open(filelist_path, 'r') as f: 18 | self.filelist = f.read().splitlines() 19 | 20 | net_w, net_h = size 21 | self.transform = Compose([ 22 | Resize( 23 | width=net_w, 24 | height=net_h, 25 | resize_target=True if mode == 'train' else False, 26 | keep_aspect_ratio=True, 27 | ensure_multiple_of=14, 28 | resize_method='lower_bound', 29 | image_interpolation_method=cv2.INTER_CUBIC, 30 | ), 31 | NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), 32 | PrepareForNet(), 33 | ]) 34 | 35 | def __getitem__(self, item): 36 | img_path = self.filelist[item].split(' ')[0] 37 | depth_path = self.filelist[item].split(' ')[1] 38 | 39 | image = cv2.imread(img_path) 40 | image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) / 255.0 41 | 42 | depth = cv2.imread(depth_path, cv2.IMREAD_UNCHANGED).astype('float32') 43 | 44 | sample = self.transform({'image': image, 'depth': depth}) 45 | 46 | sample['image'] = torch.from_numpy(sample['image']) 47 | sample['depth'] = torch.from_numpy(sample['depth']) 48 | sample['depth'] = sample['depth'] / 256.0 # convert in meters 49 | 50 | sample['valid_mask'] = sample['depth'] > 0 51 | 52 | sample['image_path'] = self.filelist[item].split(' ')[0] 53 | 54 | return sample 55 | 56 | def __len__(self): 57 | return len(self.filelist) -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/metric_depth/dataset/vkitti2.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import torch 3 | from torch.utils.data import Dataset 4 | from torchvision.transforms import Compose 5 | 6 | from dataset.transform import Resize, NormalizeImage, PrepareForNet, Crop 7 | 8 | 9 | class VKITTI2(Dataset): 10 | def __init__(self, filelist_path, mode, size=(518, 518)): 11 | 12 | self.mode = mode 13 | self.size = size 14 | 15 | with open(filelist_path, 'r') as f: 16 | self.filelist = f.read().splitlines() 17 | 18 | net_w, net_h = size 19 | self.transform = Compose([ 20 | Resize( 21 | width=net_w, 22 | height=net_h, 23 | resize_target=True if mode == 'train' else False, 24 | keep_aspect_ratio=True, 25 | ensure_multiple_of=14, 26 | resize_method='lower_bound', 27 | image_interpolation_method=cv2.INTER_CUBIC, 28 | ), 29 | NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), 30 | PrepareForNet(), 31 | ] + ([Crop(size[0])] if self.mode == 'train' else [])) 32 | 33 | def __getitem__(self, item): 34 | img_path = self.filelist[item].split(' ')[0] 35 | depth_path = self.filelist[item].split(' ')[1] 36 | 37 | image = cv2.imread(img_path) 38 | image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) / 255.0 39 | 40 | depth = cv2.imread(depth_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH) / 100.0 # cm to m 41 | 42 | sample = self.transform({'image': image, 'depth': depth}) 43 | 44 | sample['image'] = torch.from_numpy(sample['image']) 45 | sample['depth'] = torch.from_numpy(sample['depth']) 46 | 47 | sample['valid_mask'] = (sample['depth'] <= 80) 48 | 49 | sample['image_path'] = self.filelist[item].split(' ')[0] 50 | 51 | return sample 52 | 53 | def __len__(self): 54 | return len(self.filelist) -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/metric_depth/depth_anything_v2/dinov2_layers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from .mlp import Mlp 8 | from .patch_embed import PatchEmbed 9 | from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused 10 | from .block import NestedTensorBlock 11 | from .attention import MemEffAttention 12 | -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/metric_depth/depth_anything_v2/dinov2_layers/attention.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # References: 8 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py 9 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py 10 | 11 | import logging 12 | 13 | from torch import Tensor 14 | from torch import nn 15 | 16 | 17 | logger = logging.getLogger("dinov2") 18 | 19 | 20 | try: 21 | from xformers.ops import memory_efficient_attention, unbind, fmha 22 | 23 | XFORMERS_AVAILABLE = True 24 | except ImportError: 25 | logger.warning("xFormers not available") 26 | XFORMERS_AVAILABLE = False 27 | 28 | 29 | class Attention(nn.Module): 30 | def __init__( 31 | self, 32 | dim: int, 33 | num_heads: int = 8, 34 | qkv_bias: bool = False, 35 | proj_bias: bool = True, 36 | attn_drop: float = 0.0, 37 | proj_drop: float = 0.0, 38 | ) -> None: 39 | super().__init__() 40 | self.num_heads = num_heads 41 | head_dim = dim // num_heads 42 | self.scale = head_dim**-0.5 43 | 44 | self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) 45 | self.attn_drop = nn.Dropout(attn_drop) 46 | self.proj = nn.Linear(dim, dim, bias=proj_bias) 47 | self.proj_drop = nn.Dropout(proj_drop) 48 | 49 | def forward(self, x: Tensor) -> Tensor: 50 | B, N, C = x.shape 51 | qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) 52 | 53 | q, k, v = qkv[0] * self.scale, qkv[1], qkv[2] 54 | attn = q @ k.transpose(-2, -1) 55 | 56 | attn = attn.softmax(dim=-1) 57 | attn = self.attn_drop(attn) 58 | 59 | x = (attn @ v).transpose(1, 2).reshape(B, N, C) 60 | x = self.proj(x) 61 | x = self.proj_drop(x) 62 | return x 63 | 64 | 65 | class MemEffAttention(Attention): 66 | def forward(self, x: Tensor, attn_bias=None) -> Tensor: 67 | if not XFORMERS_AVAILABLE: 68 | assert attn_bias is None, "xFormers is required for nested tensors usage" 69 | return super().forward(x) 70 | 71 | B, N, C = x.shape 72 | qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads) 73 | 74 | q, k, v = unbind(qkv, 2) 75 | 76 | x = memory_efficient_attention(q, k, v, attn_bias=attn_bias) 77 | x = x.reshape([B, N, C]) 78 | 79 | x = self.proj(x) 80 | x = self.proj_drop(x) 81 | return x 82 | 83 | -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/metric_depth/depth_anything_v2/dinov2_layers/drop_path.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # References: 8 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py 9 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py 10 | 11 | 12 | from torch import nn 13 | 14 | 15 | def drop_path(x, drop_prob: float = 0.0, training: bool = False): 16 | if drop_prob == 0.0 or not training: 17 | return x 18 | keep_prob = 1 - drop_prob 19 | shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets 20 | random_tensor = x.new_empty(shape).bernoulli_(keep_prob) 21 | if keep_prob > 0.0: 22 | random_tensor.div_(keep_prob) 23 | output = x * random_tensor 24 | return output 25 | 26 | 27 | class DropPath(nn.Module): 28 | """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" 29 | 30 | def __init__(self, drop_prob=None): 31 | super(DropPath, self).__init__() 32 | self.drop_prob = drop_prob 33 | 34 | def forward(self, x): 35 | return drop_path(x, self.drop_prob, self.training) 36 | -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/metric_depth/depth_anything_v2/dinov2_layers/layer_scale.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110 8 | 9 | from typing import Union 10 | 11 | import torch 12 | from torch import Tensor 13 | from torch import nn 14 | 15 | 16 | class LayerScale(nn.Module): 17 | def __init__( 18 | self, 19 | dim: int, 20 | init_values: Union[float, Tensor] = 1e-5, 21 | inplace: bool = False, 22 | ) -> None: 23 | super().__init__() 24 | self.inplace = inplace 25 | self.gamma = nn.Parameter(init_values * torch.ones(dim)) 26 | 27 | def forward(self, x: Tensor) -> Tensor: 28 | return x.mul_(self.gamma) if self.inplace else x * self.gamma 29 | -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/metric_depth/depth_anything_v2/dinov2_layers/mlp.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # References: 8 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py 9 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py 10 | 11 | 12 | from typing import Callable, Optional 13 | 14 | from torch import Tensor, nn 15 | 16 | 17 | class Mlp(nn.Module): 18 | def __init__( 19 | self, 20 | in_features: int, 21 | hidden_features: Optional[int] = None, 22 | out_features: Optional[int] = None, 23 | act_layer: Callable[..., nn.Module] = nn.GELU, 24 | drop: float = 0.0, 25 | bias: bool = True, 26 | ) -> None: 27 | super().__init__() 28 | out_features = out_features or in_features 29 | hidden_features = hidden_features or in_features 30 | self.fc1 = nn.Linear(in_features, hidden_features, bias=bias) 31 | self.act = act_layer() 32 | self.fc2 = nn.Linear(hidden_features, out_features, bias=bias) 33 | self.drop = nn.Dropout(drop) 34 | 35 | def forward(self, x: Tensor) -> Tensor: 36 | x = self.fc1(x) 37 | x = self.act(x) 38 | x = self.drop(x) 39 | x = self.fc2(x) 40 | x = self.drop(x) 41 | return x 42 | -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/metric_depth/depth_anything_v2/dinov2_layers/patch_embed.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # References: 8 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py 9 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py 10 | 11 | from typing import Callable, Optional, Tuple, Union 12 | 13 | from torch import Tensor 14 | import torch.nn as nn 15 | 16 | 17 | def make_2tuple(x): 18 | if isinstance(x, tuple): 19 | assert len(x) == 2 20 | return x 21 | 22 | assert isinstance(x, int) 23 | return (x, x) 24 | 25 | 26 | class PatchEmbed(nn.Module): 27 | """ 28 | 2D image to patch embedding: (B,C,H,W) -> (B,N,D) 29 | 30 | Args: 31 | img_size: Image size. 32 | patch_size: Patch token size. 33 | in_chans: Number of input image channels. 34 | embed_dim: Number of linear projection output channels. 35 | norm_layer: Normalization layer. 36 | """ 37 | 38 | def __init__( 39 | self, 40 | img_size: Union[int, Tuple[int, int]] = 224, 41 | patch_size: Union[int, Tuple[int, int]] = 16, 42 | in_chans: int = 3, 43 | embed_dim: int = 768, 44 | norm_layer: Optional[Callable] = None, 45 | flatten_embedding: bool = True, 46 | ) -> None: 47 | super().__init__() 48 | 49 | image_HW = make_2tuple(img_size) 50 | patch_HW = make_2tuple(patch_size) 51 | patch_grid_size = ( 52 | image_HW[0] // patch_HW[0], 53 | image_HW[1] // patch_HW[1], 54 | ) 55 | 56 | self.img_size = image_HW 57 | self.patch_size = patch_HW 58 | self.patches_resolution = patch_grid_size 59 | self.num_patches = patch_grid_size[0] * patch_grid_size[1] 60 | 61 | self.in_chans = in_chans 62 | self.embed_dim = embed_dim 63 | 64 | self.flatten_embedding = flatten_embedding 65 | 66 | self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW) 67 | self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() 68 | 69 | def forward(self, x: Tensor) -> Tensor: 70 | _, _, H, W = x.shape 71 | patch_H, patch_W = self.patch_size 72 | 73 | assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}" 74 | assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}" 75 | 76 | x = self.proj(x) # B C H W 77 | H, W = x.size(2), x.size(3) 78 | x = x.flatten(2).transpose(1, 2) # B HW C 79 | x = self.norm(x) 80 | if not self.flatten_embedding: 81 | x = x.reshape(-1, H, W, self.embed_dim) # B H W C 82 | return x 83 | 84 | def flops(self) -> float: 85 | Ho, Wo = self.patches_resolution 86 | flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1]) 87 | if self.norm is not None: 88 | flops += Ho * Wo * self.embed_dim 89 | return flops 90 | -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/metric_depth/depth_anything_v2/dinov2_layers/swiglu_ffn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from typing import Callable, Optional 8 | 9 | from torch import Tensor, nn 10 | import torch.nn.functional as F 11 | 12 | 13 | class SwiGLUFFN(nn.Module): 14 | def __init__( 15 | self, 16 | in_features: int, 17 | hidden_features: Optional[int] = None, 18 | out_features: Optional[int] = None, 19 | act_layer: Callable[..., nn.Module] = None, 20 | drop: float = 0.0, 21 | bias: bool = True, 22 | ) -> None: 23 | super().__init__() 24 | out_features = out_features or in_features 25 | hidden_features = hidden_features or in_features 26 | self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias) 27 | self.w3 = nn.Linear(hidden_features, out_features, bias=bias) 28 | 29 | def forward(self, x: Tensor) -> Tensor: 30 | x12 = self.w12(x) 31 | x1, x2 = x12.chunk(2, dim=-1) 32 | hidden = F.silu(x1) * x2 33 | return self.w3(hidden) 34 | 35 | 36 | try: 37 | from xformers.ops import SwiGLU 38 | 39 | XFORMERS_AVAILABLE = True 40 | except ImportError: 41 | SwiGLU = SwiGLUFFN 42 | XFORMERS_AVAILABLE = False 43 | 44 | 45 | class SwiGLUFFNFused(SwiGLU): 46 | def __init__( 47 | self, 48 | in_features: int, 49 | hidden_features: Optional[int] = None, 50 | out_features: Optional[int] = None, 51 | act_layer: Callable[..., nn.Module] = None, 52 | drop: float = 0.0, 53 | bias: bool = True, 54 | ) -> None: 55 | out_features = out_features or in_features 56 | hidden_features = hidden_features or in_features 57 | hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8 58 | super().__init__( 59 | in_features=in_features, 60 | hidden_features=hidden_features, 61 | out_features=out_features, 62 | bias=bias, 63 | ) 64 | -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/metric_depth/depth_anything_v2/util/blocks.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | 4 | def _make_scratch(in_shape, out_shape, groups=1, expand=False): 5 | scratch = nn.Module() 6 | 7 | out_shape1 = out_shape 8 | out_shape2 = out_shape 9 | out_shape3 = out_shape 10 | if len(in_shape) >= 4: 11 | out_shape4 = out_shape 12 | 13 | if expand: 14 | out_shape1 = out_shape 15 | out_shape2 = out_shape * 2 16 | out_shape3 = out_shape * 4 17 | if len(in_shape) >= 4: 18 | out_shape4 = out_shape * 8 19 | 20 | scratch.layer1_rn = nn.Conv2d(in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups) 21 | scratch.layer2_rn = nn.Conv2d(in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups) 22 | scratch.layer3_rn = nn.Conv2d(in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups) 23 | if len(in_shape) >= 4: 24 | scratch.layer4_rn = nn.Conv2d(in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups) 25 | 26 | return scratch 27 | 28 | 29 | class ResidualConvUnit(nn.Module): 30 | """Residual convolution module. 31 | """ 32 | 33 | def __init__(self, features, activation, bn): 34 | """Init. 35 | 36 | Args: 37 | features (int): number of features 38 | """ 39 | super().__init__() 40 | 41 | self.bn = bn 42 | 43 | self.groups=1 44 | 45 | self.conv1 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups) 46 | 47 | self.conv2 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups) 48 | 49 | if self.bn == True: 50 | self.bn1 = nn.BatchNorm2d(features) 51 | self.bn2 = nn.BatchNorm2d(features) 52 | 53 | self.activation = activation 54 | 55 | self.skip_add = nn.quantized.FloatFunctional() 56 | 57 | def forward(self, x): 58 | """Forward pass. 59 | 60 | Args: 61 | x (tensor): input 62 | 63 | Returns: 64 | tensor: output 65 | """ 66 | 67 | out = self.activation(x) 68 | out = self.conv1(out) 69 | if self.bn == True: 70 | out = self.bn1(out) 71 | 72 | out = self.activation(out) 73 | out = self.conv2(out) 74 | if self.bn == True: 75 | out = self.bn2(out) 76 | 77 | if self.groups > 1: 78 | out = self.conv_merge(out) 79 | 80 | return self.skip_add.add(out, x) 81 | 82 | 83 | class FeatureFusionBlock(nn.Module): 84 | """Feature fusion block. 85 | """ 86 | 87 | def __init__( 88 | self, 89 | features, 90 | activation, 91 | deconv=False, 92 | bn=False, 93 | expand=False, 94 | align_corners=True, 95 | size=None 96 | ): 97 | """Init. 98 | 99 | Args: 100 | features (int): number of features 101 | """ 102 | super(FeatureFusionBlock, self).__init__() 103 | 104 | self.deconv = deconv 105 | self.align_corners = align_corners 106 | 107 | self.groups=1 108 | 109 | self.expand = expand 110 | out_features = features 111 | if self.expand == True: 112 | out_features = features // 2 113 | 114 | self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1) 115 | 116 | self.resConfUnit1 = ResidualConvUnit(features, activation, bn) 117 | self.resConfUnit2 = ResidualConvUnit(features, activation, bn) 118 | 119 | self.skip_add = nn.quantized.FloatFunctional() 120 | 121 | self.size=size 122 | 123 | def forward(self, *xs, size=None): 124 | """Forward pass. 125 | 126 | Returns: 127 | tensor: output 128 | """ 129 | output = xs[0] 130 | 131 | if len(xs) == 2: 132 | res = self.resConfUnit1(xs[1]) 133 | output = self.skip_add.add(output, res) 134 | 135 | output = self.resConfUnit2(output) 136 | 137 | if (size is None) and (self.size is None): 138 | modifier = {"scale_factor": 2} 139 | elif size is None: 140 | modifier = {"size": self.size} 141 | else: 142 | modifier = {"size": size} 143 | 144 | output = nn.functional.interpolate(output, **modifier, mode="bilinear", align_corners=self.align_corners) 145 | 146 | output = self.out_conv(output) 147 | 148 | return output 149 | -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/metric_depth/depth_to_pointcloud.py: -------------------------------------------------------------------------------- 1 | """ 2 | Born out of Depth Anything V1 Issue 36 3 | Make sure you have the necessary libraries installed. 4 | Code by @1ssb 5 | 6 | This script processes a set of images to generate depth maps and corresponding point clouds. 7 | The resulting point clouds are saved in the specified output directory. 8 | 9 | Usage: 10 | python script.py --encoder vitl --load-from path_to_model --max-depth 20 --img-path path_to_images --outdir output_directory --focal-length-x 470.4 --focal-length-y 470.4 11 | 12 | Arguments: 13 | --encoder: Model encoder to use. Choices are ['vits', 'vitb', 'vitl', 'vitg']. 14 | --load-from: Path to the pre-trained model weights. 15 | --max-depth: Maximum depth value for the depth map. 16 | --img-path: Path to the input image or directory containing images. 17 | --outdir: Directory to save the output point clouds. 18 | --focal-length-x: Focal length along the x-axis. 19 | --focal-length-y: Focal length along the y-axis. 20 | """ 21 | 22 | import argparse 23 | import cv2 24 | import glob 25 | import numpy as np 26 | import open3d as o3d 27 | import os 28 | from PIL import Image 29 | import torch 30 | 31 | from depth_anything_v2.dpt import DepthAnythingV2 32 | 33 | 34 | def main(): 35 | # Parse command-line arguments 36 | parser = argparse.ArgumentParser(description='Generate depth maps and point clouds from images.') 37 | parser.add_argument('--encoder', default='vitl', type=str, choices=['vits', 'vitb', 'vitl', 'vitg'], 38 | help='Model encoder to use.') 39 | parser.add_argument('--load-from', default='', type=str, required=True, 40 | help='Path to the pre-trained model weights.') 41 | parser.add_argument('--max-depth', default=20, type=float, 42 | help='Maximum depth value for the depth map.') 43 | parser.add_argument('--img-path', type=str, required=True, 44 | help='Path to the input image or directory containing images.') 45 | parser.add_argument('--outdir', type=str, default='./vis_pointcloud', 46 | help='Directory to save the output point clouds.') 47 | parser.add_argument('--focal-length-x', default=470.4, type=float, 48 | help='Focal length along the x-axis.') 49 | parser.add_argument('--focal-length-y', default=470.4, type=float, 50 | help='Focal length along the y-axis.') 51 | 52 | args = parser.parse_args() 53 | 54 | # Determine the device to use (CUDA, MPS, or CPU) 55 | DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu' 56 | 57 | # Model configuration based on the chosen encoder 58 | model_configs = { 59 | 'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]}, 60 | 'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]}, 61 | 'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]}, 62 | 'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]} 63 | } 64 | 65 | # Initialize the DepthAnythingV2 model with the specified configuration 66 | depth_anything = DepthAnythingV2(**{**model_configs[args.encoder], 'max_depth': args.max_depth}) 67 | depth_anything.load_state_dict(torch.load(args.load_from, map_location='cpu')) 68 | depth_anything = depth_anything.to(DEVICE).eval() 69 | 70 | # Get the list of image files to process 71 | if os.path.isfile(args.img_path): 72 | if args.img_path.endswith('txt'): 73 | with open(args.img_path, 'r') as f: 74 | filenames = f.read().splitlines() 75 | else: 76 | filenames = [args.img_path] 77 | else: 78 | filenames = glob.glob(os.path.join(args.img_path, '**/*'), recursive=True) 79 | 80 | # Create the output directory if it doesn't exist 81 | os.makedirs(args.outdir, exist_ok=True) 82 | 83 | # Process each image file 84 | for k, filename in enumerate(filenames): 85 | print(f'Processing {k+1}/{len(filenames)}: {filename}') 86 | 87 | # Load the image 88 | color_image = Image.open(filename).convert('RGB') 89 | width, height = color_image.size 90 | 91 | # Read the image using OpenCV 92 | image = cv2.imread(filename) 93 | pred = depth_anything.infer_image(image, height) 94 | 95 | # Resize depth prediction to match the original image size 96 | resized_pred = Image.fromarray(pred).resize((width, height), Image.NEAREST) 97 | 98 | # Generate mesh grid and calculate point cloud coordinates 99 | x, y = np.meshgrid(np.arange(width), np.arange(height)) 100 | x = (x - width / 2) / args.focal_length_x 101 | y = (y - height / 2) / args.focal_length_y 102 | z = np.array(resized_pred) 103 | points = np.stack((np.multiply(x, z), np.multiply(y, z), z), axis=-1).reshape(-1, 3) 104 | colors = np.array(color_image).reshape(-1, 3) / 255.0 105 | 106 | # Create the point cloud and save it to the output directory 107 | pcd = o3d.geometry.PointCloud() 108 | pcd.points = o3d.utility.Vector3dVector(points) 109 | pcd.colors = o3d.utility.Vector3dVector(colors) 110 | o3d.io.write_point_cloud(os.path.join(args.outdir, os.path.splitext(os.path.basename(filename))[0] + ".ply"), pcd) 111 | 112 | 113 | if __name__ == '__main__': 114 | main() 115 | -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/metric_depth/dist_train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | now=$(date +"%Y%m%d_%H%M%S") 3 | 4 | epoch=120 5 | bs=4 6 | gpus=8 7 | lr=0.000005 8 | encoder=vitl 9 | dataset=hypersim # vkitti 10 | img_size=518 11 | min_depth=0.001 12 | max_depth=20 # 80 for virtual kitti 13 | pretrained_from=../checkpoints/depth_anything_v2_${encoder}.pth 14 | save_path=exp/hypersim # exp/vkitti 15 | 16 | mkdir -p $save_path 17 | 18 | python3 -m torch.distributed.launch \ 19 | --nproc_per_node=$gpus \ 20 | --nnodes 1 \ 21 | --node_rank=0 \ 22 | --master_addr=localhost \ 23 | --master_port=20596 \ 24 | train.py --epoch $epoch --encoder $encoder --bs $bs --lr $lr --save-path $save_path --dataset $dataset \ 25 | --img-size $img_size --min-depth $min_depth --max-depth $max_depth --pretrained-from $pretrained_from \ 26 | --port 20596 2>&1 | tee -a $save_path/$now.log 27 | -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/metric_depth/requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib 2 | opencv-python 3 | open3d 4 | torch 5 | torchvision 6 | -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/metric_depth/run.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import cv2 3 | import glob 4 | import matplotlib 5 | import numpy as np 6 | import os 7 | import torch 8 | 9 | from depth_anything_v2.dpt import DepthAnythingV2 10 | 11 | 12 | if __name__ == '__main__': 13 | parser = argparse.ArgumentParser(description='Depth Anything V2 Metric Depth Estimation') 14 | 15 | parser.add_argument('--img-path', type=str) 16 | parser.add_argument('--input-size', type=int, default=518) 17 | parser.add_argument('--outdir', type=str, default='./vis_depth') 18 | 19 | parser.add_argument('--encoder', type=str, default='vitl', choices=['vits', 'vitb', 'vitl', 'vitg']) 20 | parser.add_argument('--load-from', type=str, default='checkpoints/depth_anything_v2_metric_hypersim_vitl.pth') 21 | parser.add_argument('--max-depth', type=float, default=20) 22 | 23 | parser.add_argument('--save-numpy', dest='save_numpy', action='store_true', help='save the model raw output') 24 | parser.add_argument('--pred-only', dest='pred_only', action='store_true', help='only display the prediction') 25 | parser.add_argument('--grayscale', dest='grayscale', action='store_true', help='do not apply colorful palette') 26 | 27 | args = parser.parse_args() 28 | 29 | DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu' 30 | 31 | model_configs = { 32 | 'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]}, 33 | 'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]}, 34 | 'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]}, 35 | 'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]} 36 | } 37 | 38 | depth_anything = DepthAnythingV2(**{**model_configs[args.encoder], 'max_depth': args.max_depth}) 39 | depth_anything.load_state_dict(torch.load(args.load_from, map_location='cpu')) 40 | depth_anything = depth_anything.to(DEVICE).eval() 41 | 42 | if os.path.isfile(args.img_path): 43 | if args.img_path.endswith('txt'): 44 | with open(args.img_path, 'r') as f: 45 | filenames = f.read().splitlines() 46 | else: 47 | filenames = [args.img_path] 48 | else: 49 | filenames = glob.glob(os.path.join(args.img_path, '**/*'), recursive=True) 50 | 51 | os.makedirs(args.outdir, exist_ok=True) 52 | 53 | cmap = matplotlib.colormaps.get_cmap('Spectral') 54 | 55 | for k, filename in enumerate(filenames): 56 | print(f'Progress {k+1}/{len(filenames)}: {filename}') 57 | 58 | raw_image = cv2.imread(filename) 59 | 60 | depth = depth_anything.infer_image(raw_image, args.input_size) 61 | 62 | if args.save_numpy: 63 | output_path = os.path.join(args.outdir, os.path.splitext(os.path.basename(filename))[0] + '_raw_depth_meter.npy') 64 | np.save(output_path, depth) 65 | 66 | depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0 67 | depth = depth.astype(np.uint8) 68 | 69 | if args.grayscale: 70 | depth = np.repeat(depth[..., np.newaxis], 3, axis=-1) 71 | else: 72 | depth = (cmap(depth)[:, :, :3] * 255)[:, :, ::-1].astype(np.uint8) 73 | 74 | output_path = os.path.join(args.outdir, os.path.splitext(os.path.basename(filename))[0] + '.png') 75 | if args.pred_only: 76 | cv2.imwrite(output_path, depth) 77 | else: 78 | split_region = np.ones((raw_image.shape[0], 50, 3), dtype=np.uint8) * 255 79 | combined_result = cv2.hconcat([raw_image, split_region, depth]) 80 | 81 | cv2.imwrite(output_path, combined_result) -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/metric_depth/util/dist_helper.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | 4 | import torch 5 | import torch.distributed as dist 6 | 7 | 8 | def setup_distributed(backend="nccl", port=None): 9 | """AdaHessian Optimizer 10 | Lifted from https://github.com/BIGBALLON/distribuuuu/blob/master/distribuuuu/utils.py 11 | Originally licensed MIT, Copyright (c) 2020 Wei Li 12 | """ 13 | num_gpus = torch.cuda.device_count() 14 | 15 | if "SLURM_JOB_ID" in os.environ: 16 | rank = int(os.environ["SLURM_PROCID"]) 17 | world_size = int(os.environ["SLURM_NTASKS"]) 18 | node_list = os.environ["SLURM_NODELIST"] 19 | addr = subprocess.getoutput(f"scontrol show hostname {node_list} | head -n1") 20 | # specify master port 21 | if port is not None: 22 | os.environ["MASTER_PORT"] = str(port) 23 | elif "MASTER_PORT" not in os.environ: 24 | os.environ["MASTER_PORT"] = "10685" 25 | if "MASTER_ADDR" not in os.environ: 26 | os.environ["MASTER_ADDR"] = addr 27 | os.environ["WORLD_SIZE"] = str(world_size) 28 | os.environ["LOCAL_RANK"] = str(rank % num_gpus) 29 | os.environ["RANK"] = str(rank) 30 | else: 31 | rank = int(os.environ["RANK"]) 32 | world_size = int(os.environ["WORLD_SIZE"]) 33 | 34 | torch.cuda.set_device(rank % num_gpus) 35 | 36 | dist.init_process_group( 37 | backend=backend, 38 | world_size=world_size, 39 | rank=rank, 40 | ) 41 | return rank, world_size 42 | -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/metric_depth/util/loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | 5 | class SiLogLoss(nn.Module): 6 | def __init__(self, lambd=0.5): 7 | super().__init__() 8 | self.lambd = lambd 9 | 10 | def forward(self, pred, target, valid_mask): 11 | valid_mask = valid_mask.detach() 12 | diff_log = torch.log(target[valid_mask]) - torch.log(pred[valid_mask]) 13 | loss = torch.sqrt(torch.pow(diff_log, 2).mean() - 14 | self.lambd * torch.pow(diff_log.mean(), 2)) 15 | 16 | return loss 17 | -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/metric_depth/util/metric.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def eval_depth(pred, target): 5 | assert pred.shape == target.shape 6 | 7 | thresh = torch.max((target / pred), (pred / target)) 8 | 9 | d1 = torch.sum(thresh < 1.25).float() / len(thresh) 10 | d2 = torch.sum(thresh < 1.25 ** 2).float() / len(thresh) 11 | d3 = torch.sum(thresh < 1.25 ** 3).float() / len(thresh) 12 | 13 | diff = pred - target 14 | diff_log = torch.log(pred) - torch.log(target) 15 | 16 | abs_rel = torch.mean(torch.abs(diff) / target) 17 | sq_rel = torch.mean(torch.pow(diff, 2) / target) 18 | 19 | rmse = torch.sqrt(torch.mean(torch.pow(diff, 2))) 20 | rmse_log = torch.sqrt(torch.mean(torch.pow(diff_log , 2))) 21 | 22 | log10 = torch.mean(torch.abs(torch.log10(pred) - torch.log10(target))) 23 | silog = torch.sqrt(torch.pow(diff_log, 2).mean() - 0.5 * torch.pow(diff_log.mean(), 2)) 24 | 25 | return {'d1': d1.item(), 'd2': d2.item(), 'd3': d3.item(), 'abs_rel': abs_rel.item(), 'sq_rel': sq_rel.item(), 26 | 'rmse': rmse.item(), 'rmse_log': rmse_log.item(), 'log10':log10.item(), 'silog':silog.item()} -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/metric_depth/util/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import numpy as np 4 | import logging 5 | 6 | logs = set() 7 | 8 | 9 | def init_log(name, level=logging.INFO): 10 | if (name, level) in logs: 11 | return 12 | logs.add((name, level)) 13 | logger = logging.getLogger(name) 14 | logger.setLevel(level) 15 | ch = logging.StreamHandler() 16 | ch.setLevel(level) 17 | if "SLURM_PROCID" in os.environ: 18 | rank = int(os.environ["SLURM_PROCID"]) 19 | logger.addFilter(lambda record: rank == 0) 20 | else: 21 | rank = 0 22 | format_str = "[%(asctime)s][%(levelname)8s] %(message)s" 23 | formatter = logging.Formatter(format_str) 24 | ch.setFormatter(formatter) 25 | logger.addHandler(ch) 26 | return logger 27 | -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/requirements.txt: -------------------------------------------------------------------------------- 1 | gradio_imageslider 2 | gradio==4.29.0 3 | matplotlib 4 | opencv-python 5 | torch 6 | torchvision 7 | -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/run.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import cv2 3 | import glob 4 | import matplotlib 5 | import numpy as np 6 | import os 7 | import torch 8 | 9 | from depth_anything_v2.dpt import DepthAnythingV2 10 | 11 | 12 | if __name__ == '__main__': 13 | parser = argparse.ArgumentParser(description='Depth Anything V2') 14 | 15 | parser.add_argument('--img-path', type=str) 16 | parser.add_argument('--input-size', type=int, default=518) 17 | parser.add_argument('--outdir', type=str, default='./vis_depth') 18 | 19 | parser.add_argument('--encoder', type=str, default='vitl', choices=['vits', 'vitb', 'vitl', 'vitg']) 20 | 21 | parser.add_argument('--pred-only', dest='pred_only', action='store_true', help='only display the prediction') 22 | parser.add_argument('--grayscale', dest='grayscale', action='store_true', help='do not apply colorful palette') 23 | 24 | args = parser.parse_args() 25 | 26 | DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu' 27 | 28 | model_configs = { 29 | 'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]}, 30 | 'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]}, 31 | 'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]}, 32 | 'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]} 33 | } 34 | 35 | depth_anything = DepthAnythingV2(**model_configs[args.encoder]) 36 | depth_anything.load_state_dict(torch.load(f'checkpoints/depth_anything_v2_{args.encoder}.pth', map_location='cpu')) 37 | depth_anything = depth_anything.to(DEVICE).eval() 38 | 39 | if os.path.isfile(args.img_path): 40 | if args.img_path.endswith('txt'): 41 | with open(args.img_path, 'r') as f: 42 | filenames = f.read().splitlines() 43 | else: 44 | filenames = [args.img_path] 45 | else: 46 | filenames = glob.glob(os.path.join(args.img_path, '**/*'), recursive=True) 47 | 48 | os.makedirs(args.outdir, exist_ok=True) 49 | 50 | cmap = matplotlib.colormaps.get_cmap('Spectral_r') 51 | 52 | for k, filename in enumerate(filenames): 53 | print(f'Progress {k+1}/{len(filenames)}: {filename}') 54 | 55 | raw_image = cv2.imread(filename) 56 | 57 | depth = depth_anything.infer_image(raw_image, args.input_size) 58 | 59 | depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0 60 | depth = depth.astype(np.uint8) 61 | 62 | if args.grayscale: 63 | depth = np.repeat(depth[..., np.newaxis], 3, axis=-1) 64 | else: 65 | depth = (cmap(depth)[:, :, :3] * 255)[:, :, ::-1].astype(np.uint8) 66 | 67 | if args.pred_only: 68 | cv2.imwrite(os.path.join(args.outdir, os.path.splitext(os.path.basename(filename))[0] + '.png'), depth) 69 | else: 70 | split_region = np.ones((raw_image.shape[0], 50, 3), dtype=np.uint8) * 255 71 | combined_result = cv2.hconcat([raw_image, split_region, depth]) 72 | 73 | cv2.imwrite(os.path.join(args.outdir, os.path.splitext(os.path.basename(filename))[0] + '.png'), combined_result) -------------------------------------------------------------------------------- /thirdparty/depth_anything_v2/run_video.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import cv2 3 | import glob 4 | import matplotlib 5 | import numpy as np 6 | import os 7 | import torch 8 | 9 | from depth_anything_v2.dpt import DepthAnythingV2 10 | 11 | 12 | if __name__ == '__main__': 13 | parser = argparse.ArgumentParser(description='Depth Anything V2') 14 | 15 | parser.add_argument('--video-path', type=str) 16 | parser.add_argument('--input-size', type=int, default=518) 17 | parser.add_argument('--outdir', type=str, default='./vis_video_depth') 18 | 19 | parser.add_argument('--encoder', type=str, default='vitl', choices=['vits', 'vitb', 'vitl', 'vitg']) 20 | 21 | parser.add_argument('--pred-only', dest='pred_only', action='store_true', help='only display the prediction') 22 | parser.add_argument('--grayscale', dest='grayscale', action='store_true', help='do not apply colorful palette') 23 | 24 | args = parser.parse_args() 25 | 26 | DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu' 27 | 28 | model_configs = { 29 | 'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]}, 30 | 'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]}, 31 | 'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]}, 32 | 'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]} 33 | } 34 | 35 | depth_anything = DepthAnythingV2(**model_configs[args.encoder]) 36 | depth_anything.load_state_dict(torch.load(f'checkpoints/depth_anything_v2_{args.encoder}.pth', map_location='cpu')) 37 | depth_anything = depth_anything.to(DEVICE).eval() 38 | 39 | if os.path.isfile(args.video_path): 40 | if args.video_path.endswith('txt'): 41 | with open(args.video_path, 'r') as f: 42 | lines = f.read().splitlines() 43 | else: 44 | filenames = [args.video_path] 45 | else: 46 | filenames = glob.glob(os.path.join(args.video_path, '**/*'), recursive=True) 47 | 48 | os.makedirs(args.outdir, exist_ok=True) 49 | 50 | margin_width = 50 51 | cmap = matplotlib.colormaps.get_cmap('Spectral_r') 52 | 53 | for k, filename in enumerate(filenames): 54 | print(f'Progress {k+1}/{len(filenames)}: {filename}') 55 | 56 | raw_video = cv2.VideoCapture(filename) 57 | frame_width, frame_height = int(raw_video.get(cv2.CAP_PROP_FRAME_WIDTH)), int(raw_video.get(cv2.CAP_PROP_FRAME_HEIGHT)) 58 | frame_rate = int(raw_video.get(cv2.CAP_PROP_FPS)) 59 | 60 | if args.pred_only: 61 | output_width = frame_width 62 | else: 63 | output_width = frame_width * 2 + margin_width 64 | 65 | output_path = os.path.join(args.outdir, os.path.splitext(os.path.basename(filename))[0] + '.mp4') 66 | out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*"mp4v"), frame_rate, (output_width, frame_height)) 67 | 68 | while raw_video.isOpened(): 69 | ret, raw_frame = raw_video.read() 70 | if not ret: 71 | break 72 | 73 | depth = depth_anything.infer_image(raw_frame, args.input_size) 74 | 75 | depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0 76 | depth = depth.astype(np.uint8) 77 | 78 | if args.grayscale: 79 | depth = np.repeat(depth[..., np.newaxis], 3, axis=-1) 80 | else: 81 | depth = (cmap(depth)[:, :, :3] * 255)[:, :, ::-1].astype(np.uint8) 82 | 83 | if args.pred_only: 84 | out.write(depth) 85 | else: 86 | split_region = np.ones((frame_height, margin_width, 3), dtype=np.uint8) * 255 87 | combined_frame = cv2.hconcat([raw_frame, split_region, depth]) 88 | 89 | out.write(combined_frame) 90 | 91 | raw_video.release() 92 | out.release() 93 | -------------------------------------------------------------------------------- /thirdparty/gaussian_splatting/LICENSE.md: -------------------------------------------------------------------------------- 1 | Gaussian-Splatting License 2 | =========================== 3 | 4 | **Inria** and **the Max Planck Institut for Informatik (MPII)** hold all the ownership rights on the *Software* named **gaussian-splatting**. 5 | The *Software* is in the process of being registered with the Agence pour la Protection des 6 | Programmes (APP). 7 | 8 | The *Software* is still being developed by the *Licensor*. 9 | 10 | *Licensor*'s goal is to allow the research community to use, test and evaluate 11 | the *Software*. 12 | 13 | ## 1. Definitions 14 | 15 | *Licensee* means any person or entity that uses the *Software* and distributes 16 | its *Work*. 17 | 18 | *Licensor* means the owners of the *Software*, i.e Inria and MPII 19 | 20 | *Software* means the original work of authorship made available under this 21 | License ie gaussian-splatting. 22 | 23 | *Work* means the *Software* and any additions to or derivative works of the 24 | *Software* that are made available under this License. 25 | 26 | 27 | ## 2. Purpose 28 | This license is intended to define the rights granted to the *Licensee* by 29 | Licensors under the *Software*. 30 | 31 | ## 3. Rights granted 32 | 33 | For the above reasons Licensors have decided to distribute the *Software*. 34 | Licensors grant non-exclusive rights to use the *Software* for research purposes 35 | to research users (both academic and industrial), free of charge, without right 36 | to sublicense.. The *Software* may be used "non-commercially", i.e., for research 37 | and/or evaluation purposes only. 38 | 39 | Subject to the terms and conditions of this License, you are granted a 40 | non-exclusive, royalty-free, license to reproduce, prepare derivative works of, 41 | publicly display, publicly perform and distribute its *Work* and any resulting 42 | derivative works in any form. 43 | 44 | ## 4. Limitations 45 | 46 | **4.1 Redistribution.** You may reproduce or distribute the *Work* only if (a) you do 47 | so under this License, (b) you include a complete copy of this License with 48 | your distribution, and (c) you retain without modification any copyright, 49 | patent, trademark, or attribution notices that are present in the *Work*. 50 | 51 | **4.2 Derivative Works.** You may specify that additional or different terms apply 52 | to the use, reproduction, and distribution of your derivative works of the *Work* 53 | ("Your Terms") only if (a) Your Terms provide that the use limitation in 54 | Section 2 applies to your derivative works, and (b) you identify the specific 55 | derivative works that are subject to Your Terms. Notwithstanding Your Terms, 56 | this License (including the redistribution requirements in Section 3.1) will 57 | continue to apply to the *Work* itself. 58 | 59 | **4.3** Any other use without of prior consent of Licensors is prohibited. Research 60 | users explicitly acknowledge having received from Licensors all information 61 | allowing to appreciate the adequacy between of the *Software* and their needs and 62 | to undertake all necessary precautions for its execution and use. 63 | 64 | **4.4** The *Software* is provided both as a compiled library file and as source 65 | code. In case of using the *Software* for a publication or other results obtained 66 | through the use of the *Software*, users are strongly encouraged to cite the 67 | corresponding publications as explained in the documentation of the *Software*. 68 | 69 | ## 5. Disclaimer 70 | 71 | THE USER CANNOT USE, EXPLOIT OR DISTRIBUTE THE *SOFTWARE* FOR COMMERCIAL PURPOSES 72 | WITHOUT PRIOR AND EXPLICIT CONSENT OF LICENSORS. YOU MUST CONTACT INRIA FOR ANY 73 | UNAUTHORIZED USE: stip-sophia.transfert@inria.fr . ANY SUCH ACTION WILL 74 | CONSTITUTE A FORGERY. THIS *SOFTWARE* IS PROVIDED "AS IS" WITHOUT ANY WARRANTIES 75 | OF ANY NATURE AND ANY EXPRESS OR IMPLIED WARRANTIES, WITH REGARDS TO COMMERCIAL 76 | USE, PROFESSIONNAL USE, LEGAL OR NOT, OR OTHER, OR COMMERCIALISATION OR 77 | ADAPTATION. UNLESS EXPLICITLY PROVIDED BY LAW, IN NO EVENT, SHALL INRIA OR THE 78 | AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 79 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 80 | GOODS OR SERVICES, LOSS OF USE, DATA, OR PROFITS OR BUSINESS INTERRUPTION) 81 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 82 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING FROM, OUT OF OR 83 | IN CONNECTION WITH THE *SOFTWARE* OR THE USE OR OTHER DEALINGS IN THE *SOFTWARE*. 84 | 85 | ## 6. Files subject to permissive licenses 86 | The contents of the file ```utils/loss_utils.py``` are based on publicly available code authored by Evan Su, which falls under the permissive MIT license. 87 | 88 | Title: pytorch-ssim\ 89 | Project code: https://github.com/Po-Hsun-Su/pytorch-ssim\ 90 | Copyright Evan Su, 2017\ 91 | License: https://github.com/Po-Hsun-Su/pytorch-ssim/blob/master/LICENSE.txt (MIT) -------------------------------------------------------------------------------- /thirdparty/gaussian_splatting/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GradientSpaces/WildGS-SLAM/24e6abf400d978955e2b26b3c451817aa6a6a11a/thirdparty/gaussian_splatting/__init__.py -------------------------------------------------------------------------------- /thirdparty/gaussian_splatting/gaussian_renderer/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2023, Inria 3 | # GRAPHDECO research group, https://team.inria.fr/graphdeco 4 | # All rights reserved. 5 | # 6 | # This software is free for non-commercial, research and evaluation use 7 | # under the terms of the LICENSE.md file. 8 | # 9 | # For inquiries contact george.drettakis@inria.fr 10 | # 11 | 12 | import math 13 | 14 | import torch 15 | from diff_gaussian_rasterization import ( 16 | GaussianRasterizationSettings, 17 | GaussianRasterizer, 18 | ) 19 | 20 | from thirdparty.gaussian_splatting.scene.gaussian_model import GaussianModel 21 | from thirdparty.gaussian_splatting.utils.sh_utils import eval_sh 22 | 23 | 24 | def render( 25 | viewpoint_camera, 26 | pc: GaussianModel, 27 | pipe, 28 | bg_color: torch.Tensor, 29 | scaling_modifier=1.0, 30 | override_color=None, 31 | mask=None, 32 | ): 33 | """ 34 | Render the scene. 35 | 36 | Background tensor (bg_color) must be on GPU! 37 | """ 38 | 39 | # Create zero tensor. We will use it to make pytorch return gradients of the 2D (screen-space) means 40 | if pc.get_xyz.shape[0] == 0: 41 | return None 42 | 43 | screenspace_points = ( 44 | torch.zeros_like( 45 | pc.get_xyz, dtype=pc.get_xyz.dtype, requires_grad=True, device="cuda" 46 | ) 47 | + 0 48 | ) 49 | try: 50 | screenspace_points.retain_grad() 51 | except Exception: 52 | pass 53 | 54 | # Set up rasterization configuration 55 | tanfovx = math.tan(viewpoint_camera.FoVx * 0.5) 56 | tanfovy = math.tan(viewpoint_camera.FoVy * 0.5) 57 | 58 | raster_settings = GaussianRasterizationSettings( 59 | image_height=int(viewpoint_camera.image_height), 60 | image_width=int(viewpoint_camera.image_width), 61 | tanfovx=tanfovx, 62 | tanfovy=tanfovy, 63 | bg=bg_color, 64 | scale_modifier=scaling_modifier, 65 | viewmatrix=viewpoint_camera.world_view_transform, 66 | projmatrix=viewpoint_camera.full_proj_transform, 67 | projmatrix_raw=viewpoint_camera.projection_matrix, 68 | sh_degree=pc.active_sh_degree, 69 | campos=viewpoint_camera.camera_center, 70 | prefiltered=False, 71 | debug=False, 72 | ) 73 | 74 | rasterizer = GaussianRasterizer(raster_settings=raster_settings) 75 | 76 | means3D = pc.get_xyz 77 | means2D = screenspace_points 78 | opacity = pc.get_opacity 79 | 80 | # If precomputed 3d covariance is provided, use it. If not, then it will be computed from 81 | # scaling / rotation by the rasterizer. 82 | scales = None 83 | rotations = None 84 | cov3D_precomp = None 85 | if pipe.compute_cov3D_python: 86 | cov3D_precomp = pc.get_covariance(scaling_modifier) 87 | else: 88 | # check if the covariance is isotropic 89 | if pc.get_scaling.shape[-1] == 1: 90 | scales = pc.get_scaling.repeat(1, 3) 91 | else: 92 | scales = pc.get_scaling 93 | rotations = pc.get_rotation 94 | 95 | # If precomputed colors are provided, use them. Otherwise, if it is desired to precompute colors 96 | # from SHs in Python, do it. If not, then SH -> RGB conversion will be done by rasterizer. 97 | shs = None 98 | colors_precomp = None 99 | if colors_precomp is None: 100 | if pipe.convert_SHs_python: 101 | shs_view = pc.get_features.transpose(1, 2).view( 102 | -1, 3, (pc.max_sh_degree + 1) ** 2 103 | ) 104 | dir_pp = pc.get_xyz - viewpoint_camera.camera_center.repeat( 105 | pc.get_features.shape[0], 1 106 | ) 107 | dir_pp_normalized = dir_pp / dir_pp.norm(dim=1, keepdim=True) 108 | sh2rgb = eval_sh(pc.active_sh_degree, shs_view, dir_pp_normalized) 109 | colors_precomp = torch.clamp_min(sh2rgb + 0.5, 0.0) 110 | else: 111 | shs = pc.get_features 112 | else: 113 | colors_precomp = override_color 114 | 115 | # Rasterize visible Gaussians to image, obtain their radii (on screen). 116 | if mask is not None: 117 | rendered_image, radii, depth, opacity = rasterizer( 118 | means3D=means3D[mask], 119 | means2D=means2D[mask], 120 | shs=shs[mask], 121 | colors_precomp=colors_precomp[mask] if colors_precomp is not None else None, 122 | opacities=opacity[mask], 123 | scales=scales[mask], 124 | rotations=rotations[mask], 125 | cov3D_precomp=cov3D_precomp[mask] if cov3D_precomp is not None else None, 126 | theta=viewpoint_camera.cam_rot_delta, 127 | rho=viewpoint_camera.cam_trans_delta, 128 | ) 129 | else: 130 | rendered_image, radii, depth, opacity, n_touched = rasterizer( 131 | means3D=means3D, 132 | means2D=means2D, 133 | shs=shs, 134 | colors_precomp=colors_precomp, 135 | opacities=opacity, 136 | scales=scales, 137 | rotations=rotations, 138 | cov3D_precomp=cov3D_precomp, 139 | theta=viewpoint_camera.cam_rot_delta, 140 | rho=viewpoint_camera.cam_trans_delta, 141 | ) 142 | 143 | # Those Gaussians that were frustum culled or had a radius of 0 were not visible. 144 | # They will be excluded from value updates used in the splitting criteria. 145 | return { 146 | "render": rendered_image, 147 | "viewspace_points": screenspace_points, 148 | "visibility_filter": radii > 0, 149 | "radii": radii, 150 | "depth": depth, 151 | "opacity": opacity, 152 | "n_touched": n_touched, 153 | } 154 | -------------------------------------------------------------------------------- /thirdparty/gaussian_splatting/utils/graphics_utils.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2023, Inria 3 | # GRAPHDECO research group, https://team.inria.fr/graphdeco 4 | # All rights reserved. 5 | # 6 | # This software is free for non-commercial, research and evaluation use 7 | # under the terms of the LICENSE.md file. 8 | # 9 | # For inquiries contact george.drettakis@inria.fr 10 | # 11 | 12 | import math 13 | from typing import NamedTuple 14 | 15 | import numpy as np 16 | import torch 17 | 18 | 19 | class BasicPointCloud(NamedTuple): 20 | points: np.array 21 | colors: np.array 22 | normals: np.array 23 | 24 | 25 | def getWorld2View(R, t): 26 | Rt = np.zeros((4, 4)) 27 | Rt[:3, :3] = R.transpose() 28 | Rt[:3, 3] = t 29 | Rt[3, 3] = 1.0 30 | return np.float32(Rt) 31 | 32 | 33 | def getWorld2View2(R, t, translate=torch.tensor([0.0, 0.0, 0.0]), scale=1.0): 34 | translate = translate.to(R.device) 35 | Rt = torch.zeros((4, 4), device=R.device) 36 | # Rt[:3, :3] = R.transpose() 37 | Rt[:3, :3] = R 38 | Rt[:3, 3] = t 39 | Rt[3, 3] = 1.0 40 | 41 | C2W = torch.linalg.inv(Rt) 42 | cam_center = C2W[:3, 3] 43 | cam_center = (cam_center + translate) * scale 44 | C2W[:3, 3] = cam_center 45 | Rt = torch.linalg.inv(C2W) 46 | return Rt 47 | 48 | 49 | def getProjectionMatrix(znear, zfar, fovX, fovY): 50 | tanHalfFovY = math.tan((fovY / 2)) 51 | tanHalfFovX = math.tan((fovX / 2)) 52 | 53 | top = tanHalfFovY * znear 54 | bottom = -top 55 | right = tanHalfFovX * znear 56 | left = -right 57 | 58 | P = torch.zeros(4, 4) 59 | 60 | z_sign = 1.0 61 | 62 | P[0, 0] = 2.0 * znear / (right - left) 63 | P[1, 1] = 2.0 * znear / (top - bottom) 64 | P[0, 2] = (right + left) / (right - left) 65 | P[1, 2] = (top + bottom) / (top - bottom) 66 | P[3, 2] = z_sign 67 | P[2, 2] = -(zfar + znear) / (zfar - znear) 68 | P[2, 3] = -2 * (zfar * znear) / (zfar - znear) 69 | return P 70 | 71 | 72 | def getProjectionMatrix2(znear, zfar, cx, cy, fx, fy, W, H): 73 | left = ((2 * cx - W) / W - 1.0) * W / 2.0 74 | right = ((2 * cx - W) / W + 1.0) * W / 2.0 75 | top = ((2 * cy - H) / H + 1.0) * H / 2.0 76 | bottom = ((2 * cy - H) / H - 1.0) * H / 2.0 77 | left = znear / fx * left 78 | right = znear / fx * right 79 | top = znear / fy * top 80 | bottom = znear / fy * bottom 81 | P = torch.zeros(4, 4) 82 | 83 | z_sign = 1.0 84 | 85 | P[0, 0] = 2.0 * znear / (right - left) 86 | P[1, 1] = 2.0 * znear / (top - bottom) 87 | P[0, 2] = (right + left) / (right - left) 88 | P[1, 2] = (top + bottom) / (top - bottom) 89 | P[3, 2] = z_sign 90 | P[2, 2] = z_sign * zfar / (zfar - znear) 91 | P[2, 3] = -(zfar * znear) / (zfar - znear) 92 | 93 | return P 94 | 95 | 96 | def fov2focal(fov, pixels): 97 | return pixels / (2 * math.tan(fov / 2)) 98 | 99 | 100 | def focal2fov(focal, pixels): 101 | return 2 * math.atan(pixels / (2 * focal)) 102 | -------------------------------------------------------------------------------- /thirdparty/gaussian_splatting/utils/image_utils.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2023, Inria 3 | # GRAPHDECO research group, https://team.inria.fr/graphdeco 4 | # All rights reserved. 5 | # 6 | # This software is free for non-commercial, research and evaluation use 7 | # under the terms of the LICENSE.md file. 8 | # 9 | # For inquiries contact george.drettakis@inria.fr 10 | # 11 | 12 | import torch 13 | 14 | 15 | def mse(img1, img2): 16 | return ((img1 - img2) ** 2).view(img1.shape[0], -1).mean(1, keepdim=True) 17 | 18 | 19 | def psnr(img1, img2): 20 | mse = ((img1 - img2) ** 2).view(img1.shape[0], -1).mean(1, keepdim=True) 21 | return 20 * torch.log10(1.0 / torch.sqrt(mse)) 22 | -------------------------------------------------------------------------------- /thirdparty/gaussian_splatting/utils/loss_utils.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2023, Inria 3 | # GRAPHDECO research group, https://team.inria.fr/graphdeco 4 | # All rights reserved. 5 | # 6 | # This software is free for non-commercial, research and evaluation use 7 | # under the terms of the LICENSE.md file. 8 | # 9 | # For inquiries contact george.drettakis@inria.fr 10 | # 11 | 12 | from math import exp 13 | 14 | import cv2 15 | import numpy as np 16 | import torch 17 | import torch.nn.functional as F 18 | from torch.autograd import Variable 19 | 20 | 21 | def l1_loss(network_output, gt): 22 | return torch.abs((network_output - gt)).mean() 23 | 24 | 25 | def l1_loss_weight(network_output, gt): 26 | image = gt.detach().cpu().numpy().transpose((1, 2, 0)) 27 | rgb_raw_gray = np.dot(image[..., :3], [0.2989, 0.5870, 0.1140]) 28 | sobelx = cv2.Sobel(rgb_raw_gray, cv2.CV_64F, 1, 0, ksize=5) 29 | sobely = cv2.Sobel(rgb_raw_gray, cv2.CV_64F, 0, 1, ksize=5) 30 | sobel_merge = np.sqrt(sobelx * sobelx + sobely * sobely) + 1e-10 31 | sobel_merge = np.exp(sobel_merge) 32 | sobel_merge /= np.max(sobel_merge) 33 | sobel_merge = torch.from_numpy(sobel_merge)[None, ...].to(gt.device) 34 | 35 | return torch.abs((network_output - gt) * sobel_merge).mean() 36 | 37 | 38 | def l2_loss(network_output, gt): 39 | return ((network_output - gt) ** 2).mean() 40 | 41 | 42 | def gaussian(window_size, sigma): 43 | gauss = torch.Tensor( 44 | [ 45 | exp(-((x - window_size // 2) ** 2) / float(2 * sigma**2)) 46 | for x in range(window_size) 47 | ] 48 | ) 49 | return gauss / gauss.sum() 50 | 51 | 52 | def create_window(window_size, channel): 53 | _1D_window = gaussian(window_size, 1.5).unsqueeze(1) 54 | _2D_window = _1D_window.mm(_1D_window.t()).float().unsqueeze(0).unsqueeze(0) 55 | window = Variable( 56 | _2D_window.expand(channel, 1, window_size, window_size).contiguous() 57 | ) 58 | return window 59 | 60 | 61 | def ssim(img1, img2, window_size=11, size_average=True): 62 | channel = img1.size(-3) 63 | window = create_window(window_size, channel) 64 | 65 | if img1.is_cuda: 66 | window = window.cuda(img1.get_device()) 67 | window = window.type_as(img1) 68 | 69 | return _ssim(img1, img2, window, window_size, channel, size_average) 70 | 71 | 72 | def _ssim(img1, img2, window, window_size, channel, size_average=True): 73 | mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel) 74 | mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel) 75 | 76 | mu1_sq = mu1.pow(2) 77 | mu2_sq = mu2.pow(2) 78 | mu1_mu2 = mu1 * mu2 79 | 80 | sigma1_sq = ( 81 | F.conv2d(img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq 82 | ) 83 | sigma2_sq = ( 84 | F.conv2d(img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq 85 | ) 86 | sigma12 = ( 87 | F.conv2d(img1 * img2, window, padding=window_size // 2, groups=channel) 88 | - mu1_mu2 89 | ) 90 | 91 | C1 = 0.01**2 92 | C2 = 0.03**2 93 | 94 | ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ( 95 | (mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2) 96 | ) 97 | 98 | if size_average: 99 | return ssim_map.mean() 100 | else: 101 | return ssim_map.mean(1).mean(1).mean(1) 102 | -------------------------------------------------------------------------------- /thirdparty/gaussian_splatting/utils/sh_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The PlenOctree Authors. 2 | # Redistribution and use in source and binary forms, with or without 3 | # modification, are permitted provided that the following conditions are met: 4 | # 5 | # 1. Redistributions of source code must retain the above copyright notice, 6 | # this list of conditions and the following disclaimer. 7 | # 8 | # 2. Redistributions in binary form must reproduce the above copyright notice, 9 | # this list of conditions and the following disclaimer in the documentation 10 | # and/or other materials provided with the distribution. 11 | # 12 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 13 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 14 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 15 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 16 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 17 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 18 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 19 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 20 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 21 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 22 | # POSSIBILITY OF SUCH DAMAGE. 23 | 24 | C0 = 0.28209479177387814 25 | C1 = 0.4886025119029199 26 | C2 = [ 27 | 1.0925484305920792, 28 | -1.0925484305920792, 29 | 0.31539156525252005, 30 | -1.0925484305920792, 31 | 0.5462742152960396, 32 | ] 33 | C3 = [ 34 | -0.5900435899266435, 35 | 2.890611442640554, 36 | -0.4570457994644658, 37 | 0.3731763325901154, 38 | -0.4570457994644658, 39 | 1.445305721320277, 40 | -0.5900435899266435, 41 | ] 42 | C4 = [ 43 | 2.5033429417967046, 44 | -1.7701307697799304, 45 | 0.9461746957575601, 46 | -0.6690465435572892, 47 | 0.10578554691520431, 48 | -0.6690465435572892, 49 | 0.47308734787878004, 50 | -1.7701307697799304, 51 | 0.6258357354491761, 52 | ] 53 | 54 | 55 | def eval_sh(deg, sh, dirs): 56 | """ 57 | Evaluate spherical harmonics at unit directions 58 | using hardcoded SH polynomials. 59 | Works with torch/np/jnp. 60 | ... Can be 0 or more batch dimensions. 61 | Args: 62 | deg: int SH deg. Currently, 0-3 supported 63 | sh: jnp.ndarray SH coeffs [..., C, (deg + 1) ** 2] 64 | dirs: jnp.ndarray unit directions [..., 3] 65 | Returns: 66 | [..., C] 67 | """ 68 | assert deg <= 4 and deg >= 0 69 | coeff = (deg + 1) ** 2 70 | assert sh.shape[-1] >= coeff 71 | 72 | result = C0 * sh[..., 0] 73 | if deg > 0: 74 | x, y, z = dirs[..., 0:1], dirs[..., 1:2], dirs[..., 2:3] 75 | result = ( 76 | result - C1 * y * sh[..., 1] + C1 * z * sh[..., 2] - C1 * x * sh[..., 3] 77 | ) 78 | 79 | if deg > 1: 80 | xx, yy, zz = x * x, y * y, z * z 81 | xy, yz, xz = x * y, y * z, x * z 82 | result = ( 83 | result 84 | + C2[0] * xy * sh[..., 4] 85 | + C2[1] * yz * sh[..., 5] 86 | + C2[2] * (2.0 * zz - xx - yy) * sh[..., 6] 87 | + C2[3] * xz * sh[..., 7] 88 | + C2[4] * (xx - yy) * sh[..., 8] 89 | ) 90 | 91 | if deg > 2: 92 | result = ( 93 | result 94 | + C3[0] * y * (3 * xx - yy) * sh[..., 9] 95 | + C3[1] * xy * z * sh[..., 10] 96 | + C3[2] * y * (4 * zz - xx - yy) * sh[..., 11] 97 | + C3[3] * z * (2 * zz - 3 * xx - 3 * yy) * sh[..., 12] 98 | + C3[4] * x * (4 * zz - xx - yy) * sh[..., 13] 99 | + C3[5] * z * (xx - yy) * sh[..., 14] 100 | + C3[6] * x * (xx - 3 * yy) * sh[..., 15] 101 | ) 102 | 103 | if deg > 3: 104 | result = ( 105 | result 106 | + C4[0] * xy * (xx - yy) * sh[..., 16] 107 | + C4[1] * yz * (3 * xx - yy) * sh[..., 17] 108 | + C4[2] * xy * (7 * zz - 1) * sh[..., 18] 109 | + C4[3] * yz * (7 * zz - 3) * sh[..., 19] 110 | + C4[4] * (zz * (35 * zz - 30) + 3) * sh[..., 20] 111 | + C4[5] * xz * (7 * zz - 3) * sh[..., 21] 112 | + C4[6] * (xx - yy) * (7 * zz - 1) * sh[..., 22] 113 | + C4[7] * xz * (xx - 3 * yy) * sh[..., 23] 114 | + C4[8] 115 | * (xx * (xx - 3 * yy) - yy * (3 * xx - yy)) 116 | * sh[..., 24] 117 | ) 118 | return result 119 | 120 | 121 | def RGB2SH(rgb): 122 | return (rgb - 0.5) / C0 123 | 124 | 125 | def SH2RGB(sh): 126 | return sh * C0 + 0.5 127 | -------------------------------------------------------------------------------- /thirdparty/gaussian_splatting/utils/system_utils.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2023, Inria 3 | # GRAPHDECO research group, https://team.inria.fr/graphdeco 4 | # All rights reserved. 5 | # 6 | # This software is free for non-commercial, research and evaluation use 7 | # under the terms of the LICENSE.md file. 8 | # 9 | # For inquiries contact george.drettakis@inria.fr 10 | # 11 | 12 | import os 13 | from errno import EEXIST 14 | from os import makedirs, path 15 | 16 | 17 | def mkdir_p(folder_path): 18 | # Creates a directory. equivalent to using mkdir -p on the command line 19 | try: 20 | makedirs(folder_path) 21 | except OSError as exc: # Python >2.5 22 | if exc.errno == EEXIST and path.isdir(folder_path): 23 | pass 24 | else: 25 | raise 26 | 27 | 28 | def searchForMaxIteration(folder): 29 | saved_iters = [int(fname.split("_")[-1]) for fname in os.listdir(folder)] 30 | return max(saved_iters) 31 | --------------------------------------------------------------------------------