├── .gitignore ├── LICENSE ├── README.md ├── assets ├── arkitscenes_41069025.mp4 ├── eval_VSIbench.png ├── eval_scanqa_sqa3d.png ├── pipeline-spatialmllm.png └── teaser-spatialmllm.png ├── evaluate ├── annotation │ └── eval_vsibench.json ├── eval_vsibench.py ├── model.py └── utils.py ├── scripts ├── evaluate_vsibench.sh └── inference.py └── src └── models ├── __init__.py ├── configuration_qwen2_5_vl.py ├── modeling_qwen2_5_vl.py ├── modular_qwen2_5_vl.py ├── processing_qwen2_5_vl.py └── vggt ├── heads ├── camera_head.py ├── dpt_head.py ├── head_act.py ├── track_head.py ├── track_modules │ ├── __init__.py │ ├── base_track_predictor.py │ ├── blocks.py │ ├── modules.py │ └── utils.py └── utils.py ├── layers ├── __init__.py ├── attention.py ├── block.py ├── drop_path.py ├── layer_scale.py ├── mlp.py ├── patch_embed.py ├── rope.py ├── swiglu_ffn.py └── vision_transformer.py ├── models ├── aggregator.py └── vggt.py └── utils ├── geometry.py ├── load_fn.py ├── pose_enc.py ├── rotation.py └── visual_track.py /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/.gitignore -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/README.md -------------------------------------------------------------------------------- /assets/arkitscenes_41069025.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/assets/arkitscenes_41069025.mp4 -------------------------------------------------------------------------------- /assets/eval_VSIbench.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/assets/eval_VSIbench.png -------------------------------------------------------------------------------- /assets/eval_scanqa_sqa3d.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/assets/eval_scanqa_sqa3d.png -------------------------------------------------------------------------------- /assets/pipeline-spatialmllm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/assets/pipeline-spatialmllm.png -------------------------------------------------------------------------------- /assets/teaser-spatialmllm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/assets/teaser-spatialmllm.png -------------------------------------------------------------------------------- /evaluate/annotation/eval_vsibench.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/evaluate/annotation/eval_vsibench.json -------------------------------------------------------------------------------- /evaluate/eval_vsibench.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/evaluate/eval_vsibench.py -------------------------------------------------------------------------------- /evaluate/model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/evaluate/model.py -------------------------------------------------------------------------------- /evaluate/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/evaluate/utils.py -------------------------------------------------------------------------------- /scripts/evaluate_vsibench.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/scripts/evaluate_vsibench.sh -------------------------------------------------------------------------------- /scripts/inference.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/scripts/inference.py -------------------------------------------------------------------------------- /src/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/src/models/__init__.py -------------------------------------------------------------------------------- /src/models/configuration_qwen2_5_vl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/src/models/configuration_qwen2_5_vl.py -------------------------------------------------------------------------------- /src/models/modeling_qwen2_5_vl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/src/models/modeling_qwen2_5_vl.py -------------------------------------------------------------------------------- /src/models/modular_qwen2_5_vl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/src/models/modular_qwen2_5_vl.py -------------------------------------------------------------------------------- /src/models/processing_qwen2_5_vl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/src/models/processing_qwen2_5_vl.py -------------------------------------------------------------------------------- /src/models/vggt/heads/camera_head.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/src/models/vggt/heads/camera_head.py -------------------------------------------------------------------------------- /src/models/vggt/heads/dpt_head.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/src/models/vggt/heads/dpt_head.py -------------------------------------------------------------------------------- /src/models/vggt/heads/head_act.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/src/models/vggt/heads/head_act.py -------------------------------------------------------------------------------- /src/models/vggt/heads/track_head.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/src/models/vggt/heads/track_head.py -------------------------------------------------------------------------------- /src/models/vggt/heads/track_modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/src/models/vggt/heads/track_modules/__init__.py -------------------------------------------------------------------------------- /src/models/vggt/heads/track_modules/base_track_predictor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/src/models/vggt/heads/track_modules/base_track_predictor.py -------------------------------------------------------------------------------- /src/models/vggt/heads/track_modules/blocks.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/src/models/vggt/heads/track_modules/blocks.py -------------------------------------------------------------------------------- /src/models/vggt/heads/track_modules/modules.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/src/models/vggt/heads/track_modules/modules.py -------------------------------------------------------------------------------- /src/models/vggt/heads/track_modules/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/src/models/vggt/heads/track_modules/utils.py -------------------------------------------------------------------------------- /src/models/vggt/heads/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/src/models/vggt/heads/utils.py -------------------------------------------------------------------------------- /src/models/vggt/layers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/src/models/vggt/layers/__init__.py -------------------------------------------------------------------------------- /src/models/vggt/layers/attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/src/models/vggt/layers/attention.py -------------------------------------------------------------------------------- /src/models/vggt/layers/block.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/src/models/vggt/layers/block.py -------------------------------------------------------------------------------- /src/models/vggt/layers/drop_path.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/src/models/vggt/layers/drop_path.py -------------------------------------------------------------------------------- /src/models/vggt/layers/layer_scale.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/src/models/vggt/layers/layer_scale.py -------------------------------------------------------------------------------- /src/models/vggt/layers/mlp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/src/models/vggt/layers/mlp.py -------------------------------------------------------------------------------- /src/models/vggt/layers/patch_embed.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/src/models/vggt/layers/patch_embed.py -------------------------------------------------------------------------------- /src/models/vggt/layers/rope.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/src/models/vggt/layers/rope.py -------------------------------------------------------------------------------- /src/models/vggt/layers/swiglu_ffn.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/src/models/vggt/layers/swiglu_ffn.py -------------------------------------------------------------------------------- /src/models/vggt/layers/vision_transformer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/src/models/vggt/layers/vision_transformer.py -------------------------------------------------------------------------------- /src/models/vggt/models/aggregator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/src/models/vggt/models/aggregator.py -------------------------------------------------------------------------------- /src/models/vggt/models/vggt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/src/models/vggt/models/vggt.py -------------------------------------------------------------------------------- /src/models/vggt/utils/geometry.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/src/models/vggt/utils/geometry.py -------------------------------------------------------------------------------- /src/models/vggt/utils/load_fn.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/src/models/vggt/utils/load_fn.py -------------------------------------------------------------------------------- /src/models/vggt/utils/pose_enc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/src/models/vggt/utils/pose_enc.py -------------------------------------------------------------------------------- /src/models/vggt/utils/rotation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/src/models/vggt/utils/rotation.py -------------------------------------------------------------------------------- /src/models/vggt/utils/visual_track.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diankun-wu/Spatial-MLLM/HEAD/src/models/vggt/utils/visual_track.py --------------------------------------------------------------------------------