├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── assets ├── airplanes.png ├── company_name.png ├── crowd.png ├── crowd_output_1.png ├── crowd_output_2.png ├── dog.png ├── donuts.png ├── donuts_output.png ├── overview.png ├── pipeline.png ├── stand_higher.png └── stand_higher_output.png ├── evaluation ├── calculate_acc_auc.py ├── calculate_coco_ap.py ├── calculate_counting.py ├── calculate_gui_acc.py ├── calculate_iou.py ├── calculate_iou_with_bbox.py ├── calculate_iou_with_bbox_nonobj.py ├── calculate_math_acc.py ├── coco_gt │ └── instances_val2017.json ├── eval_coco.sh ├── eval_count.sh ├── eval_segmentation.sh ├── eval_segmentation_qwen25vl.sh ├── eval_segmentation_with_nonobj.sh ├── evaluation_anomaly.py ├── evaluation_coco.py ├── evaluation_count.py ├── evaluation_gui.py ├── evaluation_math.py ├── evaluation_segmentation.py └── visualization.py ├── requirements.txt ├── task_categorization.md └── vision_reasoner ├── __init__.py ├── inference.py ├── models ├── __init__.py ├── base_model.py ├── qwen_vl.py ├── task_router.py ├── vggt │ ├── __init__.py │ ├── dependency │ │ ├── __init__.py │ │ ├── distortion.py │ │ ├── np_to_pycolmap.py │ │ ├── projection.py │ │ ├── track_modules │ │ │ ├── __init__.py │ │ │ ├── base_track_predictor.py │ │ │ ├── blocks.py │ │ │ ├── modules.py │ │ │ ├── track_refine.py │ │ │ └── utils.py │ │ ├── track_predict.py │ │ ├── vggsfm_tracker.py │ │ └── vggsfm_utils.py │ ├── heads │ │ ├── camera_head.py │ │ ├── dpt_head.py │ │ ├── head_act.py │ │ ├── track_head.py │ │ ├── track_modules │ │ │ ├── __init__.py │ │ │ ├── base_track_predictor.py │ │ │ ├── blocks.py │ │ │ ├── modules.py │ │ │ └── utils.py │ │ └── utils.py │ ├── layers │ │ ├── __init__.py │ │ ├── attention.py │ │ ├── block.py │ │ ├── drop_path.py │ │ ├── layer_scale.py │ │ ├── mlp.py │ │ ├── patch_embed.py │ │ ├── rope.py │ │ ├── swiglu_ffn.py │ │ └── vision_transformer.py │ ├── models │ │ ├── aggregator.py │ │ └── vggt.py │ └── utils │ │ ├── geometry.py │ │ ├── helper.py │ │ ├── load_fn.py │ │ ├── pose_enc.py │ │ ├── rotation.py │ │ └── visual_track.py ├── vision_reasoner_model.py └── visurf_model.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/.gitignore -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/.pre-commit-config.yaml -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/README.md -------------------------------------------------------------------------------- /assets/airplanes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/assets/airplanes.png -------------------------------------------------------------------------------- /assets/company_name.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/assets/company_name.png -------------------------------------------------------------------------------- /assets/crowd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/assets/crowd.png -------------------------------------------------------------------------------- /assets/crowd_output_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/assets/crowd_output_1.png -------------------------------------------------------------------------------- /assets/crowd_output_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/assets/crowd_output_2.png -------------------------------------------------------------------------------- /assets/dog.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/assets/dog.png -------------------------------------------------------------------------------- /assets/donuts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/assets/donuts.png -------------------------------------------------------------------------------- /assets/donuts_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/assets/donuts_output.png -------------------------------------------------------------------------------- /assets/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/assets/overview.png -------------------------------------------------------------------------------- /assets/pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/assets/pipeline.png -------------------------------------------------------------------------------- /assets/stand_higher.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/assets/stand_higher.png -------------------------------------------------------------------------------- /assets/stand_higher_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/assets/stand_higher_output.png -------------------------------------------------------------------------------- /evaluation/calculate_acc_auc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/evaluation/calculate_acc_auc.py -------------------------------------------------------------------------------- /evaluation/calculate_coco_ap.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/evaluation/calculate_coco_ap.py -------------------------------------------------------------------------------- /evaluation/calculate_counting.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/evaluation/calculate_counting.py -------------------------------------------------------------------------------- /evaluation/calculate_gui_acc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/evaluation/calculate_gui_acc.py -------------------------------------------------------------------------------- /evaluation/calculate_iou.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/evaluation/calculate_iou.py -------------------------------------------------------------------------------- /evaluation/calculate_iou_with_bbox.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/evaluation/calculate_iou_with_bbox.py -------------------------------------------------------------------------------- /evaluation/calculate_iou_with_bbox_nonobj.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/evaluation/calculate_iou_with_bbox_nonobj.py -------------------------------------------------------------------------------- /evaluation/calculate_math_acc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/evaluation/calculate_math_acc.py -------------------------------------------------------------------------------- /evaluation/coco_gt/instances_val2017.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/evaluation/coco_gt/instances_val2017.json -------------------------------------------------------------------------------- /evaluation/eval_coco.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/evaluation/eval_coco.sh -------------------------------------------------------------------------------- /evaluation/eval_count.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/evaluation/eval_count.sh -------------------------------------------------------------------------------- /evaluation/eval_segmentation.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/evaluation/eval_segmentation.sh -------------------------------------------------------------------------------- /evaluation/eval_segmentation_qwen25vl.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/evaluation/eval_segmentation_qwen25vl.sh -------------------------------------------------------------------------------- /evaluation/eval_segmentation_with_nonobj.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/evaluation/eval_segmentation_with_nonobj.sh -------------------------------------------------------------------------------- /evaluation/evaluation_anomaly.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/evaluation/evaluation_anomaly.py -------------------------------------------------------------------------------- /evaluation/evaluation_coco.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/evaluation/evaluation_coco.py -------------------------------------------------------------------------------- /evaluation/evaluation_count.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/evaluation/evaluation_count.py -------------------------------------------------------------------------------- /evaluation/evaluation_gui.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/evaluation/evaluation_gui.py -------------------------------------------------------------------------------- /evaluation/evaluation_math.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/evaluation/evaluation_math.py -------------------------------------------------------------------------------- /evaluation/evaluation_segmentation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/evaluation/evaluation_segmentation.py -------------------------------------------------------------------------------- /evaluation/visualization.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/evaluation/visualization.py -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/requirements.txt -------------------------------------------------------------------------------- /task_categorization.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/task_categorization.md -------------------------------------------------------------------------------- /vision_reasoner/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vision_reasoner/inference.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/inference.py -------------------------------------------------------------------------------- /vision_reasoner/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vision_reasoner/models/base_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/base_model.py -------------------------------------------------------------------------------- /vision_reasoner/models/qwen_vl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/qwen_vl.py -------------------------------------------------------------------------------- /vision_reasoner/models/task_router.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/task_router.py -------------------------------------------------------------------------------- /vision_reasoner/models/vggt/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vision_reasoner/models/vggt/dependency/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/vggt/dependency/__init__.py -------------------------------------------------------------------------------- /vision_reasoner/models/vggt/dependency/distortion.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/vggt/dependency/distortion.py -------------------------------------------------------------------------------- /vision_reasoner/models/vggt/dependency/np_to_pycolmap.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/vggt/dependency/np_to_pycolmap.py -------------------------------------------------------------------------------- /vision_reasoner/models/vggt/dependency/projection.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/vggt/dependency/projection.py -------------------------------------------------------------------------------- /vision_reasoner/models/vggt/dependency/track_modules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vision_reasoner/models/vggt/dependency/track_modules/base_track_predictor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/vggt/dependency/track_modules/base_track_predictor.py -------------------------------------------------------------------------------- /vision_reasoner/models/vggt/dependency/track_modules/blocks.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/vggt/dependency/track_modules/blocks.py -------------------------------------------------------------------------------- /vision_reasoner/models/vggt/dependency/track_modules/modules.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/vggt/dependency/track_modules/modules.py -------------------------------------------------------------------------------- /vision_reasoner/models/vggt/dependency/track_modules/track_refine.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/vggt/dependency/track_modules/track_refine.py -------------------------------------------------------------------------------- /vision_reasoner/models/vggt/dependency/track_modules/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/vggt/dependency/track_modules/utils.py -------------------------------------------------------------------------------- /vision_reasoner/models/vggt/dependency/track_predict.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/vggt/dependency/track_predict.py -------------------------------------------------------------------------------- /vision_reasoner/models/vggt/dependency/vggsfm_tracker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/vggt/dependency/vggsfm_tracker.py -------------------------------------------------------------------------------- /vision_reasoner/models/vggt/dependency/vggsfm_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/vggt/dependency/vggsfm_utils.py -------------------------------------------------------------------------------- /vision_reasoner/models/vggt/heads/camera_head.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/vggt/heads/camera_head.py -------------------------------------------------------------------------------- /vision_reasoner/models/vggt/heads/dpt_head.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/vggt/heads/dpt_head.py -------------------------------------------------------------------------------- /vision_reasoner/models/vggt/heads/head_act.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/vggt/heads/head_act.py -------------------------------------------------------------------------------- /vision_reasoner/models/vggt/heads/track_head.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/vggt/heads/track_head.py -------------------------------------------------------------------------------- /vision_reasoner/models/vggt/heads/track_modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/vggt/heads/track_modules/__init__.py -------------------------------------------------------------------------------- /vision_reasoner/models/vggt/heads/track_modules/base_track_predictor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/vggt/heads/track_modules/base_track_predictor.py -------------------------------------------------------------------------------- /vision_reasoner/models/vggt/heads/track_modules/blocks.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/vggt/heads/track_modules/blocks.py -------------------------------------------------------------------------------- /vision_reasoner/models/vggt/heads/track_modules/modules.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/vggt/heads/track_modules/modules.py -------------------------------------------------------------------------------- /vision_reasoner/models/vggt/heads/track_modules/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/vggt/heads/track_modules/utils.py -------------------------------------------------------------------------------- /vision_reasoner/models/vggt/heads/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/vggt/heads/utils.py -------------------------------------------------------------------------------- /vision_reasoner/models/vggt/layers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/vggt/layers/__init__.py -------------------------------------------------------------------------------- /vision_reasoner/models/vggt/layers/attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/vggt/layers/attention.py -------------------------------------------------------------------------------- /vision_reasoner/models/vggt/layers/block.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/vggt/layers/block.py -------------------------------------------------------------------------------- /vision_reasoner/models/vggt/layers/drop_path.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/vggt/layers/drop_path.py -------------------------------------------------------------------------------- /vision_reasoner/models/vggt/layers/layer_scale.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/vggt/layers/layer_scale.py -------------------------------------------------------------------------------- /vision_reasoner/models/vggt/layers/mlp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/vggt/layers/mlp.py -------------------------------------------------------------------------------- /vision_reasoner/models/vggt/layers/patch_embed.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/vggt/layers/patch_embed.py -------------------------------------------------------------------------------- /vision_reasoner/models/vggt/layers/rope.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/vggt/layers/rope.py -------------------------------------------------------------------------------- /vision_reasoner/models/vggt/layers/swiglu_ffn.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/vggt/layers/swiglu_ffn.py -------------------------------------------------------------------------------- /vision_reasoner/models/vggt/layers/vision_transformer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/vggt/layers/vision_transformer.py -------------------------------------------------------------------------------- /vision_reasoner/models/vggt/models/aggregator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/vggt/models/aggregator.py -------------------------------------------------------------------------------- /vision_reasoner/models/vggt/models/vggt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/vggt/models/vggt.py -------------------------------------------------------------------------------- /vision_reasoner/models/vggt/utils/geometry.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/vggt/utils/geometry.py -------------------------------------------------------------------------------- /vision_reasoner/models/vggt/utils/helper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/vggt/utils/helper.py -------------------------------------------------------------------------------- /vision_reasoner/models/vggt/utils/load_fn.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/vggt/utils/load_fn.py -------------------------------------------------------------------------------- /vision_reasoner/models/vggt/utils/pose_enc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/vggt/utils/pose_enc.py -------------------------------------------------------------------------------- /vision_reasoner/models/vggt/utils/rotation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/vggt/utils/rotation.py -------------------------------------------------------------------------------- /vision_reasoner/models/vggt/utils/visual_track.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/vggt/utils/visual_track.py -------------------------------------------------------------------------------- /vision_reasoner/models/vision_reasoner_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/vision_reasoner_model.py -------------------------------------------------------------------------------- /vision_reasoner/models/visurf_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/models/visurf_model.py -------------------------------------------------------------------------------- /vision_reasoner/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/VisionReasoner/HEAD/vision_reasoner/utils.py --------------------------------------------------------------------------------