├── Method ├── gym │ ├── __init__.py │ └── vlm_utils.py ├── position │ ├── __init__.py │ ├── .gitignore │ └── vlm_utils.py ├── vision │ ├── __init__.py │ ├── .gitignore │ ├── GroundedSAM │ │ ├── segment_anything │ │ │ ├── notebooks │ │ │ │ └── images │ │ │ │ │ ├── dog.jpg │ │ │ │ │ ├── truck.jpg │ │ │ │ │ └── groceries.jpg │ │ │ ├── .flake8 │ │ │ ├── segment_anything │ │ │ │ ├── utils │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── transforms.py │ │ │ │ │ └── onnx.py │ │ │ │ ├── modeling │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── common.py │ │ │ │ │ ├── mask_decoder.py │ │ │ │ │ └── sam.py │ │ │ │ ├── __init__.py │ │ │ │ ├── build_sam.py │ │ │ │ └── build_sam_hq.py │ │ │ ├── setup.cfg │ │ │ ├── setup.py │ │ │ ├── linter.sh │ │ │ ├── CONTRIBUTING.md │ │ │ ├── CODE_OF_CONDUCT.md │ │ │ ├── README.md │ │ │ └── scripts │ │ │ │ ├── export_onnx_model.py │ │ │ │ └── amg.py │ │ ├── .gitmodules │ │ ├── requirements.txt │ │ ├── .gitignore │ │ └── grounded_sam_demo.py │ ├── tranformation.py │ └── test_sam.py ├── isaacgym0 │ ├── .gitignore │ ├── config.yaml │ ├── utils.py │ └── asset_info.py ├── tasks ├── mask.png ├── test_image.png ├── utils │ ├── mesh.py │ ├── task_stat.py │ ├── vlm_utils.py │ └── get_assets.py ├── run_multiple.py ├── method_cfg.yaml ├── README.md ├── test_gym.py └── open6dor_gpt.py ├── assets ├── blender │ └── .gitignore ├── ckpts │ └── .gitignore ├── .gitignore ├── objects │ └── .gitignore ├── tasks │ ├── .gitignore │ └── task_refine_6dof_example │ │ └── behind │ │ └── 20240824-165044_no_interaction │ │ ├── isaac_render-rgb-0-0.png │ │ ├── isaac_render-rgb-0-1.png │ │ ├── isaac_render-rgb-0-2.png │ │ ├── isaac_render-rgb-0-3.png │ │ ├── isaac_render-rgb-0-4.png │ │ ├── gsam-gsam-mask-apple-0.npy │ │ ├── gsam-gsam-mask-apple-0.ply │ │ ├── gsam-gsam-mask-apple-0.png │ │ ├── gsam-gsam-mask-apple-1.npy │ │ ├── gsam-gsam-mask-apple-1.ply │ │ ├── gsam-gsam-mask-apple-1.png │ │ ├── gsam-gsam-mask-bottle-0.npy │ │ ├── gsam-gsam-mask-bottle-0.ply │ │ ├── gsam-gsam-mask-bottle-0.png │ │ ├── gsam-gsam-mask-bottle-1.npy │ │ ├── gsam-gsam-mask-bottle-1.ply │ │ ├── gsam-gsam-mask-bottle-1.png │ │ ├── isaac_render-depth-0-0.npy │ │ ├── isaac_render-depth-0-0.png │ │ ├── isaac_render-depth-0-1.npy │ │ ├── isaac_render-depth-0-1.png │ │ ├── isaac_render-depth-0-2.npy │ │ ├── isaac_render-depth-0-2.png │ │ ├── isaac_render-depth-0-3.npy │ │ ├── isaac_render-depth-0-3.png │ │ ├── isaac_render-depth-0-4.npy │ │ ├── isaac_render-depth-0-4.png │ │ ├── task_config_test.json │ │ └── task_config_new5.json └── robot │ └── franka_description │ ├── meshes │ ├── collision │ │ ├── finger.stl │ │ ├── hand.stl │ │ ├── link0.stl │ │ ├── link1.stl │ │ ├── link2.stl │ │ ├── link3.stl │ │ ├── link4.stl │ │ ├── link5.stl │ │ ├── link6.stl │ │ ├── link7.stl │ │ ├── stltoobj.bat │ │ ├── stltoobj.mlx │ │ └── finger.obj │ └── visual │ │ ├── daetoobj.mlx │ │ ├── daetoobj.bat │ │ ├── link1.mtl │ │ ├── link2.mtl │ │ ├── finger.mtl │ │ ├── link5.mtl │ │ ├── link4.mtl │ │ ├── link3.mtl │ │ ├── hand.mtl │ │ ├── link7.mtl │ │ ├── link0.mtl │ │ └── link6.mtl │ └── robots │ └── franka_panda.urdf ├── requirements.txt ├── images ├── teaser_final1.jpg ├── teaser_final1.pdf ├── overall_pipeline_final1.jpg └── overall_pipeline_final1.pdf ├── Benchmark ├── renderer │ ├── texture │ │ └── texture0.jpg │ └── run_Open6DOR_render.sh ├── .gitignore ├── bench_config.yaml ├── task_examples │ ├── rotation │ │ └── None │ │ │ └── mug_handle_left │ │ │ ├── 20240717-075819_no_interaction │ │ │ ├── before-rgb-0-0.png │ │ │ ├── before-rgb-0-1.png │ │ │ ├── before-rgb-0-2.png │ │ │ ├── before-rgb-0-3.png │ │ │ ├── task_config.json │ │ │ └── task_config_new.json │ │ │ └── 20240717-075911_no_interaction │ │ │ ├── before-rgb-0-0.png │ │ │ ├── before-rgb-0-1.png │ │ │ ├── before-rgb-0-2.png │ │ │ ├── before-rgb-0-3.png │ │ │ ├── task_config.json │ │ │ └── task_config_new.json │ ├── 6DoF │ │ └── behind │ │ │ └── Place_the_apple_behind_the_box_on_the_table.__upright │ │ │ └── 20240704-145831_no_interaction │ │ │ ├── before-rgb-0-0.png │ │ │ ├── before-rgb-0-1.png │ │ │ ├── before-rgb-0-2.png │ │ │ ├── before-rgb-0-3.png │ │ │ ├── task_config_new.json │ │ │ └── task_config.json │ └── position │ │ └── left │ │ └── Place_the_hammer_to_the_left_of_the_USB_on_the_table._ │ │ ├── 20240717-090658_no_interaction │ │ ├── before-rgb-0-0.png │ │ ├── before-rgb-0-1.png │ │ ├── before-rgb-0-2.png │ │ ├── before-rgb-0-3.png │ │ └── task_config.json │ │ └── 20240717-094704_no_interaction │ │ ├── before-rgb-0-0.png │ │ ├── before-rgb-0-1.png │ │ ├── before-rgb-0-2.png │ │ ├── before-rgb-0-3.png │ │ └── task_config.json ├── benchmark_catalogue │ └── error.txt ├── dataset │ └── objects │ │ └── scale.py ├── evaluation │ └── evaluator.py └── bench.py ├── .gitignore └── README.md /Method/gym/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Method/position/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Method/vision/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /assets/blender/.gitignore: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /assets/ckpts/.gitignore: -------------------------------------------------------------------------------- 1 | *pth -------------------------------------------------------------------------------- /assets/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | isaacgym/* -------------------------------------------------------------------------------- /Method/position/.gitignore: -------------------------------------------------------------------------------- 1 | openai_api.py -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | imageio 2 | bpy 3 | scipy 4 | -------------------------------------------------------------------------------- /Method/isaacgym0/.gitignore: -------------------------------------------------------------------------------- 1 | *.mp4 2 | *.png 3 | *.jpg -------------------------------------------------------------------------------- /Method/vision/.gitignore: -------------------------------------------------------------------------------- 1 | outputs/ 2 | segment-anything/ -------------------------------------------------------------------------------- /assets/objects/.gitignore: -------------------------------------------------------------------------------- 1 | objaverse_rescale/ 2 | ycb_16k_backup/ -------------------------------------------------------------------------------- /Method/tasks: -------------------------------------------------------------------------------- 1 | /home/haoran/Projects/Rearrangement/Open6DOR/Benchmark/tasks -------------------------------------------------------------------------------- /Method/mask.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/Method/mask.png -------------------------------------------------------------------------------- /assets/tasks/.gitignore: -------------------------------------------------------------------------------- 1 | task_refine_6dof 2 | task_refine_rot_only 3 | task_refine_pos -------------------------------------------------------------------------------- /Method/test_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/Method/test_image.png -------------------------------------------------------------------------------- /images/teaser_final1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/images/teaser_final1.jpg -------------------------------------------------------------------------------- /images/teaser_final1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/images/teaser_final1.pdf -------------------------------------------------------------------------------- /images/overall_pipeline_final1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/images/overall_pipeline_final1.jpg -------------------------------------------------------------------------------- /images/overall_pipeline_final1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/images/overall_pipeline_final1.pdf -------------------------------------------------------------------------------- /Benchmark/renderer/texture/texture0.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/Benchmark/renderer/texture/texture0.jpg -------------------------------------------------------------------------------- /Benchmark/.gitignore: -------------------------------------------------------------------------------- 1 | dataset/objects/* 2 | *run_renderer.sh 3 | *.DS_Store 4 | tasks/ 5 | 6 | *error.txt 7 | evaluation/format.py 8 | -------------------------------------------------------------------------------- /assets/robot/franka_description/meshes/collision/finger.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/assets/robot/franka_description/meshes/collision/finger.stl -------------------------------------------------------------------------------- /assets/robot/franka_description/meshes/collision/hand.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/assets/robot/franka_description/meshes/collision/hand.stl -------------------------------------------------------------------------------- /assets/robot/franka_description/meshes/collision/link0.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/assets/robot/franka_description/meshes/collision/link0.stl -------------------------------------------------------------------------------- /assets/robot/franka_description/meshes/collision/link1.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/assets/robot/franka_description/meshes/collision/link1.stl -------------------------------------------------------------------------------- /assets/robot/franka_description/meshes/collision/link2.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/assets/robot/franka_description/meshes/collision/link2.stl -------------------------------------------------------------------------------- /assets/robot/franka_description/meshes/collision/link3.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/assets/robot/franka_description/meshes/collision/link3.stl -------------------------------------------------------------------------------- /assets/robot/franka_description/meshes/collision/link4.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/assets/robot/franka_description/meshes/collision/link4.stl -------------------------------------------------------------------------------- /assets/robot/franka_description/meshes/collision/link5.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/assets/robot/franka_description/meshes/collision/link5.stl -------------------------------------------------------------------------------- /assets/robot/franka_description/meshes/collision/link6.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/assets/robot/franka_description/meshes/collision/link6.stl -------------------------------------------------------------------------------- /assets/robot/franka_description/meshes/collision/link7.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/assets/robot/franka_description/meshes/collision/link7.stl -------------------------------------------------------------------------------- /Benchmark/bench_config.yaml: -------------------------------------------------------------------------------- 1 | render: 2 | cam_quaternion: [0.0, 0.0, 0.0, 1.0] 3 | cam_translation: [0.0, 0.0, 1.0] 4 | background_material_id: 44 5 | env_map_id: 25 6 | -------------------------------------------------------------------------------- /assets/robot/franka_description/meshes/visual/daetoobj.mlx: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /Method/vision/GroundedSAM/segment_anything/notebooks/images/dog.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/Method/vision/GroundedSAM/segment_anything/notebooks/images/dog.jpg -------------------------------------------------------------------------------- /Method/vision/GroundedSAM/segment_anything/notebooks/images/truck.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/Method/vision/GroundedSAM/segment_anything/notebooks/images/truck.jpg -------------------------------------------------------------------------------- /Method/vision/GroundedSAM/segment_anything/notebooks/images/groceries.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/Method/vision/GroundedSAM/segment_anything/notebooks/images/groceries.jpg -------------------------------------------------------------------------------- /Method/vision/GroundedSAM/.gitmodules: -------------------------------------------------------------------------------- 1 | 2 | [submodule "grounded-sam-osx"] 3 | path = grounded-sam-osx 4 | url = https://github.com/linjing7/grounded-sam-osx.git 5 | [submodule "VISAM"] 6 | path = VISAM 7 | url = https://github.com/BingfengYan/VISAM 8 | -------------------------------------------------------------------------------- /assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/isaac_render-rgb-0-0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/isaac_render-rgb-0-0.png -------------------------------------------------------------------------------- /assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/isaac_render-rgb-0-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/isaac_render-rgb-0-1.png -------------------------------------------------------------------------------- /assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/isaac_render-rgb-0-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/isaac_render-rgb-0-2.png -------------------------------------------------------------------------------- /assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/isaac_render-rgb-0-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/isaac_render-rgb-0-3.png -------------------------------------------------------------------------------- /assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/isaac_render-rgb-0-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/isaac_render-rgb-0-4.png -------------------------------------------------------------------------------- /Benchmark/task_examples/rotation/None/mug_handle_left/20240717-075819_no_interaction/before-rgb-0-0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/Benchmark/task_examples/rotation/None/mug_handle_left/20240717-075819_no_interaction/before-rgb-0-0.png -------------------------------------------------------------------------------- /Benchmark/task_examples/rotation/None/mug_handle_left/20240717-075819_no_interaction/before-rgb-0-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/Benchmark/task_examples/rotation/None/mug_handle_left/20240717-075819_no_interaction/before-rgb-0-1.png -------------------------------------------------------------------------------- /Benchmark/task_examples/rotation/None/mug_handle_left/20240717-075819_no_interaction/before-rgb-0-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/Benchmark/task_examples/rotation/None/mug_handle_left/20240717-075819_no_interaction/before-rgb-0-2.png -------------------------------------------------------------------------------- /Benchmark/task_examples/rotation/None/mug_handle_left/20240717-075819_no_interaction/before-rgb-0-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/Benchmark/task_examples/rotation/None/mug_handle_left/20240717-075819_no_interaction/before-rgb-0-3.png -------------------------------------------------------------------------------- /Benchmark/task_examples/rotation/None/mug_handle_left/20240717-075911_no_interaction/before-rgb-0-0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/Benchmark/task_examples/rotation/None/mug_handle_left/20240717-075911_no_interaction/before-rgb-0-0.png -------------------------------------------------------------------------------- /Benchmark/task_examples/rotation/None/mug_handle_left/20240717-075911_no_interaction/before-rgb-0-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/Benchmark/task_examples/rotation/None/mug_handle_left/20240717-075911_no_interaction/before-rgb-0-1.png -------------------------------------------------------------------------------- /Benchmark/task_examples/rotation/None/mug_handle_left/20240717-075911_no_interaction/before-rgb-0-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/Benchmark/task_examples/rotation/None/mug_handle_left/20240717-075911_no_interaction/before-rgb-0-2.png -------------------------------------------------------------------------------- /Benchmark/task_examples/rotation/None/mug_handle_left/20240717-075911_no_interaction/before-rgb-0-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/Benchmark/task_examples/rotation/None/mug_handle_left/20240717-075911_no_interaction/before-rgb-0-3.png -------------------------------------------------------------------------------- /assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/gsam-gsam-mask-apple-0.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/gsam-gsam-mask-apple-0.npy -------------------------------------------------------------------------------- /assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/gsam-gsam-mask-apple-0.ply: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/gsam-gsam-mask-apple-0.ply -------------------------------------------------------------------------------- /assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/gsam-gsam-mask-apple-0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/gsam-gsam-mask-apple-0.png -------------------------------------------------------------------------------- /assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/gsam-gsam-mask-apple-1.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/gsam-gsam-mask-apple-1.npy -------------------------------------------------------------------------------- /assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/gsam-gsam-mask-apple-1.ply: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/gsam-gsam-mask-apple-1.ply -------------------------------------------------------------------------------- /assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/gsam-gsam-mask-apple-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/gsam-gsam-mask-apple-1.png -------------------------------------------------------------------------------- /assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/gsam-gsam-mask-bottle-0.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/gsam-gsam-mask-bottle-0.npy -------------------------------------------------------------------------------- /assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/gsam-gsam-mask-bottle-0.ply: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/gsam-gsam-mask-bottle-0.ply -------------------------------------------------------------------------------- /assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/gsam-gsam-mask-bottle-0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/gsam-gsam-mask-bottle-0.png -------------------------------------------------------------------------------- /assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/gsam-gsam-mask-bottle-1.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/gsam-gsam-mask-bottle-1.npy -------------------------------------------------------------------------------- /assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/gsam-gsam-mask-bottle-1.ply: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/gsam-gsam-mask-bottle-1.ply -------------------------------------------------------------------------------- /assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/gsam-gsam-mask-bottle-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/gsam-gsam-mask-bottle-1.png -------------------------------------------------------------------------------- /assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/isaac_render-depth-0-0.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/isaac_render-depth-0-0.npy -------------------------------------------------------------------------------- /assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/isaac_render-depth-0-0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/isaac_render-depth-0-0.png -------------------------------------------------------------------------------- /assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/isaac_render-depth-0-1.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/isaac_render-depth-0-1.npy -------------------------------------------------------------------------------- /assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/isaac_render-depth-0-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/isaac_render-depth-0-1.png -------------------------------------------------------------------------------- /assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/isaac_render-depth-0-2.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/isaac_render-depth-0-2.npy -------------------------------------------------------------------------------- /assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/isaac_render-depth-0-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/isaac_render-depth-0-2.png -------------------------------------------------------------------------------- /assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/isaac_render-depth-0-3.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/isaac_render-depth-0-3.npy -------------------------------------------------------------------------------- /assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/isaac_render-depth-0-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/isaac_render-depth-0-3.png -------------------------------------------------------------------------------- /assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/isaac_render-depth-0-4.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/isaac_render-depth-0-4.npy -------------------------------------------------------------------------------- /assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/isaac_render-depth-0-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/isaac_render-depth-0-4.png -------------------------------------------------------------------------------- /Benchmark/renderer/run_Open6DOR_render.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mycount=0; 4 | while (( $mycount < 1)); do 5 | ./blender-2.93.3-linux-x64/blender material_lib_v2.blend --background --python open6dor_renderer.py -- $mycount; 6 | ((mycount=$mycount+1)); 7 | done; 8 | 9 | -------------------------------------------------------------------------------- /Method/vision/GroundedSAM/segment_anything/.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = W503, E203, E221, C901, C408, E741, C407, B017, F811, C101, EXE001, EXE002 3 | max-line-length = 100 4 | max-complexity = 18 5 | select = B,C,E,F,W,T4,B9 6 | per-file-ignores = 7 | **/__init__.py:F401,F403,E402 8 | -------------------------------------------------------------------------------- /Method/vision/GroundedSAM/segment_anything/segment_anything/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | -------------------------------------------------------------------------------- /Method/isaacgym0/config.yaml: -------------------------------------------------------------------------------- 1 | SAVE_VIDEO: True 2 | SEED: 42 3 | STEPS: 1000 4 | num_envs: 256 5 | controller: ik 6 | 7 | # asset 8 | asset_root: ../assets 9 | asset_file: urdf/ycb/025_mug/025_mug_new.urdf 10 | 11 | # robot 12 | franka_asset_file: urdf/franka_description/robots/franka_panda.urdf -------------------------------------------------------------------------------- /Benchmark/task_examples/6DoF/behind/Place_the_apple_behind_the_box_on_the_table.__upright/20240704-145831_no_interaction/before-rgb-0-0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/Benchmark/task_examples/6DoF/behind/Place_the_apple_behind_the_box_on_the_table.__upright/20240704-145831_no_interaction/before-rgb-0-0.png -------------------------------------------------------------------------------- /Benchmark/task_examples/6DoF/behind/Place_the_apple_behind_the_box_on_the_table.__upright/20240704-145831_no_interaction/before-rgb-0-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/Benchmark/task_examples/6DoF/behind/Place_the_apple_behind_the_box_on_the_table.__upright/20240704-145831_no_interaction/before-rgb-0-1.png -------------------------------------------------------------------------------- /Benchmark/task_examples/6DoF/behind/Place_the_apple_behind_the_box_on_the_table.__upright/20240704-145831_no_interaction/before-rgb-0-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/Benchmark/task_examples/6DoF/behind/Place_the_apple_behind_the_box_on_the_table.__upright/20240704-145831_no_interaction/before-rgb-0-2.png -------------------------------------------------------------------------------- /Benchmark/task_examples/6DoF/behind/Place_the_apple_behind_the_box_on_the_table.__upright/20240704-145831_no_interaction/before-rgb-0-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/Benchmark/task_examples/6DoF/behind/Place_the_apple_behind_the_box_on_the_table.__upright/20240704-145831_no_interaction/before-rgb-0-3.png -------------------------------------------------------------------------------- /assets/robot/franka_description/meshes/visual/daetoobj.bat: -------------------------------------------------------------------------------- 1 | SET PATH=%PATH%;C:/Tools/Assimp/bin/x64/ 2 | forfiles /m *.dae /c "cmd /c assimp export @file @fname.obj --verbose --show-log -ptv" 3 | 4 | REM SET PATH=%PATH%;C:/Program Files/VCG/MeshLab/ 5 | REM forfiles /m *.dae /c "cmd /c meshlabserver -i @file -o @fname.obj -m vn vt -------------------------------------------------------------------------------- /Benchmark/task_examples/position/left/Place_the_hammer_to_the_left_of_the_USB_on_the_table._/20240717-090658_no_interaction/before-rgb-0-0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/Benchmark/task_examples/position/left/Place_the_hammer_to_the_left_of_the_USB_on_the_table._/20240717-090658_no_interaction/before-rgb-0-0.png -------------------------------------------------------------------------------- /Benchmark/task_examples/position/left/Place_the_hammer_to_the_left_of_the_USB_on_the_table._/20240717-090658_no_interaction/before-rgb-0-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/Benchmark/task_examples/position/left/Place_the_hammer_to_the_left_of_the_USB_on_the_table._/20240717-090658_no_interaction/before-rgb-0-1.png -------------------------------------------------------------------------------- /Benchmark/task_examples/position/left/Place_the_hammer_to_the_left_of_the_USB_on_the_table._/20240717-090658_no_interaction/before-rgb-0-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/Benchmark/task_examples/position/left/Place_the_hammer_to_the_left_of_the_USB_on_the_table._/20240717-090658_no_interaction/before-rgb-0-2.png -------------------------------------------------------------------------------- /Benchmark/task_examples/position/left/Place_the_hammer_to_the_left_of_the_USB_on_the_table._/20240717-090658_no_interaction/before-rgb-0-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/Benchmark/task_examples/position/left/Place_the_hammer_to_the_left_of_the_USB_on_the_table._/20240717-090658_no_interaction/before-rgb-0-3.png -------------------------------------------------------------------------------- /Benchmark/task_examples/position/left/Place_the_hammer_to_the_left_of_the_USB_on_the_table._/20240717-094704_no_interaction/before-rgb-0-0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/Benchmark/task_examples/position/left/Place_the_hammer_to_the_left_of_the_USB_on_the_table._/20240717-094704_no_interaction/before-rgb-0-0.png -------------------------------------------------------------------------------- /Benchmark/task_examples/position/left/Place_the_hammer_to_the_left_of_the_USB_on_the_table._/20240717-094704_no_interaction/before-rgb-0-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/Benchmark/task_examples/position/left/Place_the_hammer_to_the_left_of_the_USB_on_the_table._/20240717-094704_no_interaction/before-rgb-0-1.png -------------------------------------------------------------------------------- /Benchmark/task_examples/position/left/Place_the_hammer_to_the_left_of_the_USB_on_the_table._/20240717-094704_no_interaction/before-rgb-0-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/Benchmark/task_examples/position/left/Place_the_hammer_to_the_left_of_the_USB_on_the_table._/20240717-094704_no_interaction/before-rgb-0-2.png -------------------------------------------------------------------------------- /Benchmark/task_examples/position/left/Place_the_hammer_to_the_left_of_the_USB_on_the_table._/20240717-094704_no_interaction/before-rgb-0-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Selina2023/Open6DOR/HEAD/Benchmark/task_examples/position/left/Place_the_hammer_to_the_left_of_the_USB_on_the_table._/20240717-094704_no_interaction/before-rgb-0-3.png -------------------------------------------------------------------------------- /assets/robot/franka_description/meshes/collision/stltoobj.bat: -------------------------------------------------------------------------------- 1 | REM SET PATH=%PATH%;C:/Tools/Assimp/bin/x64/ 2 | REM forfiles /m *.dae /c "cmd /c assimp export @file @fname.obj --verbose --show-log -ptv" 3 | 4 | SET PATH=%PATH%;C:/Program Files/VCG/MeshLab/ 5 | forfiles /m *.stl /c "cmd /c meshlabserver -i @file -o @fname.obj -m vn -s stltoobj.mlx" -------------------------------------------------------------------------------- /assets/robot/franka_description/meshes/visual/link1.mtl: -------------------------------------------------------------------------------- 1 | # Blender MTL File: 'None' 2 | # Material Count: 1 3 | 4 | newmtl Part__Feature_001 5 | Ns -1.960784 6 | Ka 1.000000 1.000000 1.000000 7 | Kd 1.000000 1.000000 1.000000 8 | Ks 0.062500 0.062500 0.062500 9 | Ke 0.000000 0.000000 0.000000 10 | Ni 1.000000 11 | d 1.000000 12 | illum 2 13 | -------------------------------------------------------------------------------- /assets/robot/franka_description/meshes/visual/link2.mtl: -------------------------------------------------------------------------------- 1 | # Blender MTL File: 'None' 2 | # Material Count: 1 3 | 4 | newmtl Part__Feature024 5 | Ns -1.960784 6 | Ka 1.000000 1.000000 1.000000 7 | Kd 1.000000 1.000000 1.000000 8 | Ks 0.125000 0.125000 0.125000 9 | Ke 0.000000 0.000000 0.000000 10 | Ni 1.000000 11 | d 1.000000 12 | illum 2 13 | -------------------------------------------------------------------------------- /Method/vision/GroundedSAM/requirements.txt: -------------------------------------------------------------------------------- 1 | addict 2 | diffusers 3 | gradio 4 | huggingface_hub 5 | matplotlib 6 | numpy 7 | onnxruntime 8 | opencv_python 9 | Pillow 10 | pycocotools 11 | PyYAML 12 | requests 13 | setuptools 14 | supervision 15 | termcolor 16 | timm 17 | torch 18 | torchvision 19 | transformers 20 | yapf 21 | nltk 22 | fairscale 23 | litellm 24 | -------------------------------------------------------------------------------- /Method/vision/GroundedSAM/segment_anything/setup.cfg: -------------------------------------------------------------------------------- 1 | [isort] 2 | line_length=100 3 | multi_line_output=3 4 | include_trailing_comma=True 5 | known_standard_library=numpy,setuptools 6 | skip_glob=*/__init__.py 7 | known_myself=segment_anything 8 | known_third_party=matplotlib,cv2,torch,torchvision,pycocotools,onnx,black,isort 9 | no_lines_before=STDLIB,THIRDPARTY 10 | sections=FUTURE,STDLIB,THIRDPARTY,MYSELF,FIRSTPARTY,LOCALFOLDER 11 | default_section=FIRSTPARTY 12 | -------------------------------------------------------------------------------- /Method/utils/mesh.py: -------------------------------------------------------------------------------- 1 | import trimesh 2 | 3 | # Load a mesh from OBJ file 4 | mesh = trimesh.load('/home/haoran/Projects/Rearrangement/Open6DOR/Method/assets/objaverse_final_norm/69511a7fad2f42ee8c4b0579bbc8fec6/material.obj') 5 | 6 | # Translate mesh to its centroid 7 | mesh.apply_translation(-mesh.centroid) 8 | 9 | import pdb; pdb.set_trace() 10 | # Scale the mesh (1 unit here) 11 | scale_factor = 1.0 / mesh.bounding_box.extents.max() 12 | mesh.apply_scale(scale_factor) 13 | 14 | # save the new mesh to OBJ file 15 | mesh.export('output.obj') -------------------------------------------------------------------------------- /Method/vision/GroundedSAM/segment_anything/segment_anything/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from .sam import Sam 8 | from .image_encoder import ImageEncoderViT 9 | from .mask_decoder_hq import MaskDecoderHQ 10 | from .mask_decoder import MaskDecoder 11 | from .prompt_encoder import PromptEncoder 12 | from .transformer import TwoWayTransformer 13 | -------------------------------------------------------------------------------- /assets/robot/franka_description/meshes/visual/finger.mtl: -------------------------------------------------------------------------------- 1 | # Blender MTL File: 'None' 2 | # Material Count: 2 3 | 4 | newmtl Part__Feature001_006 5 | Ns -1.960784 6 | Ka 1.000000 1.000000 1.000000 7 | Kd 0.901961 0.921569 0.929412 8 | Ks 0.250000 0.250000 0.250000 9 | Ke 0.000000 0.000000 0.000000 10 | Ni 1.000000 11 | d 1.000000 12 | illum 2 13 | 14 | newmtl Part__Feature_007 15 | Ns -1.960784 16 | Ka 1.000000 1.000000 1.000000 17 | Kd 0.250980 0.250980 0.250980 18 | Ks 0.250000 0.250000 0.250000 19 | Ke 0.000000 0.000000 0.000000 20 | Ni 1.000000 21 | d 1.000000 22 | illum 2 23 | -------------------------------------------------------------------------------- /Method/vision/GroundedSAM/segment_anything/setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from setuptools import find_packages, setup 8 | 9 | setup( 10 | name="segment_anything", 11 | version="1.0", 12 | install_requires=[], 13 | packages=find_packages(exclude="notebooks"), 14 | extras_require={ 15 | "all": ["matplotlib", "pycocotools", "opencv-python", "onnx", "onnxruntime"], 16 | "dev": ["flake8", "isort", "black", "mypy"], 17 | }, 18 | ) 19 | -------------------------------------------------------------------------------- /Method/run_multiple.py: -------------------------------------------------------------------------------- 1 | import sys, os 2 | import argparse 3 | 4 | # add args 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument('--n', type=int, default=100) 7 | #parser.add_argument('--f', type=str, default="python reconstruction/mesh_reconstruction.py") 8 | parser.add_argument('--f', type=str, default="python interaction.py --mode gen_task --task_root rot_banch_0717 ") 9 | # parser.add_argument('--f', type=str, default="python interaction.py --mode gen_task_pure_rot --task_root rot_banch_0717_pure_rot ") 10 | #parser.add_argument('--f', type=str, default="python overall_clip.py") 11 | 12 | 13 | 14 | args = parser.parse_args() 15 | 16 | for i in range(args.n): 17 | os.system(args.f) 18 | -------------------------------------------------------------------------------- /Method/vision/GroundedSAM/segment_anything/segment_anything/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from .build_sam import ( 8 | build_sam, 9 | build_sam_vit_h, 10 | build_sam_vit_l, 11 | build_sam_vit_b, 12 | sam_model_registry, 13 | ) 14 | from .build_sam_hq import ( 15 | build_sam_hq, 16 | build_sam_hq_vit_h, 17 | build_sam_hq_vit_l, 18 | build_sam_hq_vit_b, 19 | sam_hq_model_registry, 20 | ) 21 | from .predictor import SamPredictor 22 | from .automatic_mask_generator import SamAutomaticMaskGenerator 23 | -------------------------------------------------------------------------------- /Method/vision/GroundedSAM/segment_anything/linter.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | { 5 | black --version | grep -E "23\." > /dev/null 6 | } || { 7 | echo "Linter requires 'black==23.*' !" 8 | exit 1 9 | } 10 | 11 | ISORT_VERSION=$(isort --version-number) 12 | if [[ "$ISORT_VERSION" != 5.12* ]]; then 13 | echo "Linter requires isort==5.12.0 !" 14 | exit 1 15 | fi 16 | 17 | echo "Running isort ..." 18 | isort . --atomic 19 | 20 | echo "Running black ..." 21 | black -l 100 . 22 | 23 | echo "Running flake8 ..." 24 | if [ -x "$(command -v flake8)" ]; then 25 | flake8 . 26 | else 27 | python3 -m flake8 . 28 | fi 29 | 30 | echo "Running mypy..." 31 | 32 | mypy --exclude 'setup.py|notebooks' . 33 | -------------------------------------------------------------------------------- /assets/robot/franka_description/meshes/visual/link5.mtl: -------------------------------------------------------------------------------- 1 | # Blender MTL File: 'None' 2 | # Material Count: 3 3 | 4 | newmtl Part__Feature_002_004_003 5 | Ns -1.960784 6 | Ka 1.000000 1.000000 1.000000 7 | Kd 1.000000 1.000000 1.000000 8 | Ks 0.015625 0.015625 0.015625 9 | Ke 0.000000 0.000000 0.000000 10 | Ni 1.000000 11 | d 1.000000 12 | illum 2 13 | 14 | newmtl Shell001_001_001_003 15 | Ns -1.960784 16 | Ka 1.000000 1.000000 1.000000 17 | Kd 0.250000 0.250000 0.250000 18 | Ks 0.015625 0.015625 0.015625 19 | Ke 0.000000 0.000000 0.000000 20 | Ni 1.000000 21 | d 1.000000 22 | illum 2 23 | 24 | newmtl Shell_001_001_003 25 | Ns -1.960784 26 | Ka 1.000000 1.000000 1.000000 27 | Kd 1.000000 1.000000 1.000000 28 | Ks 0.015625 0.015625 0.015625 29 | Ke 0.000000 0.000000 0.000000 30 | Ni 1.000000 31 | d 1.000000 32 | illum 2 33 | -------------------------------------------------------------------------------- /Method/method_cfg.yaml: -------------------------------------------------------------------------------- 1 | DEVICE: cuda:0 2 | INFERENCE_GSAM: True 3 | SAVE_RENDER: True 4 | VISUALIZE: True 5 | 6 | position: 7 | 8 | rotation: 9 | 10 | vision: 11 | sam_checkpoint_path: ../assets/ckpts/sam_vit_h_4b8939.pth 12 | grounded_checkpoint_path: ../assets/ckpts/groundingdino_swint_ogc.pth 13 | config_path: ./vision/GroundedSAM/GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py 14 | box_threshold: 0.3 15 | text_threshold: 0.25 16 | sam_version: vit_h 17 | 18 | cam: 19 | vinv: [[ 0. , 1. , 0. , 0. ], 20 | [-0.9028605 , -0. , 0.42993355, -0. ], 21 | [ 0.42993355, -0. , 0.9028605 , -0. ], 22 | [ 1. , 0. , 1.2 , 1. ]] 23 | proj: [[ 1.7320507, 0. , 0. , 0. ], 24 | [ 0. , 2.5980759, 0. , 0. ], 25 | [ 0. , 0. , 0. , -1. ], 26 | [ 0. , 0. , 0.05 , 0. ]] -------------------------------------------------------------------------------- /Benchmark/benchmark_catalogue/error.txt: -------------------------------------------------------------------------------- 1 | "ae7142127dd84ebbbe7762368ace452c": { shoe->mug } 2 | 072-b no upright, wrong category(toy->glue gun) 3 | 019 trans 4 | 024 trans 5 | 040 trans 6 | 065-a trans 7 | 065-b trans 8 | 065-c trans 9 | 065-d trans 10 | 065-f trans 11 | 065-g trans 12 | 065-j trans 13 | d5a5f0a954f94bcea3168329d1605fe9: shoe->mu 14 | 048 hammer trans 15 | 033 trans 16 | 8a6cb4f7b0004f53830e270dc6e1ff1d handle_left/right xx(no handle) 17 | 025 trans 18 | rewrite "tip_left" and "tip_right"'s prompt 19 | f47fdcf9615d4e94a71e6731242a4c94 wierd mesh 20 | dbb07d13a33546f09ac8ca98b1ddef20 wallet has no clasp (instruction) 21 | 032 trans 22 | d9675ab05c39447baf27e19ea07d484e lighter pointing forth(facing the viewer)-instruction 23 | note! "forth" rotation equivalence 24 | note! "spout left" needs to be upright 25 | note! "cap" forth rotation equivalence 26 | note! "cap" forth rotation equivalence 27 | 022 wierd mesh 28 | 29 | blender no texture: 9660e0c0326b4f7386014e27717231ae, ycb 04 08 09, 5de830b2cccf4fe7a2e6b400abf26ca7 -------------------------------------------------------------------------------- /assets/robot/franka_description/meshes/visual/link4.mtl: -------------------------------------------------------------------------------- 1 | # Blender MTL File: 'None' 2 | # Material Count: 4 3 | 4 | newmtl Part__Feature001_001_003_001 5 | Ns -1.960784 6 | Ka 1.000000 1.000000 1.000000 7 | Kd 1.000000 1.000000 1.000000 8 | Ks 0.007812 0.007812 0.007812 9 | Ke 0.000000 0.000000 0.000000 10 | Ni 1.000000 11 | d 1.000000 12 | illum 2 13 | 14 | newmtl Part__Feature002_001_003_001 15 | Ns -1.960784 16 | Ka 1.000000 1.000000 1.000000 17 | Kd 0.250980 0.250980 0.250980 18 | Ks 0.007812 0.007812 0.007812 19 | Ke 0.000000 0.000000 0.000000 20 | Ni 1.000000 21 | d 1.000000 22 | illum 2 23 | 24 | newmtl Part__Feature003_001_003_001 25 | Ns -1.960784 26 | Ka 1.000000 1.000000 1.000000 27 | Kd 1.000000 1.000000 1.000000 28 | Ks 0.007812 0.007812 0.007812 29 | Ke 0.000000 0.000000 0.000000 30 | Ni 1.000000 31 | d 1.000000 32 | illum 2 33 | 34 | newmtl Part__Feature_002_003_001 35 | Ns -1.960784 36 | Ka 1.000000 1.000000 1.000000 37 | Kd 1.000000 1.000000 1.000000 38 | Ks 0.007812 0.007812 0.007812 39 | Ke 0.000000 0.000000 0.000000 40 | Ni 1.000000 41 | d 1.000000 42 | illum 2 43 | -------------------------------------------------------------------------------- /assets/robot/franka_description/meshes/visual/link3.mtl: -------------------------------------------------------------------------------- 1 | # Blender MTL File: 'None' 2 | # Material Count: 4 3 | 4 | newmtl Part__Feature001_010_001_002.001 5 | Ns -1.960784 6 | Ka 1.000000 1.000000 1.000000 7 | Kd 1.000000 1.000000 1.000000 8 | Ks 0.007812 0.007812 0.007812 9 | Ke 0.000000 0.000000 0.000000 10 | Ni 1.000000 11 | d 1.000000 12 | illum 2 13 | 14 | newmtl Part__Feature002_007_001_002.001 15 | Ns -1.960784 16 | Ka 1.000000 1.000000 1.000000 17 | Kd 1.000000 1.000000 1.000000 18 | Ks 0.007812 0.007812 0.007812 19 | Ke 0.000000 0.000000 0.000000 20 | Ni 1.000000 21 | d 1.000000 22 | illum 2 23 | 24 | newmtl Part__Feature003_004_001_002.001 25 | Ns -1.960784 26 | Ka 1.000000 1.000000 1.000000 27 | Kd 1.000000 1.000000 1.000000 28 | Ks 0.007812 0.007812 0.007812 29 | Ke 0.000000 0.000000 0.000000 30 | Ni 1.000000 31 | d 1.000000 32 | illum 2 33 | 34 | newmtl Part__Feature_001_001_001_002.001 35 | Ns -1.960784 36 | Ka 1.000000 1.000000 1.000000 37 | Kd 0.250980 0.250980 0.250980 38 | Ks 0.007812 0.007812 0.007812 39 | Ke 0.000000 0.000000 0.000000 40 | Ni 1.000000 41 | d 1.000000 42 | illum 2 43 | -------------------------------------------------------------------------------- /Benchmark/task_examples/position/left/Place_the_hammer_to_the_left_of_the_USB_on_the_table._/20240717-090658_no_interaction/task_config.json: -------------------------------------------------------------------------------- 1 | {"orientation": "left", "rotation": "None", "selected_obj_names": ["USB", "hammer"], "selected_urdfs": ["objaverse_final_norm/0a51815f3c0941ae8312fc6917173ed6/material_2.urdf", "objaverse_final_norm/8ed38a92668a425eb16da938622d9ace/material_2.urdf"], "target_obj_name": "hammer", "instruction": "Place the hammer to the left of the USB on the table. ", "init_obj_pos": [[0.5523672103881836, -0.1767720878124237, 0.30958184599876404, -0.16768784821033478, -0.42019906640052795, 0.01495102047920227, 0.8916782140731812, 0.00046477484283968806, 0.0010078288614749908, -0.00030404693097807467, -0.10503458976745605, 0.03628098964691162, -0.002049945993348956], [0.5076466798782349, -0.05766259878873825, 0.30820930004119873, -0.5712552666664124, 0.4136405289173126, -0.41678178310394287, 0.5734648108482361, 0.001841548248194158, 0.003947087097913027, 0.005498047918081284, 0.7908462882041931, -0.034841056913137436, 0.027878539636731148]], "position_instruction": "Place the hammer to the left of the USB on the table. "} -------------------------------------------------------------------------------- /Benchmark/task_examples/position/left/Place_the_hammer_to_the_left_of_the_USB_on_the_table._/20240717-094704_no_interaction/task_config.json: -------------------------------------------------------------------------------- 1 | {"orientation": "left", "rotation": "None", "selected_obj_names": ["USB", "hammer"], "selected_urdfs": ["objaverse_final_norm/0a51815f3c0941ae8312fc6917173ed6/material_2.urdf", "objaverse_final_norm/35a76a67ea1c45edabbd5013de70d68d/material_2.urdf"], "target_obj_name": "hammer", "instruction": "Place the hammer to the left of the USB on the table. ", "init_obj_pos": [[0.5709131360054016, 0.2073042243719101, 0.3095809519290924, -0.17370298504829407, -0.4178505837917328, 0.0022908926475793123, 0.8917526602745056, -0.0003591739514376968, 0.0003141180204693228, -0.0003524061758071184, -0.03348350524902344, -0.04323001950979233, -0.00611852714791894], [0.4233412742614746, -0.10578499734401703, 0.32568830251693726, 0.0025873545091599226, 0.0003954840067308396, 0.12344525009393692, 0.9923479557037354, 0.0007402655319310725, -0.003524358617141843, -0.002587254624813795, 0.10105752944946289, 0.06055070459842682, 0.00236650463193655]], "position_instruction": "Place the hammer to the left of the USB on the table. "} -------------------------------------------------------------------------------- /Benchmark/task_examples/rotation/None/mug_handle_left/20240717-075911_no_interaction/task_config.json: -------------------------------------------------------------------------------- 1 | {"orientation": "None", "rotation": "None", "selected_obj_names": ["mug"], "selected_urdfs": ["objaverse_final_norm/ca4f9a92cc2f4ee98fe9332db41bf7f7/material_2.urdf"], "target_obj_name": "mug", "instruction": "Please pick up the object and place it to specify the rotation of the object after placement: the position of the object is reasonable and accords with commonsense, and that the handle of the object is on the left(pointing towards left).", "init_obj_pos": [[0.6550417542457581, 0.05568762868642807, 0.3321579694747925, 0.07643917948007584, 0.21541181206703186, -0.12756481766700745, 0.9651331901550293, -0.004337493795901537, 0.004771982319653034, -0.0002449209277983755, -0.10857345163822174, -0.09869785606861115, -0.002580456668511033]], "position_instruction": "", "rotation_instruction": "Please pick up the object and place it to specify the rotation of the object after placement: the position of the object is reasonable and accords with commonsense, and that the handle of the object is on the left(pointing towards left).", "rotation_instruction_label": "handle_left"} -------------------------------------------------------------------------------- /Method/utils/task_stat.py: -------------------------------------------------------------------------------- 1 | import glob 2 | 3 | paths = glob.glob('Method/output/rot_banch_0704/*/*/*/task_config.json') 4 | print("total tasks", len(paths)) 5 | position_tags = set([paths[i].split('/')[-4] for i in range(len(paths))]) 6 | print(position_tags) 7 | for position_tag in position_tags: 8 | print(position_tag, len([paths[i] for i in range(len(paths)) if paths[i].split('/')[-4] == position_tag])) 9 | paths = glob.glob('Method/output/rot_banch_0717/*/*/*/task_config.json') 10 | print("total tasks", len(paths)) 11 | position_tags = set([paths[i].split('/')[-4] for i in range(len(paths))]) 12 | print(position_tags) 13 | for position_tag in position_tags: 14 | print(position_tag, len([paths[i] for i in range(len(paths)) if paths[i].split('/')[-4] == position_tag])) 15 | paths = glob.glob('Method/output/rot_banch_0717_pure_rot/*/*/*/task_config.json') 16 | 17 | print("total tasks", len(paths)) 18 | position_tags = set([paths[i].split('/')[-4] for i in range(len(paths))]) 19 | print(position_tags) 20 | for position_tag in position_tags: 21 | print(position_tag, len([paths[i] for i in range(len(paths)) if paths[i].split('/')[-4] == position_tag])) -------------------------------------------------------------------------------- /Benchmark/task_examples/rotation/None/mug_handle_left/20240717-075819_no_interaction/task_config.json: -------------------------------------------------------------------------------- 1 | {"orientation": "None", "rotation": "None", "selected_obj_names": ["mug"], "selected_urdfs": ["objaverse_final_norm/db9345f568e8499a9eac2577302b5f51/material_2.urdf"], "target_obj_name": "mug", "instruction": "Please pick up the object and place it to specify the rotation of the object after placement: the position of the object is reasonable and accords with commonsense, and that the handle of the object is on the left(pointing towards left).", "init_obj_pos": [[0.6686422824859619, 0.11716754734516144, 0.34889549016952515, -0.006926149129867554, 0.25072675943374634, 0.026660921052098274, 0.9676658511161804, -0.001081045251339674, 0.0014700093306601048, -0.0009055532282218337, -0.03115496225655079, -0.024703728035092354, 0.0006507631042040884]], "position_instruction": "", "rotation_instruction": "Please pick up the object and place it to specify the rotation of the object after placement: the position of the object is reasonable and accords with commonsense, and that the handle of the object is on the left(pointing towards left).", "rotation_instruction_label": "handle_left"} -------------------------------------------------------------------------------- /assets/robot/franka_description/meshes/visual/hand.mtl: -------------------------------------------------------------------------------- 1 | # Blender MTL File: 'None' 2 | # Material Count: 5 3 | 4 | newmtl Part__Feature001_008_005 5 | Ns -1.960784 6 | Ka 1.000000 1.000000 1.000000 7 | Kd 0.250980 0.250980 0.250980 8 | Ks 0.007812 0.007812 0.007812 9 | Ke 0.000000 0.000000 0.000000 10 | Ni 1.000000 11 | d 1.000000 12 | illum 2 13 | 14 | newmtl Part__Feature002_005_005 15 | Ns -1.960784 16 | Ka 1.000000 1.000000 1.000000 17 | Kd 0.901961 0.921569 0.929412 18 | Ks 0.015625 0.015625 0.015625 19 | Ke 0.000000 0.000000 0.000000 20 | Ni 1.000000 21 | d 1.000000 22 | illum 2 23 | 24 | newmtl Part__Feature005_001_005 25 | Ns -1.960784 26 | Ka 1.000000 1.000000 1.000000 27 | Kd 1.000000 1.000000 1.000000 28 | Ks 0.015625 0.015625 0.015625 29 | Ke 0.000000 0.000000 0.000000 30 | Ni 1.000000 31 | d 1.000000 32 | illum 2 33 | 34 | newmtl Part__Feature005_001_005_001 35 | Ns -1.960784 36 | Ka 1.000000 1.000000 1.000000 37 | Kd 0.901961 0.921569 0.929412 38 | Ks 0.015625 0.015625 0.015625 39 | Ke 0.000000 0.000000 0.000000 40 | Ni 1.000000 41 | d 1.000000 42 | illum 2 43 | 44 | newmtl Part__Feature_009_005 45 | Ns -1.960784 46 | Ka 1.000000 1.000000 1.000000 47 | Kd 0.250980 0.250980 0.250980 48 | Ks 0.015625 0.015625 0.015625 49 | Ke 0.000000 0.000000 0.000000 50 | Ni 1.000000 51 | d 1.000000 52 | illum 2 53 | -------------------------------------------------------------------------------- /Method/vision/tranformation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import json 3 | 4 | 5 | 6 | def quaternion_to_matrix(q): 7 | """ 8 | Convert a quaternion into a 3x3 rotation matrix. 9 | """ 10 | qw, qx, qy, qz = q 11 | return np.array([ 12 | [1 - 2*qy*qy - 2*qz*qz, 2*qx*qy - 2*qz*qw, 2*qx*qz + 2*qy*qw], 13 | [2*qx*qy + 2*qz*qw, 1 - 2*qx*qx - 2*qz*qz, 2*qy*qz - 2*qx*qw], 14 | [2*qx*qz - 2*qy*qw, 2*qy*qz + 2*qx*qw, 1 - 2*qx*qx - 2*qy*qy] 15 | ]) 16 | 17 | def create_transformation_matrix(position, quaternion): 18 | """ 19 | Create a 4x4 transformation matrix from position and quaternion. 20 | """ 21 | x, y, z = position 22 | q = quaternion 23 | 24 | rotation_matrix = quaternion_to_matrix(q) 25 | 26 | transformation_matrix = np.identity(4) 27 | transformation_matrix[:3, :3] = rotation_matrix 28 | transformation_matrix[:3, 3] = [x, y, z] 29 | 30 | return transformation_matrix 31 | 32 | config_path = "output/gym_outputs_task_gen_obja_0304_rot/center/Place_the_mouse_at_the_center_of_all_the_objects_on_the_table.__upright/20240630-202931_no_interaction/task_config.json" 33 | 34 | config = json.load(open(config_path, "r")) 35 | pos_s = config["init_obj_pos"] 36 | for pos in pos_s: 37 | position = pos[:3] 38 | quaternion = pos[3:7] # Example quaternion 39 | transformation_matrix = create_transformation_matrix(position, quaternion) 40 | 41 | print(transformation_matrix) 42 | -------------------------------------------------------------------------------- /Benchmark/task_examples/rotation/None/mug_handle_left/20240717-075911_no_interaction/task_config_new.json: -------------------------------------------------------------------------------- 1 | {"orientation": "None", "rotation": "None", "selected_obj_names": ["mug"], "selected_urdfs": ["objaverse_final_norm/ca4f9a92cc2f4ee98fe9332db41bf7f7/material_2.urdf"], "target_obj_name": "mug", "instruction": "Please pick up the object and place it to specify the rotation of the object after placement: the position of the object is reasonable and accords with commonsense, and that the handle of the object is on the left(pointing towards left).", "init_obj_pos": [[0.6550417542457581, 0.05568762868642807, 0.3321579694747925, 0.07643917948007584, 0.21541181206703186, -0.12756481766700745, 0.9651331901550293, -0.004337493795901537, 0.004771982319653034, -0.0002449209277983755, -0.10857345163822174, -0.09869785606861115, -0.002580456668511033]], "position_instruction": "", "rotation_instruction": "Please pick up the object and place it to specify the rotation of the object after placement: the position of the object is reasonable and accords with commonsense, and that the handle of the object is on the left(pointing towards left).", "rotation_instruction_label": "handle_left", "obj_codes": ["ca4f9a92cc2f4ee98fe9332db41bf7f7"], "target_obj_code": "ca4f9a92cc2f4ee98fe9332db41bf7f7", "anno_target": {"category": "mug", "annotation": {" the position of the object is reasonable and accords with commonsense, and that the handle of the object is on the left(pointing towards left).": {"quat": [[0.5, -0.5, -0.5, 0.4999999701976776]], "stage": 1}}}} -------------------------------------------------------------------------------- /Benchmark/task_examples/rotation/None/mug_handle_left/20240717-075819_no_interaction/task_config_new.json: -------------------------------------------------------------------------------- 1 | {"orientation": "None", "rotation": "None", "selected_obj_names": ["mug"], "selected_urdfs": ["objaverse_final_norm/db9345f568e8499a9eac2577302b5f51/material_2.urdf"], "target_obj_name": "mug", "instruction": "Please pick up the object and place it to specify the rotation of the object after placement: the position of the object is reasonable and accords with commonsense, and that the handle of the object is on the left(pointing towards left).", "init_obj_pos": [[0.6686422824859619, 0.11716754734516144, 0.34889549016952515, -0.006926149129867554, 0.25072675943374634, 0.026660921052098274, 0.9676658511161804, -0.001081045251339674, 0.0014700093306601048, -0.0009055532282218337, -0.03115496225655079, -0.024703728035092354, 0.0006507631042040884]], "position_instruction": "", "rotation_instruction": "Please pick up the object and place it to specify the rotation of the object after placement: the position of the object is reasonable and accords with commonsense, and that the handle of the object is on the left(pointing towards left).", "rotation_instruction_label": "handle_left", "obj_codes": ["db9345f568e8499a9eac2577302b5f51"], "target_obj_code": "db9345f568e8499a9eac2577302b5f51", "anno_target": {"category": "mug", "annotation": {" the position of the object is reasonable and accords with commonsense, and that the handle of the object is on the left(pointing towards left).": {"quat": [[0.5, -0.5, -0.5, 0.4999999701976776]], "stage": 1}}}} -------------------------------------------------------------------------------- /Method/vision/GroundedSAM/segment_anything/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to segment-anything 2 | We want to make contributing to this project as easy and transparent as 3 | possible. 4 | 5 | ## Pull Requests 6 | We actively welcome your pull requests. 7 | 8 | 1. Fork the repo and create your branch from `main`. 9 | 2. If you've added code that should be tested, add tests. 10 | 3. If you've changed APIs, update the documentation. 11 | 4. Ensure the test suite passes. 12 | 5. Make sure your code lints, using the `linter.sh` script in the project's root directory. Linting requires `black==23.*`, `isort==5.12.0`, `flake8`, and `mypy`. 13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA"). 14 | 15 | ## Contributor License Agreement ("CLA") 16 | In order to accept your pull request, we need you to submit a CLA. You only need 17 | to do this once to work on any of Facebook's open source projects. 18 | 19 | Complete your CLA here: 20 | 21 | ## Issues 22 | We use GitHub issues to track public bugs. Please ensure your description is 23 | clear and has sufficient instructions to be able to reproduce the issue. 24 | 25 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe 26 | disclosure of security bugs. In those cases, please go through the process 27 | outlined on that page and do not file a public issue. 28 | 29 | ## License 30 | By contributing to segment-anything, you agree that your contributions will be licensed 31 | under the LICENSE file in the root directory of this source tree. 32 | -------------------------------------------------------------------------------- /Method/position/vlm_utils.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import requests 3 | 4 | # OpenAI API Key 5 | import os 6 | API_KEY = os.getenv("API_KEY") 7 | if API_KEY is None: 8 | raise ValueError("please set API_KEY environment variable by running `export API_KEY=XXXX`") 9 | # Function to encode the image 10 | def encode_image(image_path): 11 | with open(image_path, "rb") as image_file: 12 | return base64.b64encode(image_file.read()).decode('utf-8') 13 | 14 | def infer_path(prompt, path): 15 | # Getting the base64 string 16 | base64_image = encode_image(path) 17 | 18 | headers = { 19 | "Content-Type": "application/json", 20 | "Authorization": f"Bearer {API_KEY}" 21 | } 22 | 23 | payload = { 24 | "model": "gpt-4o", 25 | "messages": [ 26 | { 27 | "role": "user", 28 | "content": [ 29 | { 30 | "type": "text", 31 | "text": prompt 32 | }, 33 | { 34 | "type": "image_url", 35 | "image_url": { 36 | "url": f"data:image/jpeg;base64,{base64_image}" 37 | } 38 | } 39 | ] 40 | } 41 | ], 42 | "max_tokens": 300 43 | } 44 | 45 | response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload) 46 | 47 | # print(response.json()) 48 | return response 49 | 50 | 51 | if __name__ == "__main__": 52 | prompt = "descripbe this image" 53 | path = "./vision/1.jpg" 54 | response = infer_path(prompt, path) 55 | print(response.json()) 56 | -------------------------------------------------------------------------------- /assets/robot/franka_description/meshes/collision/stltoobj.mlx: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /Method/vision/GroundedSAM/segment_anything/segment_anything/modeling/common.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import torch 8 | import torch.nn as nn 9 | 10 | from typing import Type 11 | 12 | 13 | class MLPBlock(nn.Module): 14 | def __init__( 15 | self, 16 | embedding_dim: int, 17 | mlp_dim: int, 18 | act: Type[nn.Module] = nn.GELU, 19 | ) -> None: 20 | super().__init__() 21 | self.lin1 = nn.Linear(embedding_dim, mlp_dim) 22 | self.lin2 = nn.Linear(mlp_dim, embedding_dim) 23 | self.act = act() 24 | 25 | def forward(self, x: torch.Tensor) -> torch.Tensor: 26 | return self.lin2(self.act(self.lin1(x))) 27 | 28 | 29 | # From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa 30 | # Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119 # noqa 31 | class LayerNorm2d(nn.Module): 32 | def __init__(self, num_channels: int, eps: float = 1e-6) -> None: 33 | super().__init__() 34 | self.weight = nn.Parameter(torch.ones(num_channels)) 35 | self.bias = nn.Parameter(torch.zeros(num_channels)) 36 | self.eps = eps 37 | 38 | def forward(self, x: torch.Tensor) -> torch.Tensor: 39 | u = x.mean(1, keepdim=True) 40 | s = (x - u).pow(2).mean(1, keepdim=True) 41 | x = (x - u) / torch.sqrt(s + self.eps) 42 | x = self.weight[:, None, None] * x + self.bias[:, None, None] 43 | return x 44 | -------------------------------------------------------------------------------- /assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/task_config_test.json: -------------------------------------------------------------------------------- 1 | {"position_tag": "behind", "rotation": "None", "selected_obj_names": ["bottle", "tissue box", "apple"], "selected_urdfs": ["ycb_16k_backup/006_mustard_bottle_google_16k/006_mustard_bottle_google_16k.urdf", "objaverse_rescale/dc4c91abf45342b4bb8822f50fa162b2/material_2.urdf", "objaverse_rescale/fbda0b25f41f40958ea984f460e4770b/material_2.urdf"], "target_obj_name": "apple", "instruction": "Place the apple behind the bottle on the table. We also need to specify the rotation of the object after placement: the object is placed upright on the table and corresponds with how humans usually place the object, bottom down and top up.", "init_obj_pos": [[0.3738532066345215, 0.17327244579792023, 0.30287155508995056, 5.603695899480954e-05, -3.935253698728047e-05, -0.03753087669610977, 0.9992955327033997, 0.0029977706726640463, 0.001985779032111168, -0.0012033769162371755, -0.03269371762871742, 0.04539608955383301, -0.03798031061887741], [0.44172099232673645, -0.32238009572029114, 0.3753003478050232, 0.7060639262199402, -0.037992026656866074, -0.037284620106220245, 0.7061444520950317, -0.00012565749057102948, 0.0002828052965924144, 0.00027510791551321745, -0.005133399739861488, -0.002302509034052491, 0.0013929366832599044], [0.5476588606834412, -0.07213786244392395, 0.3492436110973358, 0.11362186074256897, 0.05067095533013344, -0.08851055055856705, 0.9882755279541016, 0.004178161732852459, -0.00013288251648191363, -0.000834679405670613, 0.0010649901814758778, 0.08433020859956741, -0.0004798930021934211]], "position_instruction": "Place the apple behind the bottle on the table. We also need to specify the rotation of the object after placement: the object is placed upright on the table and corresponds with how humans usually place the object, bottom down and top up."} -------------------------------------------------------------------------------- /Benchmark/task_examples/6DoF/behind/Place_the_apple_behind_the_box_on_the_table.__upright/20240704-145831_no_interaction/task_config_new.json: -------------------------------------------------------------------------------- 1 | {"orientation": "behind", "rotation": "None", "selected_obj_names": ["box", "apple"], "selected_urdfs": ["objaverse_final_norm/9660e0c0326b4f7386014e27717231ae/material_2.urdf", "objaverse_final_norm/f53d75bd123b40bca14d12d54286f432/material_2.urdf"], "target_obj_name": "apple", "instruction": "Place the apple behind the box on the table. We also need to specify the rotation of the object after placement: the object is placed upright on the table and corresponds with how humans usually place the object, bottom down and top up.", "init_obj_pos": [[0.5763212442398071, 0.24244019389152527, 0.3158315122127533, 0.00011814905155915767, 3.0217168387025595e-05, 0.057858873158693314, 0.9983247518539429, 0.0005872970796190202, 0.00024345181009266526, 1.8670303688850254e-05, 0.0013161733513697982, -0.0011025663698092103, -0.001989496871829033], [0.4732729494571686, 0.19301258027553558, 0.34965574741363525, 0.08372167497873306, -0.015573234297335148, -0.0979083776473999, 0.9915453195571899, 0.004182836972177029, 0.0017127282917499542, -0.001595060108229518, -0.02539602667093277, 0.09032362699508667, 0.01703813299536705]], "position_instruction": "Place the apple behind the box on the table. ", "rotation_instruction": "We also need to specify the rotation of the object after placement: the object is placed upright on the table and corresponds with how humans usually place the object, bottom down and top up.", "rotation_instruction_label": "upright", "obj_codes": ["9660e0c0326b4f7386014e27717231ae", "f53d75bd123b40bca14d12d54286f432"], "target_obj_code": "f53d75bd123b40bca14d12d54286f432", "anno_target": {"category": "apple", "annotation": {" the object is placed upright on the table and corresponds with how humans usually place the object, bottom down and top up.": {"quat": [[0.7071067690849304, 0.0, 0.0, 0.7071067690849304]], "stage": 1}}}} -------------------------------------------------------------------------------- /assets/robot/franka_description/meshes/visual/link7.mtl: -------------------------------------------------------------------------------- 1 | # Blender MTL File: 'None' 2 | # Material Count: 8 3 | 4 | newmtl Part__Mirroring001_004_002 5 | Ns -1.960784 6 | Ka 1.000000 1.000000 1.000000 7 | Kd 0.250980 0.250980 0.250980 8 | Ks 0.015625 0.015625 0.015625 9 | Ke 0.000000 0.000000 0.000000 10 | Ni 1.000000 11 | d 1.000000 12 | illum 2 13 | 14 | newmtl Part__Mirroring002_004_001 15 | Ns -1.960784 16 | Ka 1.000000 1.000000 1.000000 17 | Kd 0.250980 0.250980 0.250980 18 | Ks 0.031250 0.031250 0.031250 19 | Ke 0.000000 0.000000 0.000000 20 | Ni 1.000000 21 | d 1.000000 22 | illum 2 23 | 24 | newmtl Part__Mirroring003_004_001 25 | Ns -1.960784 26 | Ka 1.000000 1.000000 1.000000 27 | Kd 0.250980 0.250980 0.250980 28 | Ks 0.031250 0.031250 0.031250 29 | Ke 0.000000 0.000000 0.000000 30 | Ni 1.000000 31 | d 1.000000 32 | illum 2 33 | 34 | newmtl Part__Mirroring004_004_002 35 | Ns -1.960784 36 | Ka 1.000000 1.000000 1.000000 37 | Kd 1.000000 1.000000 1.000000 38 | Ks 0.031250 0.031250 0.031250 39 | Ke 0.000000 0.000000 0.000000 40 | Ni 1.000000 41 | d 1.000000 42 | illum 2 43 | 44 | newmtl Part__Mirroring005_004_001 45 | Ns -1.960784 46 | Ka 1.000000 1.000000 1.000000 47 | Kd 0.250980 0.250980 0.250980 48 | Ks 0.031250 0.031250 0.031250 49 | Ke 0.000000 0.000000 0.000000 50 | Ni 1.000000 51 | d 1.000000 52 | illum 2 53 | 54 | newmtl Part__Mirroring006_004_001 55 | Ns -1.960784 56 | Ka 1.000000 1.000000 1.000000 57 | Kd 0.250980 0.250980 0.250980 58 | Ks 0.031250 0.031250 0.031250 59 | Ke 0.000000 0.000000 0.000000 60 | Ni 1.000000 61 | d 1.000000 62 | illum 2 63 | 64 | newmtl Part__Mirroring007_004_001 65 | Ns -1.960784 66 | Ka 1.000000 1.000000 1.000000 67 | Kd 0.250980 0.250980 0.250980 68 | Ks 0.031250 0.031250 0.031250 69 | Ke 0.000000 0.000000 0.000000 70 | Ni 1.000000 71 | d 1.000000 72 | illum 2 73 | 74 | newmtl Part__Mirroring_004_001 75 | Ns -1.960784 76 | Ka 1.000000 1.000000 1.000000 77 | Kd 0.898039 0.917647 0.929412 78 | Ks 0.031250 0.031250 0.031250 79 | Ke 0.000000 0.000000 0.000000 80 | Ni 1.000000 81 | d 1.000000 82 | illum 2 83 | -------------------------------------------------------------------------------- /Method/utils/vlm_utils.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import requests 3 | 4 | # OpenAI API Key 5 | api_key = None 6 | # Function to encode the image 7 | def encode_image(image_path): 8 | with open(image_path, "rb") as image_file: 9 | return base64.b64encode(image_file.read()).decode('utf-8') 10 | 11 | def infer_path(prompt, path): 12 | # Getting the base64 string 13 | base64_image = encode_image(path) 14 | 15 | headers = { 16 | "Content-Type": "application/json", 17 | "Authorization": f"Bearer {api_key}" 18 | } 19 | 20 | payload = { 21 | "model": "gpt-4-vision-preview", 22 | "messages": [ 23 | { 24 | "role": "user", 25 | "content": [ 26 | { 27 | "type": "text", 28 | "text": prompt 29 | }, 30 | { 31 | "type": "image_url", 32 | "image_url": { 33 | "url": f"data:image/jpeg;base64,{base64_image}" 34 | } 35 | } 36 | ] 37 | } 38 | ], 39 | "max_tokens": 300 40 | } 41 | 42 | response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload) 43 | 44 | # print(response.json()) 45 | return response 46 | 47 | 48 | if __name__ == "__main__": 49 | prompt = "descripbe this image" 50 | path = "imgs/bana_cup_gsam_cup.jpg" 51 | response = infer_path(prompt, path) 52 | print(response.json()) 53 | # prompt_path = "pure_prompt.txt" 54 | # import os 55 | # os.makedirs("GPT4V-pure", exist_ok=True) 56 | # import glob, json, os 57 | # paths = glob.glob("result/*.png") 58 | # prompt_ori = open(prompt_path, "r").read() 59 | # total = len(paths) 60 | # for i, path in enumerate(paths): 61 | # name = path.split("/")[-1].split(".")[0] 62 | # print(name, i , total) 63 | # save_path = f"GPT4V-pure/{name}_pure.json" 64 | # if os.path.exists(save_path): 65 | # continue 66 | # # prompt = prompt_ori + open(f"pure_GAPartNet/{name}_pure_GAPartNet.txt", "r").read() 67 | # prompt = prompt_ori 68 | # response = infer_path(prompt, path) 69 | # json.dump(response.json(), open(save_path, "w")) 70 | # # import pdb; pdb.set_trace() -------------------------------------------------------------------------------- /Benchmark/task_examples/6DoF/behind/Place_the_apple_behind_the_box_on_the_table.__upright/20240704-145831_no_interaction/task_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "orientation": "behind", 3 | "rotation": "None", 4 | "selected_obj_names": [ 5 | "box", 6 | "apple" 7 | ], 8 | "selected_urdfs": [ 9 | "objaverse_final_norm/9660e0c0326b4f7386014e27717231ae/material_2.urdf", 10 | "objaverse_final_norm/f53d75bd123b40bca14d12d54286f432/material_2.urdf" 11 | ], 12 | "target_obj_name": "apple", 13 | "instruction": "Place the apple behind the box on the table. We also need to specify the rotation of the object after placement: the object is placed upright on the table and corresponds with how humans usually place the object, bottom down and top up.", 14 | "init_obj_pos": [ 15 | [ 16 | 0.5763212442398071, 17 | 0.24244019389152527, 18 | 0.3158315122127533, 19 | 0.00011814905155915767, 20 | 3.0217168387025595e-05, 21 | 0.057858873158693314, 22 | 0.9983247518539429, 23 | 0.0005872970796190202, 24 | 0.00024345181009266526, 25 | 1.8670303688850254e-05, 26 | 0.0013161733513697982, 27 | -0.0011025663698092103, 28 | -0.001989496871829033 29 | ], 30 | [ 31 | 0.4732729494571686, 32 | 0.19301258027553558, 33 | 0.34965574741363525, 34 | 0.08372167497873306, 35 | -0.015573234297335148, 36 | -0.0979083776473999, 37 | 0.9915453195571899, 38 | 0.004182836972177029, 39 | 0.0017127282917499542, 40 | -0.001595060108229518, 41 | -0.02539602667093277, 42 | 0.09032362699508667, 43 | 0.01703813299536705 44 | ] 45 | ], 46 | "position_instruction": "Place the apple behind the box on the table. ", 47 | "rotation_instruction": "We also need to specify the rotation of the object after placement: the object is placed upright on the table and corresponds with how humans usually place the object, bottom down and top up.", 48 | "rotation_instruction_label": "upright" 49 | } -------------------------------------------------------------------------------- /Method/isaacgym0/utils.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import os, glob 4 | import argparse 5 | import imageio 6 | from PIL import Image 7 | from isaacgym.torch_utils import * 8 | import torch 9 | import math 10 | import yaml 11 | 12 | def images_to_video(image_folder, video_path, frame_size=(1920, 1080), fps=30): 13 | images = sorted([img for img in os.listdir(image_folder) if img.endswith(".png") or img.endswith(".jpg") or img.endswith(".jpeg")]) 14 | 15 | if not images: 16 | print("No images found in the specified directory!") 17 | return 18 | 19 | writer = imageio.get_writer(video_path, fps=fps) 20 | 21 | for image in images: 22 | img_path = os.path.join(image_folder, image) 23 | img = imageio.imread(img_path) 24 | 25 | if img.shape[1] > frame_size[0] or img.shape[0] > frame_size[1]: 26 | print("Warning: frame size is smaller than the one of the images.") 27 | print("Images will be resized to match frame size.") 28 | img = np.array(Image.fromarray(img).resize(frame_size)) 29 | 30 | writer.append_data(img) 31 | 32 | writer.close() 33 | print("Video created successfully!") 34 | 35 | def quat_axis(q, axis=0): 36 | basis_vec = torch.zeros(q.shape[0], 3, device=q.device) 37 | basis_vec[:, axis] = 1 38 | return quat_rotate(q, basis_vec) 39 | 40 | 41 | def orientation_error(desired, current): 42 | cc = quat_conjugate(current) 43 | q_r = quat_mul(desired, cc) 44 | return q_r[:, 0:3] * torch.sign(q_r[:, 3]).unsqueeze(-1) 45 | 46 | 47 | def cube_grasping_yaw(q, corners): 48 | """ returns horizontal rotation required to grasp cube """ 49 | rc = quat_rotate(q, corners) 50 | yaw = (torch.atan2(rc[:, 1], rc[:, 0]) - 0.25 * math.pi) % (0.5 * math.pi) 51 | theta = 0.5 * yaw 52 | w = theta.cos() 53 | x = torch.zeros_like(w) 54 | y = torch.zeros_like(w) 55 | z = theta.sin() 56 | yaw_quats = torch.stack([x, y, z, w], dim=-1) 57 | return yaw_quats 58 | 59 | def read_yaml_config(file_path): 60 | with open(file_path, 'r') as file: 61 | # Load the YAML file into a Python dictionary 62 | config = yaml.safe_load(file) 63 | return config -------------------------------------------------------------------------------- /Benchmark/dataset/objects/scale.py: -------------------------------------------------------------------------------- 1 | import trimesh 2 | import os 3 | import json 4 | import math 5 | 6 | 7 | mesh_path = '/Users/selina/Desktop/projects/ObjectPlacement/assets/mesh/final_norm' 8 | category_path = '/Users/selina/Desktop/projects/Open6DOR/Benchmark/benchmark_catalogue/category_dictionary.json' 9 | object_path = '/Users/selina/Desktop/projects/Open6DOR/Benchmark/benchmark_catalogue/object_dictionary_complete_0702.json' 10 | new_path = "/Users/selina/Desktop/projects/Open6DOR/Benchmark/dataset/objects/rescale" 11 | 12 | category_dict = json.load(open(category_path, 'r')) 13 | object_dict = json.load(open(object_path, 'r')) 14 | for root, dirs, files in os.walk(mesh_path): 15 | for dir in dirs: 16 | try: 17 | obj_dir = os.path.join(root, dir) 18 | obj_name = dir 19 | if obj_name not in object_dict: 20 | continue 21 | obj_cat = object_dict[obj_name]['category'] 22 | obj_scale = category_dict[obj_cat]['scale'] 23 | obj_mesh = trimesh.load(os.path.join(mesh_path, dir) + '/material.obj') 24 | 25 | obj_mesh.apply_translation(-obj_mesh.centroid) 26 | 27 | if obj_mesh.bounding_box.extents.max() < 0.1: 28 | print(f"Object {obj_name} is too small") 29 | continue 30 | scale_factor = 0.7 * math.sqrt(obj_scale) / obj_mesh.bounding_box.extents.max() 31 | 32 | obj_mesh.apply_scale(scale_factor) 33 | if not os.path.exists(os.path.join(new_path, dir)): 34 | os.makedirs(os.path.join(new_path, dir), exist_ok=False) 35 | obj_mesh.export(os.path.join(new_path, dir) + '/material.obj') 36 | except: 37 | import pdb; pdb.set_trace() 38 | 39 | 40 | break 41 | 42 | # # Load a mesh from OBJ file 43 | # mesh = trimesh.load('/Users/selina/Desktop/projects/Open6DOR/Benchmark/dataset/objects/rescale/c61227cac7224b86b43c53ac2a2b6ec7/material.obj') 44 | 45 | # # Translate mesh to its centroid 46 | # mesh.apply_translation(-mesh.centroid) 47 | 48 | # # Scale the mesh (1 unit here) 49 | # # scale_factor = 1.0 / mesh.bounding_box.extents.max() 50 | # print(mesh.bounding_box.extents.max()) 51 | # # mesh.apply_scale(scale_factor) 52 | 53 | # # # save the new mesh to OBJ file 54 | # # mesh.export('2ab18cb4ec8f4a1f8dec637602362054.obj') -------------------------------------------------------------------------------- /Method/gym/vlm_utils.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import requests 3 | 4 | # OpenAI API Key 5 | # api_key = "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" 6 | api_key = "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" 7 | api_key = "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" 8 | api_key = "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" 9 | # Function to encode the image 10 | def encode_image(image_path): 11 | with open(image_path, "rb") as image_file: 12 | return base64.b64encode(image_file.read()).decode('utf-8') 13 | 14 | def infer_path(prompt, path): 15 | # Getting the base64 string 16 | base64_image = encode_image(path) 17 | 18 | headers = { 19 | "Content-Type": "application/json", 20 | "Authorization": f"Bearer {api_key}" 21 | } 22 | 23 | payload = { 24 | "model": "gpt-4-vision-preview", 25 | "messages": [ 26 | { 27 | "role": "user", 28 | "content": [ 29 | { 30 | "type": "text", 31 | "text": prompt 32 | }, 33 | { 34 | "type": "image_url", 35 | "image_url": { 36 | "url": f"data:image/jpeg;base64,{base64_image}" 37 | } 38 | } 39 | ] 40 | } 41 | ], 42 | "max_tokens": 300 43 | } 44 | 45 | response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload) 46 | 47 | # print(response.json()) 48 | return response 49 | 50 | 51 | if __name__ == "__main__": 52 | prompt = "descripbe this image" 53 | path = "imgs/bana_cup_gsam_cup.jpg" 54 | response = infer_path(prompt, path) 55 | print(response.json()) 56 | # prompt_path = "pure_prompt.txt" 57 | # import os 58 | # os.makedirs("GPT4V-pure", exist_ok=True) 59 | # import glob, json, os 60 | # paths = glob.glob("result/*.png") 61 | # prompt_ori = open(prompt_path, "r").read() 62 | # total = len(paths) 63 | # for i, path in enumerate(paths): 64 | # name = path.split("/")[-1].split(".")[0] 65 | # print(name, i , total) 66 | # save_path = f"GPT4V-pure/{name}_pure.json" 67 | # if os.path.exists(save_path): 68 | # continue 69 | # # prompt = prompt_ori + open(f"pure_GAPartNet/{name}_pure_GAPartNet.txt", "r").read() 70 | # prompt = prompt_ori 71 | # response = infer_path(prompt, path) 72 | # json.dump(response.json(), open(save_path, "w")) 73 | # # import pdb; pdb.set_trace() -------------------------------------------------------------------------------- /Method/vision/GroundedSAM/.gitignore: -------------------------------------------------------------------------------- 1 | old/ 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | pip-wheel-metadata/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | .python-version 88 | 89 | # pipenv 90 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 91 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 92 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 93 | # install all needed dependencies. 94 | #Pipfile.lock 95 | 96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 97 | __pypackages__/ 98 | 99 | # Celery stuff 100 | celerybeat-schedule 101 | celerybeat.pid 102 | 103 | # SageMath parsed files 104 | *.sage.py 105 | 106 | # Environments 107 | .env 108 | .venv 109 | env/ 110 | venv/ 111 | ENV/ 112 | env.bak/ 113 | venv.bak/ 114 | 115 | # Spyder project settings 116 | .spyderproject 117 | .spyproject 118 | 119 | # Rope project settings 120 | .ropeproject 121 | 122 | # mkdocs documentation 123 | /site 124 | 125 | # mypy 126 | .mypy_cache/ 127 | .dmypy.json 128 | dmypy.json 129 | 130 | # Pyre type checker 131 | .pyre/ 132 | 133 | # checkpoint 134 | *.pth 135 | outputs/ 136 | 137 | .idea/ 138 | -------------------------------------------------------------------------------- /assets/robot/franka_description/meshes/visual/link0.mtl: -------------------------------------------------------------------------------- 1 | # Blender MTL File: 'None' 2 | # Material Count: 12 3 | 4 | newmtl Face636_001 5 | Ns -1.960784 6 | Ka 1.000000 1.000000 1.000000 7 | Kd 0.901961 0.921569 0.929412 8 | Ks 0.125000 0.125000 0.125000 9 | Ke 0.000000 0.000000 0.000000 10 | Ni 1.000000 11 | d 1.000000 12 | illum 2 13 | 14 | newmtl Part__Feature017_001 15 | Ns -1.960784 16 | Ka 1.000000 1.000000 1.000000 17 | Kd 1.000000 1.000000 1.000000 18 | Ks 0.500000 0.500000 0.500000 19 | Ke 0.000000 0.000000 0.000000 20 | Ni 1.000000 21 | d 1.000000 22 | illum 2 23 | 24 | newmtl Part__Feature018_001 25 | Ns -1.960784 26 | Ka 1.000000 1.000000 1.000000 27 | Kd 1.000000 1.000000 1.000000 28 | Ks 0.500000 0.500000 0.500000 29 | Ke 0.000000 0.000000 0.000000 30 | Ni 1.000000 31 | d 1.000000 32 | illum 2 33 | 34 | newmtl Part__Feature019_001 35 | Ns -1.960784 36 | Ka 1.000000 1.000000 1.000000 37 | Kd 1.000000 1.000000 1.000000 38 | Ks 0.125000 0.125000 0.125000 39 | Ke 0.000000 0.000000 0.000000 40 | Ni 1.000000 41 | d 1.000000 42 | illum 2 43 | 44 | newmtl Part__Feature022_001 45 | Ns -1.960784 46 | Ka 1.000000 1.000000 1.000000 47 | Kd 0.901961 0.921569 0.929412 48 | Ks 0.125000 0.125000 0.125000 49 | Ke 0.000000 0.000000 0.000000 50 | Ni 1.000000 51 | d 1.000000 52 | illum 2 53 | 54 | newmtl Part__Feature023_001 55 | Ns -1.960784 56 | Ka 1.000000 1.000000 1.000000 57 | Kd 0.250980 0.250980 0.250980 58 | Ks 0.125000 0.125000 0.125000 59 | Ke 0.000000 0.000000 0.000000 60 | Ni 1.000000 61 | d 1.000000 62 | illum 2 63 | 64 | newmtl Shell001_001 65 | Ns -1.960784 66 | Ka 1.000000 1.000000 1.000000 67 | Kd 0.250980 0.250980 0.250980 68 | Ks 0.125000 0.125000 0.125000 69 | Ke 0.000000 0.000000 0.000000 70 | Ni 1.000000 71 | d 1.000000 72 | illum 2 73 | 74 | newmtl Shell002_001 75 | Ns -1.960784 76 | Ka 1.000000 1.000000 1.000000 77 | Kd 0.901961 0.921569 0.929412 78 | Ks 0.125000 0.125000 0.125000 79 | Ke 0.000000 0.000000 0.000000 80 | Ni 1.000000 81 | d 1.000000 82 | illum 2 83 | 84 | newmtl Shell003_001 85 | Ns -1.960784 86 | Ka 1.000000 1.000000 1.000000 87 | Kd 0.901961 0.921569 0.929412 88 | Ks 0.125000 0.125000 0.125000 89 | Ke 0.000000 0.000000 0.000000 90 | Ni 1.000000 91 | d 1.000000 92 | illum 2 93 | 94 | newmtl Shell009_001 95 | Ns -1.960784 96 | Ka 1.000000 1.000000 1.000000 97 | Kd 0.250980 0.250980 0.250980 98 | Ks 0.125000 0.125000 0.125000 99 | Ke 0.000000 0.000000 0.000000 100 | Ni 1.000000 101 | d 1.000000 102 | illum 2 103 | 104 | newmtl Shell010_001 105 | Ns -1.960784 106 | Ka 1.000000 1.000000 1.000000 107 | Kd 0.901961 0.921569 0.929412 108 | Ks 0.125000 0.125000 0.125000 109 | Ke 0.000000 0.000000 0.000000 110 | Ni 1.000000 111 | d 1.000000 112 | illum 2 113 | 114 | newmtl Shell_001 115 | Ns -1.960784 116 | Ka 1.000000 1.000000 1.000000 117 | Kd 0.250980 0.250980 0.250980 118 | Ks 0.125000 0.125000 0.125000 119 | Ke 0.000000 0.000000 0.000000 120 | Ni 1.000000 121 | d 1.000000 122 | illum 2 123 | -------------------------------------------------------------------------------- /Method/README.md: -------------------------------------------------------------------------------- 1 | # Method Introduction 2 | 3 | 4 | 5 | ## Get Task 6 | A class to get a task through configuration file, which can be used to load simulation env in IsaacGym and get the task information, render and control robot, .etc 7 | 8 | - _prepare_task: Load simulation env and get task information 9 | 10 | - _init_gym: Initialize gym env 11 | 12 | - _setup_scene: Set up scene 13 | 14 | - prepare_franka_asset: from `self.cfgs["asset"]["franka_asset_file"]` to load franka asset 15 | 16 | - _prepare_obj_assets: Load object assets: table, objects 17 | 18 | - _load_env: load all assets to env and set up scene 19 | 20 | - _init_observation: Initialize observation space and corresponding observation functions 21 | 22 | - refresh_observation: get observation dict from env 23 | 24 | - clean_up: clean up env 25 | 26 | ## Open6DOR-GPT 27 | 28 | GroundedSAM: 29 | ``` 30 | cd Method/vision/GroundedSAM/GroundingDINO 31 | pip install -e . 32 | cd ../../../.. 33 | cd Method/vision/GroundedSAM/segment_anything 34 | pip install -e . 35 | cd ../../../.. 36 | ``` 37 | Extensions: 38 | ``` 39 | sudo apt update 40 | sudo apt install fonts-dejavu 41 | ``` 42 | 43 | if meet error: 44 | ``` 45 | cannot import name 'split_torch_state_dict_into_shards' from 'huggingface_hub' 46 | ``` 47 | try: 48 | ``` 49 | pip install --upgrade huggingface_hub 50 | ``` 51 | 52 | SAM checkpoint is [here](https://huggingface.co/spaces/abhishek/StableSAM/resolve/main/sam_vit_h_4b8939.pth) 53 | 54 | GroundingDINO checkpoint is [here](https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth) 55 | 56 | ## Task Generation 57 | 58 | The core code for task generation is in `Method/interaction.py`. The task generator is responsible for generating tasks for Open6DOR. 59 | 60 | #### Position Track 61 | ```bash 62 | python interaction.py --mode gen_task --task_root debug_gen_task_pos 63 | ``` 64 | 65 | 66 | #### Rotation Track 67 | ```bash 68 | python interaction.py --mode gen_task_pure_rot --task_root debug_gen_task_rot 69 | ``` 70 | 71 | #### 6DoF Track 72 | ```bash 73 | python interaction.py --mode gen_task_rot --task_root debug_gen_task_6dof 74 | ``` 75 | 76 | #### Large Dataset Generation 77 | If you want to generate a large dataset, you can use the following command: 78 | ```bash 79 | python run_multiple.py --f "YOUR COMMAND" --n YOUR_RUN_TIMES 80 | ``` 81 | 82 | #### Change Parameters 83 | You can change the parameters in `Method/interaction.py` to generate different tasks. 84 | 85 | ##### Object Number 86 | ```python 87 | if orientation == "center": 88 | selected_obj_num = np.random.randint(4, 5) 89 | elif orientation == "between": 90 | selected_obj_num = np.random.randint(3, 5) 91 | else: 92 | selected_obj_num = np.random.randint(2, 5) 93 | ``` 94 | 95 | ##### Object Position 96 | In config.yaml, you can change the object position range: 97 | ```yaml 98 | assets: 99 | position_noise: [0.2, 0.25] # x and y position random range, depends on the table size 100 | ``` 101 | 102 | -------------------------------------------------------------------------------- /Method/utils/get_assets.py: -------------------------------------------------------------------------------- 1 | 2 | import json, glob 3 | 4 | def get_assets_info(dataset_names): 5 | urdf_paths = [] 6 | obj_name = [] 7 | uuids = [] 8 | if "ycb" in dataset_names: 9 | # all the ycb urdf data 10 | json_dict = json.load(open("../Benchmark/benchmark_catalogue/object_dictionary_complete_0702.json")) 11 | all_uuid = json_dict.keys() 12 | 13 | #ycb_urdf_paths = glob.glob("assets/ycb_16k_backup/*/*.urdf") 14 | ycb_urdf_paths = glob.glob("benchmark/mesh/ycb/*/*.urdf") 15 | ycb_names = [urdf_path.split("/")[-2] for urdf_path in ycb_urdf_paths] 16 | ycb_obj_name = [" ".join(name.split("_")[1:-2]) for name in ycb_names] 17 | ycb_uuid = [urdf_path.split("/")[-2].split("_")[0] for urdf_path in ycb_urdf_paths] 18 | 19 | valid_idx = [i for i in range(len(ycb_uuid)) if ycb_uuid[i] in all_uuid] 20 | 21 | ycb_uuids = [ycb_uuid[i] for i in valid_idx] 22 | ycb_urdf_paths = [ycb_urdf_paths[i] for i in valid_idx] 23 | ycb_obj_name = [" ".join(json_dict[ycb_uuid[i]]['category'].split("_")) for i in valid_idx] 24 | urdf_paths+=ycb_urdf_paths 25 | obj_name+=ycb_obj_name 26 | uuids += ycb_uuids 27 | if "objaverse" in dataset_names: 28 | json_dict = json.load(open("../Benchmark/benchmark_catalogue/object_dictionary_complete_0702.json")) 29 | 30 | all_uuid = json_dict.keys() 31 | # all the objaverse data 32 | objaverse_urdf_paths = glob.glob("assets/objaverse_final_norm/*/*_2.urdf") 33 | objaverse_obj_uuid = [path.split("/")[-2] for path in objaverse_urdf_paths] 34 | 35 | valid_idx = [i for i in range(len(objaverse_obj_uuid)) if objaverse_obj_uuid[i] in all_uuid] 36 | objaverse_obj_uuids = [objaverse_obj_uuid[i] for i in valid_idx] 37 | objaverse_urdf_paths = [objaverse_urdf_paths[i] for i in valid_idx] 38 | objaverse_obj_name = [" ".join(json_dict[objaverse_obj_uuid[i]]['category'].split("_")) for i in valid_idx] 39 | urdf_paths+=objaverse_urdf_paths 40 | obj_name+=objaverse_obj_name 41 | uuids+=objaverse_obj_uuids 42 | if "objaverse_old" in dataset_names: 43 | json_dict = json.load(open("category_dictionary.json")) 44 | 45 | all_uuid = [] 46 | for key in json_dict.keys(): all_uuid+=json_dict[key]["object_uuids"] 47 | # all the objaverse data 48 | objaverse_urdf_paths = glob.glob("benchmark/mesh/objaverse_final_norm/*/*_2.urdf") 49 | objaverse_names = [urdf_path.split("/")[-2] for urdf_path in objaverse_urdf_paths] 50 | objaverse_obj_name = [" ".join(name.split("_")[1:]) for name in objaverse_names] 51 | objaverse_obj_uuid = [name.split("_")[0] for name in objaverse_names] 52 | valid_idx = [i for i in range(len(objaverse_obj_uuid)) if objaverse_obj_uuid[i] in all_uuid] 53 | objaverse_urdf_paths = [objaverse_urdf_paths[i] for i in valid_idx] 54 | objaverse_obj_name = [objaverse_obj_name[i] for i in valid_idx] 55 | # import pdb; pdb.set_trace() 56 | urdf_paths+=objaverse_urdf_paths 57 | obj_name+=objaverse_obj_name 58 | return urdf_paths,obj_name,uuids -------------------------------------------------------------------------------- /Method/test_gym.py: -------------------------------------------------------------------------------- 1 | # exit() 2 | import sys 3 | import os 4 | sys.path = [os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))] + sys.path 5 | sys.path = [os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))] + sys.path 6 | 7 | # import gym 8 | from gym.object_gym import ObjectGym 9 | from gym.utils import read_yaml_config 10 | 11 | import json, glob, random 12 | 13 | tag = "handle_right" 14 | # tag = "upside" 15 | anno_path = f"/home/haoran/Projects/Rearrangement/Open6DOR/Benchmark/benchmark_catalogue/annotation/annotation_{tag}.json" 16 | # anno_path = f"/home/haoran/Projects/Rearrangement/ObjectPlacement/rotation_anno/annotation_upright_1_.json" 17 | save_root_ = f"/home/haoran/Projects/Rearrangement/Open6DOR/anno_test/anno_images-final_-{tag}" 18 | anno_data = json.load(open(anno_path, 'r')) 19 | anno_keys = list(anno_data.keys()) 20 | # import pdb; pdb.set_trace() 21 | random.shuffle(anno_keys) 22 | for anno in anno_keys: 23 | # print(anno["object_name"], anno["upright"]) 24 | anno_data_i = anno_data[anno]['annotation'] 25 | obj_id = anno 26 | save_root = f"{save_root_}/{tag}-{obj_id}" 27 | # if os.path.exists(f"/home/haoran/Projects/Rearrangement/ObjectPlacement/rotation_anno/anno_images/upright-{obj_id}/task_config-rgb-0-0.png"): 28 | if os.path.exists(f"{save_root_}/{tag}-{obj_id}/task_config-rgb-0-0.png"): 29 | continue 30 | cfgs = read_yaml_config("config.yaml") 31 | 32 | if len(obj_id) > 10: # objaverse 33 | cfgs["asset"]["asset_files"] = [f"objaverse_final_norm/{obj_id}/material_2.urdf"] 34 | else: 35 | path = glob.glob(f"assets/ycb_16k_backup/{obj_id}*/{obj_id}*.urdf")[0] 36 | path_r = "/".join(path.split("/")[-3:]) 37 | cfgs["asset"]["asset_files"] = [path_r] 38 | if len(list(anno_data_i.keys())) > 1: 39 | import pdb; pdb.set_trace() 40 | try: 41 | quat_anno = anno_data_i[list(anno_data_i.keys())[0]]["quat"] 42 | except: 43 | continue 44 | if anno_data_i[list(anno_data_i.keys())[0]]["stage"] != 1 and anno_data_i[list(anno_data_i.keys())[0]]["stage"] != 2: 45 | import pdb; pdb.set_trace() 46 | 47 | cfgs["asset"]["obj_pose_ps"] = [[0.5, 0, 0.4]] 48 | try: 49 | cfgs["asset"]["obj_pose_rs"] = [[quat_anno[0][0], quat_anno[0][1], quat_anno[0][2],quat_anno[0][3],]] 50 | except: 51 | cfgs["asset"]["obj_pose_rs"] = [[quat_anno[0], quat_anno[1], quat_anno[2],quat_anno[3],]] 52 | 53 | cfgs["asset"]["position_noise"] = [0, 0] 54 | cfgs["asset"]["rotation_noise"] = 0 55 | # cfgs["asset"]["asset_files"] = [obj_id] 56 | # cfgs["asset"]["asset_files"] = anno["object_name"] 57 | gym = ObjectGym(cfgs, None, None, pre_steps = 0) 58 | 59 | print(list(anno_data_i.keys())[0]) 60 | gym.refresh_observation(get_visual_obs=False) 61 | # save_root = f"/home/haoran/Projects/Rearrangement/ObjectPlacement/rotation_anno/anno_images2/upright-{obj_id}" 62 | 63 | os.makedirs(save_root, exist_ok=True) 64 | points_envs, colors_envs, rgb_envs, depth_envs ,seg_envs, ori_points_envs, ori_colors_envs, pixel2pointid, pointid2pixel = gym.refresh_observation(get_visual_obs=True) 65 | gym.save_render(rgb_envs=rgb_envs, depth_envs=None, ori_points_env=None, ori_colors_env=None, points=None, colors=None, save_dir = save_root, save_name = "task_config") 66 | 67 | # gym.run_steps(1000) 68 | # import pdb; pdb.set_trace() 69 | gym.clean_up() 70 | -------------------------------------------------------------------------------- /Method/vision/GroundedSAM/segment_anything/segment_anything/build_sam.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import torch 8 | 9 | from functools import partial 10 | 11 | from .modeling import ImageEncoderViT, MaskDecoder, PromptEncoder, Sam, TwoWayTransformer 12 | 13 | 14 | def build_sam_vit_h(checkpoint=None): 15 | return _build_sam( 16 | encoder_embed_dim=1280, 17 | encoder_depth=32, 18 | encoder_num_heads=16, 19 | encoder_global_attn_indexes=[7, 15, 23, 31], 20 | checkpoint=checkpoint, 21 | ) 22 | 23 | 24 | build_sam = build_sam_vit_h 25 | 26 | 27 | def build_sam_vit_l(checkpoint=None): 28 | return _build_sam( 29 | encoder_embed_dim=1024, 30 | encoder_depth=24, 31 | encoder_num_heads=16, 32 | encoder_global_attn_indexes=[5, 11, 17, 23], 33 | checkpoint=checkpoint, 34 | ) 35 | 36 | 37 | def build_sam_vit_b(checkpoint=None): 38 | return _build_sam( 39 | encoder_embed_dim=768, 40 | encoder_depth=12, 41 | encoder_num_heads=12, 42 | encoder_global_attn_indexes=[2, 5, 8, 11], 43 | checkpoint=checkpoint, 44 | ) 45 | 46 | 47 | sam_model_registry = { 48 | "default": build_sam, 49 | "vit_h": build_sam, 50 | "vit_l": build_sam_vit_l, 51 | "vit_b": build_sam_vit_b, 52 | } 53 | 54 | 55 | def _build_sam( 56 | encoder_embed_dim, 57 | encoder_depth, 58 | encoder_num_heads, 59 | encoder_global_attn_indexes, 60 | checkpoint=None, 61 | ): 62 | prompt_embed_dim = 256 63 | image_size = 1024 64 | vit_patch_size = 16 65 | image_embedding_size = image_size // vit_patch_size 66 | sam = Sam( 67 | image_encoder=ImageEncoderViT( 68 | depth=encoder_depth, 69 | embed_dim=encoder_embed_dim, 70 | img_size=image_size, 71 | mlp_ratio=4, 72 | norm_layer=partial(torch.nn.LayerNorm, eps=1e-6), 73 | num_heads=encoder_num_heads, 74 | patch_size=vit_patch_size, 75 | qkv_bias=True, 76 | use_rel_pos=True, 77 | global_attn_indexes=encoder_global_attn_indexes, 78 | window_size=14, 79 | out_chans=prompt_embed_dim, 80 | ), 81 | prompt_encoder=PromptEncoder( 82 | embed_dim=prompt_embed_dim, 83 | image_embedding_size=(image_embedding_size, image_embedding_size), 84 | input_image_size=(image_size, image_size), 85 | mask_in_chans=16, 86 | ), 87 | mask_decoder=MaskDecoder( 88 | num_multimask_outputs=3, 89 | transformer=TwoWayTransformer( 90 | depth=2, 91 | embedding_dim=prompt_embed_dim, 92 | mlp_dim=2048, 93 | num_heads=8, 94 | ), 95 | transformer_dim=prompt_embed_dim, 96 | iou_head_depth=3, 97 | iou_head_hidden_dim=256, 98 | ), 99 | pixel_mean=[123.675, 116.28, 103.53], 100 | pixel_std=[58.395, 57.12, 57.375], 101 | ) 102 | sam.eval() 103 | if checkpoint is not None: 104 | with open(checkpoint, "rb") as f: 105 | state_dict = torch.load(f) 106 | sam.load_state_dict(state_dict) 107 | return sam 108 | -------------------------------------------------------------------------------- /Method/open6dor_gpt.py: -------------------------------------------------------------------------------- 1 | import json, imageio 2 | from gym.utils import read_yaml_config, prepare_gsam_model 3 | import numpy as np 4 | 5 | class Open6DOR_GPT: 6 | def __init__(self, cfgs): 7 | self.cfgs = cfgs 8 | self.device = cfgs["DEVICE"] 9 | self._prepare_ckpts() 10 | 11 | def _prepare_ckpts(self): 12 | # prepare gsam model 13 | if self.cfgs["INFERENCE_GSAM"]: 14 | self._grounded_dino_model, self._sam_predictor = prepare_gsam_model(device=self.device) 15 | 16 | self._box_threshold = 0.3 17 | self._text_threshold = 0.25 18 | else: 19 | self._grounded_dino_model, self._sam_predictor = None, None 20 | 21 | def inference_vlm(self, prompt, image_path, print_ans = False): 22 | from gym.vlm_utils import infer_path 23 | # prepare vlm model 24 | response = infer_path(prompt, image_path) 25 | while 'choices' not in response.json(): 26 | response = infer_path(prompt, image_path) 27 | ans = response.json()['choices'][0]['message']['content'] 28 | if print_ans: 29 | print(ans) 30 | return ans 31 | 32 | def inference_gsam(self, image: np.ndarray = None, image_path: str = None, prompt = None): 33 | from vision.grounded_sam_demo import prepare_GroundedSAM_for_inference, inference_one_image 34 | if image is not None: 35 | masks = inference_one_image(image[..., :3], self._grounded_dino_model, self._sam_predictor, box_threshold=self._box_threshold, text_threshold=self._text_threshold, text_prompt=prompt, device=self.device) 36 | elif image_path is not None: 37 | image = imageio.imread(image_path) 38 | masks = inference_one_image(image[..., :3], self._grounded_dino_model, self._sam_predictor, box_threshold=self._box_threshold, text_threshold=self._text_threshold, text_prompt=prompt, device=self.device) 39 | return masks, image 40 | 41 | def inference_task(self, task_cfgs): 42 | # prepare task data 43 | task_data = self.prepare_task_data(task_cfgs) 44 | 45 | # inference 46 | pred_pose = self.inference(task_data, self._grounded_dino_model, self._sam_predictor) 47 | 48 | return pred_pose 49 | 50 | def test_vlm(): 51 | cfgs = read_yaml_config("config.yaml") 52 | open6dor_gpt = Open6DOR_GPT(cfgs=cfgs) 53 | prompt = "hello gpt, describe the image" 54 | image_path = "test_image.png" 55 | print("The ans is: ", open6dor_gpt.inference_vlm(prompt, image_path, print_ans=True)) 56 | print("vlm test passed!") 57 | import pdb; pdb.set_trace() 58 | 59 | def test_gsam(): 60 | image_path = "test_image.png" 61 | cfgs = read_yaml_config("config.yaml") 62 | open6dor_gpt = Open6DOR_GPT(cfgs=cfgs) 63 | masks, _image = open6dor_gpt.inference_gsam(image_path = image_path, prompt="calculator") 64 | _image[masks[0][0].cpu().numpy().astype(bool)] = 0 65 | imageio.imwrite("test_mask.png", _image) 66 | print("The mask is saved as test_mask.png, check it!") 67 | import pdb; pdb.set_trace() 68 | 69 | if __name__ == "__main__": 70 | # test_gsam() 71 | 72 | test_vlm() 73 | 74 | cfgs = read_yaml_config("config.yaml") 75 | task_cfgs_path = "/home/haoran/Projects/Rearrangement/Open6DOR/Method/tasks/6DoF/behind/Place_the_apple_behind_the_box_on_the_table.__upright/20240704-145831_no_interaction/task_config_new2.json" 76 | with open(task_cfgs_path, "r") as f: task_cfgs = json.load(f) 77 | 78 | open6dor_gpt = Open6DOR_GPT(cfgs=cfgs, task_cfgs=task_cfgs) 79 | -------------------------------------------------------------------------------- /assets/tasks/task_refine_6dof_example/behind/20240824-165044_no_interaction/task_config_new5.json: -------------------------------------------------------------------------------- 1 | { 2 | "selected_obj_names": [ 3 | "bottle", 4 | "tissue box", 5 | "apple" 6 | ], 7 | "selected_urdfs": [ 8 | "ycb_16k_backup/006_mustard_bottle_google_16k/006_mustard_bottle_google_16k.urdf", 9 | "objaverse_rescale/dc4c91abf45342b4bb8822f50fa162b2/material_2.urdf", 10 | "objaverse_rescale/fbda0b25f41f40958ea984f460e4770b/material_2.urdf" 11 | ], 12 | "target_obj_name": "apple", 13 | "instruction": "Place the apple behind the bottle on the table. We also need to specify the rotation of the object after placement: the object is placed upright on the table and corresponds with how humans usually place the object, bottom down and top up.", 14 | "init_obj_pos": [ 15 | [ 16 | 0.3738566040992737, 17 | 0.17327724397182465, 18 | 0.3028668463230133, 19 | -9.934652553056367e-06, 20 | 5.676249202224426e-06, 21 | -0.03726901113986969, 22 | 0.9993051886558533, 23 | 0.00011460757377790287, 24 | -0.0007374841370619833, 25 | -0.00024315444170497358, 26 | 0.01065916009247303, 27 | 0.000735661422368139, 28 | 0.0003395920793991536 29 | ], 30 | [ 31 | 0.4417206645011902, 32 | -0.3223787248134613, 33 | 0.3753006160259247, 34 | 0.7060578465461731, 35 | -0.03799350559711456, 36 | -0.037282224744558334, 37 | 0.7061506509780884, 38 | 5.145368413650431e-05, 39 | -0.00020104726718273014, 40 | 0.00014684736379422247, 41 | 0.003485196502879262, 42 | 5.90651725360658e-05, 43 | -0.0011944533325731754 44 | ], 45 | [ 46 | 0.5476366281509399, 47 | -0.07213471084833145, 48 | 0.3492423892021179, 49 | 0.1136084571480751, 50 | 0.050451405346393585, 51 | -0.0884791761636734, 52 | 0.9882911443710327, 53 | 5.16880136274267e-05, 54 | 0.0044308979995548725, 55 | -0.001778011559508741, 56 | -0.08829422295093536, 57 | -0.0050054253078997135, 58 | 0.015157821588218212 59 | ] 60 | ], 61 | "position_instruction": "Place the apple behind the bottle on the table. ", 62 | "rotation_instruction": "We also need to specify the rotation of the object after placement: the object is placed upright on the table and corresponds with how humans usually place the object, bottom down and top up.", 63 | "rotation_instruction_label": "upright", 64 | "obj_codes": [ 65 | "006", 66 | "dc4c91abf45342b4bb8822f50fa162b2", 67 | "fbda0b25f41f40958ea984f460e4770b" 68 | ], 69 | "target_obj_code": "fbda0b25f41f40958ea984f460e4770b", 70 | "anno_target": { 71 | "category": "apple", 72 | "annotation": { 73 | " the object is placed upright on the table and corresponds with how humans usually place the object, bottom down and top up.": { 74 | "quat": [ 75 | [ 76 | 0.7071067690849304, 77 | 0.0, 78 | 0.0, 79 | 0.7071067690849304 80 | ] 81 | ], 82 | "stage": 1, 83 | "axis": "z" 84 | } 85 | } 86 | }, 87 | "rot_tag_detail": "upright", 88 | "rot_tag_level": 0, 89 | "position_tag": "behind", 90 | "rotation_tag": "upright" 91 | } -------------------------------------------------------------------------------- /Method/vision/test_sam.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import matplotlib.pyplot as plt 4 | import cv2 5 | import sys 6 | sys.path.append("..") 7 | 8 | from segment_anything import sam_model_registry, SamPredictor, SamAutomaticMaskGenerator 9 | 10 | def show_anns(anns): 11 | if len(anns) == 0: 12 | return 13 | sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True) 14 | ax = plt.gca() 15 | ax.set_autoscale_on(False) 16 | 17 | img = np.ones((sorted_anns[0]['segmentation'].shape[0], sorted_anns[0]['segmentation'].shape[1], 4)) 18 | img[:,:,3] = 0 19 | for ann in sorted_anns: 20 | m = ann['segmentation'] 21 | color_mask = np.concatenate([np.random.random(3), [0.35]]) 22 | img[m] = color_mask 23 | ax.imshow(img) 24 | 25 | def show_mask(mask, ax, random_color=False): 26 | if random_color: 27 | color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0) 28 | else: 29 | color = np.array([30/255, 144/255, 255/255, 0.6]) 30 | h, w = mask.shape[-2:] 31 | mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1) 32 | ax.imshow(mask_image) 33 | 34 | def show_points(coords, labels, ax, marker_size=375): 35 | pos_points = coords[labels==1] 36 | neg_points = coords[labels==0] 37 | ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', s=marker_size, edgecolor='white', linewidth=1.25) 38 | ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', s=marker_size, edgecolor='white', linewidth=1.25) 39 | 40 | def show_box(box, ax): 41 | x0, y0 = box[0], box[1] 42 | w, h = box[2] - box[0], box[3] - box[1] 43 | ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0,0,0,0), lw=2)) 44 | 45 | image = cv2.imread('/home/haoran/Projects/ObjectPlacement/imgs/bana_cup.png') 46 | image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 47 | 48 | plt.figure(figsize=(10,10)) 49 | plt.imshow(image) 50 | plt.axis('on') 51 | plt.show() 52 | 53 | sam_checkpoint = "/home/haoran/Projects/ObjectPlacement/assets/ckpts/sam_vit_h_4b8939.pth" 54 | model_type = "vit_h" 55 | 56 | device = "cuda" 57 | 58 | sam = sam_model_registry[model_type](checkpoint=sam_checkpoint) 59 | sam.to(device=device) 60 | 61 | ############## Demo1: Mask Generation ################ 62 | mask_generator = SamAutomaticMaskGenerator(sam) 63 | 64 | masks = mask_generator.generate(image) 65 | print(len(masks)) 66 | print(masks[0].keys()) 67 | plt.figure(figsize=(20,20)) 68 | plt.imshow(image) 69 | show_anns(masks) 70 | plt.axis('off') 71 | plt.show() 72 | 73 | 74 | ############## Demo2: Mask Prediction with Input Point ################ 75 | predictor = SamPredictor(sam) 76 | 77 | predictor.set_image(image) 78 | 79 | input_point = np.array([[500, 375]]) 80 | input_label = np.array([1]) 81 | 82 | 83 | masks, scores, logits = predictor.predict( 84 | point_coords=input_point, 85 | point_labels=input_label, 86 | multimask_output=True, 87 | ) 88 | 89 | for i, (mask, score) in enumerate(zip(masks, scores)): 90 | plt.figure(figsize=(10,10)) 91 | plt.imshow(image) 92 | show_mask(mask, plt.gca()) 93 | show_points(input_point, input_label, plt.gca()) 94 | plt.title(f"Mask {i+1}, Score: {score:.3f}", fontsize=18) 95 | plt.axis('off') 96 | plt.show() 97 | 98 | input_point = np.array([[500, 375], [1125, 625]]) 99 | input_label = np.array([1, 1]) 100 | 101 | mask_input = logits[np.argmax(scores), :, :] 102 | 103 | masks, _, _ = predictor.predict( 104 | point_coords=input_point, 105 | point_labels=input_label, 106 | mask_input=mask_input[None, :, :], 107 | multimask_output=False, 108 | ) -------------------------------------------------------------------------------- /Method/vision/GroundedSAM/segment_anything/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to make participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies within all project spaces, and it also applies when 49 | an individual is representing the project or its community in public spaces. 50 | Examples of representing a project or community include using an official 51 | project e-mail address, posting via an official social media account, or acting 52 | as an appointed representative at an online or offline event. Representation of 53 | a project may be further defined and clarified by project maintainers. 54 | 55 | This Code of Conduct also applies outside the project spaces when there is a 56 | reasonable belief that an individual's behavior may have a negative impact on 57 | the project or its community. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported by contacting the project team at . All 63 | complaints will be reviewed and investigated and will result in a response that 64 | is deemed necessary and appropriate to the circumstances. The project team is 65 | obligated to maintain confidentiality with regard to the reporter of an incident. 66 | Further details of specific enforcement policies may be posted separately. 67 | 68 | Project maintainers who do not follow or enforce the Code of Conduct in good 69 | faith may face temporary or permanent repercussions as determined by other 70 | members of the project's leadership. 71 | 72 | ## Attribution 73 | 74 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 75 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 76 | 77 | [homepage]: https://www.contributor-covenant.org 78 | 79 | For answers to common questions about this code of conduct, see 80 | https://www.contributor-covenant.org/faq 81 | -------------------------------------------------------------------------------- /Method/vision/GroundedSAM/segment_anything/segment_anything/build_sam_hq.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import torch 8 | 9 | from functools import partial 10 | 11 | from .modeling import ImageEncoderViT, MaskDecoderHQ, PromptEncoder, Sam, TwoWayTransformer 12 | 13 | 14 | def build_sam_hq_vit_h(checkpoint=None): 15 | return _build_sam( 16 | encoder_embed_dim=1280, 17 | encoder_depth=32, 18 | encoder_num_heads=16, 19 | encoder_global_attn_indexes=[7, 15, 23, 31], 20 | checkpoint=checkpoint, 21 | ) 22 | 23 | 24 | build_sam_hq = build_sam_hq_vit_h 25 | 26 | 27 | def build_sam_hq_vit_l(checkpoint=None): 28 | return _build_sam( 29 | encoder_embed_dim=1024, 30 | encoder_depth=24, 31 | encoder_num_heads=16, 32 | encoder_global_attn_indexes=[5, 11, 17, 23], 33 | checkpoint=checkpoint, 34 | ) 35 | 36 | 37 | def build_sam_hq_vit_b(checkpoint=None): 38 | return _build_sam( 39 | encoder_embed_dim=768, 40 | encoder_depth=12, 41 | encoder_num_heads=12, 42 | encoder_global_attn_indexes=[2, 5, 8, 11], 43 | checkpoint=checkpoint, 44 | ) 45 | 46 | 47 | sam_hq_model_registry = { 48 | "default": build_sam_hq_vit_h, 49 | "vit_h": build_sam_hq_vit_h, 50 | "vit_l": build_sam_hq_vit_l, 51 | "vit_b": build_sam_hq_vit_b, 52 | } 53 | 54 | 55 | def _build_sam( 56 | encoder_embed_dim, 57 | encoder_depth, 58 | encoder_num_heads, 59 | encoder_global_attn_indexes, 60 | checkpoint=None, 61 | ): 62 | prompt_embed_dim = 256 63 | image_size = 1024 64 | vit_patch_size = 16 65 | image_embedding_size = image_size // vit_patch_size 66 | sam = Sam( 67 | image_encoder=ImageEncoderViT( 68 | depth=encoder_depth, 69 | embed_dim=encoder_embed_dim, 70 | img_size=image_size, 71 | mlp_ratio=4, 72 | norm_layer=partial(torch.nn.LayerNorm, eps=1e-6), 73 | num_heads=encoder_num_heads, 74 | patch_size=vit_patch_size, 75 | qkv_bias=True, 76 | use_rel_pos=True, 77 | global_attn_indexes=encoder_global_attn_indexes, 78 | window_size=14, 79 | out_chans=prompt_embed_dim, 80 | ), 81 | prompt_encoder=PromptEncoder( 82 | embed_dim=prompt_embed_dim, 83 | image_embedding_size=(image_embedding_size, image_embedding_size), 84 | input_image_size=(image_size, image_size), 85 | mask_in_chans=16, 86 | ), 87 | mask_decoder=MaskDecoderHQ( 88 | num_multimask_outputs=3, 89 | transformer=TwoWayTransformer( 90 | depth=2, 91 | embedding_dim=prompt_embed_dim, 92 | mlp_dim=2048, 93 | num_heads=8, 94 | ), 95 | transformer_dim=prompt_embed_dim, 96 | iou_head_depth=3, 97 | iou_head_hidden_dim=256, 98 | vit_dim=encoder_embed_dim, 99 | ), 100 | pixel_mean=[123.675, 116.28, 103.53], 101 | pixel_std=[58.395, 57.12, 57.375], 102 | ) 103 | # sam.eval() 104 | if checkpoint is not None: 105 | with open(checkpoint, "rb") as f: 106 | state_dict = torch.load(f) 107 | info = sam.load_state_dict(state_dict, strict=False) 108 | print(info) 109 | for n, p in sam.named_parameters(): 110 | if 'hf_token' not in n and 'hf_mlp' not in n and 'compress_vit_feat' not in n and 'embedding_encoder' not in n and 'embedding_maskfeature' not in n: 111 | p.requires_grad = False 112 | 113 | return sam -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.whl 2 | anno_test 3 | Benchmark/renderer/envmap_lib 4 | Benchmark/renderer/blender-2.93.3* 5 | Benchmark/renderer/material_lib_v2.blend 6 | Benchmark/dataset/objects/rescale/ 7 | output/ 8 | # assets/ 9 | output_new/ 10 | results_overall/ 11 | *.zip 12 | *.DS_Store 13 | # Byte-compiled / optimized / DLL files 14 | __pycache__/ 15 | *.py[cod] 16 | *$py.class 17 | 18 | 19 | # C extensions 20 | *.so 21 | 22 | # Distribution / packaging 23 | .Python 24 | build/ 25 | develop-eggs/ 26 | dist/ 27 | downloads/ 28 | eggs/ 29 | .eggs/ 30 | lib/ 31 | lib64/ 32 | parts/ 33 | sdist/ 34 | var/ 35 | wheels/ 36 | share/python-wheels/ 37 | *.egg-info/ 38 | .installed.cfg 39 | *.egg 40 | MANIFEST 41 | *.DS_Store 42 | 43 | # PyInstaller 44 | # Usually these files are written by a python script from a template 45 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 46 | *.manifest 47 | *.spec 48 | 49 | # Installer logs 50 | pip-log.txt 51 | pip-delete-this-directory.txt 52 | 53 | # Unit test / coverage reports 54 | htmlcov/ 55 | .tox/ 56 | .nox/ 57 | .coverage 58 | .coverage.* 59 | .cache 60 | nosetests.xml 61 | coverage.xml 62 | *.cover 63 | *.py,cover 64 | .hypothesis/ 65 | .pytest_cache/ 66 | cover/ 67 | 68 | # Translations 69 | *.mo 70 | *.pot 71 | 72 | # Django stuff: 73 | *.log 74 | local_settings.py 75 | db.sqlite3 76 | db.sqlite3-journal 77 | 78 | # Flask stuff: 79 | instance/ 80 | .webassets-cache 81 | 82 | # Scrapy stuff: 83 | .scrapy 84 | 85 | # Sphinx documentation 86 | docs/_build/ 87 | 88 | # PyBuilder 89 | .pybuilder/ 90 | target/ 91 | 92 | # Jupyter Notebook 93 | .ipynb_checkpoints 94 | 95 | # IPython 96 | profile_default/ 97 | ipython_config.py 98 | 99 | # pyenv 100 | # For a library or package, you might want to ignore these files since the code is 101 | # intended to run in multiple environments; otherwise, check them in: 102 | # .python-version 103 | 104 | # pipenv 105 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 106 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 107 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 108 | # install all needed dependencies. 109 | #Pipfile.lock 110 | 111 | # poetry 112 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 113 | # This is especially recommended for binary packages to ensure reproducibility, and is more 114 | # commonly ignored for libraries. 115 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 116 | #poetry.lock 117 | 118 | # pdm 119 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 120 | #pdm.lock 121 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 122 | # in version control. 123 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 124 | .pdm.toml 125 | .pdm-python 126 | .pdm-build/ 127 | 128 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 129 | __pypackages__/ 130 | 131 | # Celery stuff 132 | celerybeat-schedule 133 | celerybeat.pid 134 | 135 | # SageMath parsed files 136 | *.sage.py 137 | 138 | # Environments 139 | .env 140 | .venv 141 | env/ 142 | venv/ 143 | ENV/ 144 | env.bak/ 145 | venv.bak/ 146 | 147 | # Spyder project settings 148 | .spyderproject 149 | .spyproject 150 | 151 | # Rope project settings 152 | .ropeproject 153 | 154 | # mkdocs documentation 155 | /site 156 | 157 | # mypy 158 | .mypy_cache/ 159 | .dmypy.json 160 | dmypy.json 161 | 162 | # Pyre type checker 163 | .pyre/ 164 | 165 | # pytype static type analyzer 166 | .pytype/ 167 | 168 | # Cython debug symbols 169 | cython_debug/ 170 | 171 | # PyCharm 172 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 173 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 174 | # and can be added to the global gitignore or merged into this file. For a more nuclear 175 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 176 | #.idea/ 177 | .DS_Store 178 | .DS_Store 179 | -------------------------------------------------------------------------------- /assets/robot/franka_description/meshes/visual/link6.mtl: -------------------------------------------------------------------------------- 1 | # Blender MTL File: 'None' 2 | # Material Count: 17 3 | 4 | newmtl Face064_002_001_002_001 5 | Ns -1.960784 6 | Ka 1.000000 1.000000 1.000000 7 | Kd 1.000000 0.000000 0.000000 8 | Ks 0.003906 0.003906 0.003906 9 | Ke 0.000000 0.000000 0.000000 10 | Ni 1.000000 11 | d 1.000000 12 | illum 2 13 | 14 | newmtl Face065_002_001_002_001 15 | Ns -1.960784 16 | Ka 1.000000 1.000000 1.000000 17 | Kd 0.000000 1.000000 0.000000 18 | Ks 0.003906 0.003906 0.003906 19 | Ke 0.000000 0.000000 0.000000 20 | Ni 1.000000 21 | d 1.000000 22 | illum 2 23 | 24 | newmtl Face374_002_001_002_001 25 | Ns -1.960784 26 | Ka 1.000000 1.000000 1.000000 27 | Kd 1.000000 1.000000 1.000000 28 | Ks 0.003906 0.003906 0.003906 29 | Ke 0.000000 0.000000 0.000000 30 | Ni 1.000000 31 | d 1.000000 32 | illum 2 33 | 34 | newmtl Face539_002_001_002_001 35 | Ns -1.960784 36 | Ka 1.000000 1.000000 1.000000 37 | Kd 0.250980 0.250980 0.250980 38 | Ks 0.003906 0.003906 0.003906 39 | Ke 0.000000 0.000000 0.000000 40 | Ni 1.000000 41 | d 1.000000 42 | illum 2 43 | 44 | newmtl Part__Feature001_009_001_002_001 45 | Ns -1.960784 46 | Ka 1.000000 1.000000 1.000000 47 | Kd 0.250980 0.250980 0.250980 48 | Ks 0.003906 0.003906 0.003906 49 | Ke 0.000000 0.000000 0.000000 50 | Ni 1.000000 51 | d 1.000000 52 | illum 2 53 | 54 | newmtl Part__Feature002_006_001_002_001 55 | Ns -1.960784 56 | Ka 1.000000 1.000000 1.000000 57 | Kd 0.250980 0.250980 0.250980 58 | Ks 0.003906 0.003906 0.003906 59 | Ke 0.000000 0.000000 0.000000 60 | Ni 1.000000 61 | d 1.000000 62 | illum 2 63 | 64 | newmtl Shell002_002_001_002_001 65 | Ns -1.960784 66 | Ka 1.000000 1.000000 1.000000 67 | Kd 1.000000 1.000000 1.000000 68 | Ks 0.003906 0.003906 0.003906 69 | Ke 0.000000 0.000000 0.000000 70 | Ni 1.000000 71 | d 1.000000 72 | illum 2 73 | 74 | newmtl Shell003_002_001_002_001 75 | Ns -1.960784 76 | Ka 1.000000 1.000000 1.000000 77 | Kd 1.000000 1.000000 1.000000 78 | Ks 0.003906 0.003906 0.003906 79 | Ke 0.000000 0.000000 0.000000 80 | Ni 1.000000 81 | d 1.000000 82 | illum 2 83 | 84 | newmtl Shell004_001_001_002_001 85 | Ns -1.960784 86 | Ka 1.000000 1.000000 1.000000 87 | Kd 1.000000 1.000000 1.000000 88 | Ks 0.003906 0.003906 0.003906 89 | Ke 0.000000 0.000000 0.000000 90 | Ni 1.000000 91 | d 1.000000 92 | illum 2 93 | 94 | newmtl Shell005_001_001_002_001 95 | Ns -1.960784 96 | Ka 1.000000 1.000000 1.000000 97 | Kd 1.000000 1.000000 1.000000 98 | Ks 0.003906 0.003906 0.003906 99 | Ke 0.000000 0.000000 0.000000 100 | Ni 1.000000 101 | d 1.000000 102 | illum 2 103 | 104 | newmtl Shell006_003_002_001 105 | Ns -1.960784 106 | Ka 1.000000 1.000000 1.000000 107 | Kd 0.901961 0.921569 0.929412 108 | Ks 0.015625 0.015625 0.015625 109 | Ke 0.000000 0.000000 0.000000 110 | Ni 1.000000 111 | d 1.000000 112 | illum 2 113 | 114 | newmtl Shell007_002_002_001 115 | Ns -1.960784 116 | Ka 1.000000 1.000000 1.000000 117 | Kd 0.250000 0.250000 0.250000 118 | Ks 0.003906 0.003906 0.003906 119 | Ke 0.000000 0.000000 0.000000 120 | Ni 1.000000 121 | d 1.000000 122 | illum 2 123 | 124 | newmtl Shell011_002_002_001 125 | Ns -1.960784 126 | Ka 1.000000 1.000000 1.000000 127 | Kd 1.000000 1.000000 1.000000 128 | Ks 0.003906 0.003906 0.003906 129 | Ke 0.000000 0.000000 0.000000 130 | Ni 1.000000 131 | d 1.000000 132 | illum 2 133 | 134 | newmtl Shell012_002_002_001 135 | Ns -1.960784 136 | Ka 1.000000 1.000000 1.000000 137 | Kd 1.000000 1.000000 1.000000 138 | Ks 0.003906 0.003906 0.003906 139 | Ke 0.000000 0.000000 0.000000 140 | Ni 1.000000 141 | d 1.000000 142 | illum 2 143 | 144 | newmtl Shell_003_001_002_001 145 | Ns -1.960784 146 | Ka 1.000000 1.000000 1.000000 147 | Kd 0.250980 0.250980 0.250980 148 | Ks 0.003906 0.003906 0.003906 149 | Ke 0.000000 0.000000 0.000000 150 | Ni 1.000000 151 | d 1.000000 152 | illum 2 153 | 154 | newmtl Union001_001_001_002_001 155 | Ns -1.960784 156 | Ka 1.000000 1.000000 1.000000 157 | Kd 0.039216 0.541176 0.780392 158 | Ks 0.003906 0.003906 0.003906 159 | Ke 0.000000 0.000000 0.000000 160 | Ni 1.000000 161 | d 1.000000 162 | illum 2 163 | 164 | newmtl Union_001_001_002_001 165 | Ns -1.960784 166 | Ka 1.000000 1.000000 1.000000 167 | Kd 0.039216 0.541176 0.780392 168 | Ks 0.003906 0.003906 0.003906 169 | Ke 0.000000 0.000000 0.000000 170 | Ni 1.000000 171 | d 1.000000 172 | illum 2 173 | -------------------------------------------------------------------------------- /Method/vision/GroundedSAM/segment_anything/segment_anything/utils/transforms.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import numpy as np 8 | import torch 9 | from torch.nn import functional as F 10 | from torchvision.transforms.functional import resize, to_pil_image # type: ignore 11 | 12 | from copy import deepcopy 13 | from typing import Tuple 14 | 15 | 16 | class ResizeLongestSide: 17 | """ 18 | Resizes images to longest side 'target_length', as well as provides 19 | methods for resizing coordinates and boxes. Provides methods for 20 | transforming both numpy array and batched torch tensors. 21 | """ 22 | 23 | def __init__(self, target_length: int) -> None: 24 | self.target_length = target_length 25 | 26 | def apply_image(self, image: np.ndarray) -> np.ndarray: 27 | """ 28 | Expects a numpy array with shape HxWxC in uint8 format. 29 | """ 30 | target_size = self.get_preprocess_shape(image.shape[0], image.shape[1], self.target_length) 31 | return np.array(resize(to_pil_image(image), target_size)) 32 | 33 | def apply_coords(self, coords: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray: 34 | """ 35 | Expects a numpy array of length 2 in the final dimension. Requires the 36 | original image size in (H, W) format. 37 | """ 38 | old_h, old_w = original_size 39 | new_h, new_w = self.get_preprocess_shape( 40 | original_size[0], original_size[1], self.target_length 41 | ) 42 | coords = deepcopy(coords).astype(float) 43 | coords[..., 0] = coords[..., 0] * (new_w / old_w) 44 | coords[..., 1] = coords[..., 1] * (new_h / old_h) 45 | return coords 46 | 47 | def apply_boxes(self, boxes: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray: 48 | """ 49 | Expects a numpy array shape Bx4. Requires the original image size 50 | in (H, W) format. 51 | """ 52 | boxes = self.apply_coords(boxes.reshape(-1, 2, 2), original_size) 53 | return boxes.reshape(-1, 4) 54 | 55 | def apply_image_torch(self, image: torch.Tensor) -> torch.Tensor: 56 | """ 57 | Expects batched images with shape BxCxHxW and float format. This 58 | transformation may not exactly match apply_image. apply_image is 59 | the transformation expected by the model. 60 | """ 61 | # Expects an image in BCHW format. May not exactly match apply_image. 62 | target_size = self.get_preprocess_shape(image.shape[0], image.shape[1], self.target_length) 63 | return F.interpolate( 64 | image, target_size, mode="bilinear", align_corners=False, antialias=True 65 | ) 66 | 67 | def apply_coords_torch( 68 | self, coords: torch.Tensor, original_size: Tuple[int, ...] 69 | ) -> torch.Tensor: 70 | """ 71 | Expects a torch tensor with length 2 in the last dimension. Requires the 72 | original image size in (H, W) format. 73 | """ 74 | old_h, old_w = original_size 75 | new_h, new_w = self.get_preprocess_shape( 76 | original_size[0], original_size[1], self.target_length 77 | ) 78 | coords = deepcopy(coords).to(torch.float) 79 | coords[..., 0] = coords[..., 0] * (new_w / old_w) 80 | coords[..., 1] = coords[..., 1] * (new_h / old_h) 81 | return coords 82 | 83 | def apply_boxes_torch( 84 | self, boxes: torch.Tensor, original_size: Tuple[int, ...] 85 | ) -> torch.Tensor: 86 | """ 87 | Expects a torch tensor with shape Bx4. Requires the original image 88 | size in (H, W) format. 89 | """ 90 | boxes = self.apply_coords_torch(boxes.reshape(-1, 2, 2), original_size) 91 | return boxes.reshape(-1, 4) 92 | 93 | @staticmethod 94 | def get_preprocess_shape(oldh: int, oldw: int, long_side_length: int) -> Tuple[int, int]: 95 | """ 96 | Compute the output size given input size and target long side length. 97 | """ 98 | scale = long_side_length * 1.0 / max(oldh, oldw) 99 | newh, neww = oldh * scale, oldw * scale 100 | neww = int(neww + 0.5) 101 | newh = int(newh + 0.5) 102 | return (newh, neww) 103 | -------------------------------------------------------------------------------- /assets/robot/franka_description/meshes/collision/finger.obj: -------------------------------------------------------------------------------- 1 | #### 2 | # 3 | # OBJ File Generated by Meshlab 4 | # 5 | #### 6 | # Object finger.obj 7 | # 8 | # Vertices: 52 9 | # Faces: 32 10 | # 11 | #### 12 | vn 0.999991 0.003723 -0.001919 13 | v 0.010360 0.026403 0.000155 14 | vn 0.019341 -0.997893 -0.061925 15 | v 0.010449 0.002583 0.000147 16 | vn -0.999568 -0.025962 0.013789 17 | v -0.010387 0.002534 0.000132 18 | vn -0.999606 -0.009503 0.026403 19 | v -0.010479 0.016102 0.018988 20 | vn -0.000579 0.001464 -0.999999 21 | v -0.010401 0.026309 0.000167 22 | vn -0.044737 0.976483 0.210900 23 | v -0.010389 0.025220 0.019188 24 | vn -0.871286 -0.490748 0.005227 25 | v -0.008730 -0.000024 0.036165 26 | vn 0.999861 0.006488 0.015354 27 | v 0.010400 0.025253 0.019037 28 | vn 0.377718 0.867563 0.323518 29 | v 0.005840 0.014274 0.053803 30 | vn 0.736099 -0.021564 0.676530 31 | v 0.008616 0.013989 0.051328 32 | vn 0.999373 -0.008600 0.034345 33 | v 0.010495 0.015103 0.018436 34 | vn 0.013041 -0.999896 -0.006124 35 | v 0.008693 -0.000133 0.050166 36 | vn -0.998603 -0.032800 0.041418 37 | v -0.008623 -0.000057 0.050953 38 | vn -0.588468 -0.017705 0.808327 39 | v -0.005481 -0.000091 0.053725 40 | vn 0.004085 -0.008700 0.999954 41 | v -0.005278 0.014293 0.053849 42 | vn -0.691057 -0.012018 0.722700 43 | v -0.007778 0.014218 0.052366 44 | vn -0.665951 0.690851 0.281486 45 | v -0.008841 0.013918 0.050589 46 | vn 0.736099 -0.021564 0.676530 47 | v 0.006138 -0.000021 0.053578 48 | vn -0.002818 0.998255 0.058981 49 | v 0.010360 0.026403 0.000155 50 | vn 0.000073 0.000898 -1.000000 51 | v 0.010360 0.026403 0.000155 52 | vn 0.999898 -0.012431 0.007036 53 | v 0.010449 0.002583 0.000147 54 | vn 0.000724 0.000331 -1.000000 55 | v 0.010449 0.002583 0.000147 56 | vn -0.871286 -0.490748 0.005227 57 | v -0.010387 0.002534 0.000132 58 | vn 0.002403 -0.997480 -0.070914 59 | v -0.010387 0.002534 0.000132 60 | vn 0.000073 0.000898 -1.000000 61 | v -0.010387 0.002534 0.000132 62 | vn -0.004486 0.998354 0.057168 63 | v -0.010401 0.026309 0.000167 64 | vn -0.999988 0.004662 -0.001626 65 | v -0.010401 0.026309 0.000167 66 | vn -0.665951 0.690851 0.281486 67 | v -0.010389 0.025220 0.019188 68 | vn -0.999597 0.009346 0.026807 69 | v -0.010389 0.025220 0.019188 70 | vn 0.006493 -0.999457 -0.032313 71 | v -0.008730 -0.000024 0.036165 72 | vn 0.377718 0.867563 0.323518 73 | v 0.010400 0.025253 0.019037 74 | vn -0.000242 0.983230 0.182372 75 | v 0.010400 0.025253 0.019037 76 | vn 0.665647 0.002096 0.746264 77 | v 0.005840 0.014274 0.053803 78 | vn 0.008418 -0.012115 0.999891 79 | v 0.005840 0.014274 0.053803 80 | vn 0.001757 0.953702 0.300749 81 | v 0.005840 0.014274 0.053803 82 | vn 0.377718 0.867563 0.323518 83 | v 0.008616 0.013989 0.051328 84 | vn 0.998361 0.003310 0.057136 85 | v 0.008616 0.013989 0.051328 86 | vn 0.798906 -0.045001 0.599770 87 | v 0.008693 -0.000133 0.050166 88 | vn 0.998687 -0.025065 0.044683 89 | v 0.008693 -0.000133 0.050166 90 | vn -0.769031 -0.017753 0.638965 91 | v -0.008623 -0.000057 0.050953 92 | vn -0.008996 -0.999957 -0.002185 93 | v -0.008623 -0.000057 0.050953 94 | vn -0.871286 -0.490748 0.005227 95 | v -0.008623 -0.000057 0.050953 96 | vn 0.008418 -0.012115 0.999891 97 | v -0.005481 -0.000091 0.053725 98 | vn -0.002059 -0.999940 0.010793 99 | v -0.005481 -0.000091 0.053725 100 | vn -0.510143 -0.000217 0.860089 101 | v -0.005278 0.014293 0.053849 102 | vn -0.108731 0.943365 0.313433 103 | v -0.005278 0.014293 0.053849 104 | vn -0.665951 0.690851 0.281486 105 | v -0.007778 0.014218 0.052366 106 | vn -0.218924 0.920873 0.322590 107 | v -0.007778 0.014218 0.052366 108 | vn -0.858159 -0.000049 0.513385 109 | v -0.008841 0.013918 0.050589 110 | vn -0.998665 -0.002749 0.051583 111 | v -0.008841 0.013918 0.050589 112 | vn 0.006542 -0.999267 0.037718 113 | v 0.006138 -0.000021 0.053578 114 | vn 0.012751 -0.015529 0.999798 115 | v 0.006138 -0.000021 0.053578 116 | # 52 vertices, 0 vertices normals 117 | 118 | f 20//20 22//22 25//25 119 | f 3//3 4//4 27//27 120 | f 27//27 4//4 29//29 121 | f 2//2 30//30 24//24 122 | f 32//32 6//6 35//35 123 | f 25//25 5//5 20//20 124 | f 37//37 11//11 8//8 125 | f 11//11 39//39 21//21 126 | f 37//37 39//39 11//11 127 | f 42//42 23//23 7//7 128 | f 2//2 12//12 30//30 129 | f 12//12 44//44 30//30 130 | f 8//8 11//11 21//21 131 | f 8//8 21//21 1//1 132 | f 32//32 19//19 6//6 133 | f 6//6 46//46 35//35 134 | f 48//48 46//46 6//6 135 | f 40//40 14//14 16//16 136 | f 3//3 13//13 4//4 137 | f 31//31 9//9 36//36 138 | f 19//19 26//26 6//6 139 | f 4//4 50//50 29//29 140 | f 17//17 47//47 28//28 141 | f 34//34 43//43 52//52 142 | f 15//15 43//43 34//34 143 | f 12//12 51//51 44//44 144 | f 18//18 38//38 10//10 145 | f 44//44 41//41 30//30 146 | f 16//16 14//14 45//45 147 | f 13//13 50//50 4//4 148 | f 18//18 10//10 33//33 149 | f 16//16 49//49 40//40 150 | # 32 faces, 0 coords texture 151 | 152 | # End of File 153 | -------------------------------------------------------------------------------- /Benchmark/evaluation/evaluator.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file contains the evaluation metrics for Open6DOR Benchmark. 3 | We are currently refining the rotation eval section for fairer evaluation and easier comparison. 4 | Full version coming soon. 5 | """ 6 | import numpy as np 7 | import math 8 | from scipy.spatial.transform import Rotation as R 9 | 10 | 11 | 12 | def projection(rot_mat_A, rot_mat_B, axis): 13 | """ 14 | Project the relative rotation from A to B onto the axis. 15 | rot_mat: 3x3 rotation matrix 16 | A: ground truth rotation 17 | B: predicted rotation 18 | axis: 3x1 vector 19 | """ 20 | det = np.linalg.det(rot_mat_A) 21 | assert det != 0 # rotation matrix should have determinant +1 or -1 22 | v = np.linalg.inv(rot_mat_A) @ axis 23 | 24 | w = rot_mat_B @ v 25 | angle = np.arccos(np.dot(axis, w) / (np.linalg.norm(axis) * np.linalg.norm(w))) 26 | return np.degrees(angle) 27 | 28 | # quat_gt = [0.884556,-0.093848,-0.436286,0.135678] 29 | quat_gt = [-0.205673,-0.205673,-0.596955,0.772278] 30 | rot_gt = R.from_quat(quat_gt).as_matrix() 31 | # quat_pred = [0.972568,-0.128846,-0.164,0.103027] 32 | # quat_pred = [0.546952,-0.013245,-0.820748,0.16444] 33 | # quat_pred = [0.450043,-0.310077,-0.760036,0.351651] 34 | # quat_pred = [0.270194,-0.590044,-0.570659,0.503183] 35 | 36 | # quat_pred = [0.166216,-0.492937,-0.609121,0.59863] 37 | # quat_pred = [-0.058748,-0.690237,-0.377434,-0.377434] 38 | 39 | 40 | 41 | quat_pred = [0.107351,-0.684364,-0.220191,0.68676] 42 | rot_pred = R.from_quat(quat_pred).as_matrix() 43 | ax = "y" 44 | axis = ax 45 | if ax == "x": 46 | axis = np.array([1, 0, 0]) 47 | elif ax == "y": 48 | axis = np.array([0, 1, 0]) 49 | elif ax == "z": 50 | axis = np.array([0, 0, 1]) 51 | 52 | # if isinstance(axis, np.ndarray): 53 | # deviation = projection(rot_gt, rot_pred, axis) 54 | # print(f"Deviation along axis {axis}: {deviation}") 55 | 56 | 57 | def normalize_quat(quat): 58 | norm = math.sqrt(sum(q ** 2 for q in quat)) 59 | return [q / norm for q in quat] 60 | 61 | def angle_deviation(quat0, quat1): 62 | # Normalize the quaternions 63 | quat0 = normalize_quat(quat0) 64 | quat1 = normalize_quat(quat1) 65 | 66 | # Compute the dot product of the two quaternions 67 | dot_product = sum(q0 * q1 for q0, q1 in zip(quat0, quat1)) 68 | 69 | # Ensure the dot product is within the range [-1, 1] to avoid numerical errors 70 | dot_product = max(-1.0, min(1.0, dot_product)) 71 | 72 | # Compute the angle deviation (in radians) 73 | angle_deviation = 2 * math.acos(dot_product) 74 | 75 | # Convert the angle deviation to degrees 76 | angle_deviation_degrees = math.degrees(angle_deviation) 77 | 78 | return angle_deviation_degrees 79 | 80 | # # Example usage 81 | # quat0 = [0.7071, 0.0, 0.7071, 0.0] # Example quaternion 0 82 | # quat1 = [0.7, 0.0, 0.9, 0.0] # Example quaternion 1 83 | 84 | # angle_deviation = angle_deviation(quat0, quat1) 85 | # print(f"Angle deviation: {angle_deviation} degrees") 86 | 87 | 88 | 89 | def evaluate_rot(quat_gt, quat_pred): 90 | """ 91 | Evaluate the predicted rotation. 92 | task_id: str 93 | quat_pred: list of 4 floats 94 | """ 95 | # load the ground truth quaternion 96 | 97 | rot_gt = R.from_quat(quat_gt).as_matrix() 98 | rot_pred = R.from_quat(quat_pred).as_matrix() 99 | task_level = 0#TODO: load task level from the dataset 100 | obj_category = 0#TODO: load object category from the dataset 101 | if task_level == 0: 102 | ax = "z" 103 | elif task_level == 1: 104 | ax = "y" 105 | if obj_category in ["mug", "binder_clips", "toy", "wallet", "headphone"] : 106 | ax = "n" 107 | elif task_level == 2: 108 | ax = 0#TODO: load axis from the dataset 109 | else: 110 | raise ValueError(f"Invalid task level: {task_level}") 111 | axis = ax 112 | if ax == "x": 113 | axis = np.array([1, 0, 0]) 114 | elif ax == "y": 115 | axis = np.array([0, 1, 0]) 116 | elif ax == "z": 117 | axis = np.array([0, 0, 1]) 118 | 119 | deviation = -1 120 | if isinstance(axis, np.ndarray): 121 | deviation = projection(rot_gt, rot_pred, axis) 122 | else: 123 | deviation = angle_deviation(quat_gt, quat_pred) 124 | 125 | return deviation 126 | 127 | 128 | def evaluate_posi(sel_pos, tar_pos, mode): 129 | """ 130 | Evaluate the predicted position. 131 | """ 132 | if mode in ["left", "right", "front", "back", "behind", "top"]: 133 | if mode == "left": 134 | succ += sel_pos[1] > tar_pos[1] 135 | elif mode == "right": 136 | succ += sel_pos[1] < tar_pos[1] 137 | elif mode == "front": 138 | succ += sel_pos[0] > tar_pos[0] 139 | elif mode == "back" or mode == "behind": 140 | succ += sel_pos[0] < tar_pos[0] 141 | elif mode == "top": 142 | succ += sel_pos[2] <= tar_pos[2] 143 | elif mode == "between": 144 | max_sel_pos_x = np.max([sel_pos_1[0], sel_pos_2[0]]) 145 | max_sel_pos_y = np.max([sel_pos_1[1], sel_pos_2[1]]) 146 | min_sel_pos_x = np.min([sel_pos_1[0], sel_pos_2[0]]) 147 | min_sel_pos_y = np.min([sel_pos_1[1], sel_pos_2[1]]) 148 | tar_pos = result["final_obj_pos"][-1] 149 | succ += (min_sel_pos_x < tar_pos[0] < max_sel_pos_x) or (min_sel_pos_y < tar_pos[0] < max_sel_pos_y) 150 | elif mode == "center": 151 | max_sel_pos_x = np.max(sel_pos_all, axis=0)[0] 152 | min_sel_pos_x = np.min(sel_pos_all, axis=0)[0] 153 | max_sel_pos_y = np.max(sel_pos_all, axis=0)[1] 154 | min_sel_pos_y = np.min(sel_pos_all, axis=0)[1] 155 | succ += (min_sel_pos_x < tar_pos[0] < max_sel_pos_x) and (min_sel_pos_y < tar_pos[1] < max_sel_pos_y) -------------------------------------------------------------------------------- /Method/vision/GroundedSAM/segment_anything/README.md: -------------------------------------------------------------------------------- 1 | # Segment Anything 2 | 3 | **[Meta AI Research, FAIR](https://ai.facebook.com/research/)** 4 | 5 | [Alexander Kirillov](https://alexander-kirillov.github.io/), [Eric Mintun](https://ericmintun.github.io/), [Nikhila Ravi](https://nikhilaravi.com/), [Hanzi Mao](https://hanzimao.me/), Chloe Rolland, Laura Gustafson, [Tete Xiao](https://tetexiao.com), [Spencer Whitehead](https://www.spencerwhitehead.com/), Alex Berg, Wan-Yen Lo, [Piotr Dollar](https://pdollar.github.io/), [Ross Girshick](https://www.rossgirshick.info/) 6 | 7 | [[`Paper`](https://ai.facebook.com/research/publications/segment-anything/)] [[`Project`](https://segment-anything.com/)] [[`Demo`](https://segment-anything.com/demo)] [[`Dataset`](https://segment-anything.com/dataset/index.html)] [[`Blog`](https://ai.facebook.com/blog/segment-anything-foundation-model-image-segmentation/)] 8 | 9 | ![SAM design](assets/model_diagram.png?raw=true) 10 | 11 | The **Segment Anything Model (SAM)** produces high quality object masks from input prompts such as points or boxes, and it can be used to generate masks for all objects in an image. It has been trained on a [dataset](https://segment-anything.com/dataset/index.html) of 11 million images and 1.1 billion masks, and has strong zero-shot performance on a variety of segmentation tasks. 12 | 13 |

14 | 15 | 16 |

17 | 18 | ## Installation 19 | 20 | The code requires `python>=3.8`, as well as `pytorch>=1.7` and `torchvision>=0.8`. Please follow the instructions [here](https://pytorch.org/get-started/locally/) to install both PyTorch and TorchVision dependencies. Installing both PyTorch and TorchVision with CUDA support is strongly recommended. 21 | 22 | Install Segment Anything: 23 | 24 | ``` 25 | pip install git+https://github.com/facebookresearch/segment-anything.git 26 | ``` 27 | 28 | or clone the repository locally and install with 29 | 30 | ``` 31 | git clone git@github.com:facebookresearch/segment-anything.git 32 | cd segment-anything; pip install -e . 33 | ``` 34 | 35 | The following optional dependencies are necessary for mask post-processing, saving masks in COCO format, the example notebooks, and exporting the model in ONNX format. `jupyter` is also required to run the example notebooks. 36 | ``` 37 | pip install opencv-python pycocotools matplotlib onnxruntime onnx 38 | ``` 39 | 40 | 41 | ## Getting Started 42 | 43 | First download a [model checkpoint](#model-checkpoints). Then the model can be used in just a few lines to get masks from a given prompt: 44 | 45 | ``` 46 | from segment_anything import build_sam, SamPredictor 47 | predictor = SamPredictor(build_sam(checkpoint="")) 48 | predictor.set_image() 49 | masks, _, _ = predictor.predict() 50 | ``` 51 | 52 | or generate masks for an entire image: 53 | 54 | ``` 55 | from segment_anything import build_sam, SamAutomaticMaskGenerator 56 | mask_generator = SamAutomaticMaskGenerator(build_sam(checkpoint="")) 57 | masks = mask_generator_generate() 58 | ``` 59 | 60 | Additionally, masks can be generated for images from the command line: 61 | 62 | ``` 63 | python scripts/amg.py --checkpoint --input --output 64 | ``` 65 | 66 | See the examples notebooks on [using SAM with prompts](/notebooks/predictor_example.ipynb) and [automatically generating masks](/notebooks/automatic_mask_generator_example.ipynb) for more details. 67 | 68 |

69 | 70 | 71 |

72 | 73 | ## ONNX Export 74 | 75 | SAM's lightweight mask decoder can be exported to ONNX format so that it can be run in any environment that supports ONNX runtime, such as in-browser as showcased in the [demo](https://segment-anything.com/demo). Export the model with 76 | 77 | ``` 78 | python scripts/export_onnx_model.py --checkpoint --output 79 | ``` 80 | 81 | See the [example notebook](https://github.com/facebookresearch/segment-anything/blob/main/notebooks/onnx_model_example.ipynb) for details on how to combine image preprocessing via SAM's backbone with mask prediction using the ONNX model. It is recommended to use the latest stable version of PyTorch for ONNX export. 82 | 83 | ## Model Checkpoints 84 | 85 | Three model versions of the model are available with different backbone sizes. These models can be instantiated by running 86 | ``` 87 | from segment_anything import sam_model_registry 88 | sam = sam_model_registry[""](checkpoint="") 89 | ``` 90 | Click the links below to download the checkpoint for the corresponding model name. The default model in bold can also be instantiated with `build_sam`, as in the examples in [Getting Started](#getting-started). 91 | 92 | * **`default` or `vit_h`: [ViT-H SAM model.](https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth)** 93 | * `vit_l`: [ViT-L SAM model.](https://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pth) 94 | * `vit_b`: [ViT-B SAM model.](https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth) 95 | 96 | ## License 97 | The model is licensed under the [Apache 2.0 license](LICENSE). 98 | 99 | ## Contributing 100 | 101 | See [contributing](CONTRIBUTING.md) and the [code of conduct](CODE_OF_CONDUCT.md). 102 | 103 | ## Contributors 104 | 105 | The Segment Anything project was made possible with the help of many contributors (alphabetical): 106 | 107 | Aaron Adcock, Vaibhav Aggarwal, Morteza Behrooz, Cheng-Yang Fu, Ashley Gabriel, Ahuva Goldstand, Allen Goodman, Sumanth Gurram, Jiabo Hu, Somya Jain, Devansh Kukreja, Robert Kuo, Joshua Lane, Yanghao Li, Lilian Luong, Jitendra Malik, Mallika Malhotra, William Ngan, Omkar Parkhi, Nikhil Raina, Dirk Rowe, Neil Sejoor, Vanessa Stark, Bala Varadarajan, Bram Wasti, Zachary Winstrom 108 | -------------------------------------------------------------------------------- /Method/vision/GroundedSAM/segment_anything/segment_anything/utils/onnx.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import torch 8 | import torch.nn as nn 9 | from torch.nn import functional as F 10 | 11 | from typing import Tuple 12 | 13 | from ..modeling import Sam 14 | from .amg import calculate_stability_score 15 | 16 | 17 | class SamOnnxModel(nn.Module): 18 | """ 19 | This model should not be called directly, but is used in ONNX export. 20 | It combines the prompt encoder, mask decoder, and mask postprocessing of Sam, 21 | with some functions modified to enable model tracing. Also supports extra 22 | options controlling what information. See the ONNX export script for details. 23 | """ 24 | 25 | def __init__( 26 | self, 27 | model: Sam, 28 | return_single_mask: bool, 29 | use_stability_score: bool = False, 30 | return_extra_metrics: bool = False, 31 | ) -> None: 32 | super().__init__() 33 | self.mask_decoder = model.mask_decoder 34 | self.model = model 35 | self.img_size = model.image_encoder.img_size 36 | self.return_single_mask = return_single_mask 37 | self.use_stability_score = use_stability_score 38 | self.stability_score_offset = 1.0 39 | self.return_extra_metrics = return_extra_metrics 40 | 41 | @staticmethod 42 | def resize_longest_image_size( 43 | input_image_size: torch.Tensor, longest_side: int 44 | ) -> torch.Tensor: 45 | input_image_size = input_image_size.to(torch.float32) 46 | scale = longest_side / torch.max(input_image_size) 47 | transformed_size = scale * input_image_size 48 | transformed_size = torch.floor(transformed_size + 0.5).to(torch.int64) 49 | return transformed_size 50 | 51 | def _embed_points(self, point_coords: torch.Tensor, point_labels: torch.Tensor) -> torch.Tensor: 52 | point_coords = point_coords + 0.5 53 | point_coords = point_coords / self.img_size 54 | point_embedding = self.model.prompt_encoder.pe_layer._pe_encoding(point_coords) 55 | point_labels = point_labels.unsqueeze(-1).expand_as(point_embedding) 56 | 57 | point_embedding = point_embedding * (point_labels != -1) 58 | point_embedding = point_embedding + self.model.prompt_encoder.not_a_point_embed.weight * ( 59 | point_labels == -1 60 | ) 61 | 62 | for i in range(self.model.prompt_encoder.num_point_embeddings): 63 | point_embedding = point_embedding + self.model.prompt_encoder.point_embeddings[ 64 | i 65 | ].weight * (point_labels == i) 66 | 67 | return point_embedding 68 | 69 | def _embed_masks(self, input_mask: torch.Tensor, has_mask_input: torch.Tensor) -> torch.Tensor: 70 | mask_embedding = has_mask_input * self.model.prompt_encoder.mask_downscaling(input_mask) 71 | mask_embedding = mask_embedding + ( 72 | 1 - has_mask_input 73 | ) * self.model.prompt_encoder.no_mask_embed.weight.reshape(1, -1, 1, 1) 74 | return mask_embedding 75 | 76 | def mask_postprocessing(self, masks: torch.Tensor, orig_im_size: torch.Tensor) -> torch.Tensor: 77 | masks = F.interpolate( 78 | masks, 79 | size=(self.img_size, self.img_size), 80 | mode="bilinear", 81 | align_corners=False, 82 | ) 83 | 84 | prepadded_size = self.resize_longest_image_size(orig_im_size, self.img_size) 85 | masks = masks[..., : int(prepadded_size[0]), : int(prepadded_size[1])] 86 | 87 | orig_im_size = orig_im_size.to(torch.int64) 88 | h, w = orig_im_size[0], orig_im_size[1] 89 | masks = F.interpolate(masks, size=(h, w), mode="bilinear", align_corners=False) 90 | return masks 91 | 92 | def select_masks( 93 | self, masks: torch.Tensor, iou_preds: torch.Tensor, num_points: int 94 | ) -> Tuple[torch.Tensor, torch.Tensor]: 95 | # Determine if we should return the multiclick mask or not from the number of points. 96 | # The reweighting is used to avoid control flow. 97 | score_reweight = torch.tensor( 98 | [[1000] + [0] * (self.model.mask_decoder.num_mask_tokens - 1)] 99 | ).to(iou_preds.device) 100 | score = iou_preds + (num_points - 2.5) * score_reweight 101 | best_idx = torch.argmax(score, dim=1) 102 | masks = masks[torch.arange(masks.shape[0]), best_idx, :, :].unsqueeze(1) 103 | iou_preds = iou_preds[torch.arange(masks.shape[0]), best_idx].unsqueeze(1) 104 | 105 | return masks, iou_preds 106 | 107 | @torch.no_grad() 108 | def forward( 109 | self, 110 | image_embeddings: torch.Tensor, 111 | point_coords: torch.Tensor, 112 | point_labels: torch.Tensor, 113 | mask_input: torch.Tensor, 114 | has_mask_input: torch.Tensor, 115 | orig_im_size: torch.Tensor, 116 | ): 117 | sparse_embedding = self._embed_points(point_coords, point_labels) 118 | dense_embedding = self._embed_masks(mask_input, has_mask_input) 119 | 120 | masks, scores = self.model.mask_decoder.predict_masks( 121 | image_embeddings=image_embeddings, 122 | image_pe=self.model.prompt_encoder.get_dense_pe(), 123 | sparse_prompt_embeddings=sparse_embedding, 124 | dense_prompt_embeddings=dense_embedding, 125 | ) 126 | 127 | if self.use_stability_score: 128 | scores = calculate_stability_score( 129 | masks, self.model.mask_threshold, self.stability_score_offset 130 | ) 131 | 132 | if self.return_single_mask: 133 | masks, scores = self.select_masks(masks, scores, point_coords.shape[1]) 134 | 135 | upscaled_masks = self.mask_postprocessing(masks, orig_im_size) 136 | 137 | if self.return_extra_metrics: 138 | stability_scores = calculate_stability_score( 139 | upscaled_masks, self.model.mask_threshold, self.stability_score_offset 140 | ) 141 | areas = (upscaled_masks > self.model.mask_threshold).sum(-1).sum(-1) 142 | return upscaled_masks, scores, stability_scores, areas, masks 143 | 144 | return upscaled_masks, scores, masks 145 | -------------------------------------------------------------------------------- /Benchmark/bench.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import json 3 | import imageio.v2 as imageio 4 | import os 5 | import argparse 6 | from evaluation import evaluator 7 | import yaml 8 | 9 | mesh_root = "meshes" 10 | def load_task(task_path, bench_config): 11 | # task_config 12 | task_config = json.load(open(task_path, 'r')) 13 | 14 | # task_instruction 15 | task_instruction = task_config["instruction"] 16 | print("instruction:", task_instruction) 17 | 18 | # task_image 19 | if bench_config["image_mode"] == "GIVEN_IMAGE_ISAACGYM": 20 | image_path = task_path.replace("task_config.json", "before-rgb-0-0.png") 21 | task_image = imageio.imread(image_path) 22 | 23 | elif bench_config["image_mode"] == "GIVEN_IMAGE_BLENDER": 24 | pass 25 | 26 | elif bench_config["image_mode"] == "RENDER_IMAGE_ISAACGYM": 27 | from ..Method.interaction import init_gym 28 | gym, cfgs, task_config_now= init_gym(task_config, index=i, random_task=True, no_position = True) 29 | 30 | points_envs, colors_envs, rgb_envs, depth_envs ,seg_envs, ori_points_envs, ori_colors_envs, \ 31 | pixel2pointid, pointid2pixel = gym.refresh_observation(get_visual_obs=True) 32 | 33 | task_image = colors_envs[0] 34 | 35 | elif bench_config["image_mode"] == "RENDER_IMAGE_BLENDER": 36 | from renderer import open6dor_renderer 37 | task_image = None 38 | output_root_path = bench_config["output_path"] 39 | obj_paths = task_config["selected_urdfs"] 40 | obj_ids = [path.split("/")[-2] for path in obj_paths] 41 | 42 | init_poses = task_config["init_obj_pos"] 43 | obj_poses = {} 44 | 45 | for i in range(len(obj_ids)): 46 | pos = init_poses[i] 47 | id = obj_ids[i] 48 | position = pos[:3] 49 | quaternion = pos[3:7] 50 | transformation_matrix = open6dor_renderer.create_transformation_matrix(position, quaternion) 51 | obj_poses[id] = transformation_matrix 52 | task_id = "my_test" 53 | script = generate_shell_script(output_root_path, task_id, obj_paths, init_poses, 54 | bench_config["background_material_id"], bench_config["env_map_id"], 55 | bench_config["cam_quaternion"], bench_config["cam_translation"]) 56 | # run shell script 57 | os.system(f"bash {script}") 58 | 59 | return task_config, task_instruction, task_image 60 | 61 | def generate_shell_script(output_root_path, task_id, obj_paths, init_poses, 62 | background_material_id, env_map_id, cam_quaternion, cam_translation): 63 | script_name = "renderer/run_renderer.sh" 64 | command = "cd renderer\n" 65 | command += f"./blender-2.93.3-linux-x64/blender material_lib_v2.blend --background --python open6dor_renderer.py -- \\\n" 66 | command += f" --output_root_path {output_root_path} \\\n" 67 | command += f" --task_id {task_id} \\\n" 68 | command += f" --obj_paths {' '.join(obj_paths)} \\\n" 69 | init_obj_pos_flat = ' '.join(map(str, [item for sublist in init_poses for item in sublist])) 70 | command += f" --init_obj_pos {init_obj_pos_flat} \\\n" 71 | command += f" --background_material_id {background_material_id} \\\n" 72 | command += f" --env_map_id {env_map_id} \\\n" 73 | command += f" --cam_quaternion {' '.join(map(str, cam_quaternion))} \\\n" 74 | command += f" --cam_translation {' '.join(map(str, cam_translation))}\n" 75 | 76 | shell_file_content = f"#!/bin/bash\n\n{command}" 77 | 78 | with open(script_name, "w") as shell_file: 79 | shell_file.write(shell_file_content) 80 | 81 | print(f"Shell script {script_name} generated successfully.") 82 | print("=============================================") 83 | 84 | return script_name 85 | 86 | def eval_task(cfgs, pred_pose, use_rot = False): 87 | if use_rot: 88 | pred_rot = pred_pose["rotation"] 89 | rot_gt = list(cfgs['anno_target']['annotation'].values())[0]["quat"] 90 | rot_deviation = evaluator.evaluate_rot(rot_gt, pred_rot) 91 | print(f"Rotation deviation: {rot_deviation} degrees") 92 | 93 | pos_bases = cfgs['init_obj_pos'] 94 | pred_pos = pred_pose["position"] 95 | pos_eval = evaluator.evaluate_posi(pred_pos, pos_bases, "behind") 96 | 97 | return rot_deviation, pos_eval 98 | 99 | def method_template(cfgs, task_instruction, task_image): 100 | pred_pose = { 101 | "position": [0,0,0], 102 | "rotation": [0,0,0,0] 103 | } 104 | return pred_pose 105 | 106 | if __name__ == "__main__": 107 | 108 | parser = argparse.ArgumentParser(description="Benchmarking script for task evaluation") 109 | parser.add_argument("--mode", type=str, choices=["load_test", "eval"], help="Path to the task configuration file") 110 | parser.add_argument("--task_data", type=str, default="6dof", help="path set or single path to the task configuration file") 111 | parser.add_argument("--image_mode", type=str, default="GIVEN_IMAGE_ISAACGYM", help="Image mode") 112 | parser.add_argument("--output_path", type=str, default="../output/test", help="Path to the output directory") 113 | 114 | _args = parser.parse_args() 115 | 116 | render_configs = yaml.load(open("bench_config.yaml", 'r'), Loader=yaml.FullLoader) 117 | import pdb; pdb.set_trace() 118 | # merge the two configs 119 | bench_config = {**_args.__dict__, **render_configs} 120 | if bench_config["task_data"] == "6dof": 121 | task_paths = glob.glob('tasks/6DoF/*/*/*/task_config_new2.json') 122 | elif bench_config["task_data"] == "position": 123 | task_paths = glob.glob('tasks/position/*/*/*/task_config_new2.json') 124 | elif bench_config["task_data"] == "rotation": 125 | task_paths = glob.glob('tasks/rotation/*/*/*/task_config_new2.json') 126 | else: 127 | task_paths = [bench_config["task_data"]] 128 | 129 | if bench_config["mode"] == "load_test": 130 | for task_path in task_paths: 131 | task_config, task_instruction, task_image = load_task(task_path, bench_config) 132 | 133 | elif bench_config["mode"] == "eval": 134 | USE_ROT = False if bench_config["task_data"] == "position" else True 135 | for task_path in task_paths: 136 | task_config = json.load(open(task_path, 'r')) 137 | task_config, task_instruction, task_image = load_task(task_path, bench_config) 138 | pred_pose = method_template(task_config, task_instruction, task_image) 139 | eval_task(task_config, pred_pose, use_rot = USE_ROT) 140 | -------------------------------------------------------------------------------- /Method/isaacgym0/asset_info.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | 4 | NVIDIA CORPORATION and its licensors retain all intellectual property 5 | and proprietary rights in and to this software, related documentation 6 | and any modifications thereto. Any use, reproduction, disclosure or 7 | distribution of this software and related documentation without an express 8 | license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | 10 | 11 | Asset and Environment Information 12 | --------------------------------- 13 | Demonstrates introspection capabilities of the gym api at the asset and environment levels 14 | - Once an asset is loaded its properties can be queried 15 | - Assets in environments can be queried and their current states be retrieved 16 | """ 17 | 18 | import os 19 | from isaacgym import gymapi 20 | from isaacgym import gymutil 21 | 22 | 23 | def print_asset_info(asset, name): 24 | print("======== Asset info %s: ========" % (name)) 25 | num_bodies = gym.get_asset_rigid_body_count(asset) 26 | num_joints = gym.get_asset_joint_count(asset) 27 | num_dofs = gym.get_asset_dof_count(asset) 28 | print("Got %d bodies, %d joints, and %d DOFs" % 29 | (num_bodies, num_joints, num_dofs)) 30 | 31 | # Iterate through bodies 32 | print("Bodies:") 33 | for i in range(num_bodies): 34 | name = gym.get_asset_rigid_body_name(asset, i) 35 | print(" %2d: '%s'" % (i, name)) 36 | 37 | # Iterate through joints 38 | print("Joints:") 39 | for i in range(num_joints): 40 | name = gym.get_asset_joint_name(asset, i) 41 | type = gym.get_asset_joint_type(asset, i) 42 | type_name = gym.get_joint_type_string(type) 43 | print(" %2d: '%s' (%s)" % (i, name, type_name)) 44 | 45 | # iterate through degrees of freedom (DOFs) 46 | print("DOFs:") 47 | for i in range(num_dofs): 48 | name = gym.get_asset_dof_name(asset, i) 49 | type = gym.get_asset_dof_type(asset, i) 50 | type_name = gym.get_dof_type_string(type) 51 | print(" %2d: '%s' (%s)" % (i, name, type_name)) 52 | 53 | 54 | def print_actor_info(gym, env, actor_handle): 55 | 56 | name = gym.get_actor_name(env, actor_handle) 57 | 58 | body_names = gym.get_actor_rigid_body_names(env, actor_handle) 59 | body_dict = gym.get_actor_rigid_body_dict(env, actor_handle) 60 | 61 | joint_names = gym.get_actor_joint_names(env, actor_handle) 62 | joint_dict = gym.get_actor_joint_dict(env, actor_handle) 63 | 64 | dof_names = gym.get_actor_dof_names(env, actor_handle) 65 | dof_dict = gym.get_actor_dof_dict(env, actor_handle) 66 | 67 | print() 68 | print("===== Actor: %s =======================================" % name) 69 | 70 | print("\nBodies") 71 | print(body_names) 72 | print(body_dict) 73 | 74 | print("\nJoints") 75 | print(joint_names) 76 | print(joint_dict) 77 | 78 | print("\n Degrees Of Freedom (DOFs)") 79 | print(dof_names) 80 | print(dof_dict) 81 | print() 82 | 83 | # Get body state information 84 | body_states = gym.get_actor_rigid_body_states( 85 | env, actor_handle, gymapi.STATE_ALL) 86 | 87 | # Print some state slices 88 | print("Poses from Body State:") 89 | print(body_states['pose']) # print just the poses 90 | 91 | print("\nVelocities from Body State:") 92 | print(body_states['vel']) # print just the velocities 93 | print() 94 | 95 | # iterate through bodies and print name and position 96 | body_positions = body_states['pose']['p'] 97 | for i in range(len(body_names)): 98 | print("Body '%s' has position" % body_names[i], body_positions[i]) 99 | 100 | print("\nDOF states:") 101 | 102 | # get DOF states 103 | dof_states = gym.get_actor_dof_states(env, actor_handle, gymapi.STATE_ALL) 104 | 105 | # print some state slices 106 | # Print all states for each degree of freedom 107 | print(dof_states) 108 | print() 109 | 110 | # iterate through DOFs and print name and position 111 | dof_positions = dof_states['pos'] 112 | for i in range(len(dof_names)): 113 | print("DOF '%s' has position" % dof_names[i], dof_positions[i]) 114 | 115 | 116 | # initialize gym 117 | gym = gymapi.acquire_gym() 118 | 119 | # parse arguments 120 | args = gymutil.parse_arguments(description="Asset and Environment Information") 121 | 122 | # create simulation context 123 | sim_params = gymapi.SimParams() 124 | 125 | sim_params.use_gpu_pipeline = False 126 | if args.use_gpu_pipeline: 127 | print("WARNING: Forcing CPU pipeline.") 128 | 129 | sim = gym.create_sim(args.compute_device_id, args.graphics_device_id, args.physics_engine, sim_params) 130 | 131 | if sim is None: 132 | print("*** Failed to create sim") 133 | quit() 134 | 135 | # Print out the working directory 136 | # helpful in determining the relative location that assets will be loaded from 137 | print("Working directory: %s" % os.getcwd()) 138 | 139 | # Path where assets are searched, relative to the current working directory 140 | asset_root = "../../assets" 141 | 142 | # List of assets that will be loaded, both URDF and MJCF files are supported 143 | asset_files = ["urdf/cartpole.urdf", 144 | "urdf/franka_description/robots/franka_panda.urdf", 145 | "mjcf/nv_ant.xml"] 146 | asset_names = ["cartpole", "franka", "ant"] 147 | loaded_assets = [] 148 | 149 | # Load the assets and ensure that we are successful 150 | for asset in asset_files: 151 | print("Loading asset '%s' from '%s'" % (asset, asset_root)) 152 | 153 | current_asset = gym.load_asset(sim, asset_root, asset) 154 | 155 | if current_asset is None: 156 | print("*** Failed to load asset '%s'" % (asset, asset_root)) 157 | quit() 158 | loaded_assets.append(current_asset) 159 | 160 | for i in range(len(loaded_assets)): 161 | print() 162 | print_asset_info(loaded_assets[i], asset_names[i]) 163 | 164 | # Setup environment spacing 165 | spacing = 2.0 166 | lower = gymapi.Vec3(-spacing, 0.0, -spacing) 167 | upper = gymapi.Vec3(spacing, spacing, spacing) 168 | 169 | # Create one environment 170 | env = gym.create_env(sim, lower, upper, 1) 171 | 172 | # Add actors to environment 173 | pose = gymapi.Transform() 174 | for i in range(len(loaded_assets)): 175 | pose.p = gymapi.Vec3(0.0, 0.0, i * 2) 176 | pose.r = gymapi.Quat(-0.707107, 0.0, 0.0, 0.707107) 177 | gym.create_actor(env, loaded_assets[i], pose, asset_names[i], -1, -1) 178 | 179 | print("=== Environment info: ================================================") 180 | 181 | #actor_count = gym.get_actor_count(env) 182 | print("%d actors total" % actor_num) 183 | 184 | # Iterate through all actors for the environment 185 | for i in range(actor_count): 186 | actor_handle = gym.get_actor_handle(envs[i], i) 187 | print_actor_info(gym, envs[i], actor_handle) 188 | 189 | # Cleanup the simulator 190 | gym.destroy_sim(sim) 191 | -------------------------------------------------------------------------------- /Method/vision/GroundedSAM/segment_anything/scripts/export_onnx_model.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import torch 8 | 9 | from segment_anything import build_sam, build_sam_vit_b, build_sam_vit_l 10 | from segment_anything.utils.onnx import SamOnnxModel 11 | 12 | import argparse 13 | import warnings 14 | 15 | try: 16 | import onnxruntime # type: ignore 17 | 18 | onnxruntime_exists = True 19 | except ImportError: 20 | onnxruntime_exists = False 21 | 22 | parser = argparse.ArgumentParser( 23 | description="Export the SAM prompt encoder and mask decoder to an ONNX model." 24 | ) 25 | 26 | parser.add_argument( 27 | "--checkpoint", type=str, required=True, help="The path to the SAM model checkpoint." 28 | ) 29 | 30 | parser.add_argument( 31 | "--output", type=str, required=True, help="The filename to save the ONNX model to." 32 | ) 33 | 34 | parser.add_argument( 35 | "--model-type", 36 | type=str, 37 | default="default", 38 | help="In ['default', 'vit_b', 'vit_l']. Which type of SAM model to export.", 39 | ) 40 | 41 | parser.add_argument( 42 | "--return-single-mask", 43 | action="store_true", 44 | help=( 45 | "If true, the exported ONNX model will only return the best mask, " 46 | "instead of returning multiple masks. For high resolution images " 47 | "this can improve runtime when upscaling masks is expensive." 48 | ), 49 | ) 50 | 51 | parser.add_argument( 52 | "--opset", 53 | type=int, 54 | default=17, 55 | help="The ONNX opset version to use. Must be >=11", 56 | ) 57 | 58 | parser.add_argument( 59 | "--quantize-out", 60 | type=str, 61 | default=None, 62 | help=( 63 | "If set, will quantize the model and save it with this name. " 64 | "Quantization is performed with quantize_dynamic from onnxruntime.quantization.quantize." 65 | ), 66 | ) 67 | 68 | parser.add_argument( 69 | "--gelu-approximate", 70 | action="store_true", 71 | help=( 72 | "Replace GELU operations with approximations using tanh. Useful " 73 | "for some runtimes that have slow or unimplemented erf ops, used in GELU." 74 | ), 75 | ) 76 | 77 | parser.add_argument( 78 | "--use-stability-score", 79 | action="store_true", 80 | help=( 81 | "Replaces the model's predicted mask quality score with the stability " 82 | "score calculated on the low resolution masks using an offset of 1.0. " 83 | ), 84 | ) 85 | 86 | parser.add_argument( 87 | "--return-extra-metrics", 88 | action="store_true", 89 | help=( 90 | "The model will return five results: (masks, scores, stability_scores, " 91 | "areas, low_res_logits) instead of the usual three. This can be " 92 | "significantly slower for high resolution outputs." 93 | ), 94 | ) 95 | 96 | 97 | def run_export( 98 | model_type: str, 99 | checkpoint: str, 100 | output: str, 101 | opset: int, 102 | return_single_mask: bool, 103 | gelu_approximate: bool = False, 104 | use_stability_score: bool = False, 105 | return_extra_metrics=False, 106 | ): 107 | print("Loading model...") 108 | if model_type == "vit_b": 109 | sam = build_sam_vit_b(checkpoint) 110 | elif model_type == "vit_l": 111 | sam = build_sam_vit_l(checkpoint) 112 | else: 113 | sam = build_sam(checkpoint) 114 | 115 | onnx_model = SamOnnxModel( 116 | model=sam, 117 | return_single_mask=return_single_mask, 118 | use_stability_score=use_stability_score, 119 | return_extra_metrics=return_extra_metrics, 120 | ) 121 | 122 | if gelu_approximate: 123 | for n, m in onnx_model.named_modules(): 124 | if isinstance(m, torch.nn.GELU): 125 | m.approximate = "tanh" 126 | 127 | dynamic_axes = { 128 | "point_coords": {1: "num_points"}, 129 | "point_labels": {1: "num_points"}, 130 | } 131 | 132 | embed_dim = sam.prompt_encoder.embed_dim 133 | embed_size = sam.prompt_encoder.image_embedding_size 134 | mask_input_size = [4 * x for x in embed_size] 135 | dummy_inputs = { 136 | "image_embeddings": torch.randn(1, embed_dim, *embed_size, dtype=torch.float), 137 | "point_coords": torch.randint(low=0, high=1024, size=(1, 5, 2), dtype=torch.float), 138 | "point_labels": torch.randint(low=0, high=4, size=(1, 5), dtype=torch.float), 139 | "mask_input": torch.randn(1, 1, *mask_input_size, dtype=torch.float), 140 | "has_mask_input": torch.tensor([1], dtype=torch.float), 141 | "orig_im_size": torch.tensor([1500, 2250], dtype=torch.float), 142 | } 143 | 144 | _ = onnx_model(**dummy_inputs) 145 | 146 | output_names = ["masks", "iou_predictions", "low_res_masks"] 147 | 148 | with warnings.catch_warnings(): 149 | warnings.filterwarnings("ignore", category=torch.jit.TracerWarning) 150 | warnings.filterwarnings("ignore", category=UserWarning) 151 | with open(output, "wb") as f: 152 | print(f"Exporing onnx model to {output}...") 153 | torch.onnx.export( 154 | onnx_model, 155 | tuple(dummy_inputs.values()), 156 | f, 157 | export_params=True, 158 | verbose=False, 159 | opset_version=opset, 160 | do_constant_folding=True, 161 | input_names=list(dummy_inputs.keys()), 162 | output_names=output_names, 163 | dynamic_axes=dynamic_axes, 164 | ) 165 | 166 | if onnxruntime_exists: 167 | ort_inputs = {k: to_numpy(v) for k, v in dummy_inputs.items()} 168 | ort_session = onnxruntime.InferenceSession(output) 169 | _ = ort_session.run(None, ort_inputs) 170 | print("Model has successfully been run with ONNXRuntime.") 171 | 172 | 173 | def to_numpy(tensor): 174 | return tensor.cpu().numpy() 175 | 176 | 177 | if __name__ == "__main__": 178 | args = parser.parse_args() 179 | run_export( 180 | model_type=args.model_type, 181 | checkpoint=args.checkpoint, 182 | output=args.output, 183 | opset=args.opset, 184 | return_single_mask=args.return_single_mask, 185 | gelu_approximate=args.gelu_approximate, 186 | use_stability_score=args.use_stability_score, 187 | return_extra_metrics=args.return_extra_metrics, 188 | ) 189 | 190 | if args.quantize_out is not None: 191 | assert onnxruntime_exists, "onnxruntime is required to quantize the model." 192 | from onnxruntime.quantization import QuantType # type: ignore 193 | from onnxruntime.quantization.quantize import quantize_dynamic # type: ignore 194 | 195 | print(f"Quantizing model and writing to {args.quantize_out}...") 196 | quantize_dynamic( 197 | model_input=args.output, 198 | model_output=args.quantize_out, 199 | optimize_model=True, 200 | per_channel=False, 201 | reduce_range=False, 202 | weight_type=QuantType.QUInt8, 203 | ) 204 | print("Done!") 205 | -------------------------------------------------------------------------------- /Method/vision/GroundedSAM/segment_anything/segment_anything/modeling/mask_decoder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import torch 8 | from torch import nn 9 | from torch.nn import functional as F 10 | 11 | from typing import List, Tuple, Type 12 | 13 | from .common import LayerNorm2d 14 | 15 | 16 | class MaskDecoder(nn.Module): 17 | def __init__( 18 | self, 19 | *, 20 | transformer_dim: int, 21 | transformer: nn.Module, 22 | num_multimask_outputs: int = 3, 23 | activation: Type[nn.Module] = nn.GELU, 24 | iou_head_depth: int = 3, 25 | iou_head_hidden_dim: int = 256, 26 | ) -> None: 27 | """ 28 | Predicts masks given an image and prompt embeddings, using a 29 | transformer architecture. 30 | 31 | Arguments: 32 | transformer_dim (int): the channel dimension of the transformer 33 | transformer (nn.Module): the transformer used to predict masks 34 | num_multimask_outputs (int): the number of masks to predict 35 | when disambiguating masks 36 | activation (nn.Module): the type of activation to use when 37 | upscaling masks 38 | iou_head_depth (int): the depth of the MLP used to predict 39 | mask quality 40 | iou_head_hidden_dim (int): the hidden dimension of the MLP 41 | used to predict mask quality 42 | """ 43 | super().__init__() 44 | self.transformer_dim = transformer_dim 45 | self.transformer = transformer 46 | 47 | self.num_multimask_outputs = num_multimask_outputs 48 | 49 | self.iou_token = nn.Embedding(1, transformer_dim) 50 | self.num_mask_tokens = num_multimask_outputs + 1 51 | self.mask_tokens = nn.Embedding(self.num_mask_tokens, transformer_dim) 52 | 53 | self.output_upscaling = nn.Sequential( 54 | nn.ConvTranspose2d(transformer_dim, transformer_dim // 4, kernel_size=2, stride=2), 55 | LayerNorm2d(transformer_dim // 4), 56 | activation(), 57 | nn.ConvTranspose2d(transformer_dim // 4, transformer_dim // 8, kernel_size=2, stride=2), 58 | activation(), 59 | ) 60 | self.output_hypernetworks_mlps = nn.ModuleList( 61 | [ 62 | MLP(transformer_dim, transformer_dim, transformer_dim // 8, 3) 63 | for i in range(self.num_mask_tokens) 64 | ] 65 | ) 66 | 67 | self.iou_prediction_head = MLP( 68 | transformer_dim, iou_head_hidden_dim, self.num_mask_tokens, iou_head_depth 69 | ) 70 | 71 | def forward( 72 | self, 73 | image_embeddings: torch.Tensor, 74 | image_pe: torch.Tensor, 75 | sparse_prompt_embeddings: torch.Tensor, 76 | dense_prompt_embeddings: torch.Tensor, 77 | multimask_output: bool, 78 | hq_token_only: bool, 79 | interm_embeddings: torch.Tensor, 80 | ) -> Tuple[torch.Tensor, torch.Tensor]: 81 | """ 82 | Predict masks given image and prompt embeddings. 83 | 84 | Arguments: 85 | image_embeddings (torch.Tensor): the embeddings from the image encoder 86 | image_pe (torch.Tensor): positional encoding with the shape of image_embeddings 87 | sparse_prompt_embeddings (torch.Tensor): the embeddings of the points and boxes 88 | dense_prompt_embeddings (torch.Tensor): the embeddings of the mask inputs 89 | multimask_output (bool): Whether to return multiple masks or a single 90 | mask. 91 | 92 | Returns: 93 | torch.Tensor: batched predicted masks 94 | torch.Tensor: batched predictions of mask quality 95 | """ 96 | masks, iou_pred = self.predict_masks( 97 | image_embeddings=image_embeddings, 98 | image_pe=image_pe, 99 | sparse_prompt_embeddings=sparse_prompt_embeddings, 100 | dense_prompt_embeddings=dense_prompt_embeddings, 101 | ) 102 | 103 | # Select the correct mask or masks for output 104 | if multimask_output: 105 | mask_slice = slice(1, None) 106 | else: 107 | mask_slice = slice(0, 1) 108 | masks = masks[:, mask_slice, :, :] 109 | iou_pred = iou_pred[:, mask_slice] 110 | 111 | # Prepare output 112 | return masks, iou_pred 113 | 114 | def predict_masks( 115 | self, 116 | image_embeddings: torch.Tensor, 117 | image_pe: torch.Tensor, 118 | sparse_prompt_embeddings: torch.Tensor, 119 | dense_prompt_embeddings: torch.Tensor, 120 | ) -> Tuple[torch.Tensor, torch.Tensor]: 121 | """Predicts masks. See 'forward' for more details.""" 122 | # Concatenate output tokens 123 | output_tokens = torch.cat([self.iou_token.weight, self.mask_tokens.weight], dim=0) 124 | output_tokens = output_tokens.unsqueeze(0).expand(sparse_prompt_embeddings.size(0), -1, -1) 125 | tokens = torch.cat((output_tokens, sparse_prompt_embeddings), dim=1) 126 | 127 | # Expand per-image data in batch direction to be per-mask 128 | src = torch.repeat_interleave(image_embeddings, tokens.shape[0], dim=0) 129 | src = src + dense_prompt_embeddings 130 | pos_src = torch.repeat_interleave(image_pe, tokens.shape[0], dim=0) 131 | b, c, h, w = src.shape 132 | 133 | # Run the transformer 134 | hs, src = self.transformer(src, pos_src, tokens) 135 | iou_token_out = hs[:, 0, :] 136 | mask_tokens_out = hs[:, 1 : (1 + self.num_mask_tokens), :] 137 | 138 | # Upscale mask embeddings and predict masks using the mask tokens 139 | src = src.transpose(1, 2).view(b, c, h, w) 140 | upscaled_embedding = self.output_upscaling(src) 141 | hyper_in_list: List[torch.Tensor] = [] 142 | for i in range(self.num_mask_tokens): 143 | hyper_in_list.append(self.output_hypernetworks_mlps[i](mask_tokens_out[:, i, :])) 144 | hyper_in = torch.stack(hyper_in_list, dim=1) 145 | b, c, h, w = upscaled_embedding.shape 146 | masks = (hyper_in @ upscaled_embedding.view(b, c, h * w)).view(b, -1, h, w) 147 | 148 | # Generate mask quality predictions 149 | iou_pred = self.iou_prediction_head(iou_token_out) 150 | 151 | return masks, iou_pred 152 | 153 | 154 | # Lightly adapted from 155 | # https://github.com/facebookresearch/MaskFormer/blob/main/mask_former/modeling/transformer/transformer_predictor.py # noqa 156 | class MLP(nn.Module): 157 | def __init__( 158 | self, 159 | input_dim: int, 160 | hidden_dim: int, 161 | output_dim: int, 162 | num_layers: int, 163 | sigmoid_output: bool = False, 164 | ) -> None: 165 | super().__init__() 166 | self.num_layers = num_layers 167 | h = [hidden_dim] * (num_layers - 1) 168 | self.layers = nn.ModuleList( 169 | nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]) 170 | ) 171 | self.sigmoid_output = sigmoid_output 172 | 173 | def forward(self, x): 174 | for i, layer in enumerate(self.layers): 175 | x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) 176 | if self.sigmoid_output: 177 | x = F.sigmoid(x) 178 | return x -------------------------------------------------------------------------------- /Method/vision/GroundedSAM/segment_anything/segment_anything/modeling/sam.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import torch 8 | from torch import nn 9 | from torch.nn import functional as F 10 | 11 | from typing import Any, Dict, List, Tuple 12 | 13 | from .image_encoder import ImageEncoderViT 14 | from .mask_decoder import MaskDecoder 15 | from .prompt_encoder import PromptEncoder 16 | 17 | 18 | class Sam(nn.Module): 19 | mask_threshold: float = 0.0 20 | image_format: str = "RGB" 21 | 22 | def __init__( 23 | self, 24 | image_encoder: ImageEncoderViT, 25 | prompt_encoder: PromptEncoder, 26 | mask_decoder: MaskDecoder, 27 | pixel_mean: List[float] = [123.675, 116.28, 103.53], 28 | pixel_std: List[float] = [58.395, 57.12, 57.375], 29 | ) -> None: 30 | """ 31 | SAM predicts object masks from an image and input prompts. 32 | 33 | Arguments: 34 | image_encoder (ImageEncoderViT): The backbone used to encode the 35 | image into image embeddings that allow for efficient mask prediction. 36 | prompt_encoder (PromptEncoder): Encodes various types of input prompts. 37 | mask_decoder (MaskDecoder): Predicts masks from the image embeddings 38 | and encoded prompts. 39 | pixel_mean (list(float)): Mean values for normalizing pixels in the input image. 40 | pixel_std (list(float)): Std values for normalizing pixels in the input image. 41 | """ 42 | super().__init__() 43 | self.image_encoder = image_encoder 44 | self.prompt_encoder = prompt_encoder 45 | self.mask_decoder = mask_decoder 46 | self.register_buffer("pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False) 47 | self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False) 48 | 49 | @property 50 | def device(self) -> Any: 51 | return self.pixel_mean.device 52 | 53 | @torch.no_grad() 54 | def forward( 55 | self, 56 | batched_input: List[Dict[str, Any]], 57 | multimask_output: bool, 58 | ) -> List[Dict[str, torch.Tensor]]: 59 | """ 60 | Predicts masks end-to-end from provided images and prompts. 61 | If prompts are not known in advance, using SamPredictor is 62 | recommended over calling the model directly. 63 | 64 | Arguments: 65 | batched_input (list(dict)): A list over input images, each a 66 | dictionary with the following keys. A prompt key can be 67 | excluded if it is not present. 68 | 'image': The image as a torch tensor in 3xHxW format, 69 | already transformed for input to the model. 70 | 'original_size': (tuple(int, int)) The original size of 71 | the image before transformation, as (H, W). 72 | 'point_coords': (torch.Tensor) Batched point prompts for 73 | this image, with shape BxNx2. Already transformed to the 74 | input frame of the model. 75 | 'point_labels': (torch.Tensor) Batched labels for point prompts, 76 | with shape BxN. 77 | 'boxes': (torch.Tensor) Batched box inputs, with shape Bx4. 78 | Already transformed to the input frame of the model. 79 | 'mask_inputs': (torch.Tensor) Batched mask inputs to the model, 80 | in the form Bx1xHxW. 81 | multimask_output (bool): Whether the model should predict multiple 82 | disambiguating masks, or return a single mask. 83 | 84 | Returns: 85 | (list(dict)): A list over input images, where each element is 86 | as dictionary with the following keys. 87 | 'masks': (torch.Tensor) Batched binary mask predictions, 88 | with shape BxCxHxW, where B is the number of input promts, 89 | C is determiend by multimask_output, and (H, W) is the 90 | original size of the image. 91 | 'iou_predictions': (torch.Tensor) The model's predictions 92 | of mask quality, in shape BxC. 93 | 'low_res_logits': (torch.Tensor) Low resolution logits with 94 | shape BxCxHxW, where H=W=256. Can be passed as mask input 95 | to subsequent iterations of prediction. 96 | """ 97 | input_images = torch.stack([self.preprocess(x["image"]) for x in batched_input], dim=0) 98 | image_embeddings = self.image_encoder(input_images) 99 | 100 | outputs = [] 101 | for image_record, curr_embedding in zip(batched_input, image_embeddings): 102 | if "point_coords" in image_record: 103 | points = (image_record["point_coords"], image_record["point_labels"]) 104 | else: 105 | points = None 106 | sparse_embeddings, dense_embeddings = self.prompt_encoder( 107 | points=points, 108 | boxes=image_record.get("boxes", None), 109 | masks=image_record.get("mask_inputs", None), 110 | ) 111 | low_res_masks, iou_predictions = self.mask_decoder( 112 | image_embeddings=curr_embedding.unsqueeze(0), 113 | image_pe=self.prompt_encoder.get_dense_pe(), 114 | sparse_prompt_embeddings=sparse_embeddings, 115 | dense_prompt_embeddings=dense_embeddings, 116 | multimask_output=multimask_output, 117 | ) 118 | masks = self.postprocess_masks( 119 | low_res_masks, 120 | input_size=image_record["image"].shape[-2:], 121 | original_size=image_record["original_size"], 122 | ) 123 | masks = masks > self.mask_threshold 124 | outputs.append( 125 | { 126 | "masks": masks, 127 | "iou_predictions": iou_predictions, 128 | "low_res_logits": low_res_masks, 129 | } 130 | ) 131 | return outputs 132 | 133 | def postprocess_masks( 134 | self, 135 | masks: torch.Tensor, 136 | input_size: Tuple[int, ...], 137 | original_size: Tuple[int, ...], 138 | ) -> torch.Tensor: 139 | """ 140 | Remove padding and upscale masks to the original image size. 141 | 142 | Arguments: 143 | masks (torch.Tensor): Batched masks from the mask_decoder, 144 | in BxCxHxW format. 145 | input_size (tuple(int, int)): The size of the image input to the 146 | model, in (H, W) format. Used to remove padding. 147 | original_size (tuple(int, int)): The original size of the image 148 | before resizing for input to the model, in (H, W) format. 149 | 150 | Returns: 151 | (torch.Tensor): Batched masks in BxCxHxW format, where (H, W) 152 | is given by original_size. 153 | """ 154 | masks = F.interpolate( 155 | masks, 156 | (self.image_encoder.img_size, self.image_encoder.img_size), 157 | mode="bilinear", 158 | align_corners=False, 159 | ) 160 | masks = masks[..., : input_size[0], : input_size[1]] 161 | masks = F.interpolate(masks, original_size, mode="bilinear", align_corners=False) 162 | return masks 163 | 164 | def preprocess(self, x: torch.Tensor) -> torch.Tensor: 165 | """Normalize pixel values and pad to a square input.""" 166 | # Normalize colors 167 | x = (x - self.pixel_mean) / self.pixel_std 168 | 169 | # Pad 170 | h, w = x.shape[-2:] 171 | padh = self.image_encoder.img_size - h 172 | padw = self.image_encoder.img_size - w 173 | x = F.pad(x, (0, padw, 0, padh)) 174 | return x 175 | -------------------------------------------------------------------------------- /Method/vision/GroundedSAM/segment_anything/scripts/amg.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import cv2 # type: ignore 8 | 9 | from segment_anything import SamAutomaticMaskGenerator, sam_model_registry 10 | 11 | import argparse 12 | import json 13 | import os 14 | from typing import Any, Dict, List 15 | 16 | parser = argparse.ArgumentParser( 17 | description=( 18 | "Runs automatic mask generation on an input image or directory of images, " 19 | "and outputs masks as either PNGs or COCO-style RLEs. Requires open-cv, " 20 | "as well as pycocotools if saving in RLE format." 21 | ) 22 | ) 23 | 24 | parser.add_argument( 25 | "--input", 26 | type=str, 27 | required=True, 28 | help="Path to either a single input image or folder of images.", 29 | ) 30 | 31 | parser.add_argument( 32 | "--output", 33 | type=str, 34 | required=True, 35 | help=( 36 | "Path to the directory where masks will be output. Output will be either a folder " 37 | "of PNGs per image or a single json with COCO-style masks." 38 | ), 39 | ) 40 | 41 | parser.add_argument( 42 | "--model-type", 43 | type=str, 44 | default="default", 45 | help="The type of model to load, in ['default', 'vit_l', 'vit_b']", 46 | ) 47 | 48 | parser.add_argument( 49 | "--checkpoint", 50 | type=str, 51 | required=True, 52 | help="The path to the SAM checkpoint to use for mask generation.", 53 | ) 54 | 55 | parser.add_argument("--device", type=str, default="cuda", help="The device to run generation on.") 56 | 57 | parser.add_argument( 58 | "--convert-to-rle", 59 | action="store_true", 60 | help=( 61 | "Save masks as COCO RLEs in a single json instead of as a folder of PNGs. " 62 | "Requires pycocotools." 63 | ), 64 | ) 65 | 66 | amg_settings = parser.add_argument_group("AMG Settings") 67 | 68 | amg_settings.add_argument( 69 | "--points-per-side", 70 | type=int, 71 | default=None, 72 | help="Generate masks by sampling a grid over the image with this many points to a side.", 73 | ) 74 | 75 | amg_settings.add_argument( 76 | "--points-per-batch", 77 | type=int, 78 | default=None, 79 | help="How many input points to process simultaneously in one batch.", 80 | ) 81 | 82 | amg_settings.add_argument( 83 | "--pred-iou-thresh", 84 | type=float, 85 | default=None, 86 | help="Exclude masks with a predicted score from the model that is lower than this threshold.", 87 | ) 88 | 89 | amg_settings.add_argument( 90 | "--stability-score-thresh", 91 | type=float, 92 | default=None, 93 | help="Exclude masks with a stability score lower than this threshold.", 94 | ) 95 | 96 | amg_settings.add_argument( 97 | "--stability-score-offset", 98 | type=float, 99 | default=None, 100 | help="Larger values perturb the mask more when measuring stability score.", 101 | ) 102 | 103 | amg_settings.add_argument( 104 | "--box-nms-thresh", 105 | type=float, 106 | default=None, 107 | help="The overlap threshold for excluding a duplicate mask.", 108 | ) 109 | 110 | amg_settings.add_argument( 111 | "--crop-n-layers", 112 | type=int, 113 | default=None, 114 | help=( 115 | "If >0, mask generation is run on smaller crops of the image to generate more masks. " 116 | "The value sets how many different scales to crop at." 117 | ), 118 | ) 119 | 120 | amg_settings.add_argument( 121 | "--crop-nms-thresh", 122 | type=float, 123 | default=None, 124 | help="The overlap threshold for excluding duplicate masks across different crops.", 125 | ) 126 | 127 | amg_settings.add_argument( 128 | "--crop-overlap-ratio", 129 | type=int, 130 | default=None, 131 | help="Larger numbers mean image crops will overlap more.", 132 | ) 133 | 134 | amg_settings.add_argument( 135 | "--crop-n-points-downscale-factor", 136 | type=int, 137 | default=None, 138 | help="The number of points-per-side in each layer of crop is reduced by this factor.", 139 | ) 140 | 141 | amg_settings.add_argument( 142 | "--min-mask-region-area", 143 | type=int, 144 | default=None, 145 | help=( 146 | "Disconnected mask regions or holes with area smaller than this value " 147 | "in pixels are removed by postprocessing." 148 | ), 149 | ) 150 | 151 | 152 | def write_masks_to_folder(masks: List[Dict[str, Any]], path: str) -> None: 153 | header = "id,area,bbox_x0,bbox_y0,bbox_w,bbox_h,point_input_x,point_input_y,predicted_iou,stability_score,crop_box_x0,crop_box_y0,crop_box_w,crop_box_h" # noqa 154 | metadata = [header] 155 | for i, mask_data in enumerate(masks): 156 | mask = mask_data["segmentation"] 157 | filename = f"{i}.png" 158 | cv2.imwrite(os.path.join(path, filename), mask * 255) 159 | mask_metadata = [ 160 | str(i), 161 | str(mask_data["area"]), 162 | *[str(x) for x in mask_data["bbox"]], 163 | *[str(x) for x in mask_data["point_coords"][0]], 164 | str(mask_data["predicted_iou"]), 165 | str(mask_data["stability_score"]), 166 | *[str(x) for x in mask_data["crop_box"]], 167 | ] 168 | row = ",".join(mask_metadata) 169 | metadata.append(row) 170 | metadata_path = os.path.join(path, "metadata.csv") 171 | with open(metadata_path, "w") as f: 172 | f.write("\n".join(metadata)) 173 | 174 | return 175 | 176 | 177 | def get_amg_kwargs(args): 178 | amg_kwargs = { 179 | "points_per_side": args.points_per_side, 180 | "points_per_batch": args.points_per_batch, 181 | "pred_iou_thresh": args.pred_iou_thresh, 182 | "stability_score_thresh": args.stability_score_thresh, 183 | "stability_score_offset": args.stability_score_offset, 184 | "box_nms_thresh": args.box_nms_thresh, 185 | "crop_n_layers": args.crop_n_layers, 186 | "crop_nms_thresh": args.crop_nms_thresh, 187 | "crop_overlap_ratio": args.crop_overlap_ratio, 188 | "crop_n_points_downscale_factor": args.crop_n_points_downscale_factor, 189 | "min_mask_region_area": args.min_mask_region_area, 190 | } 191 | amg_kwargs = {k: v for k, v in amg_kwargs.items() if v is not None} 192 | return amg_kwargs 193 | 194 | 195 | def main(args: argparse.Namespace) -> None: 196 | print("Loading model...") 197 | sam = sam_model_registry[args.model_type](checkpoint=args.checkpoint) 198 | _ = sam.to(device=args.device) 199 | output_mode = "coco_rle" if args.convert_to_rle else "binary_mask" 200 | amg_kwargs = get_amg_kwargs(args) 201 | generator = SamAutomaticMaskGenerator(sam, output_mode=output_mode, **amg_kwargs) 202 | 203 | if not os.path.isdir(args.input): 204 | targets = [args.input] 205 | else: 206 | targets = [ 207 | f for f in os.listdir(args.input) if not os.path.isdir(os.path.join(args.input, f)) 208 | ] 209 | targets = [os.path.join(args.input, f) for f in targets] 210 | 211 | os.makedirs(args.output, exist_ok=True) 212 | 213 | for t in targets: 214 | print(f"Processing '{t}'...") 215 | image = cv2.imread(t) 216 | if image is None: 217 | print(f"Could not load '{t}' as an image, skipping...") 218 | continue 219 | image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 220 | 221 | masks = generator.generate(image) 222 | 223 | base = os.path.basename(t) 224 | base = os.path.splitext(base)[0] 225 | save_base = os.path.join(args.output, base) 226 | if output_mode == "binary_mask": 227 | os.makedirs(save_base, exist_ok=False) 228 | write_masks_to_folder(masks, save_base) 229 | else: 230 | save_file = save_base + ".json" 231 | with open(save_file, "w") as f: 232 | json.dump(masks, f) 233 | print("Done!") 234 | 235 | 236 | if __name__ == "__main__": 237 | args = parser.parse_args() 238 | main(args) 239 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | Open6DOR: Open6DOR: Benchmarking Open-instruction 6-DoF Object Rearrangement and A VLM-based Approach 3 | 4 | IROS 2024 5 | 6 | 7 |
8 | 9 | 10 | 11 | Paper arXiv 12 | 13 | Project Page 14 |
15 |

16 | 17 | 18 | ![Teaser](./images/teaser_final1.jpg) 19 | This is the official repository of [Open6DOR: Benchmarking Open-instruction 6-DoF Object Rearrangement and A VLM-based Approach](https://pku-epic.github.io/Open6DOR/). In this work, we propel the pioneer construction of the benchmark and approach for table-top Open-instruction 6-DoF Object Rearrangement (Open6DOR). Specifically, we collect a synthetic dataset of 200+ objects and carefully design 2400+ Open6DOR tasks. These tasks are divided into the Position-track, Rotation-track, and 6-DoF-track for evaluating different embodied agents in predicting the positions and rotations of target objects. Besides, we also propose a VLM-based approach for Open6DOR, named Open6DOR-GPT, which empowers GPT-4V with 3D-awareness and simulation-assistance while exploiting its strengths in generalizability and instruction-following for this task. We compare the existing embodied agents with our Open6DOR-GPT on the proposed Open6DOR benchmark and find that Open6DOR-GPT achieves the state-of-the-art performance. We further show the impressive performance of Open6DOR-GPT in diverse real-world experiments. 20 | We plan to release the final version of the benchmark, along with our refined method, in early September, and we recommend waiting until then to download the dataset. 21 | 22 | ## News 23 | - We update method with gpt-4o api. See Method folder. 24 | 25 | ## Benchmark 26 | The Open6DOR Benchmark is specifically designed for table-top Open6DOR tasks within a simulation environment. Our dataset encompasses 200+ high-quality objects, forming diverse scenes and totaling 2400+ diverse tasks. All tasks are carefully configured and accompanied by detailed annotations. To ensure comprehensive evaluation, we provide three specialized tracks of benchmark: the Rotation-track Benchmark ($B_r$), the Position-track benchmark ($B_p$), and the 6-DoF-track Benchmark ($B_\text{6DoF}$). 27 | In this repository, we provide: 28 | - A dataset of diverse objects 29 | - 2400+ Open6DOR tasks with detailed annotations 30 | - A set of evaluation metrics for each track of tasks 31 | 32 | 33 | ### Installation 34 | **Environment Setup** 35 | 36 | We recommend using Linux system for better compatability with our modules (including Blender and Isaacgym). 37 | ``` 38 | # Clone the repository 39 | git clone git@github.com:Selina2023/Open6DOR.git 40 | cd Open6DOR 41 | # Create an environment 42 | conda create -n Open6DOR python=3.9 43 | # Install dependencies 44 | pip install -r requirements.txt 45 | ``` 46 | 47 | 48 | **Dataset Downloads** 49 | 50 | Refer to the subsequent section for specific file locations. 51 | - Download the [object datasets](https://drive.google.com/drive/folders/1Gm30OtQWRb5NitIdnLSJlfLdAG_rWHQX?usp=sharing) and uncompress. 52 | - Download the [task datasets](https://drive.google.com/drive/folders/11o2I20Q8uJrSXO_JvnbH7dEoR43V9fKa?usp=sharing) and uncompress. (The refined version will be released along with our paper) 53 | 54 | **Rendering Dependencies** 55 | 56 | - Download [Blender 2.93.3 (Linux x64)](https://download.blender.org/release/Blender2.93/blender-2.93.3-linux-x64.tar.xz) and uncompress. 57 | - Download the [environment map asset](https://drive.google.com/file/d/1qbXc-fT04GcLqZX6D1WhbEtQo_Uav-FL/view?usp=sharing) and uncompress. 58 | - Download the [.blend file](https://drive.google.com/file/d/1Rg9fHn9D9RcNt1XFTvHP-RRa73lgzspF/view?usp=sharing) and uncompress. 59 | - Install the Python packages (Numpy, etc.) into the Blender built-in Python environment. 60 | ``` 61 | cd Benchmark/renderer/blender-2.93.3-linux-x64/2.93/python/bin 62 | ./python3.9 -m ensurepip 63 | ./python3.9 -m pip install --upgrade pip --user 64 | ./python3.9 -m pip install numpy --user 65 | ``` 66 | 67 | 68 | 69 | **File Structure** 70 | 71 | After downloading the datasets, organize the file structure as follows: 72 | 73 | ``` 74 | Benchmark 75 | ├── benchmark_catalogue 76 | │   ├── annotation 77 | │   │   └── ... 78 | │   ├── category_dictionary.json 79 | │   └── ... 80 | ├── dataset 81 | │   ├── objects 82 | │   │   ├── objaverse_rescale 83 | │   │   └── ycb 84 | │   └── tasks 85 | │       ├── 6DoF_track 86 | │       ├── position_track 87 | │       └── rotation_track 88 | ├── evaluation 89 | │   └── evaluator.py 90 | ├── renderer 91 | │   ├── blender-2.93.3-linux-x64 92 | │ ├── envmap_lib 93 | │ │   ├── abandoned_factory_canteen_01_1k.hdr 94 | │ │   └── ... 95 | │ ├── texture 96 | │ │   └── texture0.jpg 97 | │ ├── material_lib_v2.blend 98 | │ ├── modify_material.py 99 | │ └── open6dor_renderer.py 100 | ├── task_examples 101 | │   ├── 6DoF 102 | │   ├── position 103 | │   └── rotation 104 | └── bench.py 105 | 106 | ``` 107 | 108 | ### Usage 109 | Along with the dataset, we provide several functions to enable visualization and evaluation of the tasks: 110 | - To load a task example, run the following command (you may change the image_mode to RENDER_IMAGE_BLENDER or others): 111 | ``` 112 | cd Benchmark 113 | python bench.py load_task --task_path ./task_examples/6DoF/behind/Place_the_apple_behind_the_box_on_the_table.__upright/20240704-145831_no_interaction/task_config.json --image_mode GIVEN_IMAGE_ISAACGYM --output_path ./output/test 114 | ``` 115 | For personalized rendering, you may try arbitrary camera positions and background settings: 116 | ``` 117 | python bench.py load_task --task_path ./task_examples/rotation/None/mug_handle_left/20240717-075819_no_interaction/task_config.json --image_mode RENDER_IMAGE_BLENDER --cam_quaternion 0 0 0.0 1.0 --cam_translation 0.0 0.0 4 --background_material_id 44 --env_map_id 25 118 | ``` 119 | - To evaluate the task, run the following command( you need to fill the predicted pose into a json file): 120 | 121 | ``` 122 | python bench.py eval_task --task_id my_test --pred_pose path/to/pred_pose.json 123 | ``` 124 | 125 | - Besides evaluating the numerical results of the pose prediction directly, we provide another set of metrics where users are allowed to control the robot arm and interact with the simulation environment. Such evaluation is soely based on the final pose of the target object after execution. To do this, run the following command (currently not available): 126 | 127 | ``` 128 | python interaction.py 129 | ``` 130 | 136 | 137 | ## Method 138 | ![Method](./images/overall_pipeline_final1.jpg) 139 | By incorporating 3D awareness and simulation assistance, we effectively tackle the Open6DOR task through a decomposed approach. 140 | Specifically, Open6DOR-GPT takes the RGB-D image and instruction as input and outputs the corresponding robot motion trajectory. Firstly, the preprocessing module extracts the object names and masks. Then, the two modules simultaneously predict the position and rotation of the target object in a decoupled way. Finally, the planning module generates a trajectory for execution. 141 | 142 | See README in Method folder. 143 | 144 | 153 | 154 | 155 | 156 | 160 | 161 | ## Contact 162 | For further details or questions, please feel free to contact us: 163 | - [Yufei Ding](https://selina2023.github.io/): selina@stu.pku.edu.cn 164 | - [Haoran Geng](https://geng-haoran.github.io/): ghr@berkeley.edu 165 | - [He Wang](https://hughw19.github.io/): hewang@pku.edu.cn 166 | -------------------------------------------------------------------------------- /Method/vision/GroundedSAM/grounded_sam_demo.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import copy 4 | 5 | import numpy as np 6 | import json 7 | import torch 8 | from PIL import Image, ImageDraw, ImageFont 9 | 10 | # Grounding DINO 11 | import GroundingDINO.groundingdino.datasets.transforms as T 12 | from GroundingDINO.groundingdino.models import build_model 13 | from GroundingDINO.groundingdino.util import box_ops 14 | from GroundingDINO.groundingdino.util.slconfig import SLConfig 15 | from GroundingDINO.groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap 16 | 17 | # segment anything 18 | from segment_anything.segment_anything import ( 19 | sam_model_registry, 20 | sam_hq_model_registry, 21 | SamPredictor 22 | ) 23 | import cv2 24 | import numpy as np 25 | import matplotlib.pyplot as plt 26 | 27 | 28 | def load_image(image_path): 29 | # load image 30 | image_pil = Image.open(image_path).convert("RGB") # load image 31 | 32 | transform = T.Compose( 33 | [ 34 | T.RandomResize([800], max_size=1333), 35 | T.ToTensor(), 36 | T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), 37 | ] 38 | ) 39 | image, _ = transform(image_pil, None) # 3, h, w 40 | return image_pil, image 41 | 42 | 43 | def load_model(model_config_path, model_checkpoint_path, device): 44 | args = SLConfig.fromfile(model_config_path) 45 | args.device = device 46 | model = build_model(args) 47 | checkpoint = torch.load(model_checkpoint_path, map_location="cpu") 48 | load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False) 49 | print(load_res) 50 | _ = model.eval() 51 | return model 52 | 53 | 54 | def get_grounding_output(model, image, caption, box_threshold, text_threshold, with_logits=True, device="cpu"): 55 | caption = caption.lower() 56 | caption = caption.strip() 57 | if not caption.endswith("."): 58 | caption = caption + "." 59 | model = model.to(device) 60 | image = image.to(device) 61 | with torch.no_grad(): 62 | outputs = model(image[None], captions=[caption]) 63 | logits = outputs["pred_logits"].cpu().sigmoid()[0] # (nq, 256) 64 | boxes = outputs["pred_boxes"].cpu()[0] # (nq, 4) 65 | logits.shape[0] 66 | 67 | # filter output 68 | logits_filt = logits.clone() 69 | boxes_filt = boxes.clone() 70 | filt_mask = logits_filt.max(dim=1)[0] > box_threshold 71 | logits_filt = logits_filt[filt_mask] # num_filt, 256 72 | boxes_filt = boxes_filt[filt_mask] # num_filt, 4 73 | logits_filt.shape[0] 74 | 75 | # get phrase 76 | tokenlizer = model.tokenizer 77 | tokenized = tokenlizer(caption) 78 | # build pred 79 | pred_phrases = [] 80 | for logit, box in zip(logits_filt, boxes_filt): 81 | pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer) 82 | if with_logits: 83 | pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})") 84 | else: 85 | pred_phrases.append(pred_phrase) 86 | 87 | return boxes_filt, pred_phrases 88 | 89 | def show_mask(mask, ax, random_color=False): 90 | if random_color: 91 | color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0) 92 | else: 93 | color = np.array([30/255, 144/255, 255/255, 0.6]) 94 | h, w = mask.shape[-2:] 95 | mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1) 96 | ax.imshow(mask_image) 97 | 98 | 99 | def show_box(box, ax, label): 100 | x0, y0 = box[0], box[1] 101 | w, h = box[2] - box[0], box[3] - box[1] 102 | ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0,0,0,0), lw=2)) 103 | ax.text(x0, y0, label) 104 | 105 | 106 | def save_mask_data(output_dir, mask_list, box_list, label_list): 107 | value = 0 # 0 for background 108 | 109 | mask_img = torch.zeros(mask_list.shape[-2:]) 110 | for idx, mask in enumerate(mask_list): 111 | mask_img[mask.cpu().numpy()[0] == True] = value + idx + 1 112 | plt.figure(figsize=(10, 10)) 113 | plt.imshow(mask_img.numpy()) 114 | plt.axis('off') 115 | plt.savefig(os.path.join(output_dir, 'mask.jpg'), bbox_inches="tight", dpi=300, pad_inches=0.0) 116 | 117 | json_data = [{ 118 | 'value': value, 119 | 'label': 'background' 120 | }] 121 | for label, box in zip(label_list, box_list): 122 | value += 1 123 | name, logit = label.split('(') 124 | logit = logit[:-1] # the last is ')' 125 | json_data.append({ 126 | 'value': value, 127 | 'label': name, 128 | 'logit': float(logit), 129 | 'box': box.numpy().tolist(), 130 | }) 131 | with open(os.path.join(output_dir, 'mask.json'), 'w') as f: 132 | json.dump(json_data, f) 133 | 134 | 135 | if __name__ == "__main__": 136 | 137 | parser = argparse.ArgumentParser("Grounded-Segment-Anything Demo", add_help=True) 138 | parser.add_argument("--config", type=str, required=True, help="path to config file") 139 | parser.add_argument( 140 | "--grounded_checkpoint", type=str, required=True, help="path to checkpoint file" 141 | ) 142 | parser.add_argument( 143 | "--sam_version", type=str, default="vit_h", required=False, help="SAM ViT version: vit_b / vit_l / vit_h" 144 | ) 145 | parser.add_argument( 146 | "--sam_checkpoint", type=str, required=False, help="path to sam checkpoint file" 147 | ) 148 | parser.add_argument( 149 | "--sam_hq_checkpoint", type=str, default=None, help="path to sam-hq checkpoint file" 150 | ) 151 | parser.add_argument( 152 | "--use_sam_hq", action="store_true", help="using sam-hq for prediction" 153 | ) 154 | parser.add_argument("--input_image", type=str, required=True, help="path to image file") 155 | parser.add_argument("--text_prompt", type=str, required=True, help="text prompt") 156 | parser.add_argument( 157 | "--output_dir", "-o", type=str, default="outputs", required=True, help="output directory" 158 | ) 159 | 160 | parser.add_argument("--box_threshold", type=float, default=0.3, help="box threshold") 161 | parser.add_argument("--text_threshold", type=float, default=0.25, help="text threshold") 162 | 163 | parser.add_argument("--device", type=str, default="cpu", help="running on cpu only!, default=False") 164 | args = parser.parse_args() 165 | 166 | # cfg 167 | config_file = args.config # change the path of the model config file 168 | grounded_checkpoint = args.grounded_checkpoint # change the path of the model 169 | sam_version = args.sam_version 170 | sam_checkpoint = args.sam_checkpoint 171 | sam_hq_checkpoint = args.sam_hq_checkpoint 172 | use_sam_hq = args.use_sam_hq 173 | image_path = args.input_image 174 | text_prompt = args.text_prompt 175 | output_dir = args.output_dir 176 | box_threshold = args.box_threshold 177 | text_threshold = args.text_threshold 178 | device = args.device 179 | 180 | # make dir 181 | os.makedirs(output_dir, exist_ok=True) 182 | # load image 183 | image_pil, image = load_image(image_path) 184 | # load model 185 | model = load_model(config_file, grounded_checkpoint, device=device) 186 | 187 | # visualize raw image 188 | image_pil.save(os.path.join(output_dir, "raw_image.jpg")) 189 | 190 | # run grounding dino model 191 | boxes_filt, pred_phrases = get_grounding_output( 192 | model, image, text_prompt, box_threshold, text_threshold, device=device 193 | ) 194 | 195 | # initialize SAM 196 | if use_sam_hq: 197 | predictor = SamPredictor(sam_hq_model_registry[sam_version](checkpoint=sam_hq_checkpoint).to(device)) 198 | else: 199 | predictor = SamPredictor(sam_model_registry[sam_version](checkpoint=sam_checkpoint).to(device)) 200 | image = cv2.imread(image_path) 201 | image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 202 | predictor.set_image(image) 203 | 204 | size = image_pil.size 205 | H, W = size[1], size[0] 206 | for i in range(boxes_filt.size(0)): 207 | boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H]) 208 | boxes_filt[i][:2] -= boxes_filt[i][2:] / 2 209 | boxes_filt[i][2:] += boxes_filt[i][:2] 210 | 211 | boxes_filt = boxes_filt.cpu() 212 | transformed_boxes = predictor.transform.apply_boxes_torch(boxes_filt, image.shape[:2]).to(device) 213 | 214 | masks, _, _ = predictor.predict_torch( 215 | point_coords = None, 216 | point_labels = None, 217 | boxes = transformed_boxes.to(device), 218 | multimask_output = False, 219 | ) 220 | 221 | # draw output image 222 | plt.figure(figsize=(10, 10)) 223 | plt.imshow(image) 224 | for mask in masks: 225 | show_mask(mask.cpu().numpy(), plt.gca(), random_color=True) 226 | for box, label in zip(boxes_filt, pred_phrases): 227 | show_box(box.numpy(), plt.gca(), label) 228 | 229 | plt.axis('off') 230 | plt.savefig( 231 | os.path.join(output_dir, "grounded_sam_output.jpg"), 232 | bbox_inches="tight", dpi=300, pad_inches=0.0 233 | ) 234 | 235 | save_mask_data(output_dir, masks, boxes_filt, pred_phrases) 236 | -------------------------------------------------------------------------------- /assets/robot/franka_description/robots/franka_panda.urdf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 177 | 178 | 181 | 182 | 183 | 184 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | --------------------------------------------------------------------------------