├── LICENSE ├── README.md ├── ViewSpatial-Bench └── readme.md ├── data_process ├── coco_process │ ├── coco_single_life_object_filtered_by_area .json │ ├── get_person_by_area.py │ └── head2body_orientation_data.py └── scannet_process │ ├── Sce_Sim_make.py │ ├── bbox3d_project.py │ ├── frame_sampling.py │ ├── scannet_utils.py │ ├── scannetv2_train.txt │ └── scannetv2_val.txt ├── docs ├── flat_patternmaking.png ├── icon │ ├── avatar.png │ └── avatar1.png ├── main_result.png ├── pipeline_and_case.png └── pipline.png ├── eval └── ViewSpatial-Bench.json ├── evaluate.py └── requirements.txt /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

ViewSpatial-Bench:Evaluating Multi-perspective Spatial Localization in Vision-Language Models

2 | 3 |
4 | 5 | arXiv 6 | 7 | 8 | ViewSpatial_Bench 9 | 10 | 11 | Webpage 12 | 13 |
14 | 15 | 16 | Our work presents a range of spatial localization tasks requiring reasoning from both camera-centric and human-centric perspectives, revealing the challenges visual-language models (VLMs) face in multi-viewpoint spatial understanding. Current VLMs are predominantly trained on image-text pairs from the web that lack explicit 3D spatial annotations, limiting their cross-perspective spatial reasoning capabilities. 17 | 18 | ## 📖ViewSpatial-Bench 19 | 20 | To address this gap, we introduce **ViewSpatial-Bench**, a comprehensive benchmark with over 5,700 question-answer pairs across 1,000+ 3D scenes from ScanNet and MS-COCO validation sets. This benchmark evaluates VLMs' spatial localization capabilities from multiple perspectives, specifically testing both egocentric (camera) and allocentric (human subject) viewpoints across five distinct task types.The figure below shows the construction pipeline and example demonstrations of our benchmark. 21 | 22 | 23 | 24 | ## 🤖Multi-View Spatial Model 25 | 26 | We present Multi-View Spatial Model (MVSM), developed to address limitations in perspective-dependent spatial reasoning in vision-language models. Following the ViewSpatial-Bench pipeline, we constructed a training dataset of ~43K diverse spatial relationship samples across five task categories, utilizing automated spatial annotations from ScanNet and MS-COCO data, supplemented with Spatial-MM for person-perspective tasks. Using consistent language templates and standardized directional classifications, we implemented a Multi-Perspective Fine-Tuning strategy on Qwen2.5-VL (3B) to enhance reasoning across different observational viewpoints. This approach enables MVSM to develop unified 3D spatial relationship representations that robustly support both camera and human perspective reasoning. 27 | 28 | ## 👁️‍🗨️Results 29 | 30 | 31 | 32 | Accuracy comparison across multiple VLMs on camera and human perspective spatial tasks. Our Multi-View Spatial Model (MVSM) significantly outperforms all baseline models across all task categories, demonstrating the effectiveness of our multi-perspective spatial fine-tuning approach. These results reveal fundamental limitations in perspective-based spatial reasoning capabilities among current VLMs. Even powerful proprietary models like GPT-4o (34.98%) and Gemini-2.0-Flash (32.56%) perform only marginally above random chance (26.33%), confirming our hypothesis that standard VLMs struggle with perspective-dependent spatial reasoning despite their strong performance on other vision-language tasks. 33 | 34 | 35 | ## ⚒️QuickStart 36 | 37 | ```plaintext 38 | ViewSpatial-Bench 39 | ├── data_process # Script code for processing raw datasets to obtain metadata 40 | ├── eval # Used to store the raw dataset of ViewSpatial-Bench 41 | ├── ViewSpatial-Bench # Used to store the source images in ViewSpatial-Bench (can be downloaded from Huggingface) 42 | ├── README.md 43 | ├── evaluate.py # Script code for evaluating multiple VLMs on ViewSpatial-Bench 44 | └── requirements.txt # Dependencies for evaluation 45 | ``` 46 | 47 | **Note**: [CoCo dataset](https://cocodataset.org/) processing in `data_process` uses the original dataset's annotation files (download from official source). Head orientation calculations use [Orient Anything](https://github.com/SpatialVision/Orient-Anything)'s open-source code and model - place `head2body_orientation_data.py` in its root directory to run. 48 | 49 | ## 👀Evaluation on Your Own Model 50 | 51 | **I. With HuggingFace datasets library.** 52 | 53 | ```py 54 | # NOTE: pip install datasets 55 | 56 | from datasets import load_dataset 57 | ds = load_dataset("lidingm/ViewSpatial-Bench") 58 | ``` 59 | 60 | **II. Evaluation using Open-Source Code.** 61 | 62 | Evaluate using our open-source evaluation code available on Github.(Coming Soon) 63 | 64 | ```py 65 | # Clone the repository 66 | git clone https://github.com/ZJU-REAL/ViewSpatial-Bench.git 67 | cd ViewSpatial-Bench 68 | 69 | # Install dependencies 70 | pip install -r requirements.txt 71 | 72 | # Run evaluation 73 | python evaluate.py --model_path your_model_path 74 | ``` 75 | 76 | You can configure the appropriate model parameters and evaluation settings according to the framework's requirements to obtain performance evaluation results on the ViewSpatial-Bench dataset. 77 | 78 | ## Acknowledgement 79 | 80 | We thank the creators of the [ScanNet](https://github.com/ScanNet/ScanNet) and [MS-COCO](https://cocodataset.org/) datasets for their open-source contributions, which provided the foundational 3D scene data and visual content for our spatial annotation pipeline. We also acknowledge the developers of the [Orient Anything](https://github.com/SpatialVision/Orient-Anything) model for their valuable open-source work that supported our annotation framework development. 81 | 82 | ## Citation 83 | 84 | ``` 85 | @misc{li2025viewspatialbenchevaluatingmultiperspectivespatial, 86 | title={ViewSpatial-Bench: Evaluating Multi-perspective Spatial Localization in Vision-Language Models}, 87 | author={Dingming Li and Hongxing Li and Zixuan Wang and Yuchen Yan and Hang Zhang and Siqi Chen and Guiyang Hou and Shengpei Jiang and Wenqi Zhang and Yongliang Shen and Weiming Lu and Yueting Zhuang}, 88 | year={2025}, 89 | eprint={2505.21500}, 90 | archivePrefix={arXiv}, 91 | primaryClass={cs.CV}, 92 | url={https://arxiv.org/abs/2505.21500}, 93 | } 94 | ``` 95 | 96 | -------------------------------------------------------------------------------- /ViewSpatial-Bench/readme.md: -------------------------------------------------------------------------------- 1 | Simply download and extract the source image sets `scannetv2_val` and `val2017` from [Huggingface](https://huggingface.co/datasets/lidingm/ViewSpatial-Bench) to this directory. -------------------------------------------------------------------------------- /data_process/coco_process/get_person_by_area.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pycocotools.coco import COCO 3 | from tqdm import tqdm 4 | 5 | # Initialize the COCO API for instance annotations 6 | dataDir = 'annotations_trainval2017' # Update this to your COCO dataset path 7 | dataType = 'train2017' # Change this if you're using a different split (train2017, etc.) 8 | annFile = '{}/annotations/instances_{}.json'.format(dataDir, dataType) 9 | coco = COCO(annFile) 10 | 11 | # Initialize the COCO API for caption annotations 12 | captionAnnFile = '{}/annotations/captions_{}.json'.format(dataDir, dataType) 13 | coco_caps = COCO(captionAnnFile) 14 | 15 | # Categories we're interested in 16 | life_categories = ["person"] 17 | 18 | # Get category IDs for our target categories 19 | target_cat_ids = [] 20 | for category in life_categories: 21 | catIds = coco.getCatIds(catNms=[category]) 22 | target_cat_ids.extend(catIds) 23 | 24 | # Area threshold (e.g., object must occupy at least 1% of the image) 25 | area_ratio_threshold = 0.2 26 | 27 | print(f"Finding images with exactly one object from specified categories and enough area...") 28 | filtered_images = [] 29 | 30 | # Get all image IDs that contain any of our target categories 31 | for category in tqdm(life_categories): 32 | catIds = coco.getCatIds(catNms=[category]) 33 | imgIds = coco.getImgIds(catIds=catIds) 34 | 35 | for img_id in imgIds: 36 | obj_ann_ids = coco.getAnnIds(imgIds=img_id) 37 | obj_anns = coco.loadAnns(obj_ann_ids) 38 | 39 | target_objects = [] 40 | for ann in obj_anns: 41 | if ann['category_id'] in target_cat_ids: 42 | target_objects.append(ann) 43 | 44 | if len(target_objects) == 1: 45 | target_ann = target_objects[0] 46 | img_info = coco.loadImgs(img_id)[0] 47 | img_area = img_info['width'] * img_info['height'] 48 | obj_area = target_ann.get('area', 0) 49 | 50 | if obj_area / img_area >= area_ratio_threshold: 51 | cat_info = coco.loadCats(target_ann['category_id'])[0] 52 | filtered_images.append((img_id, cat_info['name'])) 53 | 54 | print(f"Found {len(filtered_images)} images with exactly one large-enough object from the specified categories") 55 | 56 | dataset = [] 57 | print("Creating dataset entries for each filtered image...") 58 | for img_id, category in tqdm(filtered_images): 59 | try: 60 | img_info = coco.loadImgs(img_id)[0] 61 | ann_ids = coco_caps.getAnnIds(imgIds=img_id) 62 | captions = coco_caps.loadAnns(ann_ids) 63 | 64 | item = { 65 | 'image_id': img_id, 66 | 'file_name': img_info['file_name'], 67 | 'coco_url': img_info['coco_url'], 68 | 'width': img_info['width'], 69 | 'height': img_info['height'], 70 | 'captions': [ann['caption'] for ann in captions], 71 | 'category': category 72 | } 73 | 74 | dataset.append(item) 75 | except Exception as e: 76 | print(f"Error processing image {img_id}: {e}") 77 | 78 | # Save to JSON 79 | output_file = 'coco_single_life_object_filtered_by_area.json' 80 | with open(output_file, 'w') as f: 81 | json.dump(dataset, f, indent=2) 82 | 83 | print(f"Dataset created with {len(dataset)} items and saved to {output_file}") 84 | 85 | # Summary statistics 86 | category_counts = {} 87 | for _, category in filtered_images: 88 | category_counts[category] = category_counts.get(category, 0) + 1 89 | 90 | print("\nCategory distribution in filtered dataset:") 91 | for category, count in sorted(category_counts.items(), key=lambda x: x[1], reverse=True): 92 | print(f"{category}: {count} images") 93 | -------------------------------------------------------------------------------- /data_process/coco_process/head2body_orientation_data.py: -------------------------------------------------------------------------------- 1 | # from paths import * 2 | from vision_tower import DINOv2_MLP 3 | from transformers import AutoImageProcessor 4 | import torch 5 | from PIL import Image 6 | import json 7 | from utils import * 8 | from inference import get_3angle 9 | from tqdm import tqdm 10 | import os 11 | import requests 12 | import json 13 | 14 | 15 | def get_keypoint_coordinates(keypoints, index): 16 | """ 17 | Get keypoint coordinates and visibility from keypoints list at specified index. 18 | keypoints: list of length 51, containing x, y, v for 17 keypoints. 19 | index: keypoint index, range [0, 16]. 20 | return: (x, y, v) 21 | """ 22 | x = keypoints[index * 3] 23 | y = keypoints[index * 3 + 1] 24 | v = keypoints[index * 3 + 2] 25 | return x, y, v 26 | 27 | 28 | def get_azimuth_direction(azimuth: float) -> str: 29 | """ 30 | Determine direction name based on azimuth angle 31 | 32 | Args: 33 | azimuth: azimuth angle in degrees 34 | 35 | Returns: 36 | direction name (front, front-right, right side, etc.) 37 | """ 38 | # Normalize angle to 0-360 range 39 | azimuth = azimuth % 360 40 | 41 | if 337.5 <= azimuth or azimuth < 22.5: 42 | return "back" 43 | elif 22.5 <= azimuth < 67.5: 44 | return "back-left" 45 | elif 67.5 <= azimuth < 112.5: 46 | return "left" 47 | elif 112.5 <= azimuth < 157.5: 48 | return "front-left" 49 | elif 157.5 <= azimuth < 202.5: 50 | return "front" 51 | elif 202.5 <= azimuth < 247.5: 52 | return "front-right" 53 | elif 247.5 <= azimuth < 292.5: 54 | return "right" 55 | elif 292.5 <= azimuth < 337.5: 56 | return "back-right" 57 | else: 58 | return "wrong" 59 | 60 | 61 | annotations_file_path = 'annotations_trainval2017/annotations/person_keypoints_train2017.json' 62 | 63 | # Read COCO annotation file 64 | with open(annotations_file_path, 'r') as f: 65 | coco_data = json.load(f) 66 | 67 | def get_ket_and_bbox(image_id): 68 | annotations = [ann for ann in coco_data['annotations'] if ann['image_id'] == image_id] 69 | return annotations[0]['keypoints'], annotations[0]['bbox'] 70 | 71 | 72 | def analyze_head_turn( 73 | image: Image.Image, 74 | bbox: list, # [x1, y1, x2, y2] 75 | keypoints: list, # [[x, y, conf], ..., [x, y, conf]] length at least 7 76 | dino, 77 | val_preprocess, 78 | device 79 | ): 80 | # Step 1: Crop person image from bbox 81 | x1, y1, x2, y2 = map(int, bbox) 82 | 83 | # Correct bbox coordinate order to ensure top-left corner comes first 84 | x1, x2 = min(x1, x2), max(x1, x2) 85 | y1, y2 = min(y1, y2), max(y1, y2) 86 | 87 | # Check for out-of-bounds coordinates (prevent exceeding image boundaries) 88 | img_width, img_height = image.size 89 | x1 = max(0, min(x1, img_width - 1)) 90 | x2 = max(0, min(x2, img_width)) 91 | y1 = max(0, min(y1, img_height - 1)) 92 | y2 = max(0, min(y2, img_height)) 93 | 94 | person_image = image.crop((x1, y1, x2, y2)) 95 | 96 | # Keypoint indices 97 | left_shoulder_idx = 5 98 | right_shoulder_idx = 6 99 | 100 | # Get keypoint coordinates and visibility 101 | left_shoulder = get_keypoint_coordinates(keypoints, left_shoulder_idx) 102 | right_shoulder = get_keypoint_coordinates(keypoints, right_shoulder_idx) 103 | if left_shoulder[2] == 0 or right_shoulder[2] == 0: 104 | return False, False, False, False, False 105 | 106 | # Step 2: Get left and right shoulder y coordinates (relative to cropped image) 107 | left_shoulder_y = left_shoulder[1] - y1 108 | right_shoulder_y = right_shoulder[1] - y1 109 | 110 | cut_y = int(min(left_shoulder_y, right_shoulder_y)) 111 | 112 | # Prevent abnormal cut_y values 113 | if cut_y <= 0 or cut_y >= (y2 - y1): 114 | return False, False, False, False, False 115 | 116 | # Step 3: Segment head/body images 117 | head_image = person_image.crop((0, 0, person_image.width, cut_y)) 118 | body_image = person_image.crop((0, cut_y, person_image.width, person_image.height)) 119 | 120 | if head_image.height == 0 or head_image.width == 0 or body_image.height == 0 or body_image.width == 0: 121 | head_image = person_image 122 | body_image = person_image 123 | 124 | # Step 4: Call model to get angles 125 | head_angles = get_3angle(head_image, dino, val_preprocess, device) 126 | body_angles = get_3angle(body_image, dino, val_preprocess, device) 127 | 128 | azimuth_head = float(head_angles[0]) 129 | azimuth_body = float(body_angles[0]) 130 | 131 | # Step 5: Determine head turn direction 132 | def relative_head_direction(az_head, az_body): 133 | delta = (az_head - az_body + 540) % 360 - 180 134 | 135 | if -90 <= delta < -60: 136 | return "left" 137 | elif -60 <= delta < -20: 138 | return "front-left" 139 | elif -20 <= delta <= 20: 140 | return "front" 141 | elif 20 < delta <= 60: 142 | return "front-right" 143 | elif 60 < delta <= 90: 144 | return "right" 145 | else: 146 | return "wrong" 147 | 148 | direction = relative_head_direction(azimuth_head, azimuth_body) 149 | 150 | return azimuth_head, azimuth_body, direction, float(head_angles[3]), float(body_angles[3]) 151 | 152 | 153 | ckpt_path = "dino_weight.pt" 154 | 155 | save_path = './' 156 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 157 | dino = DINOv2_MLP( 158 | dino_mode='large', 159 | in_dim=1024, 160 | out_dim=360 + 180 + 180 + 2, 161 | evaluate=True, 162 | mask_dino=False, 163 | frozen_back=False 164 | ) 165 | 166 | dino.eval() 167 | print('model create') 168 | dino.load_state_dict(torch.load(ckpt_path, map_location='cpu')) 169 | dino = dino.to(device) 170 | print('weight loaded') 171 | val_preprocess = AutoImageProcessor.from_pretrained("dinov2-large", cache_dir='./') 172 | 173 | 174 | def check_image_path(image_path): 175 | if os.path.exists(image_path): 176 | return True 177 | else: 178 | return False 179 | 180 | 181 | # ========== Utility Functions ========== 182 | def download_image(img_path, url): 183 | """Download image and save to specified path""" 184 | try: 185 | r = requests.get(url, timeout=10) 186 | if r.status_code == 200: 187 | with open(img_path, 'wb') as f: 188 | f.write(r.content) 189 | return True 190 | else: 191 | print(f"Download failed with status code: {r.status_code}") 192 | return False 193 | except Exception as e: 194 | print(f"Download failed: {e}") 195 | return False 196 | 197 | 198 | DATASET_FILE = 'coco_single_life_object_filtered_by_area.json' 199 | with open(DATASET_FILE, 'r') as f: 200 | dataset = json.load(f) 201 | result = [] 202 | for item in tqdm(dataset): 203 | if item['category'] != 'person': 204 | continue 205 | file_name = item['file_name'] 206 | image_path = "train2017/" + file_name 207 | if not check_image_path(image_path): 208 | success = download_image(image_path, item['coco_url']) 209 | if not success: 210 | print("Download Failed!") 211 | continue 212 | origin_image = Image.open(image_path).convert('RGB') 213 | keypoints, bbox = get_ket_and_bbox(item['image_id']) 214 | 215 | try: 216 | azimuth_head, azimuth_body, direction, head_confidence, body_confidence = analyze_head_turn(origin_image, bbox, 217 | keypoints, dino, 218 | val_preprocess, 219 | device) 220 | except: 221 | continue 222 | if azimuth_head == False: 223 | continue 224 | 225 | angles = get_3angle(origin_image, dino, val_preprocess, device) 226 | azimuth = float(angles[0]) 227 | polar = float(angles[1]) 228 | rotation = float(angles[2]) 229 | confidence = float(angles[3]) 230 | one = { 231 | 'image_id': item['image_id'], 232 | 'coco_url': item['coco_url'], 233 | 'width': item['width'], 234 | 'height': item['height'], 235 | 'captions': item['captions'], 236 | "azimuth": azimuth, 237 | "overall_confidence": confidence, 238 | 'azimuth_head': azimuth_head, 239 | 'azimuth_body': azimuth_body, 240 | 'person_direction': direction, 241 | 'camera_direction_v1': get_azimuth_direction(azimuth), 242 | 'camera_direction_v2': get_azimuth_direction(azimuth_head), 243 | 'head_confidence': head_confidence, 244 | 'body_confidence': body_confidence 245 | } 246 | result.append(one) 247 | 248 | # Save to JSON 249 | output_file = 'train_data.json' 250 | with open(output_file, 'w') as f: 251 | json.dump(result, f, indent=2) 252 | 253 | print(f"Dataset created with {len(result)} items and saved to {output_file}") -------------------------------------------------------------------------------- /data_process/scannet_process/Sce_Sim_make.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import random 3 | import os, sys 4 | import inspect 5 | from collections import Counter 6 | from frame_sampling import get_full_images 7 | try: 8 | import numpy as np 9 | except: 10 | print("Failed to import numpy package.") 11 | sys.exit(-1) 12 | 13 | currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) 14 | from scannet_utils import * 15 | 16 | 17 | def read_aggregation(filename): 18 | assert os.path.isfile(filename) 19 | object_id_to_segs = {} 20 | label_to_segs = {} 21 | with open(filename) as f: 22 | data = json.load(f) 23 | num_objects = len(data["segGroups"]) 24 | for i in range(num_objects): 25 | object_id = ( 26 | data["segGroups"][i]["objectId"] + 1 27 | ) # instance ids should be 1-indexed 28 | label = data["segGroups"][i]["label"] 29 | segs = data["segGroups"][i]["segments"] 30 | object_id_to_segs[object_id] = segs 31 | if label in label_to_segs: 32 | label_to_segs[label].extend(segs) 33 | else: 34 | label_to_segs[label] = segs 35 | return object_id_to_segs, label_to_segs 36 | 37 | 38 | def read_segmentation(filename): 39 | assert os.path.isfile(filename) 40 | seg_to_verts = {} 41 | with open(filename) as f: 42 | data = json.load(f) 43 | num_verts = len(data["segIndices"]) 44 | for i in range(num_verts): 45 | seg_id = data["segIndices"][i] 46 | if seg_id in seg_to_verts: 47 | seg_to_verts[seg_id].append(i) 48 | else: 49 | seg_to_verts[seg_id] = [i] 50 | return seg_to_verts, num_verts 51 | 52 | 53 | def export(mesh_file, agg_file, seg_file, meta_file, label_map_file): 54 | """points are XYZ RGB (RGB in 0-255), 55 | semantic label as nyu40 ids, 56 | instance label as 1-#instance, 57 | box as (cx,cy,cz,dx,dy,dz,semantic_label) 58 | """ 59 | label_map = read_label_mapping( 60 | label_map_file, label_from="raw_category", label_to="nyu40id" 61 | ) 62 | mesh_vertices = read_mesh_vertices_rgb(mesh_file) 63 | 64 | # Load scene axis alignment matrix 65 | lines = open(meta_file).readlines() 66 | for line in lines: 67 | if "axisAlignment" in line: 68 | axis_align_matrix = [ 69 | float(x) for x in line.rstrip().strip("axisAlignment = ").split(" ") 70 | ] 71 | break 72 | axis_align_matrix = np.array(axis_align_matrix).reshape((4, 4)) 73 | pts = np.ones((mesh_vertices.shape[0], 4)) 74 | pts[:, 0:3] = mesh_vertices[:, 0:3] 75 | pts = np.dot(pts, axis_align_matrix.transpose()) # Nx4 76 | mesh_vertices[:, 0:3] = pts[:, 0:3] 77 | 78 | # Load semantic and instance labels 79 | object_id_to_segs, label_to_segs = read_aggregation(agg_file) 80 | seg_to_verts, num_verts = read_segmentation(seg_file) 81 | label_ids = np.zeros(shape=(num_verts), dtype=np.uint32) # 0: unannotated 82 | object_id_to_label_id = {} 83 | for label, segs in label_to_segs.items(): 84 | label_id = label_map[label] 85 | for seg in segs: 86 | verts = seg_to_verts[seg] 87 | label_ids[verts] = label_id 88 | instance_ids = np.zeros(shape=(num_verts), dtype=np.uint32) # 0: unannotated 89 | num_instances = len(np.unique(list(object_id_to_segs.keys()))) 90 | for object_id, segs in object_id_to_segs.items(): 91 | for seg in segs: 92 | verts = seg_to_verts[seg] 93 | instance_ids[verts] = object_id 94 | if object_id not in object_id_to_label_id: 95 | object_id_to_label_id[object_id] = label_ids[verts][0] 96 | instance_bboxes = np.zeros((num_instances, 7)) 97 | for obj_id in object_id_to_segs: 98 | label_id = object_id_to_label_id[obj_id] 99 | obj_pc = mesh_vertices[instance_ids == obj_id, 0:3] 100 | if len(obj_pc) == 0: 101 | continue 102 | # Compute axis aligned box 103 | # An axis aligned bounding box is parameterized by 104 | # (cx,cy,cz) and (dx,dy,dz) and label id 105 | # where (cx,cy,cz) is the center point of the box, 106 | # dx is the x-axis length of the box. 107 | xmin = np.min(obj_pc[:, 0]) 108 | ymin = np.min(obj_pc[:, 1]) 109 | zmin = np.min(obj_pc[:, 2]) 110 | xmax = np.max(obj_pc[:, 0]) 111 | ymax = np.max(obj_pc[:, 1]) 112 | zmax = np.max(obj_pc[:, 2]) 113 | bbox = np.array( 114 | [ 115 | (xmin + xmax) / 2, 116 | (ymin + ymax) / 2, 117 | (zmin + zmax) / 2, 118 | xmax - xmin, 119 | ymax - ymin, 120 | zmax - zmin, 121 | label_id, 122 | ] 123 | ) 124 | # NOTE: this assumes obj_id is in 1,2,3,.,,,.NUM_INSTANCES 125 | instance_bboxes[obj_id - 1, :] = bbox 126 | 127 | 128 | return ( 129 | mesh_vertices, 130 | label_ids, 131 | instance_ids, 132 | instance_bboxes, 133 | object_id_to_label_id 134 | ) 135 | 136 | def get_3d_box(scene_name, pointcloud_folder, label_map_file): 137 | scan_path = f"{pointcloud_folder}/{scene_name}" 138 | 139 | scan_name = os.path.split(scan_path)[-1] 140 | mesh_file = os.path.join(scan_path, scan_name + "_vh_clean_2.ply") 141 | agg_file = os.path.join(scan_path, scan_name + ".aggregation.json") 142 | seg_file = os.path.join(scan_path, scan_name + "_vh_clean_2.0.010000.segs.json") 143 | meta_file = os.path.join( 144 | scan_path, scan_name + ".txt" 145 | ) # includes axisAlignment info for the train set scans. 146 | mesh_vertices, label_ids, instance_ids, instance_bboxes, object_id_to_label_id = export( 147 | mesh_file, agg_file, seg_file, meta_file, label_map_file 148 | ) 149 | return instance_bboxes 150 | 151 | 152 | def calculate_relative_position(A, B, C): 153 | A, B, C = map(np.array, (A, B, C)) 154 | 155 | vector_AB = B - A 156 | if np.linalg.norm(vector_AB) < 1e-6: 157 | raise ValueError("Objects A and B are at the same position.") 158 | 159 | forward = vector_AB / np.linalg.norm(vector_AB) 160 | world_up = np.array([0.0, 0.0, 1.0]) 161 | right = np.cross(forward, world_up) 162 | 163 | if np.linalg.norm(right) < 1e-6: 164 | world_up = np.array([0.0, 1.0, 0.0]) 165 | right = np.cross(forward, world_up) 166 | right /= np.linalg.norm(right) 167 | else: 168 | right /= np.linalg.norm(right) 169 | 170 | up = np.cross(right, forward) 171 | 172 | vector_AC = C - A 173 | local_x = np.dot(vector_AC, right) 174 | local_y = np.dot(vector_AC, up) 175 | local_z = np.dot(vector_AC, forward) 176 | 177 | return local_x, local_y, local_z 178 | 179 | 180 | def get_direction(local_x, local_z): 181 | angle = np.degrees(np.arctan2(local_x, local_z)) 182 | angle = (angle + 360) % 360 183 | 184 | if 22.5 <= angle < 67.5: 185 | return "front-right" 186 | elif 67.5 <= angle < 112.5: 187 | return "right" 188 | elif 112.5 <= angle < 157.5: 189 | return "back-right" 190 | elif 157.5 <= angle < 202.5: 191 | return "back" 192 | elif 202.5 <= angle < 247.5: 193 | return "back-left" 194 | elif 247.5 <= angle < 292.5: 195 | return "left" 196 | elif 292.5 <= angle < 337.5: 197 | return "front-left" 198 | else: 199 | return "front" 200 | 201 | 202 | def generate_qa_pairs(obj1, obj2, obj3, label1, label2, label3): 203 | """Generate QA pairs describing the relative position.""" 204 | try: 205 | x, y, z = calculate_relative_position(obj1, obj2, obj3) 206 | except ValueError: 207 | return [] 208 | 209 | direction = get_direction(x, z) 210 | if direction == "same position": 211 | return [] 212 | 213 | qa_templates = [ 214 | (f"If you stand at {label1} facing {label2}, where is {label3}?", 215 | f"If I stand at {label1} and face {label2}, then {label3} would be to my {direction}."), 216 | 217 | (f"Imagine standing at {label1} looking towards {label2}, where is {label3}?", 218 | f"Picture me standing at {label1}, facing {label2}—then {label3} would be on my {direction}."), 219 | 220 | (f"When positioned at {label1} facing {label2}, where can you find {label3}?", 221 | f"From my vantage point at {label1}, with my eyes fixed on {label2}, {label3} is located to my {direction}."), 222 | 223 | (f"Standing at {label1}, gazing at {label2}, where should {label3} be?", 224 | f"From this spot at {label1}, looking directly at {label2}, I’d locate {label3} on my {direction} side.") 225 | ] 226 | # All possible options 227 | all_options = ["left", "right", "front", "back", "back-right", "back-left", 228 | "front-left", "front-right"] 229 | qa_pairs = [] 230 | 231 | q_template = [random.choice(qa_templates)] 232 | for q, a in q_template: 233 | distractors = [opt for opt in all_options if opt not in direction and direction not in opt] 234 | selected_distractors = random.sample(distractors, 3) 235 | options = [direction] + selected_distractors 236 | random.shuffle(options) 237 | option_letters = ["A", "B", "C", "D"] 238 | correct_letter_index = options.index(direction) 239 | correct_option = f"{option_letters[correct_letter_index]}. {direction}" 240 | formatted_options = "\n".join([f"{option_letters[i]}. {options[i]}" for i in range(4)]) 241 | question = f"{q}\n{formatted_options}" 242 | qa_pairs.append({ 243 | "question": question, 244 | "answer": correct_option 245 | }) 246 | 247 | return qa_pairs 248 | 249 | 250 | def get_random_combinations(lst, max_samples=10000): 251 | all_combinations = list(itertools.combinations(lst, 3)) 252 | num_samples = min(max_samples, len(all_combinations)) 253 | return random.sample(all_combinations, num_samples) 254 | 255 | def get_jpg_files(folder_path): 256 | jpg_files = [] 257 | for filename in os.listdir(folder_path): 258 | file_path = os.path.join(folder_path, filename) 259 | 260 | if os.path.isfile(file_path) and filename.lower().endswith('.jpg'): 261 | jpg_files.append(filename) 262 | 263 | return jpg_files 264 | 265 | 266 | if __name__ == "__main__": 267 | nyu40_to_category = { 268 | 0: "unlabeled", 1: "wall", 2: "floor", 3: "cabinet", 4: "bed", 269 | 5: "chair", 6: "sofa", 7: "table", 8: "door", 9: "window", 270 | 10: "bookshelf", 11: "picture", 12: "counter", 13: "blinds", 271 | 14: "desk", 15: "shelves", 16: "curtain", 17: "dresser", 272 | 18: "pillow", 19: "mirror", 20: "floor mat", 21: "clothes", 273 | 22: "ceiling", 23: "books", 24: "refrigerator", 25: "television", 274 | 26: "paper", 27: "towel", 28: "shower curtain", 29: "box", 275 | 30: "whiteboard", 31: "person", 32: "nightstand", 33: "toilet", 276 | 34: "sink", 35: "lamp", 36: "bathtub", 37: "bag", 277 | 38: "other structure", 39: "other furniture", 40: "other prop" 278 | } 279 | 280 | scene_root = "scannet_metadata" 281 | output_path = r"scannet_metadata/perspective_3d.json" 282 | 283 | # Get all point cloud files and label mapping file in the scene 284 | pointcloud_folder = "/datasets/scannet/scans" 285 | label_map_file = "/datasets/scannet/scannetv2-labels.combined.tsv" 286 | 287 | qa_dataset = [] 288 | scene_num = 0 289 | 290 | with open('scannetv2_val.txt', 'r', encoding='utf-8') as file: 291 | lines = file.readlines() 292 | 293 | scenes = [line.strip() for line in lines] 294 | for i, scene in enumerate(scenes): 295 | scene_name = scene 296 | 297 | img_size = (1296, 968) 298 | instance_bboxes = get_3d_box(scene_name, pointcloud_folder, label_map_file) 299 | 300 | # 3D bounding box 301 | bboxes_3d = [tuple(bbox) for bbox in instance_bboxes if bbox[6] not in [0, 1, 2, 22, 38, 39, 40]] 302 | 303 | bbox_6_counts = Counter(bbox[6] for bbox in bboxes_3d) 304 | unique_bboxes_3d = [bbox for bbox in bboxes_3d if bbox_6_counts[bbox[6]] == 1] 305 | le = len(unique_bboxes_3d) 306 | if le < 3: 307 | continue 308 | scene_num = scene_num+1 309 | 310 | combinations_3d = get_random_combinations(unique_bboxes_3d, 40) 311 | 312 | for combination in combinations_3d: 313 | obj1 = (combination[0][0], combination[0][1],combination[0][2]) 314 | obj2 = (combination[1][0], combination[1][1], combination[1][2]) 315 | obj3 = (combination[2][0], combination[2][1], combination[2][2]) 316 | 317 | category_name1 = nyu40_to_category.get(int(combination[0][6]), "unknown") 318 | category_name2 = nyu40_to_category.get(int(combination[1][6]), "unknown") 319 | category_name3 = nyu40_to_category.get(int(combination[2][6]), "unknown") 320 | labels = (category_name1, category_name2, category_name3) 321 | 322 | jpg_files_list = get_full_images(scene_name, labels) 323 | if not jpg_files_list: 324 | img_path = os.path.join(scene_root, scene_name) 325 | img_path = img_path + "/original_images" 326 | jpg_files_list = get_jpg_files(img_path) 327 | jpg_files_list = [scene_name + "/original_images/" + a for a in jpg_files_list] 328 | # generate QA-pairs 329 | qa_set = generate_qa_pairs(obj1, obj2, obj3, *labels) 330 | 331 | for num, qa in enumerate(qa_set[:4], 1): 332 | qa_dataset.append({"image_path": scene_name, "question": qa['question'], "answer": qa['answer'], "image": jpg_files_list}) 333 | 334 | print(f"Scene {i} has been successfully saved!") 335 | 336 | with open(output_path, 'w', encoding='utf-8') as output_file: 337 | json.dump(qa_dataset, output_file, ensure_ascii=False, indent=4) 338 | print(f"QA data has been saved to {output_path}, total of {scene_num} scenes! {len(qa_dataset)} questions!") -------------------------------------------------------------------------------- /data_process/scannet_process/bbox3d_project.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | 4 | import numpy as np 5 | import torch 6 | import cv2 7 | from PIL import Image, ImageDraw 8 | import json 9 | from scannet_utils import * 10 | # Set environment variable to resolve OpenMP error 11 | os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE' 12 | 13 | 14 | def load_matrix_from_txt(path, shape=(4, 4)): 15 | with open(path) as f: 16 | txt = f.readlines() 17 | txt = ''.join(txt).replace('\n', ' ') 18 | matrix = [float(v) for v in txt.split()] 19 | return np.array(matrix).reshape(shape) 20 | 21 | 22 | def get_align_matrix(meta_file): 23 | lines = open(meta_file).readlines() 24 | for line in lines: 25 | if "axisAlignment" in line: 26 | axis_align_matrix = [ 27 | float(x) for x in line.rstrip().strip("axisAlignment = ").split(" ") 28 | ] 29 | break 30 | axis_align_matrix = np.array(axis_align_matrix).reshape((4, 4)) 31 | return axis_align_matrix 32 | 33 | 34 | def get_3d_bbox_corners(centers, sizes): 35 | """ 36 | Batch generate 8 corner points for multiple 3D bounding boxes 37 | 38 | Parameters: 39 | centers: numpy array with shape (N, 3), representing N bounding box centers 40 | sizes: numpy array with shape (N, 3), representing N bounding box dimensions [length, width, height] 41 | 42 | Returns: 43 | corners: numpy array with shape (N, 8, 3), representing 8 corner points for N bounding boxes 44 | """ 45 | N = centers.shape[0] # 边界框数量 46 | corners = np.zeros((N, 8, 3)) 47 | 48 | for i in range(N): 49 | x, y, z = centers[i] 50 | l, w, h = sizes[i] / 2.0 51 | 52 | # 定义8个角点的相对坐标 53 | corners[i] = np.array([ 54 | [x + l, y + w, z + h], [x + l, y + w, z - h], [x + l, y - w, z + h], [x + l, y - w, z - h], 55 | [x - l, y + w, z + h], [x - l, y + w, z - h], [x - l, y - w, z + h], [x - l, y - w, z - h] 56 | ]) 57 | 58 | return corners 59 | 60 | 61 | def draw_3d_bboxes(image, bboxes_2d, visibilities, colors=None, thickness=2, show_invisible=False): 62 | """ 63 | Batch draw multiple projected 3D bounding boxes on an image 64 | 65 | Parameters: 66 | image: PIL Image object 67 | bboxes_2d: numpy array with shape (N, 8, 2), representing N sets of projected 2D points 68 | visibilities: numpy array with shape (N, 8), indicating point visibility 69 | colors: list of length N containing N color tuples, auto-generated if None 70 | thickness: integer representing line thickness 71 | show_invisible: boolean indicating whether to show invisible edges (displayed as dashed lines) 72 | 73 | Returns: 74 | image: PIL Image object with bounding boxes drawn 75 | """ 76 | N = bboxes_2d.shape[0] # Number of bounding boxes 77 | 78 | # Connection line indices for bounding box edges 79 | lines = [ 80 | [0, 1], [0, 2], [1, 3], [2, 3], 81 | [4, 5], [4, 6], [5, 7], [6, 7], 82 | [0, 4], [1, 5], [2, 6], [3, 7] 83 | ] 84 | 85 | # Generate different colors automatically if none provided 86 | if colors is None: 87 | colors = [] 88 | for i in range(N): 89 | # Generate random colors while avoiding colors that are too dark or too bright 90 | color = ( 91 | np.random.randint(50, 200), 92 | np.random.randint(50, 200), 93 | np.random.randint(50, 200) 94 | ) 95 | colors.append(color) 96 | 97 | # Convert image to OpenCV format 98 | img_cv = np.array(image) 99 | 100 | # Draw edges for each bounding box 101 | for i in range(N): 102 | bbox_2d = bboxes_2d[i] 103 | visibility = visibilities[i] 104 | color = colors[i] 105 | 106 | # Draw edges 107 | for [j, k] in lines: 108 | pt1 = (int(bbox_2d[j, 0]), int(bbox_2d[j, 1])) 109 | pt2 = (int(bbox_2d[k, 0]), int(bbox_2d[k, 1])) 110 | 111 | # Draw solid line if both endpoints are visible 112 | if visibility[j] and visibility[k]: 113 | cv2.line(img_cv, pt1, pt2, color, thickness) 114 | # Draw dashed line if show_invisible is set and at least one endpoint is visible 115 | elif show_invisible and (visibility[j] or visibility[k]): 116 | # Create dashed line 117 | pts = np.array([pt1, pt2], np.int32).reshape((-1, 1, 2)) 118 | cv2.polylines(img_cv, [pts], False, color, thickness=1, lineType=cv2.LINE_AA, shift=0) 119 | 120 | # Draw visible points 121 | for j, vis in enumerate(visibility): 122 | if vis: 123 | pt = (int(bbox_2d[j, 0]), int(bbox_2d[j, 1])) 124 | cv2.circle(img_cv, pt, 3, color, -1) 125 | 126 | # Convert back to PIL Image 127 | return Image.fromarray(img_cv) 128 | 129 | def project_3d_bbox_to_2d(bboxes_3d, intrinsic, pose, image_size, depth_image=None, depth_scale=1000.0, 130 | occlusion_threshold=0.1): 131 | """ 132 | Batch project multiple 3D bounding boxes to 2D image plane and detect occlusion, 133 | resolving size mismatch between depth map and color image 134 | 135 | Parameters: 136 | bboxes_3d: numpy array with shape (N, 8, 3), representing 8 corner points of N 3D bounding boxes 137 | intrinsic: numpy array with shape (4, 4), camera intrinsic matrix 138 | pose: numpy array with shape (4, 4), camera extrinsic matrix (camera pose) 139 | image_size: tuple (width, height), representing image dimensions 140 | depth_image: numpy array with shape (height, width), representing depth image, no occlusion detection if None 141 | depth_scale: float, scale factor for depth image to convert depth values to meters 142 | occlusion_threshold: float, depth difference threshold in meters for determining point occlusion 143 | 144 | Returns: 145 | bboxes_2d: numpy array with shape (N, 8, 2), representing projected 2D points 146 | visibilities: numpy array with shape (N, 8), indicating point visibility 147 | """ 148 | N = bboxes_3d.shape[0] # Number of bounding boxes 149 | 150 | # Initialize results 151 | bboxes_2d = np.zeros((N, 8, 2)) 152 | visibilities = np.zeros((N, 8), dtype=bool) 153 | 154 | # Get depth image dimensions (if available) 155 | depth_height, depth_width = 0, 0 156 | color_width, color_height = image_size 157 | depth_to_color_scale_x, depth_to_color_scale_y = 1.0, 1.0 158 | 159 | if depth_image is not None: 160 | depth_height, depth_width = depth_image.shape[:2] 161 | # Calculate scaling ratio from depth image to color image 162 | depth_to_color_scale_x = color_width / depth_width 163 | depth_to_color_scale_y = color_height / depth_height 164 | 165 | # Calculate transformation from world coordinate system to camera coordinate system 166 | world_to_cam = np.linalg.inv(pose) 167 | 168 | # Process all N objects in the scene 169 | for i in range(N): 170 | # Get 8 corner points of current bounding box 171 | bbox_3d = bboxes_3d[i] 172 | 173 | # Convert 3D bounding box to homogeneous coordinates 174 | bbox_3d_homogeneous = np.hstack([bbox_3d, np.ones((bbox_3d.shape[0], 1))]) # (8, 4) 175 | 176 | # Transform 3D points from world coordinate system to camera coordinate system 177 | cam_points = bbox_3d_homogeneous @ world_to_cam.T # (8, 4) 178 | 179 | # Check if points are in front of camera (z > 0) 180 | visibility = cam_points[:, 2] > 0 181 | 182 | # Apply projection matrix to project points onto image plane 183 | points_2d_homogeneous = cam_points @ intrinsic.T # (8, 4) 184 | 185 | # Perspective division: convert homogeneous coordinates to image coordinates 186 | points_2d = points_2d_homogeneous[:, :2] / points_2d_homogeneous[:, 2:3] 187 | 188 | # Check if points are within image bounds 189 | in_image = (points_2d[:, 0] >= 0) & (points_2d[:, 0] < color_width) & \ 190 | (points_2d[:, 1] >= 0) & (points_2d[:, 1] < color_height) 191 | 192 | # Update visibility: points must be in front of camera and within image bounds 193 | visibility = visibility & in_image 194 | 195 | # Detect occlusion if depth image is available 196 | if depth_image is not None: 197 | for j in range(8): 198 | if visibility[j]: 199 | # Get pixel coordinates of projected point in color image 200 | color_x, color_y = int(points_2d[j, 0]), int(points_2d[j, 1]) 201 | 202 | # Convert color image coordinates to depth image coordinates 203 | depth_x = int(color_x / depth_to_color_scale_x) 204 | depth_y = int(color_y / depth_to_color_scale_y) 205 | 206 | # Ensure point is within depth image bounds 207 | if 0 <= depth_x < depth_width and 0 <= depth_y < depth_height: 208 | # Get actual depth from depth map 209 | actual_depth = float(depth_image[depth_y, depth_x]) / depth_scale # Convert to meters 210 | 211 | # Get calculated depth (z value in camera coordinate system) 212 | calculated_depth = float(cam_points[j, 2]) 213 | 214 | # Compare actual depth with calculated depth to determine occlusion 215 | # Point is considered occluded only when depth value is valid (>0) and calculated depth is significantly greater than actual depth 216 | if actual_depth > 0 and calculated_depth - actual_depth > occlusion_threshold: 217 | visibility[j] = False 218 | else: 219 | # Maintain current visibility state if point is outside depth image bounds 220 | pass 221 | 222 | # Save results for storage and later inclusion in visibility data JSON file 223 | bboxes_2d[i] = points_2d 224 | visibilities[i] = visibility 225 | 226 | return bboxes_2d, visibilities 227 | 228 | 229 | def load_3d_boxes(json_file): 230 | """ 231 | Load 3D bounding box data from JSON file 232 | 233 | Parameters: 234 | json_file: file path to JSON file containing 3D bounding box data 235 | 236 | Returns: 237 | centers: numpy array with shape (N, 3), representing bounding box center points 238 | sizes: numpy array with shape (N, 3), representing bounding box dimensions 239 | labels: list containing bounding box labels 240 | object_ids: list containing bounding box object IDs 241 | """ 242 | data = json_file 243 | 244 | centers = [] 245 | sizes = [] 246 | labels = [] 247 | object_ids = [] 248 | 249 | # 解析JSON数据 250 | for box in data['boxes']: 251 | center = np.array(box['center']) 252 | size = np.array(box['size']) 253 | label = box.get('label', 'unknown') 254 | object_id = box.get('object_id', -1) 255 | 256 | centers.append(center) 257 | sizes.append(size) 258 | labels.append(label) 259 | object_ids.append(object_id) 260 | 261 | return np.array(centers), np.array(sizes), labels, object_ids 262 | 263 | 264 | def process_image_with_boxes(image_path, boxes_json, intrinsic_path, pose_path, meta_file, output_path=None, 265 | visibility_json_path=None, depth_image_path=None, depth_scale=1000.0, 266 | occlusion_threshold=0.1, draw_picture=False): 267 | """ 268 | Process a single image by drawing all 3D bounding boxes and saving visible object information, 269 | with occlusion detection and handling of size mismatch between depth map and color image 270 | 271 | Parameters: 272 | image_path: file path to the image 273 | boxes_json: JSON object containing 3D bounding box data 274 | intrinsic_path: file path to camera intrinsic parameters 275 | pose_path: file path to camera pose 276 | meta_file: file path to scene metadata 277 | output_path: output image path, uses default path if None 278 | visibility_json_path: visibility JSON file path, uses default path if None 279 | depth_image_path: depth image file path, no occlusion detection if None 280 | depth_scale: float, scale factor for depth image to convert depth values to meters 281 | occlusion_threshold: float, depth difference threshold in meters for determining point occlusion 282 | 283 | Returns: 284 | output_path: path to the output image 285 | visibility_json_path: path to the visibility JSON file 286 | """ 287 | 288 | image = Image.open(image_path) 289 | image_size = image.size 290 | 291 | def cv_imread(file_path): 292 | cv_img = cv2.imdecode(np.fromfile(file_path, dtype=np.uint8), -1) 293 | return cv_img 294 | 295 | # 加载深度图像(如果提供) 296 | depth_image = None 297 | if depth_image_path and os.path.exists(depth_image_path): 298 | depth_image = cv_imread(depth_image_path) 299 | if len(depth_image.shape) > 2: 300 | depth_image = depth_image[:, :, 0] 301 | 302 | intrinsic = load_matrix_from_txt(intrinsic_path) 303 | pose = load_matrix_from_txt(pose_path) 304 | 305 | axis_align_matrix = get_align_matrix(meta_file) 306 | 307 | pose = axis_align_matrix @ pose 308 | 309 | world_to_cam = np.linalg.inv(pose) 310 | 311 | centers, sizes, labels, object_ids = load_3d_boxes(boxes_json) 312 | 313 | bboxes_3d = get_3d_bbox_corners(centers, sizes) 314 | 315 | bboxes_2d, visibilities = project_3d_bbox_to_2d( 316 | bboxes_3d, intrinsic, pose, image_size, depth_image, depth_scale, occlusion_threshold 317 | ) 318 | 319 | # ------------------------------------------------Store the image file for drawing------------------------------------------------ 320 | if draw_picture: 321 | unique_labels = list(set(labels)) 322 | label_colors = {} 323 | for i, label in enumerate(unique_labels): 324 | h = (i * 30) % 180 325 | hsv = np.array([[[h, 255, 255]]], dtype=np.uint8) 326 | rgb = cv2.cvtColor(hsv, cv2.COLOR_HSV2RGB)[0][0] 327 | label_colors[label] = (int(rgb[0]), int(rgb[1]), int(rgb[2])) 328 | 329 | colors = [label_colors[label] for label in labels] 330 | result_image = draw_3d_bboxes(image, bboxes_2d, visibilities, colors=colors, show_invisible=True) 331 | 332 | img_draw = ImageDraw.Draw(result_image) 333 | for i, (bbox_2d, visibility, label) in enumerate(zip(bboxes_2d, visibilities, labels)): 334 | visible_points = bbox_2d[visibility] 335 | if len(visible_points) > 0: 336 | top_point = visible_points[np.argmin(visible_points[:, 1])] 337 | x, y = int(top_point[0]), int(top_point[1] - 10) 338 | img_draw.text((x, y), label, fill=colors[i]) 339 | 340 | if output_path is None: 341 | output_path = os.path.splitext(image_path)[0] + "_with_boxes.jpg" 342 | 343 | result_image.save(output_path) 344 | 345 | # ------------------------------------------------Store visibility data in JSON format------------------------------------------------ 346 | visibility_data = { 347 | "image_path": os.path.basename(image_path), 348 | "visible_objects": [] 349 | } 350 | 351 | # It was originally set up, but the obstruction filtering will reduce the visibility by 8 points. Therefore, after being stored in the visibility JSON file, 352 | # you can decide for yourself at what level to filter (I currently set it to be greater than 0.2, because two points indicate that one side of the box can be seen). 353 | visibility_threshold = 0.01 354 | 355 | bboxes_3d_cam = [] 356 | for bbox_3d in bboxes_3d: 357 | bbox_3d_homogeneous = np.hstack([bbox_3d, np.ones((bbox_3d.shape[0], 1))]) # (8, 4) 358 | cam_points = bbox_3d_homogeneous @ world_to_cam.T # (8, 4) 359 | bbox_3d_cam = cam_points[:, :3] 360 | bboxes_3d_cam.append(bbox_3d_cam) 361 | 362 | for i, (bbox_2d, bbox_3d_cam, visibility, label, object_id) in enumerate( 363 | zip(bboxes_2d, bboxes_3d_cam, visibilities, labels, object_ids)): 364 | 365 | visibility_ratio = np.mean(visibility) 366 | is_visible = visibility_ratio >= visibility_threshold 367 | 368 | if is_visible: 369 | visible_points_count = np.sum(visibility) 370 | 371 | bbox_2d_list = bbox_2d.tolist() 372 | bbox_3d_cam_list = bbox_3d_cam.tolist() 373 | 374 | # Store coordinate center points in their respective coordinate systems 375 | bbox_2d_cam_center = np.mean(bbox_2d, axis=0) 376 | bbox_3d_cam_center = np.mean(bbox_3d_cam, axis=0) 377 | 378 | visibility_data["visible_objects"].append({ 379 | "object_id": object_id, 380 | "label": label, 381 | "visibility_ratio": float(visibility_ratio), 382 | "visible_points_count": int(visible_points_count), 383 | "bbox_2d_center": bbox_2d_cam_center.tolist(), 384 | "bbox_3d_center": bbox_3d_cam_center.tolist(), 385 | "vertices_visibility": visibility.tolist(), 386 | "occlusion_checked": depth_image is not None 387 | }) 388 | 389 | if visibility_json_path is None: 390 | visibility_json_path = os.path.splitext(output_path)[0] + "_visibility.json" 391 | 392 | with open(visibility_json_path, 'w') as f: 393 | json.dump(visibility_data, f, indent=2) 394 | 395 | return output_path, visibility_json_path 396 | 397 | 398 | def batch_process_images(image_folder, image_chosen, boxes_json, intrinsic_path, meta_file, output_folder, 399 | visibility_folder, depth_folder, pose_folder, depth_scale=1000.0, occlusion_threshold=0.1, draw_picture=False): 400 | """ 401 | Batch process all images in a folder with occlusion detection 402 | 403 | Parameters: 404 | image_folder: folder path containing images 405 | boxes_json: JSON object containing 3D bounding box data 406 | intrinsic_path: file path to camera intrinsic parameters 407 | meta_file: file path to scene metadata 408 | output_folder: output folder path, uses default path if None 409 | visibility_folder: visibility JSON folder path, uses output_folder if None 410 | depth_folder: depth image folder path, no occlusion detection if None 411 | depth_scale: float, scale factor for depth image to convert depth values to meters 412 | occlusion_threshold: float, depth difference threshold in meters for determining point occlusion 413 | 414 | Returns: 415 | processed_images: list of processed image file paths 416 | visibility_jsons: list of visibility JSON file paths 417 | """ 418 | 419 | os.makedirs(output_folder, exist_ok=True) 420 | os.makedirs(visibility_folder, exist_ok=True) 421 | 422 | # Obtain all image files 423 | image_files = image_chosen 424 | 425 | 426 | processed_images = [] 427 | visibility_jsons = [] 428 | 429 | for image_file in image_files: 430 | image_path = os.path.join(image_folder, image_file) 431 | 432 | # Keep the file name as it is, and then change it to "txt" to retrieve the corresponding camera parameters from the "pose" folder. 433 | pose_file = image_file.replace('.jpg', '.txt') 434 | pose_path = os.path.join(pose_folder, pose_file) 435 | 436 | if not os.path.exists(pose_path): 437 | print(f"The pose file does not exist.{pose_path}!") 438 | continue 439 | 440 | depth_image_path = None 441 | if depth_folder: 442 | depth_file = image_file.replace('.jpg', '.png') 443 | depth_image_path = os.path.join(depth_folder, depth_file) 444 | if not os.path.exists(depth_image_path): 445 | print(f"Depth image does not exist: {depth_image_path}") 446 | depth_image_path = None 447 | 448 | output_path = os.path.join(output_folder, f"{os.path.splitext(image_file)[0]}_with_boxes.jpg") 449 | visibility_json_path = os.path.join(visibility_folder, f"{os.path.splitext(image_file)[0]}_visibility.json") 450 | 451 | processed_path, vis_json_path = process_image_with_boxes( 452 | image_path, boxes_json, intrinsic_path, pose_path, meta_file, 453 | output_path, visibility_json_path, depth_image_path, depth_scale, occlusion_threshold, draw_picture 454 | ) 455 | 456 | processed_images.append(processed_path) 457 | visibility_jsons.append(vis_json_path) 458 | 459 | 460 | summary_data = { 461 | "scene": os.path.basename(os.path.dirname(image_folder)), 462 | "image_count": len(processed_images), 463 | "depth_images_used": depth_folder is not None, 464 | "occlusion_threshold": occlusion_threshold, 465 | "per_image_visibility": [] 466 | } 467 | 468 | # 汇总每个图像的可见性信息 469 | for vis_json_path in visibility_jsons: 470 | try: 471 | with open(vis_json_path, 'r') as f: 472 | vis_data = json.load(f) 473 | summary_data["per_image_visibility"].append({ 474 | "image_path": vis_data["image_path"], 475 | "visible_object_count": len(vis_data["visible_objects"]), 476 | 477 | "visible_object_ids": [obj["object_id"] for obj in vis_data["visible_objects"] if obj["visibility_ratio"] > 0.1], 478 | "visible_object_labels": [obj["label"] for obj in vis_data["visible_objects"] if obj["visibility_ratio"] > 0.1], 479 | 480 | "occlusion_checked": any(obj.get("occlusion_checked", False) for obj in vis_data["visible_objects"]) 481 | }) 482 | except Exception as e: 483 | print(f"读取可见性文件 {vis_json_path} 时出错: {e}") 484 | 485 | summary_path = os.path.join(visibility_folder, "visibility_summary.json") 486 | 487 | def extract_number(image_path): 488 | match = re.search(r'(\d+)\.jpg', image_path) 489 | if match: 490 | return int(match.group(1)) 491 | return 0 492 | 493 | summary_data['per_image_visibility'] = sorted(summary_data['per_image_visibility'],key=lambda x: extract_number(x["image_path"])) 494 | 495 | with open(summary_path, 'w') as f: 496 | json.dump(summary_data, f, indent=2) 497 | 498 | return processed_images, visibility_jsons 499 | 500 | 501 | def get_3d_box(scene_name, pointcloud_folder, label_map_file): 502 | scan_path = f"{pointcloud_folder}/{scene_name}" 503 | 504 | scan_name = os.path.split(scan_path)[-1] 505 | mesh_file = os.path.join(scan_path, scan_name + "_vh_clean_2.ply") 506 | agg_file = os.path.join(scan_path, scan_name + ".aggregation.json") 507 | seg_file = os.path.join(scan_path, scan_name + "_vh_clean_2.0.010000.segs.json") 508 | meta_file = os.path.join( 509 | scan_path, scan_name + ".txt" 510 | ) # includes axisAlignment info for the train set scans. 511 | mesh_vertices, label_ids, instance_ids, instance_bboxes, object_id_to_label_id, json_boxes = export( 512 | mesh_file, agg_file, seg_file, meta_file, label_map_file 513 | ) 514 | return json_boxes 515 | 516 | 517 | def process(scene_name, draw_picture=False): 518 | # Original dataset path (modifiable) 519 | scan_path = f"/datasets/scannet/data/scans/{scene_name}" 520 | 521 | # Get all RGB-D images in the scene (modifiable) 522 | image_folder = f"/datasets/scannet/scenes/{scene_name}/mc_frames" 523 | 524 | # Get all point cloud files and label mapping file in the scene 525 | pointcloud_folder = "/datasets/scannet/scans" 526 | label_map_file = "/datasets/scannet/scannetv2-labels.combined.tsv" 527 | 528 | # Output folders (modifiable) 529 | output_folder = f"scannet_metadata/{scene_name}/output_images" # Store rendered images 530 | visibility_folder = f"scannet_metadata/{scene_name}/visibility_data" # Store object information, coordinates, and visibility data for each image 531 | 532 | image_all_files = os.listdir(image_folder) 533 | image_chosen = [file for file in image_all_files if file.lower().endswith('.jpg')] 534 | 535 | # Depth image file path 536 | depth_folder = os.path.join(scan_path, "depth") 537 | # Camera parameter file path 538 | pose_folder = os.path.join(scan_path, "pose") 539 | # Camera intrinsic file path, only RGB-D intrinsic data is used here 540 | intrinsic_path = os.path.join(scan_path, "intrinsic_color.txt") 541 | # For obtaining alignment matrix 542 | meta_file = os.path.join( 543 | pointcloud_folder, scene_name, scene_name + ".txt" 544 | ) 545 | 546 | boxes_json = get_3d_box(scene_name, pointcloud_folder, label_map_file) 547 | 548 | # Set depth image scale factor and occlusion threshold 549 | depth_scale = 1000.0 # Assume depth units are in millimeters, convert to meters 550 | occlusion_threshold = 0.1 # Set depth difference threshold to 10 centimeters 551 | 552 | # Batch process all images 553 | processed_images, visibility_jsons = batch_process_images( 554 | image_folder, image_chosen, boxes_json, intrinsic_path, meta_file, 555 | output_folder, visibility_folder, depth_folder, pose_folder, depth_scale, occlusion_threshold, draw_picture 556 | ) 557 | 558 | 559 | 560 | if __name__ == "__main__": 561 | 562 | # draw_picture boolean value determines whether to draw new images to the output folder (drawing every 100 scenes as shown below) 563 | # Open file and read each line 564 | with open('scannetv2_val.txt', 'r', encoding='utf-8') as file: 565 | lines = file.readlines() 566 | 567 | scenes = [line.strip() for line in lines] 568 | 569 | for i, scene in enumerate(scenes): 570 | draw_picture = False 571 | if i%100 == 0: 572 | draw_picture = True 573 | print(f"Processed {i} scenes") 574 | scene_name = scene 575 | process(scene_name, draw_picture) -------------------------------------------------------------------------------- /data_process/scannet_process/frame_sampling.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | def find_continuous_frames(json_data, target_labels): 5 | """ 6 | Find consecutive video frames that collectively contain the specified three objects 7 | 8 | Parameters: 9 | json_data (dict): parsed JSON data 10 | target_labels (list): list of three target object labels 11 | 12 | Returns: 13 | list: list of consecutive frame filenames 14 | """ 15 | # Record visible target objects in each frame 16 | frames_info = [] 17 | for frame in json_data["per_image_visibility"]: 18 | visible_targets = set(frame["visible_object_labels"]) & set(target_labels) 19 | frames_info.append({ 20 | "image_path": frame["image_path"], 21 | "visible_targets": visible_targets, 22 | "visible_count": len(visible_targets) 23 | }) 24 | 25 | # Find the frame where target objects first appear 26 | start_index = None 27 | for i, frame in enumerate(frames_info): 28 | if frame["visible_count"] > 0: 29 | start_index = i 30 | break 31 | 32 | if start_index is None: 33 | return [] 34 | 35 | # Starting from the first appearance frame, search for consecutive frame sequences until all target objects are covered 36 | current_index = start_index 37 | found_targets = set() 38 | 39 | while current_index < len(frames_info) and len(found_targets) < len(target_labels): 40 | found_targets.update(frames_info[current_index]["visible_targets"]) 41 | current_index += 1 42 | 43 | if len(found_targets) == len(target_labels) or current_index == len(frames_info): 44 | if len(found_targets) == len(target_labels) and current_index != len(frames_info): 45 | current_index += 1 46 | break 47 | 48 | # Check if all the target objects have been found 49 | if len(found_targets) < len(target_labels): 50 | return [] 51 | 52 | result_frames = [frames_info[i]["image_path"] for i in range(start_index, current_index)] 53 | return result_frames 54 | 55 | def get_full_images(scene_name, target_labels): 56 | 57 | with open(f'scannet_metadata/{scene_name}/visibility_data/visibility_summary.json', 'r') as file: 58 | data = json.load(file) 59 | result = find_continuous_frames(data, target_labels) 60 | return result 61 | -------------------------------------------------------------------------------- /data_process/scannet_process/scannet_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | """ Ref: https://github.com/ScanNet/ScanNet/blob/master/BenchmarkScripts """ 7 | import os 8 | import sys 9 | import json 10 | import csv 11 | 12 | try: 13 | import numpy as np 14 | except: 15 | print("Failed to import numpy package.") 16 | sys.exit(-1) 17 | 18 | try: 19 | from plyfile import PlyData, PlyElement 20 | except: 21 | print("Please install the module 'plyfile' for PLY i/o, e.g.") 22 | print("pip install plyfile") 23 | sys.exit(-1) 24 | 25 | 26 | def represents_int(s): 27 | """if string s represents an int.""" 28 | try: 29 | int(s) 30 | return True 31 | except ValueError: 32 | return False 33 | 34 | 35 | def read_label_mapping(filename, label_from="raw_category", label_to="nyu40id"): 36 | assert os.path.isfile(filename) 37 | mapping = dict() 38 | with open(filename) as csvfile: 39 | reader = csv.DictReader(csvfile, delimiter="\t") 40 | for row in reader: 41 | mapping[row[label_from]] = int(row[label_to]) 42 | if represents_int(list(mapping.keys())[0]): 43 | mapping = {int(k): v for k, v in mapping.items()} 44 | return mapping 45 | 46 | 47 | def read_mesh_vertices(filename): 48 | """read XYZ for each vertex.""" 49 | assert os.path.isfile(filename) 50 | with open(filename, "rb") as f: 51 | plydata = PlyData.read(f) 52 | num_verts = plydata["vertex"].count 53 | vertices = np.zeros(shape=[num_verts, 3], dtype=np.float32) 54 | vertices[:, 0] = plydata["vertex"].data["x"] 55 | vertices[:, 1] = plydata["vertex"].data["y"] 56 | vertices[:, 2] = plydata["vertex"].data["z"] 57 | return vertices 58 | 59 | 60 | def read_mesh_vertices_rgb(filename): 61 | """read XYZ RGB for each vertex. 62 | Note: RGB values are in 0-255 63 | """ 64 | assert os.path.isfile(filename) 65 | with open(filename, "rb") as f: 66 | plydata = PlyData.read(f) 67 | num_verts = plydata["vertex"].count 68 | vertices = np.zeros(shape=[num_verts, 6], dtype=np.float32) 69 | vertices[:, 0] = plydata["vertex"].data["x"] 70 | vertices[:, 1] = plydata["vertex"].data["y"] 71 | vertices[:, 2] = plydata["vertex"].data["z"] 72 | vertices[:, 3] = plydata["vertex"].data["red"] 73 | vertices[:, 4] = plydata["vertex"].data["green"] 74 | vertices[:, 5] = plydata["vertex"].data["blue"] 75 | return vertices 76 | 77 | 78 | 79 | def read_aggregation(filename): 80 | assert os.path.isfile(filename) 81 | object_id_to_segs = {} 82 | label_to_segs = {} 83 | with open(filename) as f: 84 | data = json.load(f) 85 | num_objects = len(data["segGroups"]) 86 | for i in range(num_objects): 87 | object_id = ( 88 | data["segGroups"][i]["objectId"] + 1 89 | ) # instance ids should be 1-indexed 90 | label = data["segGroups"][i]["label"] 91 | segs = data["segGroups"][i]["segments"] 92 | object_id_to_segs[object_id] = segs 93 | if label in label_to_segs: 94 | label_to_segs[label].extend(segs) 95 | else: 96 | label_to_segs[label] = segs 97 | return object_id_to_segs, label_to_segs 98 | 99 | 100 | def read_segmentation(filename): 101 | assert os.path.isfile(filename) 102 | seg_to_verts = {} 103 | with open(filename) as f: 104 | data = json.load(f) 105 | num_verts = len(data["segIndices"]) 106 | for i in range(num_verts): 107 | seg_id = data["segIndices"][i] 108 | if seg_id in seg_to_verts: 109 | seg_to_verts[seg_id].append(i) 110 | else: 111 | seg_to_verts[seg_id] = [i] 112 | return seg_to_verts, num_verts 113 | 114 | 115 | 116 | 117 | def export(mesh_file, agg_file, seg_file, meta_file, label_map_file, output_file=None, json_file=None): 118 | """points are XYZ RGB (RGB in 0-255), 119 | semantic label as nyu40 ids, 120 | instance label as 1-#instance, 121 | box as (cx,cy,cz,dx,dy,dz,semantic_label) 122 | """ 123 | label_map = read_label_mapping( 124 | label_map_file, label_from="raw_category", label_to="nyu40id" 125 | ) 126 | mesh_vertices = read_mesh_vertices_rgb(mesh_file) 127 | 128 | # Load scene axis alignment matrix 129 | lines = open(meta_file).readlines() 130 | axis_align_matrix = np.eye(4) 131 | for line in lines: 132 | if "axisAlignment" in line: 133 | axis_align_matrix = np.array([ 134 | float(x) for x in line.rstrip().strip("axisAlignment = ").split(" ") 135 | ]).reshape((4, 4)) 136 | break 137 | 138 | pts = np.ones((mesh_vertices.shape[0], 4)) 139 | pts[:, 0:3] = mesh_vertices[:, 0:3] 140 | pts = np.dot(pts, axis_align_matrix.transpose()) # Nx4 141 | mesh_vertices[:, 0:3] = pts[:, 0:3] 142 | 143 | # Load semantic and instance labels 144 | object_id_to_segs, label_to_segs = read_aggregation(agg_file) 145 | seg_to_verts, num_verts = read_segmentation(seg_file) 146 | label_ids = np.zeros(shape=(num_verts), dtype=np.uint32) # 0: unannotated 147 | object_id_to_label_id = {} 148 | 149 | for label, segs in label_to_segs.items(): 150 | label_id = label_map.get(label, 0) 151 | for seg in segs: 152 | verts = seg_to_verts[seg] 153 | label_ids[verts] = label_id 154 | 155 | instance_ids = np.zeros(shape=(num_verts), dtype=np.uint32) # 0: unannotated 156 | num_instances = len(np.unique(list(object_id_to_segs.keys()))) 157 | for object_id, segs in object_id_to_segs.items(): 158 | for seg in segs: 159 | verts = seg_to_verts[seg] 160 | instance_ids[verts] = object_id 161 | if object_id not in object_id_to_label_id: 162 | object_id_to_label_id[object_id] = label_ids[verts][0] 163 | 164 | instance_bboxes = np.zeros((num_instances, 7)) 165 | json_boxes = {"boxes": []} 166 | 167 | for obj_id in object_id_to_segs: 168 | label_id = object_id_to_label_id.get(obj_id, 0) 169 | obj_pc = mesh_vertices[instance_ids == obj_id, 0:3] 170 | if len(obj_pc) == 0: 171 | continue 172 | if label_id in [0, 1, 2, 22, 38, 39, 40]: 173 | continue 174 | # Compute axis-aligned bounding box 175 | xmin, ymin, zmin = np.min(obj_pc, axis=0) 176 | xmax, ymax, zmax = np.max(obj_pc, axis=0) 177 | bbox = np.array([ 178 | (xmin + xmax) / 2, 179 | (ymin + ymax) / 2, 180 | (zmin + zmax) / 2, 181 | xmax - xmin, 182 | ymax - ymin, 183 | zmax - zmin, 184 | label_id, 185 | ]) 186 | instance_bboxes[obj_id - 1, :] = bbox 187 | 188 | nyu40_to_category = { 189 | 0: "unlabeled", 1: "wall", 2: "floor", 3: "cabinet", 4: "bed", 190 | 5: "chair", 6: "sofa", 7: "table", 8: "door", 9: "window", 191 | 10: "bookshelf", 11: "picture", 12: "counter", 13: "blinds", 192 | 14: "desk", 15: "shelves", 16: "curtain", 17: "dresser", 193 | 18: "pillow", 19: "mirror", 20: "floor mat", 21: "clothes", 194 | 22: "ceiling", 23: "books", 24: "refrigerator", 25: "television", 195 | 26: "paper", 27: "towel", 28: "shower curtain", 29: "box", 196 | 30: "whiteboard", 31: "person", 32: "nightstand", 33: "toilet", 197 | 34: "sink", 35: "lamp", 36: "bathtub", 37: "bag", 198 | 38: "other structure", 39: "other furniture", 40: "other prop" 199 | } 200 | 201 | json_boxes["boxes"].append({ 202 | "center": bbox[:3].tolist(), 203 | "size": bbox[3:6].tolist(), 204 | "label": nyu40_to_category.get(label_id), 205 | "object_id": int(obj_id) 206 | }) 207 | 208 | if output_file: 209 | os.makedirs(os.path.dirname(output_file), exist_ok=True) 210 | np.save(output_file + "_vert.npy", mesh_vertices) 211 | np.save(output_file + "_sem_label.npy", label_ids) 212 | np.save(output_file + "_ins_label.npy", instance_ids) 213 | np.save(output_file + "_bbox.npy", instance_bboxes) 214 | 215 | return ( 216 | mesh_vertices, 217 | label_ids, 218 | instance_ids, 219 | instance_bboxes, 220 | object_id_to_label_id, 221 | json_boxes 222 | ) 223 | 224 | -------------------------------------------------------------------------------- /data_process/scannet_process/scannetv2_train.txt: -------------------------------------------------------------------------------- 1 | scene0000_00 2 | scene0000_01 3 | scene0000_02 4 | scene0001_00 5 | scene0001_01 6 | scene0002_00 7 | scene0002_01 8 | scene0003_00 9 | scene0003_01 10 | scene0003_02 11 | scene0004_00 12 | scene0005_00 13 | scene0005_01 14 | scene0006_00 15 | scene0006_01 16 | scene0006_02 17 | scene0007_00 18 | scene0008_00 19 | scene0009_00 20 | scene0009_01 21 | scene0009_02 22 | scene0010_00 23 | scene0010_01 24 | scene0012_00 25 | scene0012_01 26 | scene0012_02 27 | scene0013_00 28 | scene0013_01 29 | scene0013_02 30 | scene0014_00 31 | scene0016_00 32 | scene0016_01 33 | scene0016_02 34 | scene0017_00 35 | scene0017_01 36 | scene0017_02 37 | scene0018_00 38 | scene0020_00 39 | scene0020_01 40 | scene0021_00 41 | scene0022_00 42 | scene0022_01 43 | scene0023_00 44 | scene0024_00 45 | scene0024_01 46 | scene0024_02 47 | scene0026_00 48 | scene0027_00 49 | scene0027_01 50 | scene0027_02 51 | scene0028_00 52 | scene0029_00 53 | scene0029_01 54 | scene0029_02 55 | scene0031_00 56 | scene0031_01 57 | scene0031_02 58 | scene0032_00 59 | scene0032_01 60 | scene0033_00 61 | scene0034_00 62 | scene0034_01 63 | scene0034_02 64 | scene0035_00 65 | scene0035_01 66 | scene0036_00 67 | scene0036_01 68 | scene0037_00 69 | scene0038_00 70 | scene0038_01 71 | scene0038_02 72 | scene0039_00 73 | scene0039_01 74 | scene0040_00 75 | scene0040_01 76 | scene0041_00 77 | scene0041_01 78 | scene0042_00 79 | scene0042_01 80 | scene0042_02 81 | scene0043_00 82 | scene0043_01 83 | scene0044_00 84 | scene0044_01 85 | scene0044_02 86 | scene0045_00 87 | scene0045_01 88 | scene0047_00 89 | scene0048_00 90 | scene0048_01 91 | scene0049_00 92 | scene0051_00 93 | scene0051_01 94 | scene0051_02 95 | scene0051_03 96 | scene0052_00 97 | scene0052_01 98 | scene0052_02 99 | scene0053_00 100 | scene0054_00 101 | scene0055_00 102 | scene0055_01 103 | scene0055_02 104 | scene0056_00 105 | scene0056_01 106 | scene0057_00 107 | scene0057_01 108 | scene0058_00 109 | scene0058_01 110 | scene0059_00 111 | scene0059_01 112 | scene0059_02 113 | scene0060_00 114 | scene0060_01 115 | scene0061_00 116 | scene0061_01 117 | scene0062_00 118 | scene0062_01 119 | scene0062_02 120 | scene0065_00 121 | scene0065_01 122 | scene0065_02 123 | scene0066_00 124 | scene0067_00 125 | scene0067_01 126 | scene0067_02 127 | scene0068_00 128 | scene0068_01 129 | scene0069_00 130 | scene0070_00 131 | scene0071_00 132 | scene0072_00 133 | scene0072_01 134 | scene0072_02 135 | scene0073_00 136 | scene0073_01 137 | scene0073_02 138 | scene0073_03 139 | scene0074_00 140 | scene0074_01 141 | scene0074_02 142 | scene0075_00 143 | scene0076_00 144 | scene0078_00 145 | scene0078_01 146 | scene0078_02 147 | scene0079_00 148 | scene0079_01 149 | scene0080_00 150 | scene0080_01 151 | scene0080_02 152 | scene0082_00 153 | scene0083_00 154 | scene0083_01 155 | scene0085_00 156 | scene0085_01 157 | scene0087_00 158 | scene0087_01 159 | scene0087_02 160 | scene0089_00 161 | scene0089_01 162 | scene0089_02 163 | scene0090_00 164 | scene0091_00 165 | scene0092_00 166 | scene0092_01 167 | scene0092_02 168 | scene0092_03 169 | scene0092_04 170 | scene0093_00 171 | scene0093_01 172 | scene0093_02 173 | scene0094_00 174 | scene0096_00 175 | scene0096_01 176 | scene0096_02 177 | scene0097_00 178 | scene0098_00 179 | scene0098_01 180 | scene0099_00 181 | scene0099_01 182 | scene0101_00 183 | scene0101_01 184 | scene0101_02 185 | scene0101_03 186 | scene0101_04 187 | scene0101_05 188 | scene0102_00 189 | scene0102_01 190 | scene0103_00 191 | scene0103_01 192 | scene0104_00 193 | scene0105_00 194 | scene0105_01 195 | scene0105_02 196 | scene0106_00 197 | scene0106_01 198 | scene0106_02 199 | scene0107_00 200 | scene0108_00 201 | scene0109_00 202 | scene0109_01 203 | scene0110_00 204 | scene0110_01 205 | scene0110_02 206 | scene0111_00 207 | scene0111_01 208 | scene0111_02 209 | scene0112_00 210 | scene0112_01 211 | scene0112_02 212 | scene0113_00 213 | scene0113_01 214 | scene0114_00 215 | scene0114_01 216 | scene0114_02 217 | scene0115_00 218 | scene0115_01 219 | scene0115_02 220 | scene0116_00 221 | scene0116_01 222 | scene0116_02 223 | scene0117_00 224 | scene0118_00 225 | scene0118_01 226 | scene0118_02 227 | scene0119_00 228 | scene0120_00 229 | scene0120_01 230 | scene0121_00 231 | scene0121_01 232 | scene0121_02 233 | scene0122_00 234 | scene0122_01 235 | scene0123_00 236 | scene0123_01 237 | scene0123_02 238 | scene0124_00 239 | scene0124_01 240 | scene0125_00 241 | scene0126_00 242 | scene0126_01 243 | scene0126_02 244 | scene0127_00 245 | scene0127_01 246 | scene0128_00 247 | scene0129_00 248 | scene0130_00 249 | scene0132_00 250 | scene0132_01 251 | scene0132_02 252 | scene0133_00 253 | scene0134_00 254 | scene0134_01 255 | scene0134_02 256 | scene0135_00 257 | scene0136_00 258 | scene0136_01 259 | scene0136_02 260 | scene0137_00 261 | scene0137_01 262 | scene0137_02 263 | scene0138_00 264 | scene0140_00 265 | scene0140_01 266 | scene0141_00 267 | scene0141_01 268 | scene0141_02 269 | scene0142_00 270 | scene0142_01 271 | scene0143_00 272 | scene0143_01 273 | scene0143_02 274 | scene0145_00 275 | scene0147_00 276 | scene0147_01 277 | scene0148_00 278 | scene0150_00 279 | scene0150_01 280 | scene0150_02 281 | scene0151_00 282 | scene0151_01 283 | scene0152_00 284 | scene0152_01 285 | scene0152_02 286 | scene0154_00 287 | scene0155_00 288 | scene0155_01 289 | scene0155_02 290 | scene0156_00 291 | scene0157_00 292 | scene0157_01 293 | scene0158_00 294 | scene0158_01 295 | scene0158_02 296 | scene0159_00 297 | scene0160_00 298 | scene0160_01 299 | scene0160_02 300 | scene0160_03 301 | scene0160_04 302 | scene0161_00 303 | scene0161_01 304 | scene0161_02 305 | scene0162_00 306 | scene0163_00 307 | scene0163_01 308 | scene0165_00 309 | scene0165_01 310 | scene0165_02 311 | scene0166_00 312 | scene0166_01 313 | scene0166_02 314 | scene0167_00 315 | scene0168_00 316 | scene0168_01 317 | scene0168_02 318 | scene0170_00 319 | scene0170_01 320 | scene0170_02 321 | scene0171_00 322 | scene0171_01 323 | scene0172_00 324 | scene0172_01 325 | scene0173_00 326 | scene0173_01 327 | scene0173_02 328 | scene0174_00 329 | scene0174_01 330 | scene0175_00 331 | scene0176_00 332 | scene0177_00 333 | scene0177_01 334 | scene0177_02 335 | scene0178_00 336 | scene0179_00 337 | scene0180_00 338 | scene0181_00 339 | scene0181_01 340 | scene0181_02 341 | scene0181_03 342 | scene0182_00 343 | scene0182_01 344 | scene0182_02 345 | scene0183_00 346 | scene0184_00 347 | scene0185_00 348 | scene0186_00 349 | scene0186_01 350 | scene0188_00 351 | scene0189_00 352 | scene0190_00 353 | scene0191_00 354 | scene0191_01 355 | scene0191_02 356 | scene0192_00 357 | scene0192_01 358 | scene0192_02 359 | scene0194_00 360 | scene0195_00 361 | scene0195_01 362 | scene0195_02 363 | scene0197_00 364 | scene0197_01 365 | scene0197_02 366 | scene0198_00 367 | scene0199_00 368 | scene0200_00 369 | scene0200_01 370 | scene0200_02 371 | scene0201_00 372 | scene0201_01 373 | scene0201_02 374 | scene0202_00 375 | scene0204_00 376 | scene0204_01 377 | scene0204_02 378 | scene0205_00 379 | scene0205_01 380 | scene0205_02 381 | scene0206_00 382 | scene0206_01 383 | scene0206_02 384 | scene0209_00 385 | scene0209_01 386 | scene0209_02 387 | scene0210_00 388 | scene0210_01 389 | scene0211_00 390 | scene0211_01 391 | scene0211_02 392 | scene0211_03 393 | scene0212_00 394 | scene0212_01 395 | scene0212_02 396 | scene0213_00 397 | scene0214_00 398 | scene0214_01 399 | scene0214_02 400 | scene0215_00 401 | scene0215_01 402 | scene0216_00 403 | scene0218_00 404 | scene0218_01 405 | scene0219_00 406 | scene0220_00 407 | scene0220_01 408 | scene0220_02 409 | scene0223_00 410 | scene0223_01 411 | scene0223_02 412 | scene0224_00 413 | scene0225_00 414 | scene0226_00 415 | scene0226_01 416 | scene0227_00 417 | scene0228_00 418 | scene0229_00 419 | scene0229_01 420 | scene0229_02 421 | scene0230_00 422 | scene0232_00 423 | scene0232_01 424 | scene0232_02 425 | scene0233_00 426 | scene0233_01 427 | scene0234_00 428 | scene0235_00 429 | scene0236_00 430 | scene0236_01 431 | scene0237_00 432 | scene0237_01 433 | scene0238_00 434 | scene0238_01 435 | scene0239_00 436 | scene0239_01 437 | scene0239_02 438 | scene0240_00 439 | scene0241_00 440 | scene0241_01 441 | scene0241_02 442 | scene0242_00 443 | scene0242_01 444 | scene0242_02 445 | scene0243_00 446 | scene0244_00 447 | scene0244_01 448 | scene0245_00 449 | scene0247_00 450 | scene0247_01 451 | scene0248_00 452 | scene0248_01 453 | scene0248_02 454 | scene0250_00 455 | scene0250_01 456 | scene0250_02 457 | scene0252_00 458 | scene0253_00 459 | scene0254_00 460 | scene0254_01 461 | scene0255_00 462 | scene0255_01 463 | scene0255_02 464 | scene0258_00 465 | scene0259_00 466 | scene0259_01 467 | scene0260_00 468 | scene0260_01 469 | scene0260_02 470 | scene0261_00 471 | scene0261_01 472 | scene0261_02 473 | scene0261_03 474 | scene0262_00 475 | scene0262_01 476 | scene0263_00 477 | scene0263_01 478 | scene0264_00 479 | scene0264_01 480 | scene0264_02 481 | scene0265_00 482 | scene0265_01 483 | scene0265_02 484 | scene0266_00 485 | scene0266_01 486 | scene0267_00 487 | scene0268_00 488 | scene0268_01 489 | scene0268_02 490 | scene0269_00 491 | scene0269_01 492 | scene0269_02 493 | scene0270_00 494 | scene0270_01 495 | scene0270_02 496 | scene0271_00 497 | scene0271_01 498 | scene0272_00 499 | scene0272_01 500 | scene0273_00 501 | scene0273_01 502 | scene0274_00 503 | scene0274_01 504 | scene0274_02 505 | scene0275_00 506 | scene0276_00 507 | scene0276_01 508 | scene0279_00 509 | scene0279_01 510 | scene0279_02 511 | scene0280_00 512 | scene0280_01 513 | scene0280_02 514 | scene0281_00 515 | scene0282_00 516 | scene0282_01 517 | scene0282_02 518 | scene0283_00 519 | scene0284_00 520 | scene0285_00 521 | scene0286_00 522 | scene0286_01 523 | scene0286_02 524 | scene0286_03 525 | scene0287_00 526 | scene0288_00 527 | scene0288_01 528 | scene0288_02 529 | scene0289_00 530 | scene0289_01 531 | scene0290_00 532 | scene0291_00 533 | scene0291_01 534 | scene0291_02 535 | scene0292_00 536 | scene0292_01 537 | scene0293_00 538 | scene0293_01 539 | scene0294_00 540 | scene0294_01 541 | scene0294_02 542 | scene0295_00 543 | scene0295_01 544 | scene0296_00 545 | scene0296_01 546 | scene0297_00 547 | scene0297_01 548 | scene0297_02 549 | scene0298_00 550 | scene0299_00 551 | scene0299_01 552 | scene0301_00 553 | scene0301_01 554 | scene0301_02 555 | scene0302_00 556 | scene0302_01 557 | scene0303_00 558 | scene0303_01 559 | scene0303_02 560 | scene0305_00 561 | scene0305_01 562 | scene0306_00 563 | scene0306_01 564 | scene0308_00 565 | scene0309_00 566 | scene0309_01 567 | scene0310_00 568 | scene0310_01 569 | scene0310_02 570 | scene0311_00 571 | scene0312_00 572 | scene0312_01 573 | scene0312_02 574 | scene0313_00 575 | scene0313_01 576 | scene0313_02 577 | scene0315_00 578 | scene0317_00 579 | scene0317_01 580 | scene0318_00 581 | scene0319_00 582 | scene0320_00 583 | scene0320_01 584 | scene0320_02 585 | scene0320_03 586 | scene0321_00 587 | scene0322_00 588 | scene0323_00 589 | scene0323_01 590 | scene0324_00 591 | scene0324_01 592 | scene0325_00 593 | scene0325_01 594 | scene0326_00 595 | scene0327_00 596 | scene0330_00 597 | scene0331_00 598 | scene0331_01 599 | scene0332_00 600 | scene0332_01 601 | scene0332_02 602 | scene0333_00 603 | scene0335_00 604 | scene0335_01 605 | scene0335_02 606 | scene0336_00 607 | scene0336_01 608 | scene0337_00 609 | scene0337_01 610 | scene0337_02 611 | scene0339_00 612 | scene0340_00 613 | scene0340_01 614 | scene0340_02 615 | scene0341_00 616 | scene0341_01 617 | scene0344_00 618 | scene0344_01 619 | scene0345_00 620 | scene0345_01 621 | scene0346_00 622 | scene0346_01 623 | scene0347_00 624 | scene0347_01 625 | scene0347_02 626 | scene0348_00 627 | scene0348_01 628 | scene0348_02 629 | scene0349_00 630 | scene0349_01 631 | scene0350_00 632 | scene0350_01 633 | scene0350_02 634 | scene0352_00 635 | scene0352_01 636 | scene0352_02 637 | scene0358_00 638 | scene0358_01 639 | scene0358_02 640 | scene0359_00 641 | scene0359_01 642 | scene0360_00 643 | scene0361_00 644 | scene0361_01 645 | scene0361_02 646 | scene0362_00 647 | scene0362_01 648 | scene0362_02 649 | scene0362_03 650 | scene0363_00 651 | scene0364_00 652 | scene0364_01 653 | scene0365_00 654 | scene0365_01 655 | scene0365_02 656 | scene0366_00 657 | scene0367_00 658 | scene0367_01 659 | scene0368_00 660 | scene0368_01 661 | scene0369_00 662 | scene0369_01 663 | scene0369_02 664 | scene0370_00 665 | scene0370_01 666 | scene0370_02 667 | scene0371_00 668 | scene0371_01 669 | scene0372_00 670 | scene0373_00 671 | scene0373_01 672 | scene0374_00 673 | scene0375_00 674 | scene0375_01 675 | scene0375_02 676 | scene0376_00 677 | scene0376_01 678 | scene0376_02 679 | scene0379_00 680 | scene0380_00 681 | scene0380_01 682 | scene0380_02 683 | scene0381_00 684 | scene0381_01 685 | scene0381_02 686 | scene0383_00 687 | scene0383_01 688 | scene0383_02 689 | scene0384_00 690 | scene0385_00 691 | scene0385_01 692 | scene0385_02 693 | scene0386_00 694 | scene0387_00 695 | scene0387_01 696 | scene0387_02 697 | scene0388_00 698 | scene0388_01 699 | scene0390_00 700 | scene0391_00 701 | scene0392_00 702 | scene0392_01 703 | scene0392_02 704 | scene0393_00 705 | scene0393_01 706 | scene0393_02 707 | scene0394_00 708 | scene0394_01 709 | scene0395_00 710 | scene0395_01 711 | scene0395_02 712 | scene0396_00 713 | scene0396_01 714 | scene0396_02 715 | scene0397_00 716 | scene0397_01 717 | scene0398_00 718 | scene0398_01 719 | scene0399_00 720 | scene0399_01 721 | scene0400_00 722 | scene0400_01 723 | scene0401_00 724 | scene0402_00 725 | scene0403_00 726 | scene0403_01 727 | scene0404_00 728 | scene0404_01 729 | scene0404_02 730 | scene0405_00 731 | scene0407_00 732 | scene0407_01 733 | scene0408_00 734 | scene0408_01 735 | scene0409_00 736 | scene0409_01 737 | scene0410_00 738 | scene0410_01 739 | scene0411_00 740 | scene0411_01 741 | scene0411_02 742 | scene0413_00 743 | scene0415_00 744 | scene0415_01 745 | scene0415_02 746 | scene0416_00 747 | scene0416_01 748 | scene0416_02 749 | scene0416_03 750 | scene0416_04 751 | scene0417_00 752 | scene0418_00 753 | scene0418_01 754 | scene0418_02 755 | scene0419_00 756 | scene0419_01 757 | scene0419_02 758 | scene0420_00 759 | scene0420_01 760 | scene0420_02 761 | scene0421_00 762 | scene0421_01 763 | scene0421_02 764 | scene0422_00 765 | scene0424_00 766 | scene0424_01 767 | scene0424_02 768 | scene0425_00 769 | scene0425_01 770 | scene0428_00 771 | scene0428_01 772 | scene0429_00 773 | scene0431_00 774 | scene0433_00 775 | scene0434_00 776 | scene0434_01 777 | scene0434_02 778 | scene0436_00 779 | scene0437_00 780 | scene0437_01 781 | scene0438_00 782 | scene0439_00 783 | scene0439_01 784 | scene0440_00 785 | scene0440_01 786 | scene0440_02 787 | scene0442_00 788 | scene0443_00 789 | scene0444_00 790 | scene0444_01 791 | scene0445_00 792 | scene0445_01 793 | scene0446_00 794 | scene0446_01 795 | scene0447_00 796 | scene0447_01 797 | scene0447_02 798 | scene0448_00 799 | scene0448_01 800 | scene0448_02 801 | scene0449_00 802 | scene0449_01 803 | scene0449_02 804 | scene0450_00 805 | scene0451_00 806 | scene0451_01 807 | scene0451_02 808 | scene0451_03 809 | scene0451_04 810 | scene0451_05 811 | scene0452_00 812 | scene0452_01 813 | scene0452_02 814 | scene0453_00 815 | scene0453_01 816 | scene0454_00 817 | scene0455_00 818 | scene0456_00 819 | scene0456_01 820 | scene0457_00 821 | scene0457_01 822 | scene0457_02 823 | scene0459_00 824 | scene0459_01 825 | scene0460_00 826 | scene0463_00 827 | scene0463_01 828 | scene0464_00 829 | scene0465_00 830 | scene0465_01 831 | scene0466_00 832 | scene0466_01 833 | scene0467_00 834 | scene0468_00 835 | scene0468_01 836 | scene0468_02 837 | scene0469_00 838 | scene0469_01 839 | scene0469_02 840 | scene0470_00 841 | scene0470_01 842 | scene0471_00 843 | scene0471_01 844 | scene0471_02 845 | scene0472_00 846 | scene0472_01 847 | scene0472_02 848 | scene0473_00 849 | scene0473_01 850 | scene0475_00 851 | scene0475_01 852 | scene0475_02 853 | scene0476_00 854 | scene0476_01 855 | scene0476_02 856 | scene0477_00 857 | scene0477_01 858 | scene0478_00 859 | scene0478_01 860 | scene0479_00 861 | scene0479_01 862 | scene0479_02 863 | scene0480_00 864 | scene0480_01 865 | scene0481_00 866 | scene0481_01 867 | scene0482_00 868 | scene0482_01 869 | scene0483_00 870 | scene0484_00 871 | scene0484_01 872 | scene0485_00 873 | scene0486_00 874 | scene0487_00 875 | scene0487_01 876 | scene0489_00 877 | scene0489_01 878 | scene0489_02 879 | scene0491_00 880 | scene0492_00 881 | scene0492_01 882 | scene0493_00 883 | scene0493_01 884 | scene0495_00 885 | scene0497_00 886 | scene0498_00 887 | scene0498_01 888 | scene0498_02 889 | scene0499_00 890 | scene0501_00 891 | scene0501_01 892 | scene0501_02 893 | scene0502_00 894 | scene0502_01 895 | scene0502_02 896 | scene0503_00 897 | scene0504_00 898 | scene0505_00 899 | scene0505_01 900 | scene0505_02 901 | scene0505_03 902 | scene0505_04 903 | scene0506_00 904 | scene0507_00 905 | scene0508_00 906 | scene0508_01 907 | scene0508_02 908 | scene0509_00 909 | scene0509_01 910 | scene0509_02 911 | scene0510_00 912 | scene0510_01 913 | scene0510_02 914 | scene0511_00 915 | scene0511_01 916 | scene0512_00 917 | scene0513_00 918 | scene0514_00 919 | scene0514_01 920 | scene0515_00 921 | scene0515_01 922 | scene0515_02 923 | scene0516_00 924 | scene0516_01 925 | scene0517_00 926 | scene0517_01 927 | scene0517_02 928 | scene0519_00 929 | scene0520_00 930 | scene0520_01 931 | scene0521_00 932 | scene0522_00 933 | scene0523_00 934 | scene0523_01 935 | scene0523_02 936 | scene0524_00 937 | scene0524_01 938 | scene0525_00 939 | scene0525_01 940 | scene0525_02 941 | scene0526_00 942 | scene0526_01 943 | scene0528_00 944 | scene0528_01 945 | scene0529_00 946 | scene0529_01 947 | scene0529_02 948 | scene0530_00 949 | scene0531_00 950 | scene0532_00 951 | scene0532_01 952 | scene0533_00 953 | scene0533_01 954 | scene0534_00 955 | scene0534_01 956 | scene0536_00 957 | scene0536_01 958 | scene0536_02 959 | scene0537_00 960 | scene0538_00 961 | scene0539_00 962 | scene0539_01 963 | scene0539_02 964 | scene0540_00 965 | scene0540_01 966 | scene0540_02 967 | scene0541_00 968 | scene0541_01 969 | scene0541_02 970 | scene0542_00 971 | scene0543_00 972 | scene0543_01 973 | scene0543_02 974 | scene0544_00 975 | scene0545_00 976 | scene0545_01 977 | scene0545_02 978 | scene0546_00 979 | scene0547_00 980 | scene0547_01 981 | scene0547_02 982 | scene0548_00 983 | scene0548_01 984 | scene0548_02 985 | scene0551_00 986 | scene0554_00 987 | scene0554_01 988 | scene0555_00 989 | scene0556_00 990 | scene0556_01 991 | scene0557_00 992 | scene0557_01 993 | scene0557_02 994 | scene0560_00 995 | scene0561_00 996 | scene0561_01 997 | scene0562_00 998 | scene0563_00 999 | scene0564_00 1000 | scene0566_00 1001 | scene0567_00 1002 | scene0567_01 1003 | scene0569_00 1004 | scene0569_01 1005 | scene0570_00 1006 | scene0570_01 1007 | scene0570_02 1008 | scene0571_00 1009 | scene0571_01 1010 | scene0572_00 1011 | scene0572_01 1012 | scene0572_02 1013 | scene0573_00 1014 | scene0573_01 1015 | scene0576_00 1016 | scene0576_01 1017 | scene0576_02 1018 | scene0577_00 1019 | scene0579_00 1020 | scene0579_01 1021 | scene0579_02 1022 | scene0581_00 1023 | scene0581_01 1024 | scene0581_02 1025 | scene0582_00 1026 | scene0582_01 1027 | scene0582_02 1028 | scene0584_00 1029 | scene0584_01 1030 | scene0584_02 1031 | scene0585_00 1032 | scene0585_01 1033 | scene0586_00 1034 | scene0586_01 1035 | scene0586_02 1036 | scene0587_00 1037 | scene0587_01 1038 | scene0587_02 1039 | scene0587_03 1040 | scene0588_00 1041 | scene0588_01 1042 | scene0588_02 1043 | scene0588_03 1044 | scene0589_00 1045 | scene0589_01 1046 | scene0589_02 1047 | scene0590_00 1048 | scene0590_01 1049 | scene0592_00 1050 | scene0592_01 1051 | scene0594_00 1052 | scene0596_00 1053 | scene0596_01 1054 | scene0596_02 1055 | scene0597_00 1056 | scene0597_01 1057 | scene0597_02 1058 | scene0600_00 1059 | scene0600_01 1060 | scene0600_02 1061 | scene0601_00 1062 | scene0601_01 1063 | scene0602_00 1064 | scene0603_00 1065 | scene0603_01 1066 | scene0604_00 1067 | scene0604_01 1068 | scene0604_02 1069 | scene0605_00 1070 | scene0605_01 1071 | scene0610_00 1072 | scene0610_01 1073 | scene0610_02 1074 | scene0611_00 1075 | scene0611_01 1076 | scene0612_00 1077 | scene0612_01 1078 | scene0613_00 1079 | scene0613_01 1080 | scene0613_02 1081 | scene0614_00 1082 | scene0614_01 1083 | scene0614_02 1084 | scene0615_00 1085 | scene0615_01 1086 | scene0617_00 1087 | scene0619_00 1088 | scene0620_00 1089 | scene0620_01 1090 | scene0622_00 1091 | scene0622_01 1092 | scene0623_00 1093 | scene0623_01 1094 | scene0624_00 1095 | scene0625_00 1096 | scene0625_01 1097 | scene0626_00 1098 | scene0626_01 1099 | scene0626_02 1100 | scene0627_00 1101 | scene0627_01 1102 | scene0628_00 1103 | scene0628_01 1104 | scene0628_02 1105 | scene0630_00 1106 | scene0630_01 1107 | scene0630_02 1108 | scene0630_03 1109 | scene0630_04 1110 | scene0630_05 1111 | scene0630_06 1112 | scene0631_00 1113 | scene0631_01 1114 | scene0631_02 1115 | scene0632_00 1116 | scene0634_00 1117 | scene0635_00 1118 | scene0635_01 1119 | scene0636_00 1120 | scene0637_00 1121 | scene0638_00 1122 | scene0639_00 1123 | scene0640_00 1124 | scene0640_01 1125 | scene0640_02 1126 | scene0641_00 1127 | scene0642_00 1128 | scene0642_01 1129 | scene0642_02 1130 | scene0642_03 1131 | scene0646_00 1132 | scene0646_01 1133 | scene0646_02 1134 | scene0649_00 1135 | scene0649_01 1136 | scene0650_00 1137 | scene0654_00 1138 | scene0654_01 1139 | scene0656_00 1140 | scene0656_01 1141 | scene0656_02 1142 | scene0656_03 1143 | scene0657_00 1144 | scene0659_00 1145 | scene0659_01 1146 | scene0661_00 1147 | scene0662_00 1148 | scene0662_01 1149 | scene0662_02 1150 | scene0666_00 1151 | scene0666_01 1152 | scene0666_02 1153 | scene0667_00 1154 | scene0667_01 1155 | scene0667_02 1156 | scene0668_00 1157 | scene0669_00 1158 | scene0669_01 1159 | scene0672_00 1160 | scene0672_01 1161 | scene0673_00 1162 | scene0673_01 1163 | scene0673_02 1164 | scene0673_03 1165 | scene0673_04 1166 | scene0673_05 1167 | scene0674_00 1168 | scene0674_01 1169 | scene0675_00 1170 | scene0675_01 1171 | scene0676_00 1172 | scene0676_01 1173 | scene0677_00 1174 | scene0677_01 1175 | scene0677_02 1176 | scene0679_00 1177 | scene0679_01 1178 | scene0680_00 1179 | scene0680_01 1180 | scene0681_00 1181 | scene0682_00 1182 | scene0683_00 1183 | scene0687_00 1184 | scene0688_00 1185 | scene0691_00 1186 | scene0691_01 1187 | scene0692_00 1188 | scene0692_01 1189 | scene0692_02 1190 | scene0692_03 1191 | scene0692_04 1192 | scene0694_00 1193 | scene0694_01 1194 | scene0698_00 1195 | scene0698_01 1196 | scene0703_00 1197 | scene0703_01 1198 | scene0705_00 1199 | scene0705_01 1200 | scene0705_02 1201 | scene0706_00 -------------------------------------------------------------------------------- /data_process/scannet_process/scannetv2_val.txt: -------------------------------------------------------------------------------- 1 | scene0011_00 2 | scene0011_01 3 | scene0015_00 4 | scene0019_00 5 | scene0019_01 6 | scene0025_00 7 | scene0025_01 8 | scene0025_02 9 | scene0030_00 10 | scene0030_01 11 | scene0030_02 12 | scene0046_00 13 | scene0046_01 14 | scene0046_02 15 | scene0050_00 16 | scene0050_01 17 | scene0050_02 18 | scene0063_00 19 | scene0064_00 20 | scene0064_01 21 | scene0077_00 22 | scene0077_01 23 | scene0081_00 24 | scene0081_01 25 | scene0081_02 26 | scene0084_00 27 | scene0084_01 28 | scene0084_02 29 | scene0086_00 30 | scene0086_01 31 | scene0086_02 32 | scene0088_00 33 | scene0088_01 34 | scene0088_02 35 | scene0088_03 36 | scene0095_00 37 | scene0095_01 38 | scene0100_00 39 | scene0100_01 40 | scene0100_02 41 | scene0131_00 42 | scene0131_01 43 | scene0131_02 44 | scene0139_00 45 | scene0144_00 46 | scene0144_01 47 | scene0146_00 48 | scene0146_01 49 | scene0146_02 50 | scene0149_00 51 | scene0153_00 52 | scene0153_01 53 | scene0164_00 54 | scene0164_01 55 | scene0164_02 56 | scene0164_03 57 | scene0169_00 58 | scene0169_01 59 | scene0187_00 60 | scene0187_01 61 | scene0193_00 62 | scene0193_01 63 | scene0196_00 64 | scene0203_00 65 | scene0203_01 66 | scene0203_02 67 | scene0207_00 68 | scene0207_01 69 | scene0207_02 70 | scene0208_00 71 | scene0217_00 72 | scene0221_00 73 | scene0221_01 74 | scene0222_00 75 | scene0222_01 76 | scene0231_00 77 | scene0231_01 78 | scene0231_02 79 | scene0246_00 80 | scene0249_00 81 | scene0251_00 82 | scene0256_00 83 | scene0256_01 84 | scene0256_02 85 | scene0257_00 86 | scene0277_00 87 | scene0277_01 88 | scene0277_02 89 | scene0278_00 90 | scene0278_01 91 | scene0300_00 92 | scene0300_01 93 | scene0304_00 94 | scene0307_00 95 | scene0307_01 96 | scene0307_02 97 | scene0314_00 98 | scene0316_00 99 | scene0328_00 100 | scene0329_00 101 | scene0329_01 102 | scene0329_02 103 | scene0334_00 104 | scene0334_01 105 | scene0334_02 106 | scene0338_00 107 | scene0338_01 108 | scene0338_02 109 | scene0342_00 110 | scene0343_00 111 | scene0351_00 112 | scene0351_01 113 | scene0353_00 114 | scene0353_01 115 | scene0353_02 116 | scene0354_00 117 | scene0355_00 118 | scene0355_01 119 | scene0356_00 120 | scene0356_01 121 | scene0356_02 122 | scene0357_00 123 | scene0357_01 124 | scene0377_00 125 | scene0377_01 126 | scene0377_02 127 | scene0378_00 128 | scene0378_01 129 | scene0378_02 130 | scene0382_00 131 | scene0382_01 132 | scene0389_00 133 | scene0406_00 134 | scene0406_01 135 | scene0406_02 136 | scene0412_00 137 | scene0412_01 138 | scene0414_00 139 | scene0423_00 140 | scene0423_01 141 | scene0423_02 142 | scene0426_00 143 | scene0426_01 144 | scene0426_02 145 | scene0426_03 146 | scene0427_00 147 | scene0430_00 148 | scene0430_01 149 | scene0432_00 150 | scene0432_01 151 | scene0435_00 152 | scene0435_01 153 | scene0435_02 154 | scene0435_03 155 | scene0441_00 156 | scene0458_00 157 | scene0458_01 158 | scene0461_00 159 | scene0462_00 160 | scene0474_00 161 | scene0474_01 162 | scene0474_02 163 | scene0474_03 164 | scene0474_04 165 | scene0474_05 166 | scene0488_00 167 | scene0488_01 168 | scene0490_00 169 | scene0494_00 170 | scene0496_00 171 | scene0500_00 172 | scene0500_01 173 | scene0518_00 174 | scene0527_00 175 | scene0535_00 176 | scene0549_00 177 | scene0549_01 178 | scene0550_00 179 | scene0552_00 180 | scene0552_01 181 | scene0553_00 182 | scene0553_01 183 | scene0553_02 184 | scene0558_00 185 | scene0558_01 186 | scene0558_02 187 | scene0559_00 188 | scene0559_01 189 | scene0559_02 190 | scene0565_00 191 | scene0568_00 192 | scene0568_01 193 | scene0568_02 194 | scene0574_00 195 | scene0574_01 196 | scene0574_02 197 | scene0575_00 198 | scene0575_01 199 | scene0575_02 200 | scene0578_00 201 | scene0578_01 202 | scene0578_02 203 | scene0580_00 204 | scene0580_01 205 | scene0583_00 206 | scene0583_01 207 | scene0583_02 208 | scene0591_00 209 | scene0591_01 210 | scene0591_02 211 | scene0593_00 212 | scene0593_01 213 | scene0595_00 214 | scene0598_00 215 | scene0598_01 216 | scene0598_02 217 | scene0599_00 218 | scene0599_01 219 | scene0599_02 220 | scene0606_00 221 | scene0606_01 222 | scene0606_02 223 | scene0607_00 224 | scene0607_01 225 | scene0608_00 226 | scene0608_01 227 | scene0608_02 228 | scene0609_00 229 | scene0609_01 230 | scene0609_02 231 | scene0609_03 232 | scene0616_00 233 | scene0616_01 234 | scene0618_00 235 | scene0621_00 236 | scene0629_00 237 | scene0629_01 238 | scene0629_02 239 | scene0633_00 240 | scene0633_01 241 | scene0643_00 242 | scene0644_00 243 | scene0645_00 244 | scene0645_01 245 | scene0645_02 246 | scene0647_00 247 | scene0647_01 248 | scene0648_00 249 | scene0648_01 250 | scene0651_00 251 | scene0651_01 252 | scene0651_02 253 | scene0652_00 254 | scene0653_00 255 | scene0653_01 256 | scene0655_00 257 | scene0655_01 258 | scene0655_02 259 | scene0658_00 260 | scene0660_00 261 | scene0663_00 262 | scene0663_01 263 | scene0663_02 264 | scene0664_00 265 | scene0664_01 266 | scene0664_02 267 | scene0665_00 268 | scene0665_01 269 | scene0670_00 270 | scene0670_01 271 | scene0671_00 272 | scene0671_01 273 | scene0678_00 274 | scene0678_01 275 | scene0678_02 276 | scene0684_00 277 | scene0684_01 278 | scene0685_00 279 | scene0685_01 280 | scene0685_02 281 | scene0686_00 282 | scene0686_01 283 | scene0686_02 284 | scene0689_00 285 | scene0690_00 286 | scene0690_01 287 | scene0693_00 288 | scene0693_01 289 | scene0693_02 290 | scene0695_00 291 | scene0695_01 292 | scene0695_02 293 | scene0695_03 294 | scene0696_00 295 | scene0696_01 296 | scene0696_02 297 | scene0697_00 298 | scene0697_01 299 | scene0697_02 300 | scene0697_03 301 | scene0699_00 302 | scene0700_00 303 | scene0700_01 304 | scene0700_02 305 | scene0701_00 306 | scene0701_01 307 | scene0701_02 308 | scene0702_00 309 | scene0702_01 310 | scene0702_02 311 | scene0704_00 312 | scene0704_01 -------------------------------------------------------------------------------- /docs/flat_patternmaking.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZJU-REAL/ViewSpatial-Bench/b99ddf54d05a791a0e0af0d1d13f4379aae36821/docs/flat_patternmaking.png -------------------------------------------------------------------------------- /docs/icon/avatar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZJU-REAL/ViewSpatial-Bench/b99ddf54d05a791a0e0af0d1d13f4379aae36821/docs/icon/avatar.png -------------------------------------------------------------------------------- /docs/icon/avatar1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZJU-REAL/ViewSpatial-Bench/b99ddf54d05a791a0e0af0d1d13f4379aae36821/docs/icon/avatar1.png -------------------------------------------------------------------------------- /docs/main_result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZJU-REAL/ViewSpatial-Bench/b99ddf54d05a791a0e0af0d1d13f4379aae36821/docs/main_result.png -------------------------------------------------------------------------------- /docs/pipeline_and_case.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZJU-REAL/ViewSpatial-Bench/b99ddf54d05a791a0e0af0d1d13f4379aae36821/docs/pipeline_and_case.png -------------------------------------------------------------------------------- /docs/pipline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZJU-REAL/ViewSpatial-Bench/b99ddf54d05a791a0e0af0d1d13f4379aae36821/docs/pipline.png -------------------------------------------------------------------------------- /evaluate.py: -------------------------------------------------------------------------------- 1 | import os, re, csv, json, torch, base64 2 | import random, argparse 3 | import numpy as np 4 | from PIL import Image 5 | from random import seed 6 | from openai import OpenAI 7 | from tqdm.auto import tqdm 8 | from collections import defaultdict 9 | from transformers import AutoModelForCausalLM, AutoProcessor 10 | # Llama-3.2-11B-Vision 11 | from transformers import MllamaForConditionalGeneration 12 | # Qwen2-VL 13 | from transformers import Qwen2VLForConditionalGeneration 14 | # Qwen2.5-VL 15 | from qwen_vl_utils import process_vision_info 16 | from transformers import Qwen2_5_VLForConditionalGeneration 17 | # LlavaOnevision 18 | from transformers import LlavaOnevisionForConditionalGeneration 19 | # Intern2.5/3 20 | from lmdeploy.vl import load_image 21 | from lmdeploy import pipeline, TurbomindEngineConfig, ChatTemplateConfig 22 | # LlavaNextVideo 23 | from transformers import LlavaNextVideoProcessor, LlavaNextVideoForConditionalGeneration 24 | 25 | 26 | seed(1234) 27 | np.random.seed(1234) 28 | 29 | parser = argparse.ArgumentParser() 30 | parser.add_argument("--model_path", type=str, default="gpt-4o") 31 | # parser.add_argument("--device", type=int, default=-1) 32 | args = parser.parse_args() 33 | 34 | model_path = args.model_path 35 | model_name = model_path.split("/")[-1] 36 | # device = torch.device(f"cuda:{args.device}" if args.device >= 0 else "cpu") 37 | prompt_format = "\nReply only to the corresponding option.\nAnswer:" 38 | 39 | # Set the size of the incoming image for qwen 40 | min_pixels = 256*28*28 41 | max_pixels = 1280*28*28 42 | 43 | 44 | # Set up the model 45 | if model_name == 'gemini-2.0-flash-001': 46 | API_KEY = "" # your api key 47 | base_url = "" # Change to your own base_url 48 | client = OpenAI(api_key=API_KEY, base_url=base_url) 49 | print(f"Model gemini-2.0-flash series:{model_name} is running!") 50 | 51 | elif model_name == 'gpt-4o': 52 | client = OpenAI(api_key="") # your api key 53 | client.base_url = "" # Change to your own base_url 54 | print(f"Model gpt-4o series:{model_name} is running!") 55 | 56 | elif "Qwen2.5-VL" in model_name: 57 | model = Qwen2_5_VLForConditionalGeneration.from_pretrained( 58 | model_path, torch_dtype="auto", device_map="auto" 59 | ) 60 | processor = AutoProcessor.from_pretrained(model_path, min_pixels=min_pixels, max_pixels=max_pixels) 61 | print(f"Model Qwen2.5-VL series:{model_name} is running!") 62 | 63 | elif "Qwen2-VL" in model_name : 64 | model = Qwen2VLForConditionalGeneration.from_pretrained( 65 | model_path, torch_dtype="auto", device_map="auto" 66 | ) 67 | processor = AutoProcessor.from_pretrained(model_path, min_pixels=min_pixels, max_pixels=max_pixels) 68 | print(f"Model Qwen2-VL series:{model_name} is running!") 69 | 70 | elif "InternVL2_5" in model_name: 71 | model = model_path 72 | pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=1000000)) 73 | print(f"Model InternVL2_5 series:{model_name} is running!") 74 | 75 | elif "LLaVA-NeXT" in model_name: 76 | model = LlavaNextVideoForConditionalGeneration.from_pretrained( 77 | model_path, 78 | torch_dtype=torch.float16, 79 | low_cpu_mem_usage=True, 80 | device_map="auto" 81 | ) 82 | processor = LlavaNextVideoProcessor.from_pretrained(model_path) 83 | print(f"Model LLaVA-NeXT series:{model_name} is running!") 84 | 85 | elif model_name == "llava-onevision-qwen2-7b-ov-hf": 86 | model = LlavaOnevisionForConditionalGeneration.from_pretrained( 87 | model_path, 88 | torch_dtype=torch.float16, 89 | low_cpu_mem_usage=True, 90 | device_map="auto" 91 | ) 92 | processor = AutoProcessor.from_pretrained(model_path) 93 | print(f"Model llava-onevision series:{model_name} is running!") 94 | 95 | elif model_name == "Llama-3.2-11B-Vision-Instruct": 96 | model = MllamaForConditionalGeneration.from_pretrained( 97 | model_path, 98 | torch_dtype=torch.bfloat16, 99 | device_map="auto", 100 | ) 101 | processor = AutoProcessor.from_pretrained(model_path) 102 | print(f"Model Llama-3.2-11B-Vision series:{model_name} is running!") 103 | 104 | elif model_name == "Kimi-VL-A3B-Instruct": 105 | model = AutoModelForCausalLM.from_pretrained( 106 | model_path, 107 | torch_dtype=torch.bfloat16, 108 | device_map="auto", 109 | trust_remote_code=True, 110 | ) 111 | processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) 112 | print(f"Model Kimi-VL series:{model_name} is running!") 113 | 114 | elif model_name == "InternVL3-14B": 115 | model = model_path 116 | pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=1000000,tp=1), chat_template_config=ChatTemplateConfig(model_name='internvl2_5')) 117 | print(f"Model InternVL3 series:{model_name} is running!") 118 | elif model_name == "random": 119 | model = None 120 | processor = None 121 | else: 122 | model = None 123 | processor = None 124 | 125 | def extract_option(text): 126 | match = re.search(r"\b([A-D])\b", text, re.IGNORECASE) 127 | return match.group(1).upper() if match else None 128 | 129 | def url_to_base64(url): 130 | if os.path.exists(url): 131 | with open(url, "rb") as f: 132 | return "data:image/jpeg;base64," + base64.b64encode(f.read()).decode("utf-8") 133 | else: 134 | print(f"该图片{url}不存在!") 135 | return False 136 | 137 | 138 | 139 | def get_output(image_path, question): 140 | image_url = [url_to_base64(image) for image in image_path] 141 | 142 | if model_name == 'gemini-2.0-flash-001': 143 | content = [{"type": "image_url", "image_url": {"url": path}} for path in image_url] 144 | 145 | chat_completion = client.chat.completions.create( 146 | model='google/gemini-2.0-flash-001', 147 | messages=[ 148 | { 149 | "role": "user", 150 | "content": [ 151 | { 152 | "type": "text", 153 | "text": question 154 | }, 155 | *content 156 | ] 157 | } 158 | ] 159 | ) 160 | pred = chat_completion.choices[0].message.content 161 | 162 | elif model_name == 'gpt-4o': 163 | content = [{"type": "image_url", "image_url": {"url": path}} for path in image_url] 164 | 165 | chat_completion = client.chat.completions.create( 166 | model="gpt-4o", 167 | messages=[ 168 | { 169 | "role": "user", 170 | "content": [ 171 | { 172 | "type": "text", 173 | "text": question 174 | }, 175 | *content 176 | ] 177 | } 178 | ] 179 | ) 180 | pred = chat_completion.choices[0].message.content 181 | 182 | 183 | elif "Qwen2.5-VL" in model_name: 184 | content = [{"type": "image", "image": path,"resized_height": 280,"resized_width": 420} for path in image_path] 185 | 186 | messages = [ 187 | { 188 | "role": "user", 189 | "content": [ 190 | *content, 191 | { 192 | "type": "text", 193 | "text": question 194 | }, 195 | ], 196 | } 197 | ] 198 | text = processor.apply_chat_template( 199 | messages, tokenize=False, add_generation_prompt=True 200 | ) 201 | image_inputs, video_inputs = process_vision_info(messages) 202 | inputs = processor( 203 | text=[text], 204 | images=image_inputs, 205 | videos=video_inputs, 206 | padding=True, 207 | return_tensors="pt", 208 | ) 209 | inputs = inputs.to("cuda") 210 | generated_ids = model.generate(**inputs, max_new_tokens=128) 211 | generated_ids_trimmed = [ 212 | out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) 213 | ] 214 | output_text = processor.batch_decode( 215 | generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False 216 | ) 217 | torch.cuda.empty_cache() 218 | torch.cuda.ipc_collect() 219 | pred = str(output_text[0]) 220 | 221 | elif "LLaVA-NeXT" in model_name: 222 | content = [{"type": "image_url", "image_url": {"url": path}} for path in image_url] 223 | 224 | conversation = [ 225 | { 226 | "role": "user", 227 | "content": [ 228 | {"type": "text", "text": question}, 229 | *content 230 | ], 231 | }, 232 | ] 233 | inputs = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=True, 234 | return_dict=True, padding=True, return_tensors="pt").to("cuda") 235 | generate_ids = model.generate(**inputs, max_new_tokens=100, eos_token_id=2, pad_token_id=2) 236 | pred = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0] 237 | match = re.search(r'ASSISTANT:\s*(.*)', pred, re.DOTALL) 238 | pred = match.group(1) 239 | 240 | elif "InternVL2_5" in model_name: 241 | images = [load_image(image) for image in image_url] 242 | formatted_lines = '' 243 | for i, item in enumerate(images, start=1): 244 | formatted_lines = formatted_lines + "Image-" + str(i) + ": {IMAGE_TOKEN}\n" 245 | response = pipe((f'{formatted_lines}{question}', images)) 246 | pred = response.text 247 | 248 | elif model_name == "llava-onevision-qwen2-7b-ov-hf": 249 | content = [{"type": "image_url", "image_url": {"url": path}} for path in image_url] 250 | 251 | conversation = [ 252 | { 253 | "role": "user", 254 | "content": [ 255 | {"type": "text", "text": question}, 256 | *content 257 | ], 258 | }, 259 | ] 260 | inputs = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=True, 261 | return_dict=True, padding=True, return_tensors="pt").to("cuda") 262 | generate_ids = model.generate(**inputs, max_new_tokens=100) 263 | pred = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0] 264 | 265 | 266 | elif model_name == "Llama-3.2-11B-Vision-Instruct": 267 | content = [{"type": "image_url", "image_url": {"url": path}} for path in image_url] 268 | 269 | conversation = [ 270 | { 271 | "role": "user", 272 | "content": [ 273 | {"type": "text", "text": question}, 274 | *content 275 | ], 276 | }, 277 | ] 278 | inputs = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=True, 279 | return_dict=True, padding=True, return_tensors="pt").to("cuda") 280 | generate_ids = model.generate(**inputs, max_new_tokens=100) 281 | pred = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] 282 | 283 | elif model_name == "Kimi-VL-A3B-Instruct": 284 | images_ = [Image.open(path) for path in image_path] 285 | images = [path.resize((path.width // 4, path.height // 4), Image.Resampling.LANCZOS) for path in images_] 286 | content = [{"type": "image", "image": path} for path in images] 287 | messages = [ 288 | { 289 | "role": "user", 290 | "content": [ {"type": "text","text": question},*content] 291 | }, 292 | ] 293 | text = processor.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt") 294 | inputs = processor(images=images, text=text, return_tensors="pt", padding=True, truncation=True).to("cuda") 295 | generated_ids = model.generate(**inputs, max_new_tokens=2048) 296 | generated_ids_trimmed = [ 297 | out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) 298 | ] 299 | pred = processor.batch_decode( 300 | generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False 301 | )[0] 302 | 303 | elif model_name == "InternVL3-14B": 304 | images = [load_image(image) for image in image_url] 305 | formatted_lines = '' 306 | for i, item in enumerate(images, start=1): 307 | formatted_lines = formatted_lines + "Image-" + str(i) + ": {IMAGE_TOKEN}\n" 308 | response = pipe((f'{formatted_lines}{question}', images)) 309 | pred = response.text 310 | 311 | else: 312 | pred = '' 313 | 314 | return pred 315 | 316 | def evaluate_vlm(benchmark_file): 317 | with open(benchmark_file, "r", encoding="utf-8") as f: 318 | benchmark_data = json.load(f) 319 | 320 | stats = defaultdict(lambda: {"correct": 0, "total": 0}) 321 | total_correct = 0 322 | total_questions = 0 323 | 324 | output_path = f"result/{model_name}" 325 | if not os.path.exists(output_path): 326 | os.makedirs(output_path) 327 | 328 | result_file = f"{output_path}/result_{model_name}.csv" 329 | with open(result_file, "w", newline="", encoding="utf-8") as csvfile: 330 | writer = csv.writer(csvfile) 331 | writer.writerow(["ID", "Question", "Question_Type", "Predicted Answer", "Correct Answer", "IsCorrect"]) 332 | 333 | for i, item in enumerate(tqdm(benchmark_data)): 334 | try: 335 | image_path = item['image_path'] 336 | question = item["question"] + prompt_format 337 | correct_answer = item["answer"] 338 | question_type = item["question_type"] 339 | stats[question_type]["total"] += 1 340 | total_questions += 1 341 | 342 | predicted_answer = get_output(image_path, question) 343 | predicted_answer_ = predicted_answer.split("\n")[-1] 344 | is_correct = extract_option(predicted_answer_) == extract_option(correct_answer) 345 | 346 | if is_correct: 347 | stats[question_type]["correct"] += 1 348 | total_correct += 1 349 | writer.writerow([i, question, question_type, predicted_answer, correct_answer, is_correct]) 350 | except Exception as e: 351 | print(f"Error on item {i}: {e}") 352 | continue 353 | 354 | print("Benchmark Evaluation Results:") 355 | print("----------------------------------------------------------") 356 | for qtype, values in stats.items(): 357 | correct = values["correct"] 358 | total = values["total"] 359 | accuracy = correct / total 360 | print(f"{qtype}: {correct}/{total} = {accuracy:.2%}") 361 | overall_accuracy = total_correct / total_questions 362 | print("----------------------------------------------------------") 363 | print(f"The accuracy rate of {model_name} on the benchmark test set: {overall_accuracy:.2%} Correct quantity:{total_correct} Total quantity:{total_questions}") 364 | print("----------------------------------------------------------") 365 | print(f"The result has been saved to {result_file}") 366 | 367 | if __name__ == '__main__': 368 | benchmark_file = "eval/ViewSpatial-Bench.json" 369 | evaluate_vlm(benchmark_file) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | inference~=0.50.1 2 | lmdeploy~=0.7.3 3 | numpy~=2.2.6 4 | openai~=1.82.0 5 | opencv_python~=4.10.0.84 6 | Pillow~=11.2.1 7 | plyfile~=1.1 8 | pycocotools~=2.0.4 9 | qwen_vl_utils~=0.0.11 10 | Requests~=2.32.3 11 | torch~=2.5.1 12 | tqdm~=4.66.6 13 | transformers~=4.51.1 --------------------------------------------------------------------------------