├── LICENSE
├── README.md
├── ViewSpatial-Bench
    └── readme.md
├── data_process
    ├── coco_process
    │   ├── coco_single_life_object_filtered_by_area .json
    │   ├── get_person_by_area.py
    │   └── head2body_orientation_data.py
    └── scannet_process
    │   ├── Sce_Sim_make.py
    │   ├── bbox3d_project.py
    │   ├── frame_sampling.py
    │   ├── scannet_utils.py
    │   ├── scannetv2_train.txt
    │   └── scannetv2_val.txt
├── docs
    ├── flat_patternmaking.png
    ├── icon
    │   ├── avatar.png
    │   └── avatar1.png
    ├── main_result.png
    ├── pipeline_and_case.png
    └── pipline.png
├── eval
    └── ViewSpatial-Bench.json
├── evaluate.py
└── requirements.txt


/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 |  
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <h1><img src="docs/icon/avatar.png" width="6%"/><i>ViewSpatial-Bench</i>:Evaluating Multi-perspective Spatial Localization in Vision-Language Models</h1>
 2 | 
 3 | <div align="center">
 4 |     <a href="https://arxiv.org/abs/2505.21500" target="_blank">
 5 |         <img alt="arXiv" src="https://img.shields.io/badge/arXiv-ViewSpatial_Bench-red?logo=arxiv" height="20" />
 6 |     </a>
 7 |     <a href="https://huggingface.co/datasets/lidingm/ViewSpatial-Bench" target="_blank">
 8 |         <img alt="ViewSpatial_Bench" src="https://img.shields.io/badge/%F0%9F%A4%97%20_Benchmark-ViewSpatial_Bench-ffc107?color=ffc107&logoColor=white" height="20" />
 9 |     </a>
10 |     <a href="https://zju-real.github.io/ViewSpatial-Page/" target="_blank">
11 |         <img alt="Webpage" src="https://img.shields.io/badge/%F0%9F%8C%8E_Website-ViewSpatial_Bench-green.svg" height="20" />
12 |     </a>
13 | </div>
14 | 
15 | <img src="docs/flat_patternmaking.png" width="100%"/>
16 | Our work presents a range of spatial localization tasks requiring reasoning from both camera-centric and human-centric perspectives, revealing the challenges visual-language models (VLMs) face in multi-viewpoint spatial understanding. Current VLMs are predominantly trained on image-text pairs from the web that lack explicit 3D spatial annotations, limiting their cross-perspective spatial reasoning capabilities. 
17 | 
18 | ## 📖ViewSpatial-Bench
19 | 
20 | To address this gap, we introduce **ViewSpatial-Bench**, a comprehensive benchmark with over 5,700 question-answer pairs across 1,000+ 3D scenes from ScanNet and MS-COCO validation sets. This benchmark evaluates VLMs' spatial localization capabilities from multiple perspectives, specifically testing both egocentric (camera) and allocentric (human subject) viewpoints across five distinct task types.The figure below shows the construction pipeline and example demonstrations of our benchmark.
21 | 
22 | <img src="docs/pipeline_and_case.png" width="100%"/>
23 | 
24 | ## 🤖Multi-View Spatial Model
25 | 
26 | We present Multi-View Spatial Model (MVSM), developed to address limitations in perspective-dependent spatial reasoning in vision-language models. Following the ViewSpatial-Bench pipeline, we constructed a training dataset of ~43K diverse spatial relationship samples across five task categories, utilizing automated spatial annotations from ScanNet and MS-COCO data, supplemented with Spatial-MM for person-perspective tasks. Using consistent language templates and standardized directional classifications, we implemented a Multi-Perspective Fine-Tuning strategy on Qwen2.5-VL (3B) to enhance reasoning across different observational viewpoints. This approach enables MVSM to develop unified 3D spatial relationship representations that robustly support both camera and human perspective reasoning.
27 | 
28 | ## 👁️‍🗨️Results
29 | 
30 | <img src="docs/main_result.png" width="100%"/>
31 | 
32 | Accuracy comparison across multiple VLMs on camera and human perspective spatial tasks. Our Multi-View Spatial Model (MVSM) significantly outperforms all baseline models across all task categories, demonstrating the effectiveness of our multi-perspective spatial fine-tuning approach. These results reveal fundamental limitations in perspective-based spatial reasoning capabilities among current VLMs. Even powerful proprietary models like GPT-4o (34.98%) and Gemini-2.0-Flash (32.56%) perform only marginally above random chance (26.33%), confirming our hypothesis that standard VLMs struggle with perspective-dependent spatial reasoning despite their strong performance on other vision-language tasks.
33 | 
34 | 
35 | ## ⚒️QuickStart 
36 | 
37 | ```plaintext
38 | ViewSpatial-Bench
39 | ├── data_process        # Script code for processing raw datasets to obtain metadata
40 | ├── eval                # Used to store the raw dataset of ViewSpatial-Bench
41 | ├── ViewSpatial-Bench	# Used to store the source images in ViewSpatial-Bench (can be downloaded from Huggingface)
42 | ├── README.md
43 | ├── evaluate.py         # Script code for evaluating multiple VLMs on ViewSpatial-Bench
44 | └── requirements.txt    # Dependencies for evaluation
45 | ```
46 | 
47 | **Note**: [CoCo dataset](https://cocodataset.org/) processing in `data_process` uses the original dataset's annotation files (download from official source). Head orientation calculations use [Orient Anything](https://github.com/SpatialVision/Orient-Anything)'s open-source code and model - place `head2body_orientation_data.py` in its root directory to run.
48 | 
49 | ## 👀Evaluation on Your Own Model
50 | 
51 | **I. With HuggingFace datasets library.**
52 | 
53 | ```py
54 | # NOTE: pip install datasets
55 | 
56 | from datasets import load_dataset
57 | ds = load_dataset("lidingm/ViewSpatial-Bench")
58 | ```
59 | 
60 | **II. Evaluation using Open-Source Code.**
61 | 
62 | Evaluate using our open-source evaluation code available on Github.(Coming Soon)
63 | 
64 | ```py
65 | # Clone the repository
66 | git clone https://github.com/ZJU-REAL/ViewSpatial-Bench.git
67 | cd ViewSpatial-Bench
68 | 
69 | # Install dependencies
70 | pip install -r requirements.txt
71 | 
72 | # Run evaluation
73 | python evaluate.py --model_path your_model_path
74 | ```
75 | 
76 | You can configure the appropriate model parameters and evaluation settings according to the framework's requirements to obtain performance evaluation results on the ViewSpatial-Bench dataset.
77 | 
78 | ## Acknowledgement
79 | 
80 | We thank the creators of the [ScanNet](https://github.com/ScanNet/ScanNet) and [MS-COCO](https://cocodataset.org/) datasets for their open-source contributions, which provided the foundational 3D scene data and visual content for our spatial annotation pipeline. We also acknowledge the developers of the [Orient Anything](https://github.com/SpatialVision/Orient-Anything) model for their valuable open-source work that supported our annotation framework development.
81 | 
82 | ## Citation
83 | 
84 | ```
85 | @misc{li2025viewspatialbenchevaluatingmultiperspectivespatial,
86 |       title={ViewSpatial-Bench: Evaluating Multi-perspective Spatial Localization in Vision-Language Models}, 
87 |       author={Dingming Li and Hongxing Li and Zixuan Wang and Yuchen Yan and Hang Zhang and Siqi Chen and Guiyang Hou and Shengpei Jiang and Wenqi Zhang and Yongliang Shen and Weiming Lu and Yueting Zhuang},
88 |       year={2025},
89 |       eprint={2505.21500},
90 |       archivePrefix={arXiv},
91 |       primaryClass={cs.CV},
92 |       url={https://arxiv.org/abs/2505.21500}, 
93 | }
94 | ```
95 | 
96 | 


--------------------------------------------------------------------------------
/ViewSpatial-Bench/readme.md:
--------------------------------------------------------------------------------
1 | Simply download and extract the source image sets `scannetv2_val` and `val2017` from [Huggingface](https://huggingface.co/datasets/lidingm/ViewSpatial-Bench) to this directory.


--------------------------------------------------------------------------------
/data_process/coco_process/get_person_by_area.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from pycocotools.coco import COCO
 3 | from tqdm import tqdm
 4 | 
 5 | # Initialize the COCO API for instance annotations
 6 | dataDir = 'annotations_trainval2017'  # Update this to your COCO dataset path
 7 | dataType = 'train2017'  # Change this if you're using a different split (train2017, etc.)
 8 | annFile = '{}/annotations/instances_{}.json'.format(dataDir, dataType)
 9 | coco = COCO(annFile)
10 | 
11 | # Initialize the COCO API for caption annotations
12 | captionAnnFile = '{}/annotations/captions_{}.json'.format(dataDir, dataType)
13 | coco_caps = COCO(captionAnnFile)
14 | 
15 | # Categories we're interested in
16 | life_categories = ["person"]
17 | 
18 | # Get category IDs for our target categories
19 | target_cat_ids = []
20 | for category in life_categories:
21 |     catIds = coco.getCatIds(catNms=[category])
22 |     target_cat_ids.extend(catIds)
23 | 
24 | # Area threshold (e.g., object must occupy at least 1% of the image)
25 | area_ratio_threshold = 0.2
26 | 
27 | print(f"Finding images with exactly one object from specified categories and enough area...")
28 | filtered_images = []
29 | 
30 | # Get all image IDs that contain any of our target categories
31 | for category in tqdm(life_categories):
32 |     catIds = coco.getCatIds(catNms=[category])
33 |     imgIds = coco.getImgIds(catIds=catIds)
34 | 
35 |     for img_id in imgIds:
36 |         obj_ann_ids = coco.getAnnIds(imgIds=img_id)
37 |         obj_anns = coco.loadAnns(obj_ann_ids)
38 | 
39 |         target_objects = []
40 |         for ann in obj_anns:
41 |             if ann['category_id'] in target_cat_ids:
42 |                 target_objects.append(ann)
43 | 
44 |         if len(target_objects) == 1:
45 |             target_ann = target_objects[0]
46 |             img_info = coco.loadImgs(img_id)[0]
47 |             img_area = img_info['width'] * img_info['height']
48 |             obj_area = target_ann.get('area', 0)
49 | 
50 |             if obj_area / img_area >= area_ratio_threshold:
51 |                 cat_info = coco.loadCats(target_ann['category_id'])[0]
52 |                 filtered_images.append((img_id, cat_info['name']))
53 | 
54 | print(f"Found {len(filtered_images)} images with exactly one large-enough object from the specified categories")
55 | 
56 | dataset = []
57 | print("Creating dataset entries for each filtered image...")
58 | for img_id, category in tqdm(filtered_images):
59 |     try:
60 |         img_info = coco.loadImgs(img_id)[0]
61 |         ann_ids = coco_caps.getAnnIds(imgIds=img_id)
62 |         captions = coco_caps.loadAnns(ann_ids)
63 | 
64 |         item = {
65 |             'image_id': img_id,
66 |             'file_name': img_info['file_name'],
67 |             'coco_url': img_info['coco_url'],
68 |             'width': img_info['width'],
69 |             'height': img_info['height'],
70 |             'captions': [ann['caption'] for ann in captions],
71 |             'category': category
72 |         }
73 | 
74 |         dataset.append(item)
75 |     except Exception as e:
76 |         print(f"Error processing image {img_id}: {e}")
77 | 
78 | # Save to JSON
79 | output_file = 'coco_single_life_object_filtered_by_area.json'
80 | with open(output_file, 'w') as f:
81 |     json.dump(dataset, f, indent=2)
82 | 
83 | print(f"Dataset created with {len(dataset)} items and saved to {output_file}")
84 | 
85 | # Summary statistics
86 | category_counts = {}
87 | for _, category in filtered_images:
88 |     category_counts[category] = category_counts.get(category, 0) + 1
89 | 
90 | print("\nCategory distribution in filtered dataset:")
91 | for category, count in sorted(category_counts.items(), key=lambda x: x[1], reverse=True):
92 |     print(f"{category}: {count} images")
93 | 


--------------------------------------------------------------------------------
/data_process/coco_process/head2body_orientation_data.py:
--------------------------------------------------------------------------------
  1 | # from paths import *
  2 | from vision_tower import DINOv2_MLP
  3 | from transformers import AutoImageProcessor
  4 | import torch
  5 | from PIL import Image
  6 | import json
  7 | from utils import *
  8 | from inference import get_3angle
  9 | from tqdm import tqdm
 10 | import os
 11 | import requests
 12 | import json
 13 | 
 14 | 
 15 | def get_keypoint_coordinates(keypoints, index):
 16 |     """
 17 |     Get keypoint coordinates and visibility from keypoints list at specified index.
 18 |     keypoints: list of length 51, containing x, y, v for 17 keypoints.
 19 |     index: keypoint index, range [0, 16].
 20 |     return: (x, y, v)
 21 |     """
 22 |     x = keypoints[index * 3]
 23 |     y = keypoints[index * 3 + 1]
 24 |     v = keypoints[index * 3 + 2]
 25 |     return x, y, v
 26 | 
 27 | 
 28 | def get_azimuth_direction(azimuth: float) -> str:
 29 |     """
 30 |     Determine direction name based on azimuth angle
 31 | 
 32 |     Args:
 33 |        azimuth: azimuth angle in degrees
 34 | 
 35 |     Returns:
 36 |        direction name (front, front-right, right side, etc.)
 37 |     """
 38 |     # Normalize angle to 0-360 range
 39 |     azimuth = azimuth % 360
 40 | 
 41 |     if 337.5 <= azimuth or azimuth < 22.5:
 42 |         return "back"
 43 |     elif 22.5 <= azimuth < 67.5:
 44 |         return "back-left"
 45 |     elif 67.5 <= azimuth < 112.5:
 46 |         return "left"
 47 |     elif 112.5 <= azimuth < 157.5:
 48 |         return "front-left"
 49 |     elif 157.5 <= azimuth < 202.5:
 50 |         return "front"
 51 |     elif 202.5 <= azimuth < 247.5:
 52 |         return "front-right"
 53 |     elif 247.5 <= azimuth < 292.5:
 54 |         return "right"
 55 |     elif 292.5 <= azimuth < 337.5:
 56 |         return "back-right"
 57 |     else:
 58 |         return "wrong"
 59 | 
 60 | 
 61 | annotations_file_path = 'annotations_trainval2017/annotations/person_keypoints_train2017.json'
 62 | 
 63 | # Read COCO annotation file
 64 | with open(annotations_file_path, 'r') as f:
 65 |     coco_data = json.load(f)
 66 | 
 67 | def get_ket_and_bbox(image_id):
 68 |     annotations = [ann for ann in coco_data['annotations'] if ann['image_id'] == image_id]
 69 |     return annotations[0]['keypoints'], annotations[0]['bbox']
 70 | 
 71 | 
 72 | def analyze_head_turn(
 73 |         image: Image.Image,
 74 |         bbox: list,  # [x1, y1, x2, y2]
 75 |         keypoints: list,  # [[x, y, conf], ..., [x, y, conf]] length at least 7
 76 |         dino,
 77 |         val_preprocess,
 78 |         device
 79 | ):
 80 |     # Step 1: Crop person image from bbox
 81 |     x1, y1, x2, y2 = map(int, bbox)
 82 | 
 83 |     # Correct bbox coordinate order to ensure top-left corner comes first
 84 |     x1, x2 = min(x1, x2), max(x1, x2)
 85 |     y1, y2 = min(y1, y2), max(y1, y2)
 86 | 
 87 |     # Check for out-of-bounds coordinates (prevent exceeding image boundaries)
 88 |     img_width, img_height = image.size
 89 |     x1 = max(0, min(x1, img_width - 1))
 90 |     x2 = max(0, min(x2, img_width))
 91 |     y1 = max(0, min(y1, img_height - 1))
 92 |     y2 = max(0, min(y2, img_height))
 93 | 
 94 |     person_image = image.crop((x1, y1, x2, y2))
 95 | 
 96 |     # Keypoint indices
 97 |     left_shoulder_idx = 5
 98 |     right_shoulder_idx = 6
 99 | 
100 |     # Get keypoint coordinates and visibility
101 |     left_shoulder = get_keypoint_coordinates(keypoints, left_shoulder_idx)
102 |     right_shoulder = get_keypoint_coordinates(keypoints, right_shoulder_idx)
103 |     if left_shoulder[2] == 0 or right_shoulder[2] == 0:
104 |         return False, False, False, False, False
105 | 
106 |     # Step 2: Get left and right shoulder y coordinates (relative to cropped image)
107 |     left_shoulder_y = left_shoulder[1] - y1
108 |     right_shoulder_y = right_shoulder[1] - y1
109 | 
110 |     cut_y = int(min(left_shoulder_y, right_shoulder_y))
111 | 
112 |     # Prevent abnormal cut_y values
113 |     if cut_y <= 0 or cut_y >= (y2 - y1):
114 |         return False, False, False, False, False
115 | 
116 |     # Step 3: Segment head/body images
117 |     head_image = person_image.crop((0, 0, person_image.width, cut_y))
118 |     body_image = person_image.crop((0, cut_y, person_image.width, person_image.height))
119 | 
120 |     if head_image.height == 0 or head_image.width == 0 or body_image.height == 0 or body_image.width == 0:
121 |         head_image = person_image
122 |         body_image = person_image
123 | 
124 |     # Step 4: Call model to get angles
125 |     head_angles = get_3angle(head_image, dino, val_preprocess, device)
126 |     body_angles = get_3angle(body_image, dino, val_preprocess, device)
127 | 
128 |     azimuth_head = float(head_angles[0])
129 |     azimuth_body = float(body_angles[0])
130 | 
131 |     # Step 5: Determine head turn direction
132 |     def relative_head_direction(az_head, az_body):
133 |         delta = (az_head - az_body + 540) % 360 - 180
134 | 
135 |         if -90 <= delta < -60:
136 |             return "left"
137 |         elif -60 <= delta < -20:
138 |             return "front-left"
139 |         elif -20 <= delta <= 20:
140 |             return "front"
141 |         elif 20 < delta <= 60:
142 |             return "front-right"
143 |         elif 60 < delta <= 90:
144 |             return "right"
145 |         else:
146 |             return "wrong"
147 | 
148 |     direction = relative_head_direction(azimuth_head, azimuth_body)
149 | 
150 |     return azimuth_head, azimuth_body, direction, float(head_angles[3]), float(body_angles[3])
151 | 
152 | 
153 | ckpt_path = "dino_weight.pt"
154 | 
155 | save_path = './'
156 | device = 'cuda' if torch.cuda.is_available() else 'cpu'
157 | dino = DINOv2_MLP(
158 |     dino_mode='large',
159 |     in_dim=1024,
160 |     out_dim=360 + 180 + 180 + 2,
161 |     evaluate=True,
162 |     mask_dino=False,
163 |     frozen_back=False
164 | )
165 | 
166 | dino.eval()
167 | print('model create')
168 | dino.load_state_dict(torch.load(ckpt_path, map_location='cpu'))
169 | dino = dino.to(device)
170 | print('weight loaded')
171 | val_preprocess = AutoImageProcessor.from_pretrained("dinov2-large", cache_dir='./')
172 | 
173 | 
174 | def check_image_path(image_path):
175 |     if os.path.exists(image_path):
176 |         return True
177 |     else:
178 |         return False
179 | 
180 | 
181 | # ========== Utility Functions ==========
182 | def download_image(img_path, url):
183 |    """Download image and save to specified path"""
184 |    try:
185 |        r = requests.get(url, timeout=10)
186 |        if r.status_code == 200:
187 |            with open(img_path, 'wb') as f:
188 |                f.write(r.content)
189 |            return True
190 |        else:
191 |            print(f"Download failed with status code: {r.status_code}")
192 |            return False
193 |    except Exception as e:
194 |        print(f"Download failed: {e}")
195 |        return False
196 | 
197 | 
198 | DATASET_FILE = 'coco_single_life_object_filtered_by_area.json'
199 | with open(DATASET_FILE, 'r') as f:
200 |     dataset = json.load(f)
201 | result = []
202 | for item in tqdm(dataset):
203 |     if item['category'] != 'person':
204 |         continue
205 |     file_name = item['file_name']
206 |     image_path = "train2017/" + file_name
207 |     if not check_image_path(image_path):
208 |         success = download_image(image_path, item['coco_url'])
209 |         if not success:
210 |             print("Download Failed!")
211 |             continue
212 |     origin_image = Image.open(image_path).convert('RGB')
213 |     keypoints, bbox = get_ket_and_bbox(item['image_id'])
214 | 
215 |     try:
216 |         azimuth_head, azimuth_body, direction, head_confidence, body_confidence = analyze_head_turn(origin_image, bbox,
217 |                                                                                                     keypoints, dino,
218 |                                                                                                     val_preprocess,
219 |                                                                                                     device)
220 |     except:
221 |         continue
222 |     if azimuth_head == False:
223 |         continue
224 | 
225 |     angles = get_3angle(origin_image, dino, val_preprocess, device)
226 |     azimuth = float(angles[0])
227 |     polar = float(angles[1])
228 |     rotation = float(angles[2])
229 |     confidence = float(angles[3])
230 |     one = {
231 |         'image_id': item['image_id'],
232 |         'coco_url': item['coco_url'],
233 |         'width': item['width'],
234 |         'height': item['height'],
235 |         'captions': item['captions'],
236 |         "azimuth": azimuth,
237 |         "overall_confidence": confidence,
238 |         'azimuth_head': azimuth_head,
239 |         'azimuth_body': azimuth_body,
240 |         'person_direction': direction,
241 |         'camera_direction_v1': get_azimuth_direction(azimuth),
242 |         'camera_direction_v2': get_azimuth_direction(azimuth_head),
243 |         'head_confidence': head_confidence,
244 |         'body_confidence': body_confidence
245 |     }
246 |     result.append(one)
247 | 
248 | # Save to JSON
249 | output_file = 'train_data.json'
250 | with open(output_file, 'w') as f:
251 |     json.dump(result, f, indent=2)
252 | 
253 | print(f"Dataset created with {len(result)} items and saved to {output_file}")


--------------------------------------------------------------------------------
/data_process/scannet_process/Sce_Sim_make.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import random
  3 | import os, sys
  4 | import inspect
  5 | from collections import Counter
  6 | from frame_sampling import get_full_images
  7 | try:
  8 |     import numpy as np
  9 | except:
 10 |     print("Failed to import numpy package.")
 11 |     sys.exit(-1)
 12 | 
 13 | currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
 14 | from scannet_utils import *
 15 | 
 16 | 
 17 | def read_aggregation(filename):
 18 |     assert os.path.isfile(filename)
 19 |     object_id_to_segs = {}
 20 |     label_to_segs = {}
 21 |     with open(filename) as f:
 22 |         data = json.load(f)
 23 |         num_objects = len(data["segGroups"])
 24 |         for i in range(num_objects):
 25 |             object_id = (
 26 |                 data["segGroups"][i]["objectId"] + 1
 27 |             )  # instance ids should be 1-indexed
 28 |             label = data["segGroups"][i]["label"]
 29 |             segs = data["segGroups"][i]["segments"]
 30 |             object_id_to_segs[object_id] = segs
 31 |             if label in label_to_segs:
 32 |                 label_to_segs[label].extend(segs)
 33 |             else:
 34 |                 label_to_segs[label] = segs
 35 |     return object_id_to_segs, label_to_segs
 36 | 
 37 | 
 38 | def read_segmentation(filename):
 39 |     assert os.path.isfile(filename)
 40 |     seg_to_verts = {}
 41 |     with open(filename) as f:
 42 |         data = json.load(f)
 43 |         num_verts = len(data["segIndices"])
 44 |         for i in range(num_verts):
 45 |             seg_id = data["segIndices"][i]
 46 |             if seg_id in seg_to_verts:
 47 |                 seg_to_verts[seg_id].append(i)
 48 |             else:
 49 |                 seg_to_verts[seg_id] = [i]
 50 |     return seg_to_verts, num_verts
 51 | 
 52 | 
 53 | def export(mesh_file, agg_file, seg_file, meta_file, label_map_file):
 54 |     """points are XYZ RGB (RGB in 0-255),
 55 |     semantic label as nyu40 ids,
 56 |     instance label as 1-#instance,
 57 |     box as (cx,cy,cz,dx,dy,dz,semantic_label)
 58 |     """
 59 |     label_map = read_label_mapping(
 60 |         label_map_file, label_from="raw_category", label_to="nyu40id"
 61 |     )
 62 |     mesh_vertices = read_mesh_vertices_rgb(mesh_file)
 63 | 
 64 |     # Load scene axis alignment matrix
 65 |     lines = open(meta_file).readlines()
 66 |     for line in lines:
 67 |         if "axisAlignment" in line:
 68 |             axis_align_matrix = [
 69 |                 float(x) for x in line.rstrip().strip("axisAlignment = ").split(" ")
 70 |             ]
 71 |             break
 72 |     axis_align_matrix = np.array(axis_align_matrix).reshape((4, 4))
 73 |     pts = np.ones((mesh_vertices.shape[0], 4))
 74 |     pts[:, 0:3] = mesh_vertices[:, 0:3]
 75 |     pts = np.dot(pts, axis_align_matrix.transpose())  # Nx4
 76 |     mesh_vertices[:, 0:3] = pts[:, 0:3]
 77 | 
 78 |     # Load semantic and instance labels
 79 |     object_id_to_segs, label_to_segs = read_aggregation(agg_file)
 80 |     seg_to_verts, num_verts = read_segmentation(seg_file)
 81 |     label_ids = np.zeros(shape=(num_verts), dtype=np.uint32)  # 0: unannotated
 82 |     object_id_to_label_id = {}
 83 |     for label, segs in label_to_segs.items():
 84 |         label_id = label_map[label]
 85 |         for seg in segs:
 86 |             verts = seg_to_verts[seg]
 87 |             label_ids[verts] = label_id
 88 |     instance_ids = np.zeros(shape=(num_verts), dtype=np.uint32)  # 0: unannotated
 89 |     num_instances = len(np.unique(list(object_id_to_segs.keys())))
 90 |     for object_id, segs in object_id_to_segs.items():
 91 |         for seg in segs:
 92 |             verts = seg_to_verts[seg]
 93 |             instance_ids[verts] = object_id
 94 |             if object_id not in object_id_to_label_id:
 95 |                 object_id_to_label_id[object_id] = label_ids[verts][0]
 96 |     instance_bboxes = np.zeros((num_instances, 7))
 97 |     for obj_id in object_id_to_segs:
 98 |         label_id = object_id_to_label_id[obj_id]
 99 |         obj_pc = mesh_vertices[instance_ids == obj_id, 0:3]
100 |         if len(obj_pc) == 0:
101 |             continue
102 |         # Compute axis aligned box
103 |         # An axis aligned bounding box is parameterized by
104 |         # (cx,cy,cz) and (dx,dy,dz) and label id
105 |         # where (cx,cy,cz) is the center point of the box,
106 |         # dx is the x-axis length of the box.
107 |         xmin = np.min(obj_pc[:, 0])
108 |         ymin = np.min(obj_pc[:, 1])
109 |         zmin = np.min(obj_pc[:, 2])
110 |         xmax = np.max(obj_pc[:, 0])
111 |         ymax = np.max(obj_pc[:, 1])
112 |         zmax = np.max(obj_pc[:, 2])
113 |         bbox = np.array(
114 |             [
115 |                 (xmin + xmax) / 2,
116 |                 (ymin + ymax) / 2,
117 |                 (zmin + zmax) / 2,
118 |                 xmax - xmin,
119 |                 ymax - ymin,
120 |                 zmax - zmin,
121 |                 label_id,
122 |             ]
123 |         )
124 |         # NOTE: this assumes obj_id is in 1,2,3,.,,,.NUM_INSTANCES
125 |         instance_bboxes[obj_id - 1, :] = bbox
126 | 
127 | 
128 |     return (
129 |         mesh_vertices,
130 |         label_ids,
131 |         instance_ids,
132 |         instance_bboxes,
133 |         object_id_to_label_id
134 |     )
135 | 
136 | def get_3d_box(scene_name, pointcloud_folder, label_map_file):
137 |     scan_path = f"{pointcloud_folder}/{scene_name}"
138 | 
139 |     scan_name = os.path.split(scan_path)[-1]
140 |     mesh_file = os.path.join(scan_path, scan_name + "_vh_clean_2.ply")
141 |     agg_file = os.path.join(scan_path, scan_name + ".aggregation.json")
142 |     seg_file = os.path.join(scan_path, scan_name + "_vh_clean_2.0.010000.segs.json")
143 |     meta_file = os.path.join(
144 |         scan_path, scan_name + ".txt"
145 |     )  # includes axisAlignment info for the train set scans.
146 |     mesh_vertices, label_ids, instance_ids, instance_bboxes, object_id_to_label_id = export(
147 |         mesh_file, agg_file, seg_file, meta_file, label_map_file
148 |     )
149 |     return instance_bboxes
150 | 
151 | 
152 | def calculate_relative_position(A, B, C):
153 |     A, B, C = map(np.array, (A, B, C))
154 | 
155 |     vector_AB = B - A
156 |     if np.linalg.norm(vector_AB) < 1e-6:
157 |         raise ValueError("Objects A and B are at the same position.")
158 | 
159 |     forward = vector_AB / np.linalg.norm(vector_AB)
160 |     world_up = np.array([0.0, 0.0, 1.0])
161 |     right = np.cross(forward, world_up)
162 | 
163 |     if np.linalg.norm(right) < 1e-6:
164 |         world_up = np.array([0.0, 1.0, 0.0])
165 |         right = np.cross(forward, world_up)
166 |         right /= np.linalg.norm(right)
167 |     else:
168 |         right /= np.linalg.norm(right)
169 | 
170 |     up = np.cross(right, forward)
171 | 
172 |     vector_AC = C - A
173 |     local_x = np.dot(vector_AC, right)
174 |     local_y = np.dot(vector_AC, up)
175 |     local_z = np.dot(vector_AC, forward)
176 | 
177 |     return local_x, local_y, local_z
178 | 
179 | 
180 | def get_direction(local_x, local_z):
181 |     angle = np.degrees(np.arctan2(local_x, local_z))
182 |     angle = (angle + 360) % 360
183 | 
184 |     if 22.5 <= angle < 67.5:
185 |         return "front-right"
186 |     elif 67.5 <= angle < 112.5:
187 |         return "right"
188 |     elif 112.5 <= angle < 157.5:
189 |         return "back-right"
190 |     elif 157.5 <= angle < 202.5:
191 |         return "back"
192 |     elif 202.5 <= angle < 247.5:
193 |         return "back-left"
194 |     elif 247.5 <= angle < 292.5:
195 |         return "left"
196 |     elif 292.5 <= angle < 337.5:
197 |         return "front-left"
198 |     else:
199 |         return "front"
200 | 
201 | 
202 | def generate_qa_pairs(obj1, obj2, obj3, label1, label2, label3):
203 |     """Generate QA pairs describing the relative position."""
204 |     try:
205 |         x, y, z = calculate_relative_position(obj1, obj2, obj3)
206 |     except ValueError:
207 |         return []
208 | 
209 |     direction = get_direction(x, z)
210 |     if direction == "same position":
211 |         return []
212 | 
213 |     qa_templates = [
214 |         (f"If you stand at {label1} facing {label2}, where is {label3}?",
215 |          f"If I stand at {label1} and face {label2}, then {label3} would be to my {direction}."),
216 | 
217 |         (f"Imagine standing at {label1} looking towards {label2}, where is {label3}?",
218 |          f"Picture me standing at {label1}, facing {label2}—then {label3} would be on my {direction}."),
219 | 
220 |         (f"When positioned at {label1} facing {label2}, where can you find {label3}?",
221 |          f"From my vantage point at {label1}, with my eyes fixed on {label2}, {label3} is located to my {direction}."),
222 | 
223 |         (f"Standing at {label1}, gazing at {label2}, where should {label3} be?",
224 |          f"From this spot at {label1}, looking directly at {label2}, I’d locate {label3} on my {direction} side.")
225 |     ]
226 |     # All possible options
227 |     all_options = ["left", "right", "front", "back", "back-right", "back-left",
228 |                   "front-left", "front-right"]
229 |     qa_pairs = []
230 | 
231 |     q_template = [random.choice(qa_templates)]
232 |     for q, a in q_template:
233 |         distractors = [opt for opt in all_options if opt not in direction and direction not in opt]
234 |         selected_distractors = random.sample(distractors, 3)
235 |         options = [direction] + selected_distractors
236 |         random.shuffle(options)
237 |         option_letters = ["A", "B", "C", "D"]
238 |         correct_letter_index = options.index(direction)
239 |         correct_option = f"{option_letters[correct_letter_index]}. {direction}"
240 |         formatted_options = "\n".join([f"{option_letters[i]}. {options[i]}" for i in range(4)])
241 |         question = f"{q}\n{formatted_options}"
242 |         qa_pairs.append({
243 |             "question": question,
244 |             "answer": correct_option
245 |         })
246 | 
247 |     return qa_pairs
248 | 
249 | 
250 | def get_random_combinations(lst, max_samples=10000):
251 |     all_combinations = list(itertools.combinations(lst, 3))
252 |     num_samples = min(max_samples, len(all_combinations))
253 |     return random.sample(all_combinations, num_samples)
254 | 
255 | def get_jpg_files(folder_path):
256 |     jpg_files = []
257 |     for filename in os.listdir(folder_path):
258 |         file_path = os.path.join(folder_path, filename)
259 | 
260 |         if os.path.isfile(file_path) and filename.lower().endswith('.jpg'):
261 |             jpg_files.append(filename)
262 | 
263 |     return jpg_files
264 | 
265 | 
266 | if __name__ == "__main__":
267 |     nyu40_to_category = {
268 |         0: "unlabeled", 1: "wall", 2: "floor", 3: "cabinet", 4: "bed",
269 |         5: "chair", 6: "sofa", 7: "table", 8: "door", 9: "window",
270 |         10: "bookshelf", 11: "picture", 12: "counter", 13: "blinds",
271 |         14: "desk", 15: "shelves", 16: "curtain", 17: "dresser",
272 |         18: "pillow", 19: "mirror", 20: "floor mat", 21: "clothes",
273 |         22: "ceiling", 23: "books", 24: "refrigerator", 25: "television",
274 |         26: "paper", 27: "towel", 28: "shower curtain", 29: "box",
275 |         30: "whiteboard", 31: "person", 32: "nightstand", 33: "toilet",
276 |         34: "sink", 35: "lamp", 36: "bathtub", 37: "bag",
277 |         38: "other structure", 39: "other furniture", 40: "other prop"
278 |     }
279 | 
280 |     scene_root = "scannet_metadata"
281 |     output_path = r"scannet_metadata/perspective_3d.json"
282 | 
283 |     # Get all point cloud files and label mapping file in the scene
284 |     pointcloud_folder = "/datasets/scannet/scans"
285 |     label_map_file = "/datasets/scannet/scannetv2-labels.combined.tsv"
286 | 
287 |     qa_dataset = []
288 |     scene_num = 0
289 | 
290 |     with open('scannetv2_val.txt', 'r', encoding='utf-8') as file:
291 |         lines = file.readlines()
292 | 
293 |     scenes = [line.strip() for line in lines]
294 |     for i, scene in enumerate(scenes):
295 |         scene_name = scene
296 | 
297 |         img_size = (1296, 968)
298 |         instance_bboxes = get_3d_box(scene_name, pointcloud_folder, label_map_file)
299 | 
300 |         # 3D bounding box
301 |         bboxes_3d = [tuple(bbox) for bbox in instance_bboxes if bbox[6] not in [0, 1, 2, 22, 38, 39, 40]]
302 | 
303 |         bbox_6_counts = Counter(bbox[6] for bbox in bboxes_3d)
304 |         unique_bboxes_3d = [bbox for bbox in bboxes_3d if bbox_6_counts[bbox[6]] == 1]
305 |         le = len(unique_bboxes_3d)
306 |         if le < 3:
307 |             continue
308 |         scene_num = scene_num+1
309 | 
310 |         combinations_3d = get_random_combinations(unique_bboxes_3d, 40)
311 | 
312 |         for combination in combinations_3d:
313 |             obj1 = (combination[0][0], combination[0][1],combination[0][2])
314 |             obj2 = (combination[1][0], combination[1][1], combination[1][2])
315 |             obj3 = (combination[2][0], combination[2][1], combination[2][2])
316 | 
317 |             category_name1 = nyu40_to_category.get(int(combination[0][6]), "unknown")
318 |             category_name2 = nyu40_to_category.get(int(combination[1][6]), "unknown")
319 |             category_name3 = nyu40_to_category.get(int(combination[2][6]), "unknown")
320 |             labels = (category_name1, category_name2, category_name3)
321 | 
322 |             jpg_files_list = get_full_images(scene_name, labels)
323 |             if not jpg_files_list:
324 |                 img_path = os.path.join(scene_root, scene_name)
325 |                 img_path = img_path + "/original_images"
326 |                 jpg_files_list = get_jpg_files(img_path)
327 |             jpg_files_list = [scene_name + "/original_images/" + a for a in jpg_files_list]
328 |             # generate QA-pairs
329 |             qa_set = generate_qa_pairs(obj1, obj2, obj3, *labels)
330 | 
331 |             for num, qa in enumerate(qa_set[:4], 1):
332 |                 qa_dataset.append({"image_path": scene_name, "question": qa['question'], "answer": qa['answer'], "image": jpg_files_list})
333 | 
334 |         print(f"Scene {i} has been successfully saved!")
335 | 
336 |     with open(output_path, 'w', encoding='utf-8') as output_file:
337 |         json.dump(qa_dataset, output_file, ensure_ascii=False, indent=4)
338 |     print(f"QA data has been saved to {output_path}, total of {scene_num} scenes! {len(qa_dataset)} questions!")


--------------------------------------------------------------------------------
/data_process/scannet_process/bbox3d_project.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | 
  4 | import numpy as np
  5 | import torch
  6 | import cv2
  7 | from PIL import Image, ImageDraw
  8 | import json
  9 | from scannet_utils import *
 10 | # Set environment variable to resolve OpenMP error
 11 | os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
 12 | 
 13 | 
 14 | def load_matrix_from_txt(path, shape=(4, 4)):
 15 |     with open(path) as f:
 16 |         txt = f.readlines()
 17 |     txt = ''.join(txt).replace('\n', ' ')
 18 |     matrix = [float(v) for v in txt.split()]
 19 |     return np.array(matrix).reshape(shape)
 20 | 
 21 | 
 22 | def get_align_matrix(meta_file):
 23 |     lines = open(meta_file).readlines()
 24 |     for line in lines:
 25 |         if "axisAlignment" in line:
 26 |             axis_align_matrix = [
 27 |                 float(x) for x in line.rstrip().strip("axisAlignment = ").split(" ")
 28 |             ]
 29 |             break
 30 |     axis_align_matrix = np.array(axis_align_matrix).reshape((4, 4))
 31 |     return axis_align_matrix
 32 | 
 33 | 
 34 | def get_3d_bbox_corners(centers, sizes):
 35 |     """
 36 |     Batch generate 8 corner points for multiple 3D bounding boxes
 37 | 
 38 |     Parameters:
 39 |        centers: numpy array with shape (N, 3), representing N bounding box centers
 40 |        sizes: numpy array with shape (N, 3), representing N bounding box dimensions [length, width, height]
 41 | 
 42 |     Returns:
 43 |        corners: numpy array with shape (N, 8, 3), representing 8 corner points for N bounding boxes
 44 |     """
 45 |     N = centers.shape[0]  # 边界框数量
 46 |     corners = np.zeros((N, 8, 3))
 47 | 
 48 |     for i in range(N):
 49 |         x, y, z = centers[i]
 50 |         l, w, h = sizes[i] / 2.0
 51 | 
 52 |         # 定义8个角点的相对坐标
 53 |         corners[i] = np.array([
 54 |             [x + l, y + w, z + h], [x + l, y + w, z - h], [x + l, y - w, z + h], [x + l, y - w, z - h],
 55 |             [x - l, y + w, z + h], [x - l, y + w, z - h], [x - l, y - w, z + h], [x - l, y - w, z - h]
 56 |         ])
 57 | 
 58 |     return corners
 59 | 
 60 | 
 61 | def draw_3d_bboxes(image, bboxes_2d, visibilities, colors=None, thickness=2, show_invisible=False):
 62 |     """
 63 |     Batch draw multiple projected 3D bounding boxes on an image
 64 | 
 65 |     Parameters:
 66 |         image: PIL Image object
 67 |         bboxes_2d: numpy array with shape (N, 8, 2), representing N sets of projected 2D points
 68 |         visibilities: numpy array with shape (N, 8), indicating point visibility
 69 |         colors: list of length N containing N color tuples, auto-generated if None
 70 |         thickness: integer representing line thickness
 71 |         show_invisible: boolean indicating whether to show invisible edges (displayed as dashed lines)
 72 | 
 73 |     Returns:
 74 |         image: PIL Image object with bounding boxes drawn
 75 |     """
 76 |     N = bboxes_2d.shape[0]  # Number of bounding boxes
 77 | 
 78 |     # Connection line indices for bounding box edges
 79 |     lines = [
 80 |         [0, 1], [0, 2], [1, 3], [2, 3],
 81 |         [4, 5], [4, 6], [5, 7], [6, 7],
 82 |         [0, 4], [1, 5], [2, 6], [3, 7]
 83 |     ]
 84 | 
 85 |     # Generate different colors automatically if none provided
 86 |     if colors is None:
 87 |         colors = []
 88 |         for i in range(N):
 89 |             # Generate random colors while avoiding colors that are too dark or too bright
 90 |             color = (
 91 |                 np.random.randint(50, 200),
 92 |                 np.random.randint(50, 200),
 93 |                 np.random.randint(50, 200)
 94 |             )
 95 |             colors.append(color)
 96 | 
 97 |     # Convert image to OpenCV format
 98 |     img_cv = np.array(image)
 99 | 
100 |     # Draw edges for each bounding box
101 |     for i in range(N):
102 |         bbox_2d = bboxes_2d[i]
103 |         visibility = visibilities[i]
104 |         color = colors[i]
105 | 
106 |         # Draw edges
107 |         for [j, k] in lines:
108 |             pt1 = (int(bbox_2d[j, 0]), int(bbox_2d[j, 1]))
109 |             pt2 = (int(bbox_2d[k, 0]), int(bbox_2d[k, 1]))
110 | 
111 |             # Draw solid line if both endpoints are visible
112 |             if visibility[j] and visibility[k]:
113 |                 cv2.line(img_cv, pt1, pt2, color, thickness)
114 |             # Draw dashed line if show_invisible is set and at least one endpoint is visible
115 |             elif show_invisible and (visibility[j] or visibility[k]):
116 |                 # Create dashed line
117 |                 pts = np.array([pt1, pt2], np.int32).reshape((-1, 1, 2))
118 |                 cv2.polylines(img_cv, [pts], False, color, thickness=1, lineType=cv2.LINE_AA, shift=0)
119 | 
120 |         # Draw visible points
121 |         for j, vis in enumerate(visibility):
122 |             if vis:
123 |                 pt = (int(bbox_2d[j, 0]), int(bbox_2d[j, 1]))
124 |                 cv2.circle(img_cv, pt, 3, color, -1)
125 | 
126 |     # Convert back to PIL Image
127 |     return Image.fromarray(img_cv)
128 | 
129 | def project_3d_bbox_to_2d(bboxes_3d, intrinsic, pose, image_size, depth_image=None, depth_scale=1000.0,
130 |                           occlusion_threshold=0.1):
131 |     """
132 |     Batch project multiple 3D bounding boxes to 2D image plane and detect occlusion,
133 |     resolving size mismatch between depth map and color image
134 | 
135 |     Parameters:
136 |         bboxes_3d: numpy array with shape (N, 8, 3), representing 8 corner points of N 3D bounding boxes
137 |         intrinsic: numpy array with shape (4, 4), camera intrinsic matrix
138 |         pose: numpy array with shape (4, 4), camera extrinsic matrix (camera pose)
139 |         image_size: tuple (width, height), representing image dimensions
140 |         depth_image: numpy array with shape (height, width), representing depth image, no occlusion detection if None
141 |         depth_scale: float, scale factor for depth image to convert depth values to meters
142 |         occlusion_threshold: float, depth difference threshold in meters for determining point occlusion
143 | 
144 |     Returns:
145 |         bboxes_2d: numpy array with shape (N, 8, 2), representing projected 2D points
146 |         visibilities: numpy array with shape (N, 8), indicating point visibility
147 |     """
148 |     N = bboxes_3d.shape[0]  # Number of bounding boxes
149 | 
150 |     # Initialize results
151 |     bboxes_2d = np.zeros((N, 8, 2))
152 |     visibilities = np.zeros((N, 8), dtype=bool)
153 | 
154 |     # Get depth image dimensions (if available)
155 |     depth_height, depth_width = 0, 0
156 |     color_width, color_height = image_size
157 |     depth_to_color_scale_x, depth_to_color_scale_y = 1.0, 1.0
158 | 
159 |     if depth_image is not None:
160 |         depth_height, depth_width = depth_image.shape[:2]
161 |         # Calculate scaling ratio from depth image to color image
162 |         depth_to_color_scale_x = color_width / depth_width
163 |         depth_to_color_scale_y = color_height / depth_height
164 | 
165 |     # Calculate transformation from world coordinate system to camera coordinate system
166 |     world_to_cam = np.linalg.inv(pose)
167 | 
168 |     # Process all N objects in the scene
169 |     for i in range(N):
170 |         # Get 8 corner points of current bounding box
171 |         bbox_3d = bboxes_3d[i]
172 | 
173 |         # Convert 3D bounding box to homogeneous coordinates
174 |         bbox_3d_homogeneous = np.hstack([bbox_3d, np.ones((bbox_3d.shape[0], 1))])  # (8, 4)
175 | 
176 |         # Transform 3D points from world coordinate system to camera coordinate system
177 |         cam_points = bbox_3d_homogeneous @ world_to_cam.T  # (8, 4)
178 | 
179 |         # Check if points are in front of camera (z > 0)
180 |         visibility = cam_points[:, 2] > 0
181 | 
182 |         # Apply projection matrix to project points onto image plane
183 |         points_2d_homogeneous = cam_points @ intrinsic.T  # (8, 4)
184 | 
185 |         # Perspective division: convert homogeneous coordinates to image coordinates
186 |         points_2d = points_2d_homogeneous[:, :2] / points_2d_homogeneous[:, 2:3]
187 | 
188 |         # Check if points are within image bounds
189 |         in_image = (points_2d[:, 0] >= 0) & (points_2d[:, 0] < color_width) & \
190 |                    (points_2d[:, 1] >= 0) & (points_2d[:, 1] < color_height)
191 | 
192 |         # Update visibility: points must be in front of camera and within image bounds
193 |         visibility = visibility & in_image
194 | 
195 |         # Detect occlusion if depth image is available
196 |         if depth_image is not None:
197 |             for j in range(8):
198 |                 if visibility[j]:
199 |                     # Get pixel coordinates of projected point in color image
200 |                     color_x, color_y = int(points_2d[j, 0]), int(points_2d[j, 1])
201 | 
202 |                     # Convert color image coordinates to depth image coordinates
203 |                     depth_x = int(color_x / depth_to_color_scale_x)
204 |                     depth_y = int(color_y / depth_to_color_scale_y)
205 | 
206 |                     # Ensure point is within depth image bounds
207 |                     if 0 <= depth_x < depth_width and 0 <= depth_y < depth_height:
208 |                         # Get actual depth from depth map
209 |                         actual_depth = float(depth_image[depth_y, depth_x]) / depth_scale  # Convert to meters
210 | 
211 |                         # Get calculated depth (z value in camera coordinate system)
212 |                         calculated_depth = float(cam_points[j, 2])
213 | 
214 |                         # Compare actual depth with calculated depth to determine occlusion
215 |                         # Point is considered occluded only when depth value is valid (>0) and calculated depth is significantly greater than actual depth
216 |                         if actual_depth > 0 and calculated_depth - actual_depth > occlusion_threshold:
217 |                             visibility[j] = False
218 |                     else:
219 |                         # Maintain current visibility state if point is outside depth image bounds
220 |                         pass
221 | 
222 |         # Save results for storage and later inclusion in visibility data JSON file
223 |         bboxes_2d[i] = points_2d
224 |         visibilities[i] = visibility
225 | 
226 |     return bboxes_2d, visibilities
227 | 
228 | 
229 | def load_3d_boxes(json_file):
230 |     """
231 |     Load 3D bounding box data from JSON file
232 | 
233 |     Parameters:
234 |        json_file: file path to JSON file containing 3D bounding box data
235 | 
236 |     Returns:
237 |        centers: numpy array with shape (N, 3), representing bounding box center points
238 |        sizes: numpy array with shape (N, 3), representing bounding box dimensions
239 |        labels: list containing bounding box labels
240 |        object_ids: list containing bounding box object IDs
241 |     """
242 |     data = json_file
243 | 
244 |     centers = []
245 |     sizes = []
246 |     labels = []
247 |     object_ids = []
248 | 
249 |     # 解析JSON数据
250 |     for box in data['boxes']:
251 |         center = np.array(box['center'])
252 |         size = np.array(box['size'])
253 |         label = box.get('label', 'unknown')
254 |         object_id = box.get('object_id', -1)
255 | 
256 |         centers.append(center)
257 |         sizes.append(size)
258 |         labels.append(label)
259 |         object_ids.append(object_id)
260 | 
261 |     return np.array(centers), np.array(sizes), labels, object_ids
262 | 
263 | 
264 | def process_image_with_boxes(image_path, boxes_json, intrinsic_path, pose_path, meta_file, output_path=None,
265 |                              visibility_json_path=None, depth_image_path=None, depth_scale=1000.0,
266 |                              occlusion_threshold=0.1, draw_picture=False):
267 |     """
268 |     Process a single image by drawing all 3D bounding boxes and saving visible object information,
269 |     with occlusion detection and handling of size mismatch between depth map and color image
270 | 
271 |     Parameters:
272 |        image_path: file path to the image
273 |        boxes_json: JSON object containing 3D bounding box data
274 |        intrinsic_path: file path to camera intrinsic parameters
275 |        pose_path: file path to camera pose
276 |        meta_file: file path to scene metadata
277 |        output_path: output image path, uses default path if None
278 |        visibility_json_path: visibility JSON file path, uses default path if None
279 |        depth_image_path: depth image file path, no occlusion detection if None
280 |        depth_scale: float, scale factor for depth image to convert depth values to meters
281 |        occlusion_threshold: float, depth difference threshold in meters for determining point occlusion
282 | 
283 |     Returns:
284 |        output_path: path to the output image
285 |        visibility_json_path: path to the visibility JSON file
286 |     """
287 | 
288 |     image = Image.open(image_path)
289 |     image_size = image.size
290 | 
291 |     def cv_imread(file_path):
292 |         cv_img = cv2.imdecode(np.fromfile(file_path, dtype=np.uint8), -1)
293 |         return cv_img
294 | 
295 |     # 加载深度图像（如果提供）
296 |     depth_image = None
297 |     if depth_image_path and os.path.exists(depth_image_path):
298 |         depth_image = cv_imread(depth_image_path)
299 |         if len(depth_image.shape) > 2:
300 |             depth_image = depth_image[:, :, 0]
301 | 
302 |     intrinsic = load_matrix_from_txt(intrinsic_path)
303 |     pose = load_matrix_from_txt(pose_path)
304 | 
305 |     axis_align_matrix = get_align_matrix(meta_file)
306 | 
307 |     pose = axis_align_matrix @ pose
308 | 
309 |     world_to_cam = np.linalg.inv(pose)
310 | 
311 |     centers, sizes, labels, object_ids = load_3d_boxes(boxes_json)
312 | 
313 |     bboxes_3d = get_3d_bbox_corners(centers, sizes)
314 | 
315 |     bboxes_2d, visibilities = project_3d_bbox_to_2d(
316 |         bboxes_3d, intrinsic, pose, image_size, depth_image, depth_scale, occlusion_threshold
317 |     )
318 | 
319 |     # ------------------------------------------------Store the image file for drawing------------------------------------------------
320 |     if draw_picture:
321 |         unique_labels = list(set(labels))
322 |         label_colors = {}
323 |         for i, label in enumerate(unique_labels):
324 |             h = (i * 30) % 180
325 |             hsv = np.array([[[h, 255, 255]]], dtype=np.uint8)
326 |             rgb = cv2.cvtColor(hsv, cv2.COLOR_HSV2RGB)[0][0]
327 |             label_colors[label] = (int(rgb[0]), int(rgb[1]), int(rgb[2]))
328 | 
329 |         colors = [label_colors[label] for label in labels]
330 |         result_image = draw_3d_bboxes(image, bboxes_2d, visibilities, colors=colors, show_invisible=True)
331 | 
332 |         img_draw = ImageDraw.Draw(result_image)
333 |         for i, (bbox_2d, visibility, label) in enumerate(zip(bboxes_2d, visibilities, labels)):
334 |             visible_points = bbox_2d[visibility]
335 |             if len(visible_points) > 0:
336 |                 top_point = visible_points[np.argmin(visible_points[:, 1])]
337 |                 x, y = int(top_point[0]), int(top_point[1] - 10)
338 |                 img_draw.text((x, y), label, fill=colors[i])
339 | 
340 |         if output_path is None:
341 |             output_path = os.path.splitext(image_path)[0] + "_with_boxes.jpg"
342 | 
343 |         result_image.save(output_path)
344 | 
345 |     # ------------------------------------------------Store visibility data in JSON format------------------------------------------------
346 |     visibility_data = {
347 |         "image_path": os.path.basename(image_path),
348 |         "visible_objects": []
349 |     }
350 | 
351 | # It was originally set up, but the obstruction filtering will reduce the visibility by 8 points. Therefore, after being stored in the visibility JSON file,
352 | # you can decide for yourself at what level to filter (I currently set it to be greater than 0.2, because two points indicate that one side of the box can be seen).
353 |     visibility_threshold = 0.01
354 | 
355 |     bboxes_3d_cam = []
356 |     for bbox_3d in bboxes_3d:
357 |         bbox_3d_homogeneous = np.hstack([bbox_3d, np.ones((bbox_3d.shape[0], 1))])  # (8, 4)
358 |         cam_points = bbox_3d_homogeneous @ world_to_cam.T  # (8, 4)
359 |         bbox_3d_cam = cam_points[:, :3]
360 |         bboxes_3d_cam.append(bbox_3d_cam)
361 | 
362 |     for i, (bbox_2d, bbox_3d_cam, visibility, label, object_id) in enumerate(
363 |             zip(bboxes_2d, bboxes_3d_cam, visibilities, labels, object_ids)):
364 | 
365 |         visibility_ratio = np.mean(visibility)
366 |         is_visible = visibility_ratio >= visibility_threshold
367 | 
368 |         if is_visible:
369 |             visible_points_count = np.sum(visibility)
370 | 
371 |             bbox_2d_list = bbox_2d.tolist()
372 |             bbox_3d_cam_list = bbox_3d_cam.tolist()
373 | 
374 |             # Store coordinate center points in their respective coordinate systems
375 |             bbox_2d_cam_center = np.mean(bbox_2d, axis=0)
376 |             bbox_3d_cam_center = np.mean(bbox_3d_cam, axis=0)
377 | 
378 |             visibility_data["visible_objects"].append({
379 |                 "object_id": object_id,
380 |                 "label": label,
381 |                 "visibility_ratio": float(visibility_ratio),
382 |                 "visible_points_count": int(visible_points_count),
383 |                 "bbox_2d_center": bbox_2d_cam_center.tolist(),
384 |                 "bbox_3d_center": bbox_3d_cam_center.tolist(),
385 |                 "vertices_visibility": visibility.tolist(),
386 |                 "occlusion_checked": depth_image is not None
387 |             })
388 | 
389 |     if visibility_json_path is None:
390 |         visibility_json_path = os.path.splitext(output_path)[0] + "_visibility.json"
391 | 
392 |     with open(visibility_json_path, 'w') as f:
393 |         json.dump(visibility_data, f, indent=2)
394 | 
395 |     return output_path, visibility_json_path
396 | 
397 | 
398 | def batch_process_images(image_folder, image_chosen, boxes_json, intrinsic_path, meta_file, output_folder,
399 |                          visibility_folder, depth_folder, pose_folder, depth_scale=1000.0, occlusion_threshold=0.1, draw_picture=False):
400 |     """
401 |     Batch process all images in a folder with occlusion detection
402 | 
403 |     Parameters:
404 |        image_folder: folder path containing images
405 |        boxes_json: JSON object containing 3D bounding box data
406 |        intrinsic_path: file path to camera intrinsic parameters
407 |        meta_file: file path to scene metadata
408 |        output_folder: output folder path, uses default path if None
409 |        visibility_folder: visibility JSON folder path, uses output_folder if None
410 |        depth_folder: depth image folder path, no occlusion detection if None
411 |        depth_scale: float, scale factor for depth image to convert depth values to meters
412 |        occlusion_threshold: float, depth difference threshold in meters for determining point occlusion
413 | 
414 |     Returns:
415 |        processed_images: list of processed image file paths
416 |        visibility_jsons: list of visibility JSON file paths
417 |     """
418 | 
419 |     os.makedirs(output_folder, exist_ok=True)
420 |     os.makedirs(visibility_folder, exist_ok=True)
421 | 
422 |     # Obtain all image files
423 |     image_files = image_chosen
424 | 
425 | 
426 |     processed_images = []
427 |     visibility_jsons = []
428 | 
429 |     for image_file in image_files:
430 |         image_path = os.path.join(image_folder, image_file)
431 | 
432 |         # Keep the file name as it is, and then change it to "txt" to retrieve the corresponding camera parameters from the "pose" folder.
433 |         pose_file = image_file.replace('.jpg', '.txt')
434 |         pose_path = os.path.join(pose_folder, pose_file)
435 | 
436 |         if not os.path.exists(pose_path):
437 |             print(f"The pose file does not exist.{pose_path}!")
438 |             continue
439 | 
440 |         depth_image_path = None
441 |         if depth_folder:
442 |             depth_file = image_file.replace('.jpg', '.png')
443 |             depth_image_path = os.path.join(depth_folder, depth_file)
444 |             if not os.path.exists(depth_image_path):
445 |                 print(f"Depth image does not exist: {depth_image_path}")
446 |                 depth_image_path = None
447 | 
448 |         output_path = os.path.join(output_folder, f"{os.path.splitext(image_file)[0]}_with_boxes.jpg")
449 |         visibility_json_path = os.path.join(visibility_folder, f"{os.path.splitext(image_file)[0]}_visibility.json")
450 | 
451 |         processed_path, vis_json_path = process_image_with_boxes(
452 |             image_path, boxes_json, intrinsic_path, pose_path, meta_file,
453 |             output_path, visibility_json_path, depth_image_path, depth_scale, occlusion_threshold, draw_picture
454 |         )
455 | 
456 |         processed_images.append(processed_path)
457 |         visibility_jsons.append(vis_json_path)
458 | 
459 | 
460 |     summary_data = {
461 |         "scene": os.path.basename(os.path.dirname(image_folder)),
462 |         "image_count": len(processed_images),
463 |         "depth_images_used": depth_folder is not None,
464 |         "occlusion_threshold": occlusion_threshold,
465 |         "per_image_visibility": []
466 |     }
467 | 
468 |     # 汇总每个图像的可见性信息
469 |     for vis_json_path in visibility_jsons:
470 |         try:
471 |             with open(vis_json_path, 'r') as f:
472 |                 vis_data = json.load(f)
473 |                 summary_data["per_image_visibility"].append({
474 |                     "image_path": vis_data["image_path"],
475 |                     "visible_object_count": len(vis_data["visible_objects"]),
476 | 
477 |                     "visible_object_ids": [obj["object_id"] for obj in vis_data["visible_objects"] if obj["visibility_ratio"] > 0.1],
478 |                     "visible_object_labels": [obj["label"] for obj in vis_data["visible_objects"] if obj["visibility_ratio"] > 0.1],
479 | 
480 |                     "occlusion_checked": any(obj.get("occlusion_checked", False) for obj in vis_data["visible_objects"])
481 |                 })
482 |         except Exception as e:
483 |             print(f"读取可见性文件 {vis_json_path} 时出错: {e}")
484 | 
485 |     summary_path = os.path.join(visibility_folder, "visibility_summary.json")
486 | 
487 |     def extract_number(image_path):
488 |         match = re.search(r'(\d+)\.jpg', image_path)
489 |         if match:
490 |             return int(match.group(1))
491 |         return 0
492 | 
493 |     summary_data['per_image_visibility'] = sorted(summary_data['per_image_visibility'],key=lambda x: extract_number(x["image_path"]))
494 | 
495 |     with open(summary_path, 'w') as f:
496 |         json.dump(summary_data, f, indent=2)
497 | 
498 |     return processed_images, visibility_jsons
499 | 
500 | 
501 | def get_3d_box(scene_name, pointcloud_folder, label_map_file):
502 |     scan_path = f"{pointcloud_folder}/{scene_name}"
503 | 
504 |     scan_name = os.path.split(scan_path)[-1]
505 |     mesh_file = os.path.join(scan_path, scan_name + "_vh_clean_2.ply")
506 |     agg_file = os.path.join(scan_path, scan_name + ".aggregation.json")
507 |     seg_file = os.path.join(scan_path, scan_name + "_vh_clean_2.0.010000.segs.json")
508 |     meta_file = os.path.join(
509 |         scan_path, scan_name + ".txt"
510 |     )  # includes axisAlignment info for the train set scans.
511 |     mesh_vertices, label_ids, instance_ids, instance_bboxes, object_id_to_label_id, json_boxes = export(
512 |         mesh_file, agg_file, seg_file, meta_file, label_map_file
513 |     )
514 |     return json_boxes
515 | 
516 | 
517 | def process(scene_name, draw_picture=False):
518 |    # Original dataset path (modifiable)
519 |    scan_path = f"/datasets/scannet/data/scans/{scene_name}"
520 | 
521 |    # Get all RGB-D images in the scene (modifiable)
522 |    image_folder = f"/datasets/scannet/scenes/{scene_name}/mc_frames"
523 | 
524 |    # Get all point cloud files and label mapping file in the scene
525 |    pointcloud_folder = "/datasets/scannet/scans"
526 |    label_map_file = "/datasets/scannet/scannetv2-labels.combined.tsv"
527 | 
528 |    # Output folders (modifiable)
529 |    output_folder = f"scannet_metadata/{scene_name}/output_images"  # Store rendered images
530 |    visibility_folder = f"scannet_metadata/{scene_name}/visibility_data"  # Store object information, coordinates, and visibility data for each image
531 | 
532 |    image_all_files = os.listdir(image_folder)
533 |    image_chosen = [file for file in image_all_files if file.lower().endswith('.jpg')]
534 | 
535 |    # Depth image file path
536 |    depth_folder = os.path.join(scan_path, "depth")
537 |    # Camera parameter file path
538 |    pose_folder = os.path.join(scan_path, "pose")
539 |    # Camera intrinsic file path, only RGB-D intrinsic data is used here
540 |    intrinsic_path = os.path.join(scan_path, "intrinsic_color.txt")
541 |    # For obtaining alignment matrix
542 |    meta_file = os.path.join(
543 |        pointcloud_folder, scene_name, scene_name + ".txt"
544 |    )
545 | 
546 |    boxes_json = get_3d_box(scene_name, pointcloud_folder, label_map_file)
547 | 
548 |    # Set depth image scale factor and occlusion threshold
549 |    depth_scale = 1000.0  # Assume depth units are in millimeters, convert to meters
550 |    occlusion_threshold = 0.1  # Set depth difference threshold to 10 centimeters
551 | 
552 |    # Batch process all images
553 |    processed_images, visibility_jsons = batch_process_images(
554 |        image_folder, image_chosen, boxes_json, intrinsic_path, meta_file,
555 |        output_folder, visibility_folder, depth_folder, pose_folder, depth_scale, occlusion_threshold, draw_picture
556 |    )
557 | 
558 | 
559 | 
560 | if __name__ == "__main__":
561 | 
562 |     # draw_picture boolean value determines whether to draw new images to the output folder (drawing every 100 scenes as shown below)
563 |     # Open file and read each line
564 |     with open('scannetv2_val.txt', 'r', encoding='utf-8') as file:
565 |         lines = file.readlines()
566 | 
567 |     scenes = [line.strip() for line in lines]
568 | 
569 |     for i, scene in enumerate(scenes):
570 |         draw_picture = False
571 |         if i%100 == 0:
572 |             draw_picture = True
573 |             print(f"Processed {i} scenes")
574 |         scene_name = scene
575 |         process(scene_name, draw_picture)


--------------------------------------------------------------------------------
/data_process/scannet_process/frame_sampling.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | 
 4 | def find_continuous_frames(json_data, target_labels):
 5 |     """
 6 |     Find consecutive video frames that collectively contain the specified three objects
 7 | 
 8 |     Parameters:
 9 |        json_data (dict): parsed JSON data
10 |        target_labels (list): list of three target object labels
11 | 
12 |     Returns:
13 |        list: list of consecutive frame filenames
14 |     """
15 |     # Record visible target objects in each frame
16 |     frames_info = []
17 |     for frame in json_data["per_image_visibility"]:
18 |         visible_targets = set(frame["visible_object_labels"]) & set(target_labels)
19 |         frames_info.append({
20 |             "image_path": frame["image_path"],
21 |             "visible_targets": visible_targets,
22 |             "visible_count": len(visible_targets)
23 |         })
24 | 
25 |     # Find the frame where target objects first appear
26 |     start_index = None
27 |     for i, frame in enumerate(frames_info):
28 |         if frame["visible_count"] > 0:
29 |             start_index = i
30 |             break
31 | 
32 |     if start_index is None:
33 |         return []
34 | 
35 |     # Starting from the first appearance frame, search for consecutive frame sequences until all target objects are covered
36 |     current_index = start_index
37 |     found_targets = set()
38 | 
39 |     while current_index < len(frames_info) and len(found_targets) < len(target_labels):
40 |         found_targets.update(frames_info[current_index]["visible_targets"])
41 |         current_index += 1
42 | 
43 |         if len(found_targets) == len(target_labels) or current_index == len(frames_info):
44 |             if len(found_targets) == len(target_labels) and current_index != len(frames_info):
45 |                 current_index += 1
46 |             break
47 | 
48 |     # Check if all the target objects have been found
49 |     if len(found_targets) < len(target_labels):
50 |         return []
51 | 
52 |     result_frames = [frames_info[i]["image_path"] for i in range(start_index, current_index)]
53 |     return result_frames
54 | 
55 | def get_full_images(scene_name, target_labels):
56 | 
57 |     with open(f'scannet_metadata/{scene_name}/visibility_data/visibility_summary.json', 'r') as file:
58 |         data = json.load(file)
59 |     result = find_continuous_frames(data, target_labels)
60 |     return result
61 | 


--------------------------------------------------------------------------------
/data_process/scannet_process/scannet_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | #
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | 
  6 | """ Ref: https://github.com/ScanNet/ScanNet/blob/master/BenchmarkScripts """
  7 | import os
  8 | import sys
  9 | import json
 10 | import csv
 11 | 
 12 | try:
 13 |     import numpy as np
 14 | except:
 15 |     print("Failed to import numpy package.")
 16 |     sys.exit(-1)
 17 | 
 18 | try:
 19 |     from plyfile import PlyData, PlyElement
 20 | except:
 21 |     print("Please install the module 'plyfile' for PLY i/o, e.g.")
 22 |     print("pip install plyfile")
 23 |     sys.exit(-1)
 24 | 
 25 | 
 26 | def represents_int(s):
 27 |     """if string s represents an int."""
 28 |     try:
 29 |         int(s)
 30 |         return True
 31 |     except ValueError:
 32 |         return False
 33 | 
 34 | 
 35 | def read_label_mapping(filename, label_from="raw_category", label_to="nyu40id"):
 36 |     assert os.path.isfile(filename)
 37 |     mapping = dict()
 38 |     with open(filename) as csvfile:
 39 |         reader = csv.DictReader(csvfile, delimiter="\t")
 40 |         for row in reader:
 41 |             mapping[row[label_from]] = int(row[label_to])
 42 |     if represents_int(list(mapping.keys())[0]):
 43 |         mapping = {int(k): v for k, v in mapping.items()}
 44 |     return mapping
 45 | 
 46 | 
 47 | def read_mesh_vertices(filename):
 48 |     """read XYZ for each vertex."""
 49 |     assert os.path.isfile(filename)
 50 |     with open(filename, "rb") as f:
 51 |         plydata = PlyData.read(f)
 52 |         num_verts = plydata["vertex"].count
 53 |         vertices = np.zeros(shape=[num_verts, 3], dtype=np.float32)
 54 |         vertices[:, 0] = plydata["vertex"].data["x"]
 55 |         vertices[:, 1] = plydata["vertex"].data["y"]
 56 |         vertices[:, 2] = plydata["vertex"].data["z"]
 57 |     return vertices
 58 | 
 59 | 
 60 | def read_mesh_vertices_rgb(filename):
 61 |     """read XYZ RGB for each vertex.
 62 |     Note: RGB values are in 0-255
 63 |     """
 64 |     assert os.path.isfile(filename)
 65 |     with open(filename, "rb") as f:
 66 |         plydata = PlyData.read(f)
 67 |         num_verts = plydata["vertex"].count
 68 |         vertices = np.zeros(shape=[num_verts, 6], dtype=np.float32)
 69 |         vertices[:, 0] = plydata["vertex"].data["x"]
 70 |         vertices[:, 1] = plydata["vertex"].data["y"]
 71 |         vertices[:, 2] = plydata["vertex"].data["z"]
 72 |         vertices[:, 3] = plydata["vertex"].data["red"]
 73 |         vertices[:, 4] = plydata["vertex"].data["green"]
 74 |         vertices[:, 5] = plydata["vertex"].data["blue"]
 75 |     return vertices
 76 | 
 77 | 
 78 | 
 79 | def read_aggregation(filename):
 80 |     assert os.path.isfile(filename)
 81 |     object_id_to_segs = {}
 82 |     label_to_segs = {}
 83 |     with open(filename) as f:
 84 |         data = json.load(f)
 85 |         num_objects = len(data["segGroups"])
 86 |         for i in range(num_objects):
 87 |             object_id = (
 88 |                     data["segGroups"][i]["objectId"] + 1
 89 |             )  # instance ids should be 1-indexed
 90 |             label = data["segGroups"][i]["label"]
 91 |             segs = data["segGroups"][i]["segments"]
 92 |             object_id_to_segs[object_id] = segs
 93 |             if label in label_to_segs:
 94 |                 label_to_segs[label].extend(segs)
 95 |             else:
 96 |                 label_to_segs[label] = segs
 97 |     return object_id_to_segs, label_to_segs
 98 | 
 99 | 
100 | def read_segmentation(filename):
101 |     assert os.path.isfile(filename)
102 |     seg_to_verts = {}
103 |     with open(filename) as f:
104 |         data = json.load(f)
105 |         num_verts = len(data["segIndices"])
106 |         for i in range(num_verts):
107 |             seg_id = data["segIndices"][i]
108 |             if seg_id in seg_to_verts:
109 |                 seg_to_verts[seg_id].append(i)
110 |             else:
111 |                 seg_to_verts[seg_id] = [i]
112 |     return seg_to_verts, num_verts
113 | 
114 | 
115 | 
116 | 
117 | def export(mesh_file, agg_file, seg_file, meta_file, label_map_file, output_file=None, json_file=None):
118 |     """points are XYZ RGB (RGB in 0-255),
119 |     semantic label as nyu40 ids,
120 |     instance label as 1-#instance,
121 |     box as (cx,cy,cz,dx,dy,dz,semantic_label)
122 |     """
123 |     label_map = read_label_mapping(
124 |         label_map_file, label_from="raw_category", label_to="nyu40id"
125 |     )
126 |     mesh_vertices = read_mesh_vertices_rgb(mesh_file)
127 | 
128 |     # Load scene axis alignment matrix
129 |     lines = open(meta_file).readlines()
130 |     axis_align_matrix = np.eye(4)
131 |     for line in lines:
132 |         if "axisAlignment" in line:
133 |             axis_align_matrix = np.array([
134 |                 float(x) for x in line.rstrip().strip("axisAlignment = ").split(" ")
135 |             ]).reshape((4, 4))
136 |             break
137 | 
138 |     pts = np.ones((mesh_vertices.shape[0], 4))
139 |     pts[:, 0:3] = mesh_vertices[:, 0:3]
140 |     pts = np.dot(pts, axis_align_matrix.transpose())  # Nx4
141 |     mesh_vertices[:, 0:3] = pts[:, 0:3]
142 | 
143 |     # Load semantic and instance labels
144 |     object_id_to_segs, label_to_segs = read_aggregation(agg_file)
145 |     seg_to_verts, num_verts = read_segmentation(seg_file)
146 |     label_ids = np.zeros(shape=(num_verts), dtype=np.uint32)  # 0: unannotated
147 |     object_id_to_label_id = {}
148 | 
149 |     for label, segs in label_to_segs.items():
150 |         label_id = label_map.get(label, 0)
151 |         for seg in segs:
152 |             verts = seg_to_verts[seg]
153 |             label_ids[verts] = label_id
154 | 
155 |     instance_ids = np.zeros(shape=(num_verts), dtype=np.uint32)  # 0: unannotated
156 |     num_instances = len(np.unique(list(object_id_to_segs.keys())))
157 |     for object_id, segs in object_id_to_segs.items():
158 |         for seg in segs:
159 |             verts = seg_to_verts[seg]
160 |             instance_ids[verts] = object_id
161 |             if object_id not in object_id_to_label_id:
162 |                 object_id_to_label_id[object_id] = label_ids[verts][0]
163 | 
164 |     instance_bboxes = np.zeros((num_instances, 7))
165 |     json_boxes = {"boxes": []}
166 | 
167 |     for obj_id in object_id_to_segs:
168 |         label_id = object_id_to_label_id.get(obj_id, 0)
169 |         obj_pc = mesh_vertices[instance_ids == obj_id, 0:3]
170 |         if len(obj_pc) == 0:
171 |             continue
172 |         if label_id in [0, 1, 2, 22, 38, 39, 40]:
173 |             continue
174 |         # Compute axis-aligned bounding box
175 |         xmin, ymin, zmin = np.min(obj_pc, axis=0)
176 |         xmax, ymax, zmax = np.max(obj_pc, axis=0)
177 |         bbox = np.array([
178 |             (xmin + xmax) / 2,
179 |             (ymin + ymax) / 2,
180 |             (zmin + zmax) / 2,
181 |             xmax - xmin,
182 |             ymax - ymin,
183 |             zmax - zmin,
184 |             label_id,
185 |         ])
186 |         instance_bboxes[obj_id - 1, :] = bbox
187 | 
188 |         nyu40_to_category = {
189 |             0: "unlabeled", 1: "wall", 2: "floor", 3: "cabinet", 4: "bed",
190 |             5: "chair", 6: "sofa", 7: "table", 8: "door", 9: "window",
191 |             10: "bookshelf", 11: "picture", 12: "counter", 13: "blinds",
192 |             14: "desk", 15: "shelves", 16: "curtain", 17: "dresser",
193 |             18: "pillow", 19: "mirror", 20: "floor mat", 21: "clothes",
194 |             22: "ceiling", 23: "books", 24: "refrigerator", 25: "television",
195 |             26: "paper", 27: "towel", 28: "shower curtain", 29: "box",
196 |             30: "whiteboard", 31: "person", 32: "nightstand", 33: "toilet",
197 |             34: "sink", 35: "lamp", 36: "bathtub", 37: "bag",
198 |             38: "other structure", 39: "other furniture", 40: "other prop"
199 |         }
200 | 
201 |         json_boxes["boxes"].append({
202 |             "center": bbox[:3].tolist(),
203 |             "size": bbox[3:6].tolist(),
204 |             "label": nyu40_to_category.get(label_id),
205 |             "object_id": int(obj_id)
206 |         })
207 | 
208 |     if output_file:
209 |         os.makedirs(os.path.dirname(output_file), exist_ok=True)
210 |         np.save(output_file + "_vert.npy", mesh_vertices)
211 |         np.save(output_file + "_sem_label.npy", label_ids)
212 |         np.save(output_file + "_ins_label.npy", instance_ids)
213 |         np.save(output_file + "_bbox.npy", instance_bboxes)
214 | 
215 |     return (
216 |         mesh_vertices,
217 |         label_ids,
218 |         instance_ids,
219 |         instance_bboxes,
220 |         object_id_to_label_id,
221 |         json_boxes
222 |     )
223 | 
224 | 


--------------------------------------------------------------------------------
/data_process/scannet_process/scannetv2_train.txt:
--------------------------------------------------------------------------------
   1 | scene0000_00
   2 | scene0000_01
   3 | scene0000_02
   4 | scene0001_00
   5 | scene0001_01
   6 | scene0002_00
   7 | scene0002_01
   8 | scene0003_00
   9 | scene0003_01
  10 | scene0003_02
  11 | scene0004_00
  12 | scene0005_00
  13 | scene0005_01
  14 | scene0006_00
  15 | scene0006_01
  16 | scene0006_02
  17 | scene0007_00
  18 | scene0008_00
  19 | scene0009_00
  20 | scene0009_01
  21 | scene0009_02
  22 | scene0010_00
  23 | scene0010_01
  24 | scene0012_00
  25 | scene0012_01
  26 | scene0012_02
  27 | scene0013_00
  28 | scene0013_01
  29 | scene0013_02
  30 | scene0014_00
  31 | scene0016_00
  32 | scene0016_01
  33 | scene0016_02
  34 | scene0017_00
  35 | scene0017_01
  36 | scene0017_02
  37 | scene0018_00
  38 | scene0020_00
  39 | scene0020_01
  40 | scene0021_00
  41 | scene0022_00
  42 | scene0022_01
  43 | scene0023_00
  44 | scene0024_00
  45 | scene0024_01
  46 | scene0024_02
  47 | scene0026_00
  48 | scene0027_00
  49 | scene0027_01
  50 | scene0027_02
  51 | scene0028_00
  52 | scene0029_00
  53 | scene0029_01
  54 | scene0029_02
  55 | scene0031_00
  56 | scene0031_01
  57 | scene0031_02
  58 | scene0032_00
  59 | scene0032_01
  60 | scene0033_00
  61 | scene0034_00
  62 | scene0034_01
  63 | scene0034_02
  64 | scene0035_00
  65 | scene0035_01
  66 | scene0036_00
  67 | scene0036_01
  68 | scene0037_00
  69 | scene0038_00
  70 | scene0038_01
  71 | scene0038_02
  72 | scene0039_00
  73 | scene0039_01
  74 | scene0040_00
  75 | scene0040_01
  76 | scene0041_00
  77 | scene0041_01
  78 | scene0042_00
  79 | scene0042_01
  80 | scene0042_02
  81 | scene0043_00
  82 | scene0043_01
  83 | scene0044_00
  84 | scene0044_01
  85 | scene0044_02
  86 | scene0045_00
  87 | scene0045_01
  88 | scene0047_00
  89 | scene0048_00
  90 | scene0048_01
  91 | scene0049_00
  92 | scene0051_00
  93 | scene0051_01
  94 | scene0051_02
  95 | scene0051_03
  96 | scene0052_00
  97 | scene0052_01
  98 | scene0052_02
  99 | scene0053_00
 100 | scene0054_00
 101 | scene0055_00
 102 | scene0055_01
 103 | scene0055_02
 104 | scene0056_00
 105 | scene0056_01
 106 | scene0057_00
 107 | scene0057_01
 108 | scene0058_00
 109 | scene0058_01
 110 | scene0059_00
 111 | scene0059_01
 112 | scene0059_02
 113 | scene0060_00
 114 | scene0060_01
 115 | scene0061_00
 116 | scene0061_01
 117 | scene0062_00
 118 | scene0062_01
 119 | scene0062_02
 120 | scene0065_00
 121 | scene0065_01
 122 | scene0065_02
 123 | scene0066_00
 124 | scene0067_00
 125 | scene0067_01
 126 | scene0067_02
 127 | scene0068_00
 128 | scene0068_01
 129 | scene0069_00
 130 | scene0070_00
 131 | scene0071_00
 132 | scene0072_00
 133 | scene0072_01
 134 | scene0072_02
 135 | scene0073_00
 136 | scene0073_01
 137 | scene0073_02
 138 | scene0073_03
 139 | scene0074_00
 140 | scene0074_01
 141 | scene0074_02
 142 | scene0075_00
 143 | scene0076_00
 144 | scene0078_00
 145 | scene0078_01
 146 | scene0078_02
 147 | scene0079_00
 148 | scene0079_01
 149 | scene0080_00
 150 | scene0080_01
 151 | scene0080_02
 152 | scene0082_00
 153 | scene0083_00
 154 | scene0083_01
 155 | scene0085_00
 156 | scene0085_01
 157 | scene0087_00
 158 | scene0087_01
 159 | scene0087_02
 160 | scene0089_00
 161 | scene0089_01
 162 | scene0089_02
 163 | scene0090_00
 164 | scene0091_00
 165 | scene0092_00
 166 | scene0092_01
 167 | scene0092_02
 168 | scene0092_03
 169 | scene0092_04
 170 | scene0093_00
 171 | scene0093_01
 172 | scene0093_02
 173 | scene0094_00
 174 | scene0096_00
 175 | scene0096_01
 176 | scene0096_02
 177 | scene0097_00
 178 | scene0098_00
 179 | scene0098_01
 180 | scene0099_00
 181 | scene0099_01
 182 | scene0101_00
 183 | scene0101_01
 184 | scene0101_02
 185 | scene0101_03
 186 | scene0101_04
 187 | scene0101_05
 188 | scene0102_00
 189 | scene0102_01
 190 | scene0103_00
 191 | scene0103_01
 192 | scene0104_00
 193 | scene0105_00
 194 | scene0105_01
 195 | scene0105_02
 196 | scene0106_00
 197 | scene0106_01
 198 | scene0106_02
 199 | scene0107_00
 200 | scene0108_00
 201 | scene0109_00
 202 | scene0109_01
 203 | scene0110_00
 204 | scene0110_01
 205 | scene0110_02
 206 | scene0111_00
 207 | scene0111_01
 208 | scene0111_02
 209 | scene0112_00
 210 | scene0112_01
 211 | scene0112_02
 212 | scene0113_00
 213 | scene0113_01
 214 | scene0114_00
 215 | scene0114_01
 216 | scene0114_02
 217 | scene0115_00
 218 | scene0115_01
 219 | scene0115_02
 220 | scene0116_00
 221 | scene0116_01
 222 | scene0116_02
 223 | scene0117_00
 224 | scene0118_00
 225 | scene0118_01
 226 | scene0118_02
 227 | scene0119_00
 228 | scene0120_00
 229 | scene0120_01
 230 | scene0121_00
 231 | scene0121_01
 232 | scene0121_02
 233 | scene0122_00
 234 | scene0122_01
 235 | scene0123_00
 236 | scene0123_01
 237 | scene0123_02
 238 | scene0124_00
 239 | scene0124_01
 240 | scene0125_00
 241 | scene0126_00
 242 | scene0126_01
 243 | scene0126_02
 244 | scene0127_00
 245 | scene0127_01
 246 | scene0128_00
 247 | scene0129_00
 248 | scene0130_00
 249 | scene0132_00
 250 | scene0132_01
 251 | scene0132_02
 252 | scene0133_00
 253 | scene0134_00
 254 | scene0134_01
 255 | scene0134_02
 256 | scene0135_00
 257 | scene0136_00
 258 | scene0136_01
 259 | scene0136_02
 260 | scene0137_00
 261 | scene0137_01
 262 | scene0137_02
 263 | scene0138_00
 264 | scene0140_00
 265 | scene0140_01
 266 | scene0141_00
 267 | scene0141_01
 268 | scene0141_02
 269 | scene0142_00
 270 | scene0142_01
 271 | scene0143_00
 272 | scene0143_01
 273 | scene0143_02
 274 | scene0145_00
 275 | scene0147_00
 276 | scene0147_01
 277 | scene0148_00
 278 | scene0150_00
 279 | scene0150_01
 280 | scene0150_02
 281 | scene0151_00
 282 | scene0151_01
 283 | scene0152_00
 284 | scene0152_01
 285 | scene0152_02
 286 | scene0154_00
 287 | scene0155_00
 288 | scene0155_01
 289 | scene0155_02
 290 | scene0156_00
 291 | scene0157_00
 292 | scene0157_01
 293 | scene0158_00
 294 | scene0158_01
 295 | scene0158_02
 296 | scene0159_00
 297 | scene0160_00
 298 | scene0160_01
 299 | scene0160_02
 300 | scene0160_03
 301 | scene0160_04
 302 | scene0161_00
 303 | scene0161_01
 304 | scene0161_02
 305 | scene0162_00
 306 | scene0163_00
 307 | scene0163_01
 308 | scene0165_00
 309 | scene0165_01
 310 | scene0165_02
 311 | scene0166_00
 312 | scene0166_01
 313 | scene0166_02
 314 | scene0167_00
 315 | scene0168_00
 316 | scene0168_01
 317 | scene0168_02
 318 | scene0170_00
 319 | scene0170_01
 320 | scene0170_02
 321 | scene0171_00
 322 | scene0171_01
 323 | scene0172_00
 324 | scene0172_01
 325 | scene0173_00
 326 | scene0173_01
 327 | scene0173_02
 328 | scene0174_00
 329 | scene0174_01
 330 | scene0175_00
 331 | scene0176_00
 332 | scene0177_00
 333 | scene0177_01
 334 | scene0177_02
 335 | scene0178_00
 336 | scene0179_00
 337 | scene0180_00
 338 | scene0181_00
 339 | scene0181_01
 340 | scene0181_02
 341 | scene0181_03
 342 | scene0182_00
 343 | scene0182_01
 344 | scene0182_02
 345 | scene0183_00
 346 | scene0184_00
 347 | scene0185_00
 348 | scene0186_00
 349 | scene0186_01
 350 | scene0188_00
 351 | scene0189_00
 352 | scene0190_00
 353 | scene0191_00
 354 | scene0191_01
 355 | scene0191_02
 356 | scene0192_00
 357 | scene0192_01
 358 | scene0192_02
 359 | scene0194_00
 360 | scene0195_00
 361 | scene0195_01
 362 | scene0195_02
 363 | scene0197_00
 364 | scene0197_01
 365 | scene0197_02
 366 | scene0198_00
 367 | scene0199_00
 368 | scene0200_00
 369 | scene0200_01
 370 | scene0200_02
 371 | scene0201_00
 372 | scene0201_01
 373 | scene0201_02
 374 | scene0202_00
 375 | scene0204_00
 376 | scene0204_01
 377 | scene0204_02
 378 | scene0205_00
 379 | scene0205_01
 380 | scene0205_02
 381 | scene0206_00
 382 | scene0206_01
 383 | scene0206_02
 384 | scene0209_00
 385 | scene0209_01
 386 | scene0209_02
 387 | scene0210_00
 388 | scene0210_01
 389 | scene0211_00
 390 | scene0211_01
 391 | scene0211_02
 392 | scene0211_03
 393 | scene0212_00
 394 | scene0212_01
 395 | scene0212_02
 396 | scene0213_00
 397 | scene0214_00
 398 | scene0214_01
 399 | scene0214_02
 400 | scene0215_00
 401 | scene0215_01
 402 | scene0216_00
 403 | scene0218_00
 404 | scene0218_01
 405 | scene0219_00
 406 | scene0220_00
 407 | scene0220_01
 408 | scene0220_02
 409 | scene0223_00
 410 | scene0223_01
 411 | scene0223_02
 412 | scene0224_00
 413 | scene0225_00
 414 | scene0226_00
 415 | scene0226_01
 416 | scene0227_00
 417 | scene0228_00
 418 | scene0229_00
 419 | scene0229_01
 420 | scene0229_02
 421 | scene0230_00
 422 | scene0232_00
 423 | scene0232_01
 424 | scene0232_02
 425 | scene0233_00
 426 | scene0233_01
 427 | scene0234_00
 428 | scene0235_00
 429 | scene0236_00
 430 | scene0236_01
 431 | scene0237_00
 432 | scene0237_01
 433 | scene0238_00
 434 | scene0238_01
 435 | scene0239_00
 436 | scene0239_01
 437 | scene0239_02
 438 | scene0240_00
 439 | scene0241_00
 440 | scene0241_01
 441 | scene0241_02
 442 | scene0242_00
 443 | scene0242_01
 444 | scene0242_02
 445 | scene0243_00
 446 | scene0244_00
 447 | scene0244_01
 448 | scene0245_00
 449 | scene0247_00
 450 | scene0247_01
 451 | scene0248_00
 452 | scene0248_01
 453 | scene0248_02
 454 | scene0250_00
 455 | scene0250_01
 456 | scene0250_02
 457 | scene0252_00
 458 | scene0253_00
 459 | scene0254_00
 460 | scene0254_01
 461 | scene0255_00
 462 | scene0255_01
 463 | scene0255_02
 464 | scene0258_00
 465 | scene0259_00
 466 | scene0259_01
 467 | scene0260_00
 468 | scene0260_01
 469 | scene0260_02
 470 | scene0261_00
 471 | scene0261_01
 472 | scene0261_02
 473 | scene0261_03
 474 | scene0262_00
 475 | scene0262_01
 476 | scene0263_00
 477 | scene0263_01
 478 | scene0264_00
 479 | scene0264_01
 480 | scene0264_02
 481 | scene0265_00
 482 | scene0265_01
 483 | scene0265_02
 484 | scene0266_00
 485 | scene0266_01
 486 | scene0267_00
 487 | scene0268_00
 488 | scene0268_01
 489 | scene0268_02
 490 | scene0269_00
 491 | scene0269_01
 492 | scene0269_02
 493 | scene0270_00
 494 | scene0270_01
 495 | scene0270_02
 496 | scene0271_00
 497 | scene0271_01
 498 | scene0272_00
 499 | scene0272_01
 500 | scene0273_00
 501 | scene0273_01
 502 | scene0274_00
 503 | scene0274_01
 504 | scene0274_02
 505 | scene0275_00
 506 | scene0276_00
 507 | scene0276_01
 508 | scene0279_00
 509 | scene0279_01
 510 | scene0279_02
 511 | scene0280_00
 512 | scene0280_01
 513 | scene0280_02
 514 | scene0281_00
 515 | scene0282_00
 516 | scene0282_01
 517 | scene0282_02
 518 | scene0283_00
 519 | scene0284_00
 520 | scene0285_00
 521 | scene0286_00
 522 | scene0286_01
 523 | scene0286_02
 524 | scene0286_03
 525 | scene0287_00
 526 | scene0288_00
 527 | scene0288_01
 528 | scene0288_02
 529 | scene0289_00
 530 | scene0289_01
 531 | scene0290_00
 532 | scene0291_00
 533 | scene0291_01
 534 | scene0291_02
 535 | scene0292_00
 536 | scene0292_01
 537 | scene0293_00
 538 | scene0293_01
 539 | scene0294_00
 540 | scene0294_01
 541 | scene0294_02
 542 | scene0295_00
 543 | scene0295_01
 544 | scene0296_00
 545 | scene0296_01
 546 | scene0297_00
 547 | scene0297_01
 548 | scene0297_02
 549 | scene0298_00
 550 | scene0299_00
 551 | scene0299_01
 552 | scene0301_00
 553 | scene0301_01
 554 | scene0301_02
 555 | scene0302_00
 556 | scene0302_01
 557 | scene0303_00
 558 | scene0303_01
 559 | scene0303_02
 560 | scene0305_00
 561 | scene0305_01
 562 | scene0306_00
 563 | scene0306_01
 564 | scene0308_00
 565 | scene0309_00
 566 | scene0309_01
 567 | scene0310_00
 568 | scene0310_01
 569 | scene0310_02
 570 | scene0311_00
 571 | scene0312_00
 572 | scene0312_01
 573 | scene0312_02
 574 | scene0313_00
 575 | scene0313_01
 576 | scene0313_02
 577 | scene0315_00
 578 | scene0317_00
 579 | scene0317_01
 580 | scene0318_00
 581 | scene0319_00
 582 | scene0320_00
 583 | scene0320_01
 584 | scene0320_02
 585 | scene0320_03
 586 | scene0321_00
 587 | scene0322_00
 588 | scene0323_00
 589 | scene0323_01
 590 | scene0324_00
 591 | scene0324_01
 592 | scene0325_00
 593 | scene0325_01
 594 | scene0326_00
 595 | scene0327_00
 596 | scene0330_00
 597 | scene0331_00
 598 | scene0331_01
 599 | scene0332_00
 600 | scene0332_01
 601 | scene0332_02
 602 | scene0333_00
 603 | scene0335_00
 604 | scene0335_01
 605 | scene0335_02
 606 | scene0336_00
 607 | scene0336_01
 608 | scene0337_00
 609 | scene0337_01
 610 | scene0337_02
 611 | scene0339_00
 612 | scene0340_00
 613 | scene0340_01
 614 | scene0340_02
 615 | scene0341_00
 616 | scene0341_01
 617 | scene0344_00
 618 | scene0344_01
 619 | scene0345_00
 620 | scene0345_01
 621 | scene0346_00
 622 | scene0346_01
 623 | scene0347_00
 624 | scene0347_01
 625 | scene0347_02
 626 | scene0348_00
 627 | scene0348_01
 628 | scene0348_02
 629 | scene0349_00
 630 | scene0349_01
 631 | scene0350_00
 632 | scene0350_01
 633 | scene0350_02
 634 | scene0352_00
 635 | scene0352_01
 636 | scene0352_02
 637 | scene0358_00
 638 | scene0358_01
 639 | scene0358_02
 640 | scene0359_00
 641 | scene0359_01
 642 | scene0360_00
 643 | scene0361_00
 644 | scene0361_01
 645 | scene0361_02
 646 | scene0362_00
 647 | scene0362_01
 648 | scene0362_02
 649 | scene0362_03
 650 | scene0363_00
 651 | scene0364_00
 652 | scene0364_01
 653 | scene0365_00
 654 | scene0365_01
 655 | scene0365_02
 656 | scene0366_00
 657 | scene0367_00
 658 | scene0367_01
 659 | scene0368_00
 660 | scene0368_01
 661 | scene0369_00
 662 | scene0369_01
 663 | scene0369_02
 664 | scene0370_00
 665 | scene0370_01
 666 | scene0370_02
 667 | scene0371_00
 668 | scene0371_01
 669 | scene0372_00
 670 | scene0373_00
 671 | scene0373_01
 672 | scene0374_00
 673 | scene0375_00
 674 | scene0375_01
 675 | scene0375_02
 676 | scene0376_00
 677 | scene0376_01
 678 | scene0376_02
 679 | scene0379_00
 680 | scene0380_00
 681 | scene0380_01
 682 | scene0380_02
 683 | scene0381_00
 684 | scene0381_01
 685 | scene0381_02
 686 | scene0383_00
 687 | scene0383_01
 688 | scene0383_02
 689 | scene0384_00
 690 | scene0385_00
 691 | scene0385_01
 692 | scene0385_02
 693 | scene0386_00
 694 | scene0387_00
 695 | scene0387_01
 696 | scene0387_02
 697 | scene0388_00
 698 | scene0388_01
 699 | scene0390_00
 700 | scene0391_00
 701 | scene0392_00
 702 | scene0392_01
 703 | scene0392_02
 704 | scene0393_00
 705 | scene0393_01
 706 | scene0393_02
 707 | scene0394_00
 708 | scene0394_01
 709 | scene0395_00
 710 | scene0395_01
 711 | scene0395_02
 712 | scene0396_00
 713 | scene0396_01
 714 | scene0396_02
 715 | scene0397_00
 716 | scene0397_01
 717 | scene0398_00
 718 | scene0398_01
 719 | scene0399_00
 720 | scene0399_01
 721 | scene0400_00
 722 | scene0400_01
 723 | scene0401_00
 724 | scene0402_00
 725 | scene0403_00
 726 | scene0403_01
 727 | scene0404_00
 728 | scene0404_01
 729 | scene0404_02
 730 | scene0405_00
 731 | scene0407_00
 732 | scene0407_01
 733 | scene0408_00
 734 | scene0408_01
 735 | scene0409_00
 736 | scene0409_01
 737 | scene0410_00
 738 | scene0410_01
 739 | scene0411_00
 740 | scene0411_01
 741 | scene0411_02
 742 | scene0413_00
 743 | scene0415_00
 744 | scene0415_01
 745 | scene0415_02
 746 | scene0416_00
 747 | scene0416_01
 748 | scene0416_02
 749 | scene0416_03
 750 | scene0416_04
 751 | scene0417_00
 752 | scene0418_00
 753 | scene0418_01
 754 | scene0418_02
 755 | scene0419_00
 756 | scene0419_01
 757 | scene0419_02
 758 | scene0420_00
 759 | scene0420_01
 760 | scene0420_02
 761 | scene0421_00
 762 | scene0421_01
 763 | scene0421_02
 764 | scene0422_00
 765 | scene0424_00
 766 | scene0424_01
 767 | scene0424_02
 768 | scene0425_00
 769 | scene0425_01
 770 | scene0428_00
 771 | scene0428_01
 772 | scene0429_00
 773 | scene0431_00
 774 | scene0433_00
 775 | scene0434_00
 776 | scene0434_01
 777 | scene0434_02
 778 | scene0436_00
 779 | scene0437_00
 780 | scene0437_01
 781 | scene0438_00
 782 | scene0439_00
 783 | scene0439_01
 784 | scene0440_00
 785 | scene0440_01
 786 | scene0440_02
 787 | scene0442_00
 788 | scene0443_00
 789 | scene0444_00
 790 | scene0444_01
 791 | scene0445_00
 792 | scene0445_01
 793 | scene0446_00
 794 | scene0446_01
 795 | scene0447_00
 796 | scene0447_01
 797 | scene0447_02
 798 | scene0448_00
 799 | scene0448_01
 800 | scene0448_02
 801 | scene0449_00
 802 | scene0449_01
 803 | scene0449_02
 804 | scene0450_00
 805 | scene0451_00
 806 | scene0451_01
 807 | scene0451_02
 808 | scene0451_03
 809 | scene0451_04
 810 | scene0451_05
 811 | scene0452_00
 812 | scene0452_01
 813 | scene0452_02
 814 | scene0453_00
 815 | scene0453_01
 816 | scene0454_00
 817 | scene0455_00
 818 | scene0456_00
 819 | scene0456_01
 820 | scene0457_00
 821 | scene0457_01
 822 | scene0457_02
 823 | scene0459_00
 824 | scene0459_01
 825 | scene0460_00
 826 | scene0463_00
 827 | scene0463_01
 828 | scene0464_00
 829 | scene0465_00
 830 | scene0465_01
 831 | scene0466_00
 832 | scene0466_01
 833 | scene0467_00
 834 | scene0468_00
 835 | scene0468_01
 836 | scene0468_02
 837 | scene0469_00
 838 | scene0469_01
 839 | scene0469_02
 840 | scene0470_00
 841 | scene0470_01
 842 | scene0471_00
 843 | scene0471_01
 844 | scene0471_02
 845 | scene0472_00
 846 | scene0472_01
 847 | scene0472_02
 848 | scene0473_00
 849 | scene0473_01
 850 | scene0475_00
 851 | scene0475_01
 852 | scene0475_02
 853 | scene0476_00
 854 | scene0476_01
 855 | scene0476_02
 856 | scene0477_00
 857 | scene0477_01
 858 | scene0478_00
 859 | scene0478_01
 860 | scene0479_00
 861 | scene0479_01
 862 | scene0479_02
 863 | scene0480_00
 864 | scene0480_01
 865 | scene0481_00
 866 | scene0481_01
 867 | scene0482_00
 868 | scene0482_01
 869 | scene0483_00
 870 | scene0484_00
 871 | scene0484_01
 872 | scene0485_00
 873 | scene0486_00
 874 | scene0487_00
 875 | scene0487_01
 876 | scene0489_00
 877 | scene0489_01
 878 | scene0489_02
 879 | scene0491_00
 880 | scene0492_00
 881 | scene0492_01
 882 | scene0493_00
 883 | scene0493_01
 884 | scene0495_00
 885 | scene0497_00
 886 | scene0498_00
 887 | scene0498_01
 888 | scene0498_02
 889 | scene0499_00
 890 | scene0501_00
 891 | scene0501_01
 892 | scene0501_02
 893 | scene0502_00
 894 | scene0502_01
 895 | scene0502_02
 896 | scene0503_00
 897 | scene0504_00
 898 | scene0505_00
 899 | scene0505_01
 900 | scene0505_02
 901 | scene0505_03
 902 | scene0505_04
 903 | scene0506_00
 904 | scene0507_00
 905 | scene0508_00
 906 | scene0508_01
 907 | scene0508_02
 908 | scene0509_00
 909 | scene0509_01
 910 | scene0509_02
 911 | scene0510_00
 912 | scene0510_01
 913 | scene0510_02
 914 | scene0511_00
 915 | scene0511_01
 916 | scene0512_00
 917 | scene0513_00
 918 | scene0514_00
 919 | scene0514_01
 920 | scene0515_00
 921 | scene0515_01
 922 | scene0515_02
 923 | scene0516_00
 924 | scene0516_01
 925 | scene0517_00
 926 | scene0517_01
 927 | scene0517_02
 928 | scene0519_00
 929 | scene0520_00
 930 | scene0520_01
 931 | scene0521_00
 932 | scene0522_00
 933 | scene0523_00
 934 | scene0523_01
 935 | scene0523_02
 936 | scene0524_00
 937 | scene0524_01
 938 | scene0525_00
 939 | scene0525_01
 940 | scene0525_02
 941 | scene0526_00
 942 | scene0526_01
 943 | scene0528_00
 944 | scene0528_01
 945 | scene0529_00
 946 | scene0529_01
 947 | scene0529_02
 948 | scene0530_00
 949 | scene0531_00
 950 | scene0532_00
 951 | scene0532_01
 952 | scene0533_00
 953 | scene0533_01
 954 | scene0534_00
 955 | scene0534_01
 956 | scene0536_00
 957 | scene0536_01
 958 | scene0536_02
 959 | scene0537_00
 960 | scene0538_00
 961 | scene0539_00
 962 | scene0539_01
 963 | scene0539_02
 964 | scene0540_00
 965 | scene0540_01
 966 | scene0540_02
 967 | scene0541_00
 968 | scene0541_01
 969 | scene0541_02
 970 | scene0542_00
 971 | scene0543_00
 972 | scene0543_01
 973 | scene0543_02
 974 | scene0544_00
 975 | scene0545_00
 976 | scene0545_01
 977 | scene0545_02
 978 | scene0546_00
 979 | scene0547_00
 980 | scene0547_01
 981 | scene0547_02
 982 | scene0548_00
 983 | scene0548_01
 984 | scene0548_02
 985 | scene0551_00
 986 | scene0554_00
 987 | scene0554_01
 988 | scene0555_00
 989 | scene0556_00
 990 | scene0556_01
 991 | scene0557_00
 992 | scene0557_01
 993 | scene0557_02
 994 | scene0560_00
 995 | scene0561_00
 996 | scene0561_01
 997 | scene0562_00
 998 | scene0563_00
 999 | scene0564_00
1000 | scene0566_00
1001 | scene0567_00
1002 | scene0567_01
1003 | scene0569_00
1004 | scene0569_01
1005 | scene0570_00
1006 | scene0570_01
1007 | scene0570_02
1008 | scene0571_00
1009 | scene0571_01
1010 | scene0572_00
1011 | scene0572_01
1012 | scene0572_02
1013 | scene0573_00
1014 | scene0573_01
1015 | scene0576_00
1016 | scene0576_01
1017 | scene0576_02
1018 | scene0577_00
1019 | scene0579_00
1020 | scene0579_01
1021 | scene0579_02
1022 | scene0581_00
1023 | scene0581_01
1024 | scene0581_02
1025 | scene0582_00
1026 | scene0582_01
1027 | scene0582_02
1028 | scene0584_00
1029 | scene0584_01
1030 | scene0584_02
1031 | scene0585_00
1032 | scene0585_01
1033 | scene0586_00
1034 | scene0586_01
1035 | scene0586_02
1036 | scene0587_00
1037 | scene0587_01
1038 | scene0587_02
1039 | scene0587_03
1040 | scene0588_00
1041 | scene0588_01
1042 | scene0588_02
1043 | scene0588_03
1044 | scene0589_00
1045 | scene0589_01
1046 | scene0589_02
1047 | scene0590_00
1048 | scene0590_01
1049 | scene0592_00
1050 | scene0592_01
1051 | scene0594_00
1052 | scene0596_00
1053 | scene0596_01
1054 | scene0596_02
1055 | scene0597_00
1056 | scene0597_01
1057 | scene0597_02
1058 | scene0600_00
1059 | scene0600_01
1060 | scene0600_02
1061 | scene0601_00
1062 | scene0601_01
1063 | scene0602_00
1064 | scene0603_00
1065 | scene0603_01
1066 | scene0604_00
1067 | scene0604_01
1068 | scene0604_02
1069 | scene0605_00
1070 | scene0605_01
1071 | scene0610_00
1072 | scene0610_01
1073 | scene0610_02
1074 | scene0611_00
1075 | scene0611_01
1076 | scene0612_00
1077 | scene0612_01
1078 | scene0613_00
1079 | scene0613_01
1080 | scene0613_02
1081 | scene0614_00
1082 | scene0614_01
1083 | scene0614_02
1084 | scene0615_00
1085 | scene0615_01
1086 | scene0617_00
1087 | scene0619_00
1088 | scene0620_00
1089 | scene0620_01
1090 | scene0622_00
1091 | scene0622_01
1092 | scene0623_00
1093 | scene0623_01
1094 | scene0624_00
1095 | scene0625_00
1096 | scene0625_01
1097 | scene0626_00
1098 | scene0626_01
1099 | scene0626_02
1100 | scene0627_00
1101 | scene0627_01
1102 | scene0628_00
1103 | scene0628_01
1104 | scene0628_02
1105 | scene0630_00
1106 | scene0630_01
1107 | scene0630_02
1108 | scene0630_03
1109 | scene0630_04
1110 | scene0630_05
1111 | scene0630_06
1112 | scene0631_00
1113 | scene0631_01
1114 | scene0631_02
1115 | scene0632_00
1116 | scene0634_00
1117 | scene0635_00
1118 | scene0635_01
1119 | scene0636_00
1120 | scene0637_00
1121 | scene0638_00
1122 | scene0639_00
1123 | scene0640_00
1124 | scene0640_01
1125 | scene0640_02
1126 | scene0641_00
1127 | scene0642_00
1128 | scene0642_01
1129 | scene0642_02
1130 | scene0642_03
1131 | scene0646_00
1132 | scene0646_01
1133 | scene0646_02
1134 | scene0649_00
1135 | scene0649_01
1136 | scene0650_00
1137 | scene0654_00
1138 | scene0654_01
1139 | scene0656_00
1140 | scene0656_01
1141 | scene0656_02
1142 | scene0656_03
1143 | scene0657_00
1144 | scene0659_00
1145 | scene0659_01
1146 | scene0661_00
1147 | scene0662_00
1148 | scene0662_01
1149 | scene0662_02
1150 | scene0666_00
1151 | scene0666_01
1152 | scene0666_02
1153 | scene0667_00
1154 | scene0667_01
1155 | scene0667_02
1156 | scene0668_00
1157 | scene0669_00
1158 | scene0669_01
1159 | scene0672_00
1160 | scene0672_01
1161 | scene0673_00
1162 | scene0673_01
1163 | scene0673_02
1164 | scene0673_03
1165 | scene0673_04
1166 | scene0673_05
1167 | scene0674_00
1168 | scene0674_01
1169 | scene0675_00
1170 | scene0675_01
1171 | scene0676_00
1172 | scene0676_01
1173 | scene0677_00
1174 | scene0677_01
1175 | scene0677_02
1176 | scene0679_00
1177 | scene0679_01
1178 | scene0680_00
1179 | scene0680_01
1180 | scene0681_00
1181 | scene0682_00
1182 | scene0683_00
1183 | scene0687_00
1184 | scene0688_00
1185 | scene0691_00
1186 | scene0691_01
1187 | scene0692_00
1188 | scene0692_01
1189 | scene0692_02
1190 | scene0692_03
1191 | scene0692_04
1192 | scene0694_00
1193 | scene0694_01
1194 | scene0698_00
1195 | scene0698_01
1196 | scene0703_00
1197 | scene0703_01
1198 | scene0705_00
1199 | scene0705_01
1200 | scene0705_02
1201 | scene0706_00


--------------------------------------------------------------------------------
/data_process/scannet_process/scannetv2_val.txt:
--------------------------------------------------------------------------------
  1 | scene0011_00
  2 | scene0011_01
  3 | scene0015_00
  4 | scene0019_00
  5 | scene0019_01
  6 | scene0025_00
  7 | scene0025_01
  8 | scene0025_02
  9 | scene0030_00
 10 | scene0030_01
 11 | scene0030_02
 12 | scene0046_00
 13 | scene0046_01
 14 | scene0046_02
 15 | scene0050_00
 16 | scene0050_01
 17 | scene0050_02
 18 | scene0063_00
 19 | scene0064_00
 20 | scene0064_01
 21 | scene0077_00
 22 | scene0077_01
 23 | scene0081_00
 24 | scene0081_01
 25 | scene0081_02
 26 | scene0084_00
 27 | scene0084_01
 28 | scene0084_02
 29 | scene0086_00
 30 | scene0086_01
 31 | scene0086_02
 32 | scene0088_00
 33 | scene0088_01
 34 | scene0088_02
 35 | scene0088_03
 36 | scene0095_00
 37 | scene0095_01
 38 | scene0100_00
 39 | scene0100_01
 40 | scene0100_02
 41 | scene0131_00
 42 | scene0131_01
 43 | scene0131_02
 44 | scene0139_00
 45 | scene0144_00
 46 | scene0144_01
 47 | scene0146_00
 48 | scene0146_01
 49 | scene0146_02
 50 | scene0149_00
 51 | scene0153_00
 52 | scene0153_01
 53 | scene0164_00
 54 | scene0164_01
 55 | scene0164_02
 56 | scene0164_03
 57 | scene0169_00
 58 | scene0169_01
 59 | scene0187_00
 60 | scene0187_01
 61 | scene0193_00
 62 | scene0193_01
 63 | scene0196_00
 64 | scene0203_00
 65 | scene0203_01
 66 | scene0203_02
 67 | scene0207_00
 68 | scene0207_01
 69 | scene0207_02
 70 | scene0208_00
 71 | scene0217_00
 72 | scene0221_00
 73 | scene0221_01
 74 | scene0222_00
 75 | scene0222_01
 76 | scene0231_00
 77 | scene0231_01
 78 | scene0231_02
 79 | scene0246_00
 80 | scene0249_00
 81 | scene0251_00
 82 | scene0256_00
 83 | scene0256_01
 84 | scene0256_02
 85 | scene0257_00
 86 | scene0277_00
 87 | scene0277_01
 88 | scene0277_02
 89 | scene0278_00
 90 | scene0278_01
 91 | scene0300_00
 92 | scene0300_01
 93 | scene0304_00
 94 | scene0307_00
 95 | scene0307_01
 96 | scene0307_02
 97 | scene0314_00
 98 | scene0316_00
 99 | scene0328_00
100 | scene0329_00
101 | scene0329_01
102 | scene0329_02
103 | scene0334_00
104 | scene0334_01
105 | scene0334_02
106 | scene0338_00
107 | scene0338_01
108 | scene0338_02
109 | scene0342_00
110 | scene0343_00
111 | scene0351_00
112 | scene0351_01
113 | scene0353_00
114 | scene0353_01
115 | scene0353_02
116 | scene0354_00
117 | scene0355_00
118 | scene0355_01
119 | scene0356_00
120 | scene0356_01
121 | scene0356_02
122 | scene0357_00
123 | scene0357_01
124 | scene0377_00
125 | scene0377_01
126 | scene0377_02
127 | scene0378_00
128 | scene0378_01
129 | scene0378_02
130 | scene0382_00
131 | scene0382_01
132 | scene0389_00
133 | scene0406_00
134 | scene0406_01
135 | scene0406_02
136 | scene0412_00
137 | scene0412_01
138 | scene0414_00
139 | scene0423_00
140 | scene0423_01
141 | scene0423_02
142 | scene0426_00
143 | scene0426_01
144 | scene0426_02
145 | scene0426_03
146 | scene0427_00
147 | scene0430_00
148 | scene0430_01
149 | scene0432_00
150 | scene0432_01
151 | scene0435_00
152 | scene0435_01
153 | scene0435_02
154 | scene0435_03
155 | scene0441_00
156 | scene0458_00
157 | scene0458_01
158 | scene0461_00
159 | scene0462_00
160 | scene0474_00
161 | scene0474_01
162 | scene0474_02
163 | scene0474_03
164 | scene0474_04
165 | scene0474_05
166 | scene0488_00
167 | scene0488_01
168 | scene0490_00
169 | scene0494_00
170 | scene0496_00
171 | scene0500_00
172 | scene0500_01
173 | scene0518_00
174 | scene0527_00
175 | scene0535_00
176 | scene0549_00
177 | scene0549_01
178 | scene0550_00
179 | scene0552_00
180 | scene0552_01
181 | scene0553_00
182 | scene0553_01
183 | scene0553_02
184 | scene0558_00
185 | scene0558_01
186 | scene0558_02
187 | scene0559_00
188 | scene0559_01
189 | scene0559_02
190 | scene0565_00
191 | scene0568_00
192 | scene0568_01
193 | scene0568_02
194 | scene0574_00
195 | scene0574_01
196 | scene0574_02
197 | scene0575_00
198 | scene0575_01
199 | scene0575_02
200 | scene0578_00
201 | scene0578_01
202 | scene0578_02
203 | scene0580_00
204 | scene0580_01
205 | scene0583_00
206 | scene0583_01
207 | scene0583_02
208 | scene0591_00
209 | scene0591_01
210 | scene0591_02
211 | scene0593_00
212 | scene0593_01
213 | scene0595_00
214 | scene0598_00
215 | scene0598_01
216 | scene0598_02
217 | scene0599_00
218 | scene0599_01
219 | scene0599_02
220 | scene0606_00
221 | scene0606_01
222 | scene0606_02
223 | scene0607_00
224 | scene0607_01
225 | scene0608_00
226 | scene0608_01
227 | scene0608_02
228 | scene0609_00
229 | scene0609_01
230 | scene0609_02
231 | scene0609_03
232 | scene0616_00
233 | scene0616_01
234 | scene0618_00
235 | scene0621_00
236 | scene0629_00
237 | scene0629_01
238 | scene0629_02
239 | scene0633_00
240 | scene0633_01
241 | scene0643_00
242 | scene0644_00
243 | scene0645_00
244 | scene0645_01
245 | scene0645_02
246 | scene0647_00
247 | scene0647_01
248 | scene0648_00
249 | scene0648_01
250 | scene0651_00
251 | scene0651_01
252 | scene0651_02
253 | scene0652_00
254 | scene0653_00
255 | scene0653_01
256 | scene0655_00
257 | scene0655_01
258 | scene0655_02
259 | scene0658_00
260 | scene0660_00
261 | scene0663_00
262 | scene0663_01
263 | scene0663_02
264 | scene0664_00
265 | scene0664_01
266 | scene0664_02
267 | scene0665_00
268 | scene0665_01
269 | scene0670_00
270 | scene0670_01
271 | scene0671_00
272 | scene0671_01
273 | scene0678_00
274 | scene0678_01
275 | scene0678_02
276 | scene0684_00
277 | scene0684_01
278 | scene0685_00
279 | scene0685_01
280 | scene0685_02
281 | scene0686_00
282 | scene0686_01
283 | scene0686_02
284 | scene0689_00
285 | scene0690_00
286 | scene0690_01
287 | scene0693_00
288 | scene0693_01
289 | scene0693_02
290 | scene0695_00
291 | scene0695_01
292 | scene0695_02
293 | scene0695_03
294 | scene0696_00
295 | scene0696_01
296 | scene0696_02
297 | scene0697_00
298 | scene0697_01
299 | scene0697_02
300 | scene0697_03
301 | scene0699_00
302 | scene0700_00
303 | scene0700_01
304 | scene0700_02
305 | scene0701_00
306 | scene0701_01
307 | scene0701_02
308 | scene0702_00
309 | scene0702_01
310 | scene0702_02
311 | scene0704_00
312 | scene0704_01


--------------------------------------------------------------------------------
/docs/flat_patternmaking.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZJU-REAL/ViewSpatial-Bench/b99ddf54d05a791a0e0af0d1d13f4379aae36821/docs/flat_patternmaking.png


--------------------------------------------------------------------------------
/docs/icon/avatar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZJU-REAL/ViewSpatial-Bench/b99ddf54d05a791a0e0af0d1d13f4379aae36821/docs/icon/avatar.png


--------------------------------------------------------------------------------
/docs/icon/avatar1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZJU-REAL/ViewSpatial-Bench/b99ddf54d05a791a0e0af0d1d13f4379aae36821/docs/icon/avatar1.png


--------------------------------------------------------------------------------
/docs/main_result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZJU-REAL/ViewSpatial-Bench/b99ddf54d05a791a0e0af0d1d13f4379aae36821/docs/main_result.png


--------------------------------------------------------------------------------
/docs/pipeline_and_case.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZJU-REAL/ViewSpatial-Bench/b99ddf54d05a791a0e0af0d1d13f4379aae36821/docs/pipeline_and_case.png


--------------------------------------------------------------------------------
/docs/pipline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZJU-REAL/ViewSpatial-Bench/b99ddf54d05a791a0e0af0d1d13f4379aae36821/docs/pipline.png


--------------------------------------------------------------------------------
/evaluate.py:
--------------------------------------------------------------------------------
  1 | import os, re, csv, json, torch, base64
  2 | import random, argparse
  3 | import numpy as np
  4 | from PIL import Image
  5 | from random import seed
  6 | from openai import OpenAI
  7 | from tqdm.auto import tqdm
  8 | from collections import defaultdict
  9 | from transformers import AutoModelForCausalLM, AutoProcessor
 10 | # Llama-3.2-11B-Vision
 11 | from transformers import MllamaForConditionalGeneration
 12 | # Qwen2-VL
 13 | from transformers import Qwen2VLForConditionalGeneration
 14 | # Qwen2.5-VL
 15 | from qwen_vl_utils import process_vision_info
 16 | from transformers import Qwen2_5_VLForConditionalGeneration
 17 | # LlavaOnevision
 18 | from transformers import LlavaOnevisionForConditionalGeneration
 19 | # Intern2.5/3
 20 | from lmdeploy.vl import load_image
 21 | from lmdeploy import pipeline, TurbomindEngineConfig, ChatTemplateConfig
 22 | # LlavaNextVideo
 23 | from transformers import LlavaNextVideoProcessor, LlavaNextVideoForConditionalGeneration
 24 | 
 25 | 
 26 | seed(1234)
 27 | np.random.seed(1234)
 28 | 
 29 | parser = argparse.ArgumentParser()
 30 | parser.add_argument("--model_path", type=str, default="gpt-4o")
 31 | # parser.add_argument("--device", type=int, default=-1)
 32 | args = parser.parse_args()
 33 | 
 34 | model_path = args.model_path
 35 | model_name = model_path.split("/")[-1]
 36 | # device = torch.device(f"cuda:{args.device}" if args.device >= 0 else "cpu")
 37 | prompt_format = "\nReply only to the corresponding option.\nAnswer:"
 38 | 
 39 | # Set the size of the incoming image for qwen
 40 | min_pixels = 256*28*28
 41 | max_pixels = 1280*28*28
 42 | 
 43 | 
 44 | # Set up the model
 45 | if model_name == 'gemini-2.0-flash-001':
 46 |     API_KEY = ""  # your api key
 47 |     base_url = ""  # Change to your own base_url
 48 |     client = OpenAI(api_key=API_KEY, base_url=base_url)
 49 |     print(f"Model gemini-2.0-flash series:{model_name} is running!")
 50 |     
 51 | elif model_name == 'gpt-4o':
 52 |     client = OpenAI(api_key="")  # your api key
 53 |     client.base_url = ""  # Change to your own base_url
 54 |     print(f"Model gpt-4o series:{model_name} is running!")
 55 | 
 56 | elif "Qwen2.5-VL" in model_name:
 57 |     model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 58 |         model_path, torch_dtype="auto", device_map="auto"
 59 |     )
 60 |     processor = AutoProcessor.from_pretrained(model_path, min_pixels=min_pixels, max_pixels=max_pixels)
 61 |     print(f"Model Qwen2.5-VL series:{model_name} is running!")
 62 | 
 63 | elif "Qwen2-VL" in model_name :
 64 |     model = Qwen2VLForConditionalGeneration.from_pretrained(
 65 |         model_path, torch_dtype="auto", device_map="auto"
 66 |     )
 67 |     processor = AutoProcessor.from_pretrained(model_path, min_pixels=min_pixels, max_pixels=max_pixels)
 68 |     print(f"Model Qwen2-VL series:{model_name} is running!")
 69 | 
 70 | elif "InternVL2_5" in model_name:
 71 |     model = model_path
 72 |     pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=1000000))
 73 |     print(f"Model InternVL2_5 series:{model_name} is running!")
 74 | 
 75 | elif "LLaVA-NeXT" in model_name:
 76 |     model = LlavaNextVideoForConditionalGeneration.from_pretrained(
 77 |         model_path,
 78 |         torch_dtype=torch.float16,
 79 |         low_cpu_mem_usage=True,
 80 |         device_map="auto"
 81 |     )
 82 |     processor = LlavaNextVideoProcessor.from_pretrained(model_path)
 83 |     print(f"Model LLaVA-NeXT series:{model_name} is running!")
 84 |     
 85 | elif model_name == "llava-onevision-qwen2-7b-ov-hf":
 86 |     model = LlavaOnevisionForConditionalGeneration.from_pretrained(
 87 |         model_path,
 88 |         torch_dtype=torch.float16,
 89 |         low_cpu_mem_usage=True,
 90 |         device_map="auto"
 91 |     )
 92 |     processor = AutoProcessor.from_pretrained(model_path)
 93 |     print(f"Model llava-onevision series:{model_name} is running!")
 94 | 
 95 | elif model_name == "Llama-3.2-11B-Vision-Instruct":
 96 |     model = MllamaForConditionalGeneration.from_pretrained(
 97 |         model_path,
 98 |         torch_dtype=torch.bfloat16,
 99 |         device_map="auto",
100 |     )
101 |     processor = AutoProcessor.from_pretrained(model_path)
102 |     print(f"Model Llama-3.2-11B-Vision series:{model_name} is running!")
103 |     
104 | elif model_name == "Kimi-VL-A3B-Instruct":
105 |     model = AutoModelForCausalLM.from_pretrained(
106 |         model_path,
107 |         torch_dtype=torch.bfloat16,
108 |         device_map="auto",
109 |         trust_remote_code=True,
110 |     )
111 |     processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
112 |     print(f"Model Kimi-VL series:{model_name} is running!")
113 |     
114 | elif model_name == "InternVL3-14B":
115 |     model = model_path
116 |     pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=1000000,tp=1), chat_template_config=ChatTemplateConfig(model_name='internvl2_5'))
117 |     print(f"Model InternVL3 series:{model_name} is running!")
118 | elif model_name == "random":
119 |     model = None
120 |     processor = None
121 | else:
122 |     model = None
123 |     processor = None
124 | 
125 | def extract_option(text):
126 |     match = re.search(r"\b([A-D])\b", text, re.IGNORECASE)
127 |     return match.group(1).upper() if match else None
128 | 
129 | def url_to_base64(url):
130 |     if os.path.exists(url):
131 |         with open(url, "rb") as f:
132 |             return "data:image/jpeg;base64," + base64.b64encode(f.read()).decode("utf-8")
133 |     else:
134 |         print(f"该图片{url}不存在！")
135 |         return False
136 | 
137 | 
138 | 
139 | def get_output(image_path, question):
140 |     image_url = [url_to_base64(image) for image in image_path]
141 | 
142 |     if model_name == 'gemini-2.0-flash-001':
143 |         content = [{"type": "image_url", "image_url": {"url": path}} for path in image_url]
144 | 
145 |         chat_completion = client.chat.completions.create(
146 |             model='google/gemini-2.0-flash-001',
147 |             messages=[
148 |                 {
149 |                     "role": "user",
150 |                     "content": [
151 |                         {
152 |                             "type": "text",
153 |                             "text": question
154 |                         },
155 |                         *content
156 |                     ]
157 |                 }
158 |             ]
159 |         )
160 |         pred = chat_completion.choices[0].message.content
161 | 
162 |     elif model_name == 'gpt-4o':
163 |         content = [{"type": "image_url", "image_url": {"url": path}} for path in image_url]
164 | 
165 |         chat_completion = client.chat.completions.create(
166 |             model="gpt-4o",
167 |             messages=[
168 |                 {
169 |                     "role": "user",
170 |                     "content": [
171 |                         {
172 |                             "type": "text",
173 |                             "text": question
174 |                         },
175 |                         *content
176 |                     ]
177 |                 }
178 |             ]
179 |         )
180 |         pred = chat_completion.choices[0].message.content
181 | 
182 | 
183 |     elif "Qwen2.5-VL" in model_name:
184 |         content = [{"type": "image", "image": path,"resized_height": 280,"resized_width": 420} for path in image_path]
185 | 
186 |         messages = [
187 |             {
188 |                 "role": "user",
189 |                 "content": [
190 |                     *content,
191 |                     {
192 |                         "type": "text",
193 |                         "text": question
194 |                     },
195 |                 ],
196 |             }
197 |         ]
198 |         text = processor.apply_chat_template(
199 |             messages, tokenize=False, add_generation_prompt=True
200 |         )
201 |         image_inputs, video_inputs = process_vision_info(messages)
202 |         inputs = processor(
203 |             text=[text],
204 |             images=image_inputs,
205 |             videos=video_inputs,
206 |             padding=True,
207 |             return_tensors="pt",
208 |         )
209 |         inputs = inputs.to("cuda")
210 |         generated_ids = model.generate(**inputs, max_new_tokens=128)
211 |         generated_ids_trimmed = [
212 |             out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
213 |         ]
214 |         output_text = processor.batch_decode(
215 |             generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
216 |         )
217 |         torch.cuda.empty_cache()
218 |         torch.cuda.ipc_collect()
219 |         pred = str(output_text[0])
220 | 
221 |     elif "LLaVA-NeXT" in model_name:
222 |         content = [{"type": "image_url", "image_url": {"url": path}} for path in image_url]
223 | 
224 |         conversation = [
225 |             {
226 |                 "role": "user",
227 |                 "content": [
228 |                     {"type": "text", "text": question},
229 |                     *content
230 |                 ],
231 |             },
232 |         ]
233 |         inputs = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=True,
234 |                                                return_dict=True, padding=True, return_tensors="pt").to("cuda")
235 |         generate_ids = model.generate(**inputs, max_new_tokens=100, eos_token_id=2, pad_token_id=2)
236 |         pred = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]
237 |         match = re.search(r'ASSISTANT:\s*(.*)', pred, re.DOTALL)
238 |         pred = match.group(1)
239 |             
240 |     elif "InternVL2_5" in model_name:
241 |         images = [load_image(image) for image in image_url]
242 |         formatted_lines = ''
243 |         for i, item in enumerate(images, start=1):
244 |             formatted_lines = formatted_lines + "Image-" + str(i) + ": {IMAGE_TOKEN}\n"
245 |         response = pipe((f'{formatted_lines}{question}', images))
246 |         pred = response.text
247 |             
248 |     elif model_name == "llava-onevision-qwen2-7b-ov-hf":
249 |         content = [{"type": "image_url", "image_url": {"url": path}} for path in image_url]
250 | 
251 |         conversation = [
252 |             {
253 |                 "role": "user",
254 |                 "content": [
255 |                     {"type": "text", "text": question},
256 |                     *content
257 |                 ],
258 |             },
259 |         ]
260 |         inputs = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=True,
261 |                                                return_dict=True, padding=True, return_tensors="pt").to("cuda")
262 |         generate_ids = model.generate(**inputs, max_new_tokens=100)
263 |         pred = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]
264 | 
265 |             
266 |     elif model_name == "Llama-3.2-11B-Vision-Instruct":
267 |         content = [{"type": "image_url", "image_url": {"url": path}} for path in image_url]
268 | 
269 |         conversation = [
270 |             {
271 |                 "role": "user",
272 |                 "content": [
273 |                     {"type": "text", "text": question},
274 |                     *content
275 |                 ],
276 |             },
277 |         ]
278 |         inputs = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=True,
279 |                                                return_dict=True, padding=True, return_tensors="pt").to("cuda")
280 |         generate_ids = model.generate(**inputs, max_new_tokens=100)
281 |         pred = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
282 | 
283 |     elif model_name == "Kimi-VL-A3B-Instruct":
284 |         images_ = [Image.open(path) for path in image_path]
285 |         images = [path.resize((path.width // 4, path.height // 4), Image.Resampling.LANCZOS) for path in images_]
286 |         content = [{"type": "image", "image": path} for path in images]
287 |         messages = [
288 |             {
289 |                 "role": "user",
290 |                 "content": [ {"type": "text","text": question},*content]
291 |             },
292 |         ]
293 |         text = processor.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
294 |         inputs = processor(images=images, text=text, return_tensors="pt", padding=True, truncation=True).to("cuda")
295 |         generated_ids = model.generate(**inputs, max_new_tokens=2048)
296 |         generated_ids_trimmed = [
297 |             out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
298 |         ]
299 |         pred = processor.batch_decode(
300 |             generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
301 |         )[0]
302 | 
303 |     elif model_name == "InternVL3-14B":
304 |         images = [load_image(image) for image in image_url]
305 |         formatted_lines = ''
306 |         for i, item in enumerate(images, start=1):
307 |             formatted_lines = formatted_lines + "Image-" + str(i) + ": {IMAGE_TOKEN}\n"
308 |         response = pipe((f'{formatted_lines}{question}', images))
309 |         pred = response.text
310 | 
311 |     else:
312 |         pred = ''
313 |         
314 |     return pred
315 | 
316 | def evaluate_vlm(benchmark_file):
317 |     with open(benchmark_file, "r", encoding="utf-8") as f:
318 |         benchmark_data = json.load(f)
319 | 
320 |     stats = defaultdict(lambda: {"correct": 0, "total": 0})
321 |     total_correct = 0
322 |     total_questions = 0
323 | 
324 |     output_path = f"result/{model_name}"
325 |     if not os.path.exists(output_path):
326 |         os.makedirs(output_path)
327 | 
328 |     result_file = f"{output_path}/result_{model_name}.csv"
329 |     with open(result_file, "w", newline="", encoding="utf-8") as csvfile:
330 |         writer = csv.writer(csvfile)
331 |         writer.writerow(["ID", "Question", "Question_Type", "Predicted Answer", "Correct Answer", "IsCorrect"])
332 | 
333 |         for i, item in enumerate(tqdm(benchmark_data)):
334 |             try:
335 |                 image_path = item['image_path']
336 |                 question = item["question"] + prompt_format
337 |                 correct_answer = item["answer"]
338 |                 question_type = item["question_type"]
339 |                 stats[question_type]["total"] += 1
340 |                 total_questions += 1
341 |                 
342 |                 predicted_answer = get_output(image_path, question)
343 |                 predicted_answer_ = predicted_answer.split("\n")[-1]
344 |                 is_correct = extract_option(predicted_answer_) == extract_option(correct_answer)
345 |                 
346 |                 if is_correct:
347 |                     stats[question_type]["correct"] += 1
348 |                     total_correct += 1
349 |                 writer.writerow([i, question, question_type, predicted_answer, correct_answer, is_correct])
350 |             except Exception as e:
351 |                 print(f"Error on item {i}: {e}")
352 |                 continue
353 | 
354 |     print("Benchmark Evaluation Results:")
355 |     print("----------------------------------------------------------")
356 |     for qtype, values in stats.items():
357 |         correct = values["correct"]
358 |         total = values["total"]
359 |         accuracy = correct / total
360 |         print(f"{qtype}: {correct}/{total} = {accuracy:.2%}")
361 |     overall_accuracy = total_correct / total_questions
362 |     print("----------------------------------------------------------")
363 |     print(f"The accuracy rate of {model_name} on the benchmark test set: {overall_accuracy:.2%}      Correct quantity:{total_correct}  Total quantity:{total_questions}")
364 |     print("----------------------------------------------------------")
365 |     print(f"The result has been saved to {result_file}")
366 | 
367 | if __name__ == '__main__':
368 |     benchmark_file = "eval/ViewSpatial-Bench.json"
369 |     evaluate_vlm(benchmark_file)


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | inference~=0.50.1
 2 | lmdeploy~=0.7.3
 3 | numpy~=2.2.6
 4 | openai~=1.82.0
 5 | opencv_python~=4.10.0.84
 6 | Pillow~=11.2.1
 7 | plyfile~=1.1
 8 | pycocotools~=2.0.4
 9 | qwen_vl_utils~=0.0.11
10 | Requests~=2.32.3
11 | torch~=2.5.1
12 | tqdm~=4.66.6
13 | transformers~=4.51.1


--------------------------------------------------------------------------------