├── .gitignore
├── LICENSE
├── README.md
├── data_engine
├── all_seed
│ ├── 00001_image_style.json
│ ├── 00002_image_scene.json
│ ├── 00003_image_emotion.json
│ ├── 00004_image_quality.json
│ ├── 00005_image_description.json
│ ├── 00006_object_localization.json
│ ├── 00007_attribute_recognition.json
│ ├── 00008_celebrity_recognition.json
│ ├── 00009_ocr.json
│ ├── 00010_object_relation.json
│ ├── 00011_image_comparison.json
│ ├── 00012_structuralized_imagetext_understanding.json
│ ├── 00013_commonsense_reasoning.json
│ ├── 00014_complex_reasoning.json
│ ├── 00015_social_relation.json
│ ├── 00016_future_prediction.json
│ ├── 00017_artwork.json
│ ├── 00018_landmark.json
│ ├── 00019_numerical_calculation.json
│ ├── 00020_spatial_relationship.json
│ ├── 00021_posters.json
│ ├── 00022_meme_comprehension.json
│ ├── 00023_writing.json
│ ├── 00024_brand_recognition.json
│ └── 00025_species_recognition.json
├── end_prompt.json
├── gpt35_qa.py
├── gpt4v_caption.py
├── image_retrieval_bing_spider.py
└── image_retrieval_clip.py
├── figs
├── data-engine.png
├── example_in_domain.pdf
└── example_in_domain.png
└── train_dataset_for_llava.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Python
2 | __pycache__
3 | *.pyc
4 | *.egg-info
5 | dist
6 |
7 | # Log
8 | *.log
9 | *.log.*
10 |
11 | # Data
12 | !**/alpaca-data-conversation.json
13 |
14 | # Editor
15 | .idea
16 | *.swp
17 |
18 | # Other
19 | .DS_Store
20 | wandb
21 | output
22 |
23 | checkpoints
24 | ckpts*
25 |
26 | .ipynb_checkpoints
27 | *.ipynb
28 |
29 | # DevContainer
30 | !.devcontainer/*
31 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # MMInstruct
2 |
3 | The official implementation of the paper "[MMInstruct: A High-Quality Multi-Modal Instruction Tuning Dataset with Extensive Diversity](http://arxiv.org/abs/2407.15838)".
4 |
5 | The dataset is available on Hugging Face at [🤗 yuecao0119/MMInstruct](https://huggingface.co/datasets/yuecao0119/MMInstruct-GPT4V).
6 |
7 | ## 📣 News
8 |
9 | - **[Oct 14, 2024]** Our paper is accepted by SCIENCE CHINA Information Sciences!
10 | - **[Aug 6, 2024]** The dataset is already accessible on Hugging Face at [🤗 yuecao0119/MMInstruct](https://huggingface.co/datasets/yuecao0119/MMInstruct-GPT4V).
11 | - **[Jul 22, 2024]** The paper has been released on [arXiv](https://arxiv.org/abs/2407.15838)!
12 | - **[Jul 22, 2024]** Code has been released.
13 |
14 | ## Todo List
15 |
16 | - [x] Data Engine.
17 | - [x] Open Source Datasets.
18 | - [ ] Release the checkpoint.
19 |
20 | ## Introduction
21 |
22 | Vision-language supervised fine-tuning effectively enhances VLLM performance, but existing visual instruction tuning datasets have limitations:
23 |
24 | 1. **Instruction Annotation Quality**: Despite strong performance, advanced VLLMs may generate instructions with inaccuracies, such as hallucinations.
25 | 2. **Instruction and Image Diversity**: Limited instruction types and lack of diverse image data impact the model's ability to generate varied and realistic outputs.
26 |
27 |
28 | ### MMInstruct Dataset
29 |
30 | To address these challenges, we created the MMInstruct dataset, featuring:
31 | - **973K instructions** from **24 domains**
32 | - Four instruction types: Judgement, Multiple-Choice, Long Visual Question Answering, and Short Visual Question Answering.
33 |
34 |
35 |
36 | The open source datasets on Hugging Face [🤗 yuecao0119/MMInstruct](https://huggingface.co/datasets/yuecao0119/MMInstruct-GPT4V) include:
37 |
38 | * `caption_cn`: 144K English detailed image caption data generated using *gpt-4-vision-preview*.
39 | * `caption_en`: 18.2K Chinese detailed image caption data generated using *gpt-4-vision-preview*.
40 | * `qa_en`: 216K instruction data generated using *GPT-3.5-turbo*, including 161K multi-round long questions and answers and 55K manually corrected instruction data from 23 fields, as shown in the figure below.
41 |
42 | We also expand MMInstruct with other open-source data, including:
43 |
44 | | Domain | Dataset |
45 | | -------------------- | ------------------------------------------------------------ |
46 | | mathematics datasets | [GEOS](https://aclanthology.org/D15-1171.pdf); [UniGeo](https://arxiv.org/abs/2212.02746); [GeoQA+](https://aclanthology.org/2022.coling-1.130/); [Geometry3k](https://arxiv.org/abs/2105.04165); [CLEVR-Math](https://arxiv.org/abs/2208.05358); [Supre-CLEVR](https://openaccess.thecvf.com/content/CVPR2023/html/Li_Super-CLEVR_A_Virtual_Benchmark_To_Diagnose_Domain_Robustness_in_Visual_CVPR_2023_paper.html); [TabMWP](https://arxiv.org/abs/2209.14610) |
47 | | charts and plots | [DVQA (100K)](https://openaccess.thecvf.com/content_cvpr_2018/html/Kafle_DVQA_Understanding_Data_CVPR_2018_paper.html); [FigureQA](https://arxiv.org/abs/1710.07300) |
48 | | scientific figure | [TQA](https://openaccess.thecvf.com/content_cvpr_2017/html/Kembhavi_Are_You_Smarter_CVPR_2017_paper.html) |
49 | | map chart | [MapQA](https://arxiv.org/abs/2211.08545) |
50 |
51 | ### Data Engine
52 |
53 | We developed an instruction generation data engine leveraging GPT-4V, GPT-3.5, and manual correction. This engine allows semi-automatic, low-cost, multi-domain instruction generation at 1/6 the cost of manual construction.
54 |
55 |
56 |
57 | As described in [our paper](http://arxiv.org/abs/2407.15838), we mainly proposed a semi-automatic and low-cost instruction generation data engine using GPT-4V, GPT-3.5 and manual correction. Our data engine consists of six steps: (a) image collection, (b) image caption generation, (c) seed question collection, (d) automatic instruction generation, (e) dataset expansion and (f) manual correction.
58 |
59 | (a) First, we collect a large number of different images from various sources, which are mainly obtained through some selected source images, and then retrieved by crawlers and clips, etc., as shown in [image_retrieval_bing_spider.py](https://github.com/yuecao0119/MMInstruct/blob/main/data_engine/image_retrieval_bing_spider.py) and [image_retrieval_clip.py](https://github.com/yuecao0119/MMInstruct/blob/main/data_engine/image_retrieval_clip.py).
60 |
61 | (b) And use GPT-4V to generate detailed image captions, as shown in [gpt4v_caption.py](https://github.com/yuecao0119/MMInstruct/blob/main/data_engine/gpt4v_caption.py).
62 |
63 | (c) Then experts designed corresponding [seed questions](https://github.com/yuecao0119/MMInstruct/tree/main/data_engine/all_seed) for different fields.
64 |
65 | (d) We use image captions and seed questions to automatically generate a rich and diverse set of instruction data through GPT-3.5, as shown in [gpt35_qa.py](https://github.com/yuecao0119/MMInstruct/blob/main/data_engine/gpt35_qa.py).
66 |
67 | (e), (f) In addition, we also use various methods to expand our dataset. Finally, manual correction is performed to ensure data quality and accuracy.
68 |
69 |
70 | ### Performance
71 |
72 |
73 |
74 | ## Citation
75 |
76 | If this work is helpful for your research, please consider citing the following BibTeX entry.
77 |
78 | ```
79 | @article{liu2024mminstruct,
80 | title={MMInstruct: A High-Quality Multi-Modal Instruction Tuning Dataset with Extensive Diversity},
81 | author={Liu, Yangzhou and Cao, Yue and Gao, Zhangwei and Wang, Weiyun and Chen, Zhe and Wang, Wenhai and Tian, Hao and Lu, Lewei and Zhu, Xizhou and Lu, Tong and others},
82 | journal={arXiv preprint arXiv:2407.15838},
83 | year={2024}
84 | }
85 | ```
86 |
--------------------------------------------------------------------------------
/data_engine/all_seed/00001_image_style.json:
--------------------------------------------------------------------------------
1 | {
2 | "judge": {
3 | "Chinese": [
4 | "这张图片的艺术风格是?"
5 | ],
6 | "English": [
7 | "Is the style of this image a ?"
8 | ]
9 | },
10 | "choice":{
11 | "Chinese": [
12 | "这张图片展示了什么艺术风格?",
13 | "识别此图像的艺术风格。"
14 | ],
15 | "English": [
16 | "What art style is showcased in this image?",
17 | "Identify the art style of this image."
18 | ]
19 | },
20 | "sentence": {
21 | "Chinese": [
22 | "这张图片展示了什么艺术风格?",
23 | "识别此图像的艺术风格。"
24 | ],
25 | "English": [
26 | "What art style is showcased in this image?",
27 | "Identify the art style of this image."
28 | ]
29 | }
30 | }
--------------------------------------------------------------------------------
/data_engine/all_seed/00002_image_scene.json:
--------------------------------------------------------------------------------
1 | {
2 | "judge": {
3 | "Chinese": [
4 | "这张图片拍摄于吗?",
5 | "这张图像的场景类别是吗?",
6 | "这张图像的环境类型是吗?",
7 | "这张图像的季节是吗?",
8 | "这张图像的温度状态是吗?",
9 | "图片显示的是吗?"
10 | ],
11 | "English": [
12 | "Was this picture taken in ?",
13 | "Is the scene category of this image ?",
14 | "Is the environment type of this image ?",
15 | "Is the season of this image ?",
16 | "Is the temperature status of this image ?",
17 | "Does the picture show ?"
18 | ]
19 | },
20 | "choice": {
21 | "Chinese": [
22 | "哪个场景类别与此图像最匹配?",
23 | "图中描绘了什么样的环境类型?",
24 | "图中描绘的是哪个季节?",
25 | "图片中描绘了什么样的温度状态?",
26 | "图片显示的是什么场景?"
27 | ],
28 | "English": [
29 | "Which scene category best matches this image?",
30 | "What type of environment is depicted in the image?",
31 | "What season is depicted in the picture?",
32 | "What temperature state is depicted in the picture?",
33 | "What scene does the picture show?"
34 | ]
35 | },
36 | "sentence": {
37 | "Chinese": [
38 | "请告诉我照片中拍摄的环境。",
39 | "解释此图中可见的环境类型。",
40 | "解释图中所示的温度状态。",
41 | "照片中显示的是哪个季节?",
42 | "图片显示的什么场景?"
43 | ],
44 | "English": [
45 | "Please tell me about the environment in which the photos were taken.",
46 | "Explain the type of environment visible in this image.",
47 | "Explain the temperature conditions shown in the picture.",
48 | "Which season is shown in the photo?",
49 | "What scene is shown in the picture?"
50 | ]
51 | }
52 | }
--------------------------------------------------------------------------------
/data_engine/all_seed/00003_image_emotion.json:
--------------------------------------------------------------------------------
1 | {
2 | "judge": {
3 | "Chinese": [
4 | "图片中的的情绪是积极的吗?",
5 | "图片中的的情绪是消极的吗?",
6 | "这张图片传达了的情绪?"
7 | ],
8 | "English": [
9 | "Is the emotion of in the picture positive?",
10 | "Is the emotion of in the picture negative?",
11 | "Does this image convey the emotion of ?"
12 | ]
13 | },
14 | "choice": {
15 | "Chinese": [
16 | "这张照片描绘了什么样的情感?",
17 | "识别此图像中表达的情感。",
18 | "这张图片传达了什么样的情绪?"
19 | ],
20 | "English": [
21 | "What emotion does this photo depict?",
22 | "Identify the emotion expressed in this image.",
23 | "Which mood does this image convey?"
24 | ]
25 | },
26 | "sentence": {
27 | "Chinese": [
28 | "这张照片描绘了什么样的情感?",
29 | "识别此图像中表达的情感。",
30 | "这张图片传达了什么样的情绪?"
31 | ],
32 | "English": [
33 | "What emotion does this photo depict?",
34 | "Identify the emotion expressed in this image.",
35 | "Which mood does this image convey?"
36 | ]
37 | }
38 | }
--------------------------------------------------------------------------------
/data_engine/all_seed/00004_image_quality.json:
--------------------------------------------------------------------------------
1 | {
2 | "judge": {
3 | "Chinese": [
4 | "第一张图像的对比度是否高于第二张图像?",
5 | "第二个图像的亮度比第一个图像的低吗?",
6 | "第一个图像比第二个图像更清晰吗?",
7 | "第二张图像的对比度是否高于第一张图像?",
8 | "第一个图像的清晰度是否低于第二个图像?",
9 | "第一个图像比第二个图像暗吗?"
10 | ],
11 | "English": [
12 | "Does the first image shows the higher contrast than the second image?",
13 | "Is the brightness of the second image lower than that of the first image?",
14 | "Is the first image clearer than the second image?",
15 | "Does the second image shows the higher contrast than the first image?",
16 | "Is the clarity of the first image lower than that of the second image?",
17 | "Is the first image darker than the second image?"
18 | ]
19 | },
20 | "choice": {
21 | "Chinese": [
22 | "哪个图像的亮度最<高/低>?",
23 | "这两张图像中哪个清晰度最<高/低>?",
24 | "在哪幅图像中,颜色对比度最<高/低>?"
25 | ],
26 | "English": [
27 | "Which image has the highest brightness?",
28 | "Which image shows the highest sharpness?",
29 | "In which image do the colors stand out most from each other?"
30 | ]
31 | },
32 | "sentence": {
33 | "Chinese": [
34 | "哪个图像的亮度最<高/低>?",
35 | "这两张图像中哪个清晰度最<高/低>?",
36 | "在哪幅图像中,颜色对比度最<高/低>?",
37 | "描述第张图像的清晰度。",
38 | "描述第张图像的对比度。",
39 | "描述第张图像的亮度。"
40 | ],
41 | "English": [
42 | "Which image has the highest brightness?",
43 | "Which image shows the highest sharpness?",
44 | "In which image do the colors stand out most from each other?",
45 | "Describe the clarity of the image.",
46 | "Describe the contrast of the image.",
47 | "Describe the brightness of the image."
48 | ]
49 | }
50 | }
--------------------------------------------------------------------------------
/data_engine/all_seed/00005_image_description.json:
--------------------------------------------------------------------------------
1 | {
2 | "judge": {
3 | "Chinese": [
4 | "这张图片发生了的事件吗?",
5 | "关于这个图片的具体内容,是这样的吗?",
6 | "这个图片的主题是吗?",
7 | "照片中的场景是吗?",
8 | "这张照片中的关键元素是吗?"
9 | ],
10 | "English": [
11 | "Did the incident of occur in this picture?",
12 | "Is the specific content of this picture like ?",
13 | "Is the theme of this picture ?",
14 | "Is the scene in the photo ?",
15 | "Is the key element in this photo ?"
16 | ]
17 | },
18 | "choice": {
19 | "Chinese": [
20 | "请描述此图片的详细内容。",
21 | "你能描述一下这张照片的焦点吗?",
22 | "适合这个图片的标题是什么?",
23 | "请描述一下图中的场景信息。",
24 | "请列出图片中的主要元素。",
25 | "你认为这张图片中发生了什么?"
26 | ],
27 | "English": [
28 | "Please describe the details of this image.",
29 | "Can you describe the focus of this photo?",
30 | "What would be a suitable title for this image?",
31 | "Please describe the scene information in the picture.",
32 | "Please list the main elements in the image.",
33 | "What do you think is happening in this picture?"
34 | ]
35 | },
36 | "sentence": {
37 | "Chinese": [
38 | "请描述此图片的详细内容。",
39 | "你能描述一下这张照片的焦点吗?",
40 | "适合这个图片的标题是什么?",
41 | "请描述一下图中的场景信息。",
42 | "请列出图片中的主要元素。",
43 | "你认为这张图片中发生了什么?"
44 | ],
45 | "English": [
46 | "Please describe the details of this image.",
47 | "Can you describe the focus of this photo?",
48 | "What would be a suitable title for this image?",
49 | "Please describe the scene information in the picture.",
50 | "Please list the main elements in the image.",
51 | "What do you think is happening in this picture?"
52 | ]
53 | }
54 | }
--------------------------------------------------------------------------------
/data_engine/all_seed/00006_object_localization.json:
--------------------------------------------------------------------------------
1 | {
2 | "judge": {
3 | "Chinese": [
4 | "