├── .gitignore
├── LICENSE
├── README.md
├── data_engine
    ├── all_seed
    │   ├── 00001_image_style.json
    │   ├── 00002_image_scene.json
    │   ├── 00003_image_emotion.json
    │   ├── 00004_image_quality.json
    │   ├── 00005_image_description.json
    │   ├── 00006_object_localization.json
    │   ├── 00007_attribute_recognition.json
    │   ├── 00008_celebrity_recognition.json
    │   ├── 00009_ocr.json
    │   ├── 00010_object_relation.json
    │   ├── 00011_image_comparison.json
    │   ├── 00012_structuralized_imagetext_understanding.json
    │   ├── 00013_commonsense_reasoning.json
    │   ├── 00014_complex_reasoning.json
    │   ├── 00015_social_relation.json
    │   ├── 00016_future_prediction.json
    │   ├── 00017_artwork.json
    │   ├── 00018_landmark.json
    │   ├── 00019_numerical_calculation.json
    │   ├── 00020_spatial_relationship.json
    │   ├── 00021_posters.json
    │   ├── 00022_meme_comprehension.json
    │   ├── 00023_writing.json
    │   ├── 00024_brand_recognition.json
    │   └── 00025_species_recognition.json
    ├── end_prompt.json
    ├── gpt35_qa.py
    ├── gpt4v_caption.py
    ├── image_retrieval_bing_spider.py
    └── image_retrieval_clip.py
├── figs
    ├── data-engine.png
    ├── example_in_domain.pdf
    └── example_in_domain.png
└── train_dataset_for_llava.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python
 2 | __pycache__
 3 | *.pyc
 4 | *.egg-info
 5 | dist
 6 | 
 7 | # Log
 8 | *.log
 9 | *.log.*
10 | 
11 | # Data
12 | !**/alpaca-data-conversation.json
13 | 
14 | # Editor
15 | .idea
16 | *.swp
17 | 
18 | # Other
19 | .DS_Store
20 | wandb
21 | output
22 | 
23 | checkpoints
24 | ckpts*
25 | 
26 | .ipynb_checkpoints
27 | *.ipynb
28 | 
29 | # DevContainer
30 | !.devcontainer/*
31 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MMInstruct
 2 | 
 3 | The official implementation of the paper "[MMInstruct: A High-Quality Multi-Modal Instruction Tuning Dataset with Extensive Diversity](http://arxiv.org/abs/2407.15838)".
 4 | 
 5 | The dataset is available on Hugging Face at [🤗 yuecao0119/MMInstruct](https://huggingface.co/datasets/yuecao0119/MMInstruct-GPT4V).
 6 | 
 7 | ## 📣 News
 8 | 
 9 | - **[Oct 14, 2024]** Our paper is accepted by SCIENCE CHINA Information Sciences!
10 | - **[Aug 6, 2024]**  The dataset is already accessible on Hugging Face at [🤗 yuecao0119/MMInstruct](https://huggingface.co/datasets/yuecao0119/MMInstruct-GPT4V).
11 | - **[Jul 22, 2024]** The paper has been released on [arXiv](https://arxiv.org/abs/2407.15838)!
12 | - **[Jul 22, 2024]** Code has been released.
13 | 
14 | ## Todo List
15 | 
16 | - [x] Data Engine.
17 | - [x] Open Source Datasets.
18 | - [ ] Release the checkpoint.
19 | 
20 | ## Introduction
21 | 
22 | Vision-language supervised fine-tuning effectively enhances VLLM performance, but existing visual instruction tuning datasets have limitations:
23 | 
24 | 1. **Instruction Annotation Quality**: Despite strong performance, advanced VLLMs may generate instructions with inaccuracies, such as hallucinations.
25 | 2. **Instruction and Image Diversity**: Limited instruction types and lack of diverse image data impact the model's ability to generate varied and realistic outputs.
26 | 
27 | 
28 | ### MMInstruct Dataset
29 | 
30 | To address these challenges, we created the MMInstruct dataset, featuring:
31 | - **973K instructions** from **24 domains**
32 | - Four instruction types: Judgement, Multiple-Choice, Long Visual Question Answering, and Short Visual Question Answering.
33 | 
34 | <img width="1117" alt="image" src="https://github.com/user-attachments/assets/92ef8128-89e3-4891-9dad-6c64da2c9de3">
35 | 
36 | The open source datasets on Hugging Face [🤗 yuecao0119/MMInstruct](https://huggingface.co/datasets/yuecao0119/MMInstruct-GPT4V) include:
37 | 
38 | * `caption_cn`: 144K English detailed image caption data generated using *gpt-4-vision-preview*.
39 | * `caption_en`: 18.2K Chinese detailed image caption data generated using *gpt-4-vision-preview*.
40 | * `qa_en`: 216K instruction data generated using *GPT-3.5-turbo*, including 161K multi-round long questions and answers and 55K manually corrected instruction data from 23 fields, as shown in the figure below.
41 | 
42 | We also expand MMInstruct with other open-source data, including:
43 | 
44 | | Domain                 | Dataset                                                      |
45 | | -------------------- | ------------------------------------------------------------ |
46 | | mathematics datasets | [GEOS](https://aclanthology.org/D15-1171.pdf); [UniGeo](https://arxiv.org/abs/2212.02746); [GeoQA+](https://aclanthology.org/2022.coling-1.130/); [Geometry3k](https://arxiv.org/abs/2105.04165); [CLEVR-Math](https://arxiv.org/abs/2208.05358); [Supre-CLEVR](https://openaccess.thecvf.com/content/CVPR2023/html/Li_Super-CLEVR_A_Virtual_Benchmark_To_Diagnose_Domain_Robustness_in_Visual_CVPR_2023_paper.html); [TabMWP](https://arxiv.org/abs/2209.14610) |
47 | | charts and plots     | [DVQA (100K)](https://openaccess.thecvf.com/content_cvpr_2018/html/Kafle_DVQA_Understanding_Data_CVPR_2018_paper.html); [FigureQA](https://arxiv.org/abs/1710.07300) |
48 | | scientific figure    | [TQA](https://openaccess.thecvf.com/content_cvpr_2017/html/Kembhavi_Are_You_Smarter_CVPR_2017_paper.html) |
49 | | map chart            | [MapQA](https://arxiv.org/abs/2211.08545)                    |
50 | 
51 | ### Data Engine
52 | 
53 | We developed an instruction generation data engine leveraging GPT-4V, GPT-3.5, and manual correction. This engine allows semi-automatic, low-cost, multi-domain instruction generation at 1/6 the cost of manual construction.
54 | 
55 | <img width="1589" alt="image" src="https://github.com/user-attachments/assets/8513df0f-f3d3-4145-bc81-baa1db656a4e">
56 | 
57 | As described in [our paper](http://arxiv.org/abs/2407.15838), we mainly proposed a semi-automatic and low-cost instruction generation data engine using GPT-4V, GPT-3.5 and manual correction. Our data engine consists of six steps: (a) image collection, (b) image caption generation, (c) seed question collection, (d) automatic instruction generation, (e) dataset expansion and (f) manual correction.
58 | 
59 | (a) First, we collect a large number of different images from various sources, which are mainly obtained through some selected source images, and then retrieved by crawlers and clips, etc., as shown in [image_retrieval_bing_spider.py](https://github.com/yuecao0119/MMInstruct/blob/main/data_engine/image_retrieval_bing_spider.py) and [image_retrieval_clip.py](https://github.com/yuecao0119/MMInstruct/blob/main/data_engine/image_retrieval_clip.py).
60 | 
61 | (b) And use GPT-4V to generate detailed image captions, as shown in [gpt4v_caption.py](https://github.com/yuecao0119/MMInstruct/blob/main/data_engine/gpt4v_caption.py).
62 | 
63 | (c) Then experts designed corresponding [seed questions](https://github.com/yuecao0119/MMInstruct/tree/main/data_engine/all_seed) for different fields. 
64 | 
65 | (d) We use image captions and seed questions to automatically generate a rich and diverse set of instruction data through GPT-3.5, as shown in [gpt35_qa.py](https://github.com/yuecao0119/MMInstruct/blob/main/data_engine/gpt35_qa.py).
66 | 
67 | (e), (f) In addition, we also use various methods to expand our dataset. Finally, manual correction is performed to ensure data quality and accuracy.
68 | 
69 | 
70 | ### Performance
71 | 
72 | <img width="1220" alt="image" src="https://github.com/user-attachments/assets/eca16ea4-8e73-4e92-8a5b-3036557abb94">
73 | 
74 | ## Citation
75 | 
76 | If this work is helpful for your research, please consider citing the following BibTeX entry.
77 | 
78 | ```
79 | @article{liu2024mminstruct,
80 |   title={MMInstruct: A High-Quality Multi-Modal Instruction Tuning Dataset with Extensive Diversity},
81 |   author={Liu, Yangzhou and Cao, Yue and Gao, Zhangwei and Wang, Weiyun and Chen, Zhe and Wang, Wenhai and Tian, Hao and Lu, Lewei and Zhu, Xizhou and Lu, Tong and others},
82 |   journal={arXiv preprint arXiv:2407.15838},
83 |   year={2024}
84 | }
85 | ```
86 | 


--------------------------------------------------------------------------------
/data_engine/all_seed/00001_image_style.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "judge": {
 3 |         "Chinese": [
 4 |             "这张图片的艺术风格是<a certain artistic style>？"
 5 |         ],
 6 |         "English": [
 7 |             "Is the style of this image a <a certain artistic style>?"
 8 |         ]
 9 |     },
10 |     "choice":{
11 |         "Chinese": [
12 |             "这张图片展示了什么艺术风格？",
13 |             "识别此图像的艺术风格。"
14 |         ],
15 |         "English": [
16 |             "What art style is showcased in this image?",    
17 |             "Identify the art style of this image."
18 |         ]
19 |     },
20 |     "sentence": {
21 |         "Chinese": [
22 |             "这张图片展示了什么艺术风格？",
23 |             "识别此图像的艺术风格。"
24 |         ],
25 |         "English": [
26 |             "What art style is showcased in this image?",    
27 |             "Identify the art style of this image."
28 |         ]
29 |     }
30 | }


--------------------------------------------------------------------------------
/data_engine/all_seed/00002_image_scene.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "judge": {
 3 |         "Chinese": [
 4 |             "这张图片拍摄于<a certain scene>吗?",
 5 |             "这张图像的场景类别是<a certain scene category>吗？",
 6 |             "这张图像的环境类型是<a certain environment type>吗？",
 7 |             "这张图像的季节是<a certain season>吗？",
 8 |             "这张图像的温度状态是<a certain temperature status>吗？",
 9 |             "图片显示的是<a certain scene>吗？"
10 |         ],
11 |         "English": [
12 |             "Was this picture taken in <a certain scene>?",
13 |             "Is the scene category of this image <a certain category scene>?",
14 |             "Is the environment type of this image <a certain environment type>?",
15 |             "Is the season of this image <a certain season>?",
16 |             "Is the temperature status of this image <a certain temperature status>?",
17 |             "Does the picture show <a certain scene>?"
18 |         ]
19 |     },
20 |     "choice": {
21 |         "Chinese": [
22 |             "哪个场景类别与此图像最匹配？",
23 |             "图中描绘了什么样的环境类型？",
24 |             "图中描绘的是哪个季节？",
25 |             "图片中描绘了什么样的温度状态？",
26 |             "图片显示的是什么场景？"
27 |         ],
28 |         "English": [
29 |             "Which scene category best matches this image?",
30 |             "What type of environment is depicted in the image?",
31 |             "What season is depicted in the picture?",
32 |             "What temperature state is depicted in the picture?",
33 |             "What scene does the picture show?"
34 |         ]
35 |     },
36 |     "sentence": {
37 |         "Chinese": [
38 |             "请告诉我照片中拍摄的环境。",
39 |             "解释此图中可见的环境类型。",
40 |             "解释图中所示的温度状态。",
41 |             "照片中显示的是哪个季节？",
42 |             "图片显示的什么场景？"
43 |         ],
44 |         "English": [
45 |             "Please tell me about the environment in which the photos were taken.",
46 |             "Explain the type of environment visible in this image.",
47 |             "Explain the temperature conditions shown in the picture.",
48 |             "Which season is shown in the photo?",
49 |             "What scene is shown in the picture?"
50 |         ]
51 |     }
52 | }


--------------------------------------------------------------------------------
/data_engine/all_seed/00003_image_emotion.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "judge": {
 3 |         "Chinese": [
 4 |             "图片中的<some object>的情绪是积极的吗？",
 5 |             "图片中的<some object>的情绪是消极的吗？",
 6 |             "这张图片传达了<specific emotion>的情绪？"
 7 |         ],
 8 |         "English": [
 9 |             "Is the emotion of <some object> in the picture positive?",
10 |             "Is the emotion of <some object> in the picture negative?",
11 |             "Does this image convey the emotion of <specific emotion>?"
12 |         ]
13 |     },
14 |     "choice": {
15 |         "Chinese": [
16 |             "这张照片描绘了什么样的情感？",
17 |             "识别此图像中表达的情感。",
18 |             "这张图片传达了什么样的情绪？"
19 |         ],
20 |         "English": [
21 |             "What emotion does this photo depict?",
22 |             "Identify the emotion expressed in this image.",
23 |             "Which mood does this image convey?"
24 |         ]
25 |     },
26 |     "sentence": {
27 |         "Chinese": [
28 |             "这张照片描绘了什么样的情感？",
29 |             "识别此图像中表达的情感。",
30 |             "这张图片传达了什么样的情绪？"
31 |         ],
32 |         "English": [
33 |             "What emotion does this photo depict?",
34 |             "Identify the emotion expressed in this image.",
35 |             "Which mood does this image convey?"
36 |         ]
37 |     }
38 | }


--------------------------------------------------------------------------------
/data_engine/all_seed/00004_image_quality.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "judge": {
 3 |         "Chinese": [
 4 |             "第一张图像的对比度是否高于第二张图像？",
 5 |             "第二个图像的亮度比第一个图像的低吗？",
 6 |             "第一个图像比第二个图像更清晰吗？",
 7 |             "第二张图像的对比度是否高于第一张图像？",
 8 |             "第一个图像的清晰度是否低于第二个图像？",
 9 |             "第一个图像比第二个图像暗吗？"
10 |         ],
11 |         "English": [
12 |             "Does the first image shows the higher contrast than the second image?",
13 |             "Is the brightness of the second image lower than that of the first image?",
14 |             "Is the first image clearer than the second image?",
15 |             "Does the second image shows the higher contrast than the first image?",
16 |             "Is the clarity of the first image lower than that of the second image?",
17 |             "Is the first image darker than the second image?"
18 |         ]
19 |     },
20 |     "choice": {
21 |         "Chinese": [
22 |             "哪个图像的亮度最<高/低>？",
23 |             "这两张图像中哪个清晰度最<高/低>？",
24 |             "在哪幅图像中，颜色对比度最<高/低>？"
25 |         ],
26 |         "English": [
27 |             "Which image has the highest brightness?",
28 |             "Which image shows the highest sharpness?",
29 |             "In which image do the colors stand out most from each other?"
30 |         ]
31 |     },
32 |     "sentence": {
33 |         "Chinese": [
34 |             "哪个图像的亮度最<高/低>？",
35 |             "这两张图像中哪个清晰度最<高/低>？",
36 |             "在哪幅图像中，颜色对比度最<高/低>？",
37 |             "描述第<order number>张图像的清晰度。",
38 |             "描述第<order number>张图像的对比度。",
39 |             "描述第<order number>张图像的亮度。"
40 |         ],
41 |         "English": [
42 |             "Which image has the highest brightness?",
43 |             "Which image shows the highest sharpness?",
44 |             "In which image do the colors stand out most from each other?",
45 |             "Describe the clarity of the <order number> image.",
46 |             "Describe the contrast of the <order number> image.",
47 |             "Describe the brightness of the <order number> image."
48 |         ]
49 |     }
50 | }


--------------------------------------------------------------------------------
/data_engine/all_seed/00005_image_description.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "judge": {
 3 |         "Chinese": [
 4 |             "这张图片发生了<special event>的事件吗？",
 5 |             "关于这个图片的具体内容，是<picture content>这样的吗？",
 6 |             "这个图片的主题是<specific theme>吗？",
 7 |             "照片中的场景是<specific scene>吗？",
 8 |             "这张照片中的关键元素是<key element>吗？"
 9 |         ],
10 |         "English": [
11 |             "Did the incident of <special event> occur in this picture?",
12 |             "Is the specific content of this picture like <picture content>?",
13 |             "Is the theme of this picture <specific theme>?",
14 |             "Is the scene in the photo <specific scene>?",
15 |             "Is the key element in this photo <key element>?"
16 |         ]
17 |     },
18 |     "choice": {
19 |         "Chinese": [
20 |             "请描述此图片的详细内容。",
21 |             "你能描述一下这张照片的焦点吗？",
22 |             "适合这个图片的标题是什么？",
23 |             "请描述一下图中的场景信息。",
24 |             "请列出图片中的主要元素。",
25 |             "你认为这张图片中发生了什么？"
26 |         ],
27 |         "English": [
28 |             "Please describe the details of this image.",
29 |             "Can you describe the focus of this photo?",
30 |             "What would be a suitable title for this image?",
31 |             "Please describe the scene information in the picture.",
32 |             "Please list the main elements in the image.",
33 |             "What do you think is happening in this picture?"
34 |         ]
35 |     },
36 |     "sentence": {
37 |         "Chinese": [
38 |             "请描述此图片的详细内容。",
39 |             "你能描述一下这张照片的焦点吗？",
40 |             "适合这个图片的标题是什么？",
41 |             "请描述一下图中的场景信息。",
42 |             "请列出图片中的主要元素。",
43 |             "你认为这张图片中发生了什么？"
44 |         ],
45 |         "English": [
46 |             "Please describe the details of this image.",
47 |             "Can you describe the focus of this photo?",
48 |             "What would be a suitable title for this image?",
49 |             "Please describe the scene information in the picture.",
50 |             "Please list the main elements in the image.",
51 |             "What do you think is happening in this picture?"
52 |         ]
53 |     }
54 | }


--------------------------------------------------------------------------------
/data_engine/all_seed/00006_object_localization.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "judge": {
 3 |         "Chinese": [
 4 |             "<object>在图片中的精确位置是<position>吗？",
 5 |             "在这张图片中，<object>朝向的是<direction>吗？",
 6 |             "这张图片中<object>的可见数量是<number>吗？"
 7 |         ],
 8 |         "English": [
 9 |             "Is the precise position of <object> in the picture <position>?",
10 |             "In this picture, is <object> facing <direction>?",
11 |             "Is the visible number of <object> in this image <number>?"
12 |         ]
13 |     },
14 |     "choice": {
15 |         "Chinese": [
16 |             "<object>在图片中的精确位置是什么？",
17 |             "在这张图片中，<object>朝向哪个方向？",
18 |             "请估算出这张图片中<object>的可见数量？"
19 |         ],
20 |         "English": [
21 |             "What is the precise position of <object> in the picture?",
22 |             "In this picture, which direction does <object> face?",
23 |             "Please estimate the visible number of <object>s in this image?"
24 |         ]
25 |     },
26 |     "sentence": {
27 |         "Chinese": [
28 |             "<object>在图片中的精确位置是什么？",
29 |             "在这张图片中，<object>朝向哪个方向？",
30 |             "请估算出这张图片中<object>的可见数量？"
31 |         ],
32 |         "English": [
33 |             "What is the precise position of <object> in the picture?",
34 |             "In this picture, which direction does <object> face?",
35 |             "Please estimate the visible number of <object>s in this image?"
36 |         ]
37 |     }
38 | }


--------------------------------------------------------------------------------
/data_engine/all_seed/00007_attribute_recognition.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuecao0119/MMInstruct/eb6aacbcc64b1880092de22142eeec0ad7f2422d/data_engine/all_seed/00007_attribute_recognition.json


--------------------------------------------------------------------------------
/data_engine/all_seed/00008_celebrity_recognition.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "judge": {
 3 |         "Chinese": [
 4 |             "照片上的这个人是<name>吗？",
 5 |             "这个人的名字是<name>吗？",
 6 |             "图片中这个人来自<specific country>吗？",
 7 |             "照片上这个人的职业是<specific occupations>吗？"
 8 |         ],
 9 |         "English": [
10 |             "Is this person <name> in the photo?",
11 |             "Is this person's name <name>?",
12 |             "Is the person in the picture from <specific country>?",
13 |             "Is the occupation of the person in the photo <specific occupations>?"
14 |         ]
15 |     },
16 |     "choice": {
17 |         "English": [
18 |             "Is this person in the photo <name>?",
19 |             "What is the occupation of the person in the photo?",
20 |             "Describe personal information about the person in the image.",
21 |             "What is the name of the person in this photo?",
22 |             "What country is the person in the picture from?"
23 |         ],
24 |         "Chinese": [
25 |             "照片上的这个人是<name>吗？",
26 |             "照片上这个人的职业是什么？",
27 |             "描述图片中这个人的个人信息。",
28 |             "这张照片中的人的名字是什么？",
29 |             "图片中这个人来自哪个国家？"
30 |         ]
31 |     },
32 |     "sentence": {
33 |         "Chinese": [
34 |             "照片上的这个人是<name>吗？",
35 |             "照片上这个人的职业是什么？",
36 |             "描述图片中这个人的个人信息。",
37 |             "这张照片中的人的名字是什么？",
38 |             "图片中这个人来自哪个国家？"
39 |         ],
40 |         "English": [
41 |             "Is this person in the photo <name>?",
42 |             "What is the occupation of the person in the photo?",
43 |             "Describe personal information about the person in the image.",
44 |             "What is the name of the person in this photo?",
45 |             "What country is the person in the picture from?"
46 |         ]
47 |     }
48 | }


--------------------------------------------------------------------------------
/data_engine/all_seed/00009_ocr.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "judge": {
 3 |         "Chinese": [
 4 |             "图像是否包含<特定文本/字符>？",
 5 |             "图像中的<所有/上方/中间/下方/左边/右边等>文本信息是否是<translation results>？",
 6 |             "图像中的文本信息的<中文/英文/日语等>翻译结果是否是<placeholder>？"
 7 |         ],
 8 |         "English": [
 9 |             "Does the image contain <specific text/character>?",
10 |             "Is the <all/above/middle/bottom/left/right, etc.> text information in the image <specific text>?",
11 |             "Is the <Chinese/English/Japanese, etc.> translation result of the text information in the image <translation results>?"
12 |         ]
13 |     },
14 |     "choice": {
15 |         "Chinese": [
16 |             "识别图片中的<所有/上方/中间/下方/左边/右边等>文本信息。",
17 |             "解释此图片中的文本信息。",
18 |             "翻译此图片中的文本信息为<中文/英文/日语等>。"
19 |         ],
20 |         "English": [
21 |             "Recognize <all/above/middle/below/left/right, etc.> text information in the picture.",
22 |             "Explain the textual information in this image.",
23 |             "Translate the text information in this image to <Chinese/English/Japanese, etc.>."
24 |         ]
25 |     },
26 |     "sentence": {
27 |         "Chinese": [
28 |             "识别图片中的<所有/上方/中间/下方/左边/右边等>文本信息。",
29 |             "解释此图片中的文本信息。",
30 |             "翻译此图片中的文本信息为<中文/英文/日语等>。"
31 |         ],
32 |         "English": [
33 |             "Recognize <all/above/middle/below/left/right, etc.> text information in the picture.",
34 |             "Explain the textual information in this image.",
35 |             "Translate the text information in this image to <Chinese/English/Japanese, etc.>."
36 |         ]
37 |     }
38 | }


--------------------------------------------------------------------------------
/data_engine/all_seed/00010_object_relation.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "judge": {
 3 |         "Chinese": [
 4 |             "图中生物之间的自然关系是否是<specific relationship>？",
 5 |             "图中所示的生物与人类的自然关系是否是<specific relationship>",
 6 |             "图中所示的生物与<specific creatures>的自然关系是否是<specific relationship>？",
 7 |             "<Object 1>相对于<Object 2>的位置是<specific position>吗？",
 8 |             "<Location 1>是否在<Location 2>的<specific direction>？",
 9 |             "<Location 1>是否位于图像的<East/South/West/North, etc.>？"
10 |         ],
11 |         "English": [
12 |             "Is the natural relationship between the creatures in the picture a <specific relationship>?",
13 |             "Is the natural relationship between the creatures shown in the picture and humans a <specific relationship>",
14 |             "Is the natural relationship between the creatures shown in the picture and <specific creatures> a <specific relationship>?",
15 |             "Is the position of <Object 1> relative to <Object 2> a <specific position>?",
16 |             "Is <Location 1> in the <specific direction> of <Location 2>?",
17 |             "Is <Location 1> located in <East/South/West/North, etc.> of the image?"
18 |         ]
19 |     },
20 |     "choice": {
21 |         "Chinese": [
22 |             "图中生物之间存在什么样的自然关系？",
23 |             "在自然界中，图中所示的生物与人类之间有什么关系？",
24 |             "在自然界中，图中所示的生物与<其他特定生物>之间有什么关系？",
25 |             "就二维平面上的角度而言，<物体1>相对于<物体2>的位置是什么？",
26 |             "图中，<物体1>和<物体2>之间的相对位置是什么？",
27 |             "<地点1>在<地点2>的什么方向？",
28 |             "哪个<地点>位于<东部/南部/西部/北部等>？"
29 |         ],
30 |         "English": [
31 |             "What kind of natural relationships exist between creatures in the picture?",
32 |             "What is the relationship between the creatures shown in the picture and humans in nature?",
33 |             "What is the relationship between the creatures shown in the figure and <other specific creatures> in nature?",
34 |             "In terms of angles on a two-dimensional plane, what is the position of <Object 1> relative to <Object 2>?",
35 |             "What is the relative position between <Object 1> and <Object 2> in the figure?",
36 |             "What direction is <Location 1> in <Location 2>?",
37 |             "Which <Location 1> is located in <East/South/West/North, etc.>?"
38 |         ]
39 |     },
40 |     "sentence": {
41 |         "Chinese": [
42 |             "图中生物之间存在什么样的自然关系？",
43 |             "在自然界中，图中所示的生物与人类之间有什么关系？",
44 |             "在自然界中，图中所示的生物与<其他特定生物>之间有什么关系？",
45 |             "就二维平面上的角度而言，<物体1>相对于<物体2>的位置是什么？",
46 |             "图中，<物体1>和<物体2>之间的相对位置是什么？",
47 |             "<地点1>在<地点2>的什么方向？",
48 |             "哪个<地点>位于<东部/南部/西部/北部等>？"
49 |         ],
50 |         "English": [
51 |             "What kind of natural relationships exist between creatures in the picture?",
52 |             "What is the relationship between the creatures shown in the picture and humans in nature?",
53 |             "What is the relationship between the creatures shown in the figure and <other specific creatures> in nature?",
54 |             "In terms of angles on a two-dimensional plane, what is the position of <Object 1> relative to <Object 2>?",
55 |             "What is the relative position between <Object 1> and <Object 2> in the figure?",
56 |             "What direction is <Location 1> in <Location 2>?",
57 |             "Which <Location 1> is located in <East/South/West/North, etc.>?"
58 |         ]
59 |     }
60 | }


--------------------------------------------------------------------------------
/data_engine/all_seed/00011_image_comparison.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "judge": {
 3 |         "Chinese": [
 4 |             "图中的<物体>一样大吗？",
 5 |             "图中的<物体>颜色一样吗？",
 6 |             "图中的商品<特定商品属性>一样吗？",
 7 |             "图中的商品适用场合一样吗？",
 8 |             "图中的商品适用人群一样吗？",
 9 |             "图片中是否阐明了<specific theory>这样的道理？"
10 |         ],
11 |         "English": [
12 |             "Are the <objects> in the picture the same size?",
13 |             "Are the <objects> in the picture the same color?",
14 |             "Are the products in the picture <specific product attributes> the same?",
15 |             "Are the products in the picture suitable for the same situations?",
16 |             "Are the products in the picture suitable for the same people?",
17 |             "Does the picture illustrate <specific theory>?"
18 |         ]
19 |     },
20 |     "choice": {
21 |         "Chinese": [
22 |             "图中的<物体>一样大吗？",
23 |             "图中的<物体>颜色一样吗？",
24 |             "对比图中<物体>的<特定属性>。",
25 |             "图中的商品<特定商品属性>一样吗？",
26 |             "图中的商品适用场合一样吗？",
27 |             "图中的商品适用人群一样吗？",
28 |             "请详细描述画面，并告诉我图片中发生了什么事情。",
29 |             "请详细描述图片，并告诉我图片阐明了什么道理。",
30 |             "请详细解释图片，并说明图片想表达的核心思想。"
31 |         ],
32 |         "English": [
33 |             "Is the <object> in the picture the same size?",
34 |             "Is the color of the <object> in the picture the same?",
35 |             "Compare the <specific attributes> of the <object> in the figure.",
36 |             "Is the product <specific product attributes> in the picture the same?",
37 |             "Is the product in the picture suitable for the same occasion?",
38 |             "Is the product in the picture suitable for the same audience?",
39 |             "Please describe the scene in detail and tell me what happened in the picture.",
40 |             "Please provide a detailed description of the image and tell me what it illustrates.",
41 |             "Please provide a detailed explanation of the image and explain the core idea it intends to convey."
42 |         ]
43 |     },
44 |     "sentence": {
45 |         "Chinese": [
46 |             "图中的<物体>一样大吗？",
47 |             "图中的<物体>颜色一样吗？",
48 |             "对比图中<物体>的<特定属性>。",
49 |             "图中的商品<特定商品属性>一样吗？",
50 |             "图中的商品适用场合一样吗？",
51 |             "图中的商品适用人群一样吗？",
52 |             "请详细描述画面，并告诉我图片中发生了什么事情。",
53 |             "请详细描述图片，并告诉我图片阐明了什么道理。",
54 |             "请详细解释图片，并说明图片想表达的核心思想。"
55 |         ],
56 |         "English": [
57 |             "Is the <object> in the picture the same size?",
58 |             "Is the color of the <object> in the picture the same?",
59 |             "Compare the <specific attributes> of the <object> in the figure.",
60 |             "Is the product <specific product attributes> in the picture the same?",
61 |             "Is the product in the picture suitable for the same occasion?",
62 |             "Is the product in the picture suitable for the same audience?",
63 |             "Please describe the scene in detail and tell me what happened in the picture.",
64 |             "Please provide a detailed description of the image and tell me what it illustrates.",
65 |             "Please provide a detailed explanation of the image and explain the core idea it intends to convey."
66 |         ]
67 |     }
68 | }


--------------------------------------------------------------------------------
/data_engine/all_seed/00012_structuralized_imagetext_understanding.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuecao0119/MMInstruct/eb6aacbcc64b1880092de22142eeec0ad7f2422d/data_engine/all_seed/00012_structuralized_imagetext_understanding.json


--------------------------------------------------------------------------------
/data_engine/all_seed/00013_commonsense_reasoning.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuecao0119/MMInstruct/eb6aacbcc64b1880092de22142eeec0ad7f2422d/data_engine/all_seed/00013_commonsense_reasoning.json


--------------------------------------------------------------------------------
/data_engine/all_seed/00014_complex_reasoning.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuecao0119/MMInstruct/eb6aacbcc64b1880092de22142eeec0ad7f2422d/data_engine/all_seed/00014_complex_reasoning.json


--------------------------------------------------------------------------------
/data_engine/all_seed/00015_social_relation.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "judge": {
 3 |         "Chinese": [
 4 |             "这两位人物可能是合作伙伴吗？",
 5 |             "他们之间的联系可能是家庭关系吗？",
 6 |             "他们可能是情侣吗？",
 7 |             "这两人可能是同事吗？",
 8 |             "这两位人物可能是同一家公司的员工吗？",
 9 |             "他们之间可能存在师生关系吗？",
10 |             "这两人可能是一对兄弟姐妹吗？",
11 |             "在这个环境中，他们可能是陌生人吗？",
12 |             "他们可能是彼此的竞争对手吗？",
13 |             "你认为这两人可能是一对亲子吗？",
14 |             "你认为这两人可能是邻居吗？",
15 |             "你认为这两人可能是一对旅行伴侣吗？",
16 |             "这两位人物之间有可能是亲戚吗？",
17 |             "这张图片中的两位人物可能是朋友吗？"
18 |         ],
19 |         "English": [
20 |             "Could these two individuals be partners?",
21 |             "Is the connection between them possibly a family relationship?",
22 |             "Could they be a couple?",
23 |             "Are these two individuals possibly colleagues?",
24 |             "Could these two characters be employees of the same company?",
25 |             "Is there a possibility of a teacher-student relationship between them?",
26 |             "Could these two people be siblings?",
27 |             "In this setting, could they be strangers to each other?",
28 |             "Is it possible that they are competitors?",
29 |             "Do you think these two individuals could be parent and child?",
30 |             "Do you think these two people could be neighbors?",
31 |             "Do you think these two individuals could be travel companions?",
32 |             "Is there a possibility of them being relatives?",
33 |             "Could the two individuals in this picture be friends?"
34 |         ]
35 |     },
36 |     "choice": {
37 |         "Chinese": [
38 |             "这张图片中的人物之间存在什么社会关系？",
39 |             "你觉得这几个人之间的亲密度是什么样的？",
40 |             "你认为这几个人是因为什么而相识的？",
41 |             "在这张图片中，他们之间可能存在着怎样的互动？",
42 |             "你认为这几人之间的互动可能有何特点？",
43 |             "这张图片中的人物之间信任程度是怎么样的？"
44 |         ],
45 |         "English": [
46 |             "What social relationships exist between the people in this picture?",
47 |             "What do you think the intimacy between these people is like?",
48 |             "Why do you think these people got to know each other?",
49 |             "What kind of interaction might there be between them in this picture?",
50 |             "What do you think might be the characteristics of the interaction between these people?",
51 |             "What is the level of trust between the people in this picture?"
52 |         ]
53 |     },
54 |     "sentence": {
55 |         "Chinese": [
56 |             "这张图片中的人物之间存在什么社会关系？",
57 |             "你觉得这几个人之间的亲密度是什么样的？",
58 |             "你认为这几个人是因为什么而相识的？",
59 |             "在这张图片中，他们之间可能存在着怎样的互动？",
60 |             "你认为这几人之间的互动可能有何特点？",
61 |             "这张图片中的人物之间信任程度是怎么样的？"
62 |         ],
63 |         "English": [
64 |             "What social relationships exist between the people in this picture?",
65 |             "What do you think the intimacy between these people is like?",
66 |             "Why do you think these people got to know each other?",
67 |             "What kind of interaction might there be between them in this picture?",
68 |             "What do you think might be the characteristics of the interaction between these people?",
69 |             "What is the level of trust between the people in this picture?"
70 |         ]
71 |     }
72 | }


--------------------------------------------------------------------------------
/data_engine/all_seed/00016_future_prediction.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "judge": {
 3 |         "Chinese": [
 4 |             "根据图像，<some people>在<doing something>时，<something>是他可能面临的一个关键问题吗？",
 5 |             "根据这张图，接下来可能会发生<will happen something>这样的事吗？",
 6 |             "这张图片的未来结果可能是<specific result>吗？",
 7 |             "这张图片未来可能会是<specific result>的积极结果吗？",
 8 |             "这张图片未来可能会是<specific result>的不幸结果吗？",
 9 |             "这张图片未来的天气可能会是<weather>吗？"
10 |         ],
11 |         "English": [
12 |             "Based on the image, is <something> one key issue that <some people> may face when <doing something>?",
13 |             "Based on this picture, is it possible that <will happen something> next?",
14 |             "Is the expected result of this image <specific result>?",
15 |             "Is the expected positive result of this image <specific result>?",
16 |             "Is the expected unfortunate outcome of this image <specific result>?",
17 |             "Will the weather in this picture be <weather> next?"
18 |         ]
19 |     },
20 |     "choice": {
21 |         "Chinese": [
22 |             "根据图像，<some people>在<doing something>时可能面临的一个关键问题是什么？",
23 |             "根据这张图片，请预测接下来会发生什么？",
24 |             "这张图片的预期结果是什么？",
25 |             "这张图片预期的积极结果是什么？",
26 |             "这张图片预期的不幸结果是什么？",
27 |             "预测这张图片中的天气之后会是怎么样的。"
28 |         ],
29 |         "English": [
30 |             "Based on the image, what is one key issue that <some people> might face when <doing something>?",
31 |             "Based on this image, please predict what will happen next?",
32 |             "What is the intended outcome in this image?",
33 |             "What is the positive result in this image?",
34 |             "What is the unfortunate outcome in this image?",
35 |             "Predict what the weather in this picture will be like next."
36 |         ]
37 |     },
38 |     "sentence": {
39 |         "Chinese": [
40 |             "根据图像，<some people>在<doing something>时可能面临的一个关键问题是什么？",
41 |             "根据这张图片，请预测接下来会发生什么？",
42 |             "这张图片的预期结果是什么？",
43 |             "这张图片预期的积极结果是什么？",
44 |             "这张图片预期的不幸结果是什么？",
45 |             "预测这张图片中的天气之后会是怎么样的。"
46 |         ],
47 |         "English": [
48 |             "Based on the image, what is one key issue that <some people> might face when <doing something>?",
49 |             "Based on this image, please predict what will happen next?",
50 |             "What is the intended outcome in this image?",
51 |             "What is the positive result in this image?",
52 |             "What is the unfortunate outcome in this image?",
53 |             "Predict what the weather in this picture will be like next."
54 |         ]
55 |     }
56 | }


--------------------------------------------------------------------------------
/data_engine/all_seed/00017_artwork.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "judge": {
  3 |         "Chinese": [
  4 |             "这个图片是不是描述了<content>？",
  5 |             "<content>。这段对于这张图片的赏析是否恰当？",
  6 |             "“<content>”这首诗是否适合这幅画？",
  7 |             "这件艺术品属于<genre>体裁吗？",
  8 |             "这件艺术品是以<category>的形式存在的吗？",
  9 |             "这东西看起来像<something>吗？",
 10 |             "这件艺术品是<artist>创作的吗？",
 11 |             "这件艺术品是否曾在<location>展览？",
 12 |             "这件艺术品的标题是<title>吗？",
 13 |             "这幅画的构图是否具有<characteristics>这样的特征？",
 14 |             "这幅画属于<style>的艺术风格吗？",
 15 |             "这个<artwork>中的<object>代表了<meaning>的含义吗？",
 16 |             "这张图看起来像<object/sentence>吗？",
 17 |             "这个<object>的设计风格是<style>这样的吗？",
 18 |             "这幅画的主题是<theme>吗？",
 19 |             "你对这件艺术品的色彩组合的印象是<impression>的吗？",
 20 |             "这张图片中的<object>属于<style>的风格吗？"
 21 |         ],
 22 |         "English": [
 23 |             "Does this image depict <content>?",
 24 |             "<content>. Is this appreciation of this picture appropriate?",
 25 |             "Is the poem '<poem>' suitable for this painting?",
 26 |             "Does this artwork belong to the type of <genre>?",
 27 |             "Does this artwork exist in the form of <category>?",
 28 |             "Does this thing look like <something>?",
 29 |             "Is this artwork created by <artist>?",
 30 |             "Is this artwork displayed in <location>?",
 31 |             "Is this artwork titled <title>?",
 32 |             "Is the composition of this painting characterized by <characteristics>?",
 33 |             "Does this painting belong to the art style of <style>?",
 34 |             "Does the <object> in this <artwork> represent the meaning of <meaning>?",
 35 |             "Does this image look like <object/sentence>?",
 36 |             "Is the design style of this <object> like <style>?",
 37 |             "Is the theme of this painting <theme>?",
 38 |             "Is your impression of the color combination of this artwork <impression>?",
 39 |             "Does the <object> in this picture belong to the style of <style>?"
 40 |         ]
 41 |     },
 42 |     "choice": {
 43 |         "Chinese": [
 44 |             "请对图中艺术品进行简要的描述。",
 45 |             "赏析这个艺术品。",
 46 |             "根据这幅画，你能把它写成诗歌吗？",
 47 |             "这个艺术品是<绘画、雕塑、还是其他>什么存在形式？",
 48 |             "这个东西看起来像什么？",
 49 |             "这件艺术品是谁创作的？",
 50 |             "这件艺术品曾在哪里展览？",
 51 |             "这件艺术品的标题是什么？",
 52 |             "这幅画的构图有何特点？",
 53 |             "这幅画属于什么艺术风格？",
 54 |             "这个艺术品中的<object>代表什么？",
 55 |             "这张图看起来像什么？",
 56 |             "这个<object>的设计风格是什么？",
 57 |             "这幅画的主题是什么？",
 58 |             "你对这件艺术品的色彩组合有什么印象？"
 59 |         ],
 60 |         "English": [
 61 |             "Please provide a brief description of the artwork pictured.",
 62 |             "Appreciate this work of art.",
 63 |             "Based on this painting, can you write it as a poem?",
 64 |             "Is this artwork <a painting, a sculpture, or some other form> of existence?",
 65 |             "What does this object look like?",
 66 |             "Who created this artwork?",
 67 |             "Where has this artwork been exhibited?",
 68 |             "What is the title of this artwork?",
 69 |             "What are the characteristics of the composition of this painting?",
 70 |             "What artistic style does this painting belong to?",
 71 |             "What does <object> in this artwork represent?",
 72 |             "What does this picture look like?",
 73 |             "What is the design style of this <object>?",
 74 |             "What is the theme of this painting?",
 75 |             "What are your impressions of the color combinations in this artwork?"
 76 |         ]
 77 |     },
 78 |     "sentence": {
 79 |         "Chinese": [
 80 |             "请对图中艺术品进行简要的描述。",
 81 |             "赏析这个艺术品。",
 82 |             "根据这幅画，你能把它写成诗歌吗？",
 83 |             "这个艺术品是<绘画、雕塑、还是其他>什么存在形式？",
 84 |             "这个东西看起来像什么？",
 85 |             "这件艺术品是谁创作的？",
 86 |             "这件艺术品曾在哪里展览？",
 87 |             "这件艺术品的标题是什么？",
 88 |             "这幅画的构图有何特点？",
 89 |             "这幅画属于什么艺术风格？",
 90 |             "这个艺术品中的<object>代表什么？",
 91 |             "这张图看起来像什么？",
 92 |             "这个<object>的设计风格是什么？",
 93 |             "这幅画的主题是什么？",
 94 |             "你对这件艺术品的色彩组合有什么印象？"
 95 |         ],
 96 |         "English": [
 97 |             "Please provide a brief description of the artwork pictured.",
 98 |             "Appreciate this work of art.",
 99 |             "Based on this painting, can you write it as a poem?",
100 |             "Is this artwork <a painting, a sculpture, or some other form> of existence?",
101 |             "What does this object look like?",
102 |             "Who created this artwork?",
103 |             "Where has this artwork been exhibited?",
104 |             "What is the title of this artwork?",
105 |             "What are the characteristics of the composition of this painting?",
106 |             "What artistic style does this painting belong to?",
107 |             "What does <object> in this artwork represent?",
108 |             "What does this picture look like?",
109 |             "What is the design style of this <object>?",
110 |             "What is the theme of this painting?",
111 |             "What are your impressions of the color combinations in this artwork?"
112 |         ]
113 |     }
114 | }


--------------------------------------------------------------------------------
/data_engine/all_seed/00018_landmark.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "judge": {
 3 |         "Chinese": [
 4 |             "这是<landmark_building/attraction>的照片吗？",
 5 |             "图片的<above, below, left or right>的建筑物是<landmark_building/attraction>吗？",
 6 |             "图片中的建筑是<landmark_building/attraction>吗？",
 7 |             "这张图中的地标属于<city/country>吗？"
 8 |         ],
 9 |         "English": [
10 |             "Is this a photo of <landmark_building/attraction>?",
11 |             "Is the building <above, below, left or right> in the picture <landmark_building/attraction>?",
12 |             "Is the building in the picture <landmark_building/attraction>?",
13 |             "Does the landmark in this picture belong to <city/country>?"
14 |         ]
15 |     },
16 |     "choice": {
17 |         "Chinese": [
18 |             "这个照片显示的是哪个<标志性建筑/景点>？",
19 |             "请指定此处显示的<标志性建筑/景点>的名称。",
20 |             "位于图片<above, below, left or right of the image>的<标志性建筑/景点>是什么？",
21 |             "这张图中的<标志性建筑/景点>属于哪个<城市/国家>？",
22 |             "请简要介绍一下图中的<标志性建筑/景点>。"
23 |         ],
24 |         "English": [
25 |             "Which <landmark_building/attraction> does this photo show?",
26 |             "Please specify the name of the <landmark_building/attraction> shown here.",
27 |             "What is the <landmark_building/attraction> located <above, below, left or right of the image>?",
28 |             "Which <city/country> does the <landmark_building/attraction> in this picture belong to?",
29 |             "Please briefly introduce the <landmark_building/attraction> in the picture."
30 |         ]
31 |     },
32 |     "sentence": {
33 |         "Chinese": [
34 |             "这个照片显示的是哪个<标志性建筑/景点>？",
35 |             "请指定此处显示的<标志性建筑/景点>的名称。",
36 |             "位于图片<above, below, left or right of the image>的<标志性建筑/景点>是什么？",
37 |             "这张图中的<标志性建筑/景点>属于哪个<城市/国家>？",
38 |             "请简要介绍一下图中的<标志性建筑/景点>。"
39 |         ],
40 |         "English": [
41 |             "Which <landmark_building/attraction> does this photo show?",
42 |             "Please specify the name of the <landmark_building/attraction> shown here.",
43 |             "What is the <landmark_building/attraction> located <above, below, left or right of the image>?",
44 |             "Which <city/country> does the <landmark_building/attraction> in this picture belong to?",
45 |             "Please briefly introduce the <landmark_building/attraction> in the picture."
46 |         ]
47 |     }
48 | }


--------------------------------------------------------------------------------
/data_engine/all_seed/00019_numerical_calculation.json:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | {
 4 |     "judge": {
 5 |         "Chinese": [
 6 |             "图片中的计算过程正确吗？",
 7 |             "图中的公式按照下面步骤计算是否正确？<calculation process> ",
 8 |             "图中直角三角形的斜边长度是<result>吗？",
 9 |             "使用图中所示的字母计算图中电路的功率。图中电路的功率是<result>吗？",
10 |             "图片中<the geometric shapes>的面积是否等于<result>？",
11 |             "图片中<the name of the variable>的值是否应等于<result>？"
12 |         ],
13 |         "English": [
14 |             "Are the actions in the picture correct?",
15 |             "Is the formula in the figure calculated correctly according to the following steps? <calculation process>",
16 |             "Is the length of the hypotenuse of the right triangle in the figure <result>?",
17 |             "Calculate the power of the circuit in the diagram using the letters indicated in the diagram. Is the power of the circuit in the diagram <result>?",
18 |             "Is the area of the <the geometric shapes> in the picture equal to <result>?",
19 |             "Should the value of <the name of the variable> in the picture equal <result>?"
20 |         ]
21 |     },
22 |     "choice": {
23 |         "Chinese": [
24 |             "图片中的计算过程正确吗？为什么？",
25 |             "计算图中的公式。",
26 |             "图中的操作是否恰当",
27 |             "使用图中所示的字母计算图中电路的功率。",
28 |             "图片中<the geometric shapes>的面积等于多少？",
29 |             "图片中<the name of the variable>的值应该等于多少？"
30 |         ],
31 |         "English": [
32 |             "Is the calculation process in the picture correct? Why?",
33 |             "Calculate the formulas in the picture.",
34 |             "Is the operation in the picture appropriate?",
35 |             "Calculate the power of the circuit in the picture using the letters shown in the picture.",
36 |             "What is the area of <the geometric shapes> in the picture?",
37 |             "What should the value of <the name of the variable> in the picture be equal to?"
38 |         ]
39 |     },
40 |     "sentence": {
41 |         "Chinese": [
42 |             "图片中的计算过程正确吗？为什么？",
43 |             "计算图中的公式。",
44 |             "图中的操作是否恰当",
45 |             "使用图中所示的字母计算图中电路的功率。",
46 |             "图片中<the geometric shapes>的面积等于多少？",
47 |             "图片中<the name of the variable>的值应该等于多少？"
48 |         ],
49 |         "English": [
50 |             "Is the calculation process in the picture correct? Why?",
51 |             "Calculate the formulas in the picture.",
52 |             "Is the operation in the picture appropriate?",
53 |             "Calculate the power of the circuit in the picture using the letters shown in the picture.",
54 |             "What is the area of <the geometric shapes> in the picture?",
55 |             "What should the value of <the name of the variable> in the picture be equal to?"
56 |         ]
57 |     }
58 | }


--------------------------------------------------------------------------------
/data_engine/all_seed/00020_spatial_relationship.json:
--------------------------------------------------------------------------------
 1 | 
 2 | {
 3 |     "judge": {
 4 |         "Chinese": [
 5 |             "<object1>是否在图像中<object2>的左侧？",
 6 |             "<object1>是否在图像中<object2>的右侧？",
 7 |             "<object1>是否在图像中<object2>的上方？",
 8 |             "<object1>是否在图像中<object2>的下方？",
 9 |             "<object1>是否在图像中<object2>的前面？",
10 |             "<object1>是否在图像中<object2>的后面？",
11 |             "<object1>是否在图像中<object2>的内部？",
12 |             "<object1>是否在图像中<object2>的外部？",
13 |             "图像中<object2>中间部位是否有<object1>？"
14 |         ],
15 |         "English": [
16 |             "Is the <object1> on the left of the <object2> in the image?",
17 |             "Is the <object1> on the right of the <object2> in the image?",
18 |             "Is the <object1> on the top of the <object2> in the image?",
19 |             "Is the <object1> under of the <object2> in the image?",
20 |             "Is the <object1> in front of the <object2> in the image?",
21 |             "Is the <object1> behind the <object2> in the image?",
22 |             "Is the <object1> inside the <object2> in the image?",
23 |             "Is the <object1> outside the <object2> in the image?",
24 |             "Is there <object1> in the middle of <object2> in the image?"
25 |         ]
26 |     },
27 |     "choice": {
28 |         "Chinese": [
29 |             "在图像中<object>的左侧的是什么东西？",
30 |             "在图像中<object>的右侧的是什么东西？",
31 |             "在图像中<object>的上方的是什么东西？",
32 |             "在图像中<object>的下方的是什么东西？",
33 |             "在图像中<object>的前面的是什么东西？",
34 |             "在图像中<object>的后面的是什么东西？",
35 |             "图像中，<object>的内部装有什么东西？",
36 |             "图像中，<object>在什么东西的内部？",
37 |             "在图像中<object>中间部位的是什么东西？",
38 |             "图像中<object1>在<object2>的什么方位？"
39 |         ],
40 |         "English": [
41 |             "What is on the left side of <object2> in the image?",
42 |             "What is on the right side of <object2> in the image?",
43 |             "What is on the top of <object2> in the image?",
44 |             "What is under <object2> in the image?",
45 |             "What is in front of <object2> in the image?",
46 |             "What is behind <object2> in the image?",
47 |             "What is inside <object2> in the image?",
48 |             "What is outside <object2> in the image?",
49 |             "What is in the middle of <object2> in the image?",
50 |             "In the image, where is <object1> located at <object2>?"
51 |         ]
52 |     },
53 |     "sentence": {
54 |         "Chinese": [
55 |             "在图像中<object>的左侧的是什么东西？",
56 |             "在图像中<object>的右侧的是什么东西？",
57 |             "在图像中<object>的上方的是什么东西？",
58 |             "在图像中<object>的下方的是什么东西？",
59 |             "在图像中<object>的前面的是什么东西？",
60 |             "在图像中<object>的后面的是什么东西？",
61 |             "图像中，<object>的内部装有什么东西？",
62 |             "图像中，<object>在什么东西的内部？",
63 |             "在图像中<object>中间部位的是什么东西？",
64 |             "图像中<object1>在<object2>的什么方位？"
65 |         ],
66 |         "English": [
67 |             "What is on the left side of <object2> in the image?",
68 |             "What is on the right side of <object2> in the image?",
69 |             "What is on the top of <object2> in the image?",
70 |             "What is under <object2> in the image?",
71 |             "What is in front of <object2> in the image?",
72 |             "What is behind <object2> in the image?",
73 |             "What is inside <object2> in the image?",
74 |             "What is outside <object2> in the image?",
75 |             "What is in the middle of <object2> in the image?",
76 |             "In the image, where is <object1> located at <object2>?"
77 |         ]
78 |     }
79 | }


--------------------------------------------------------------------------------
/data_engine/all_seed/00021_posters.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "judge": {
 3 |         "Chinese": [
 4 |             "这部<movie/TV series/animation/game, etc.>的导演是<specific name>吗？",
 5 |             "这部<movie/TV series/animation/game, etc.>的标题是<specific title>吗？",
 6 |             "这部<movie/TV series/animation/game, etc.>来自<specific country or region>吗？",
 7 |             "这个人是<specific movie/TV series/animation/game, etc.>的角色吗？",
 8 |             "这张图片中的角色是在这部<movie/TV series/animation/game, etc.>里的名字是<specific name>吗？",
 9 |             "这张<海报/照片/插图等>描述的是<specific movie/TV series/animation/game, etc.>吗？",
10 |             "这幅插图是<specific movie/TV series/animation/game, etc.>的宣传图片？",
11 |             "这个场景属于<specific movie/TV series/animation/game, etc.>的吗？",
12 |             "图片中<movie/TV series/animation/game, etc.>的主要角色是<specific name>吗？"
13 |         ],
14 |         "English": [
15 |             "Is the director of this <movie/TV series/animation/name, etc.> <specific name>?",
16 |             "Is the title of this <movie/TV series/animation/name, etc.> <specific title>?",
17 |             "Does this <movie/TV series/animation/name, etc.> come from <specific country or region>?",
18 |             "Is this person the character of <specific movie/TV series/animation/name, etc.>?",
19 |             "Is the character in this picture named <specific name> in this <movie/TV series/animation/name, etc.>?",
20 |             "Does this <poster/photo/illustration> describe <specific movie/TV series/animation/name, etc.>?",
21 |             "Is this illustration a promotional image for <specific movie/TV series/animation/name, etc.>?",
22 |             "Does this scene belong to <specific movie/TV series/animation/name, etc.>?",
23 |             "Is the main character of <movie/TV series/animation/name, etc.> in the picture <specific name>?"
24 |         ]
25 |     },
26 |     "choice": {
27 |         "Chinese": [
28 |             "这部<movie/TV series/animation/game, etc.>的导演是谁？",
29 |             "这部<movie/TV series/animation/game, etc.>的标题是什么？",
30 |             "这部<movie/TV series/animation/game, etc.>来自哪个国家或地区？",
31 |             "这个人是哪个<movie/TV series/animation/game, etc.>角色？",
32 |             "这张图片中的角色是在这部<movie/TV series/animation/game, etc.>里的名字是什么？",
33 |             "这张<海报/照片/插图等>描述的是哪部<movie/TV series/animation/game, etc.>？",
34 |             "这张<海报/照片/插图等>是从哪里获取的？",
35 |             "这幅插图是什么<movie/TV series/animation/game, etc.>的宣传图片？",
36 |             "这个场景属于哪部<movie/TV series/animation/game, etc.>？",
37 |             "这是哪部<movie/TV series/animation/game, etc.>的剧照？",
38 |             "图片中<movie/TV series/animation/game, etc.>的主要角色是什么？"
39 |         ],
40 |         "English": [
41 |             "Who is the director of this <movie/TV series/animation/game, etc.>?",
42 |             "What is the title of this <movie/TV series/animation/game, etc.>?",
43 |             "Which country or region does this <movie/TV series/animation/game, etc.> come from?",
44 |             "Which <movie/TV series/animation/game, etc.> character is this person?",
45 |             "What is the name of the character in this picture in this <movie/TV series/animation/game, etc.>?",
46 |             "Which <movie/TV series/animation/game, etc.> does this <poster/photo/illustration, etc.> describe?",
47 |             "Where can I get this <poster/photo/illustration, etc.>??",
48 |             "What <movie/TV series/animation/game, etc.> promotional image is this illustration?",
49 |             "Which <movie/TV series/animation/game, etc.> does this scene belong to?",
50 |             "What is the main character in the <movie/TV series/animation/game, etc.> in the picture?"
51 |         ]
52 |     },
53 |     "sentence": {
54 |         "Chinese": [
55 |             "这部<movie/TV series/animation/game, etc.>的导演是谁？",
56 |             "这部<movie/TV series/animation/game, etc.>的标题是什么？",
57 |             "这部<movie/TV series/animation/game, etc.>来自哪个国家或地区？",
58 |             "这个人是哪个<movie/TV series/animation/game, etc.>角色？",
59 |             "这张图片中的角色是在这部<movie/TV series/animation/game, etc.>里的名字是什么？",
60 |             "这张<海报/照片/插图等>描述的是哪部<movie/TV series/animation/game, etc.>？",
61 |             "这张<海报/照片/插图等>是从哪里获取的？",
62 |             "这幅插图是什么<movie/TV series/animation/game, etc.>的宣传图片？",
63 |             "这个场景属于哪部<movie/TV series/animation/game, etc.>？",
64 |             "这是哪部<movie/TV series/animation/game, etc.>的剧照？",
65 |             "图片中<movie/TV series/animation/game, etc.>的主要角色是什么？"
66 |         ],
67 |         "English": [
68 |             "Who is the director of this <movie/TV series/animation/game, etc.>?",
69 |             "What is the title of this <movie/TV series/animation/game, etc.>?",
70 |             "Which country or region does this <movie/TV series/animation/game, etc.> come from?",
71 |             "Which <movie/TV series/animation/game, etc.> character is this person?",
72 |             "What is the name of the character in this picture in this <movie/TV series/animation/game, etc.>?",
73 |             "Which <movie/TV series/animation/game, etc.> does this <poster/photo/illustration, etc.> describe?",
74 |             "Where can I get this <poster/photo/illustration, etc.>??",
75 |             "What <movie/TV series/animation/game, etc.> promotional image is this illustration?",
76 |             "Which <movie/TV series/animation/game, etc.> does this scene belong to?",
77 |             "What is the main character in the <movie/TV series/animation/game, etc.> in the picture?"
78 |         ]
79 |     }
80 | }


--------------------------------------------------------------------------------
/data_engine/all_seed/00022_meme_comprehension.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "judge": {
 3 |         "Chinese": [
 4 |             "这个<emoticon/screenshot/picture>之所以<Funny, interesting, thought-provoking or sad, etc>是因为<Specific reasons>吗？",
 5 |             "这个<emoticon/screenshot/picture>背后的内涵是<Specific connotations>吗？",
 6 |             "这个<emoticon/screenshot/picture>的故事与<Specific Story>有关吗？",
 7 |             "这个<emoticon/screenshot/picture>传达了<specific information>的信息吗？",
 8 |             "这个<emoticon/screenshot/picture>背后的故事与<Specific stories/people, etc>相关联吗？",
 9 |             "<specific characteristic>是否是这个<emoticon/screenshot/picture>的一个显著特征？",
10 |             "这个<emoticon/screenshot/picture>通常被用来表达<Funny, interesting, thought-provoking or sad, etc>吗？",
11 |             "<Specific stories/people, etc>是否与这个<emoticon/screenshot/picture>有关联？"
12 |         ],
13 |         "English": [
14 |             "Is this <emotion/screenshot/picture> <Funny, interesting, thought-provoking or sad, etc> because of <Specific reasons>?",
15 |             "Is the connotation behind this <emotion/screenshot/picture> <Specific annotations>?",
16 |             "Is this story related to <Specific Story>?",
17 |             "Does this <emotion/screenshot/picture> convey the message of <specific information>?",
18 |             "Is the story behind this <emotion/screenshot/picture> related to <Specific stories/people, etc.>?",
19 |             "Is <specific characteristic> a prominent feature of this <emotion/screenshot/picture>?",
20 |             "Is this <emotion/screenshot/picture> commonly used to express <Funny, interesting, thought provoking or sad, etc.>?",
21 |             "Is <Specific stories/people, etc.> related to this <emotion/screenshot/picture>?"
22 |         ]
23 |     },
24 |     "choice": {
25 |         "Chinese": [
26 |             "这个<emoticon/screenshot/picture>传达了什么信息？",
27 |             "这个<emoticon/screenshot/picture>通常用于表达什么？",
28 |             "这个<emoticon/screenshot/picture>的出处是什么？",
29 |             "这个<emoticon/screenshot/picture>讲述了什么样的故事？",
30 |             "这个<emoticon/screenshot/picture>的主要目的是什么？",
31 |             "如何理解这个<emoticon/screenshot/picture>？",
32 |             "这个<emoticon/screenshot/picture>背后有什么重要含义？",
33 |             "这个<emoticon/screenshot/picture>中的梗是什么？",
34 |             "这个<emoticon/screenshot/picture>有什么<好笑/悲伤/引人深思>之处？",
35 |             "这个<emoticon/screenshot/picture>有什么独特之处？"
36 |         ],
37 |         "English": [
38 |             "What message does this <emoticon/screenshot/picture> convey?",
39 |             "What is this <emoticon/screenshot/picture> usually used to express?",
40 |             "What is the source of this <emoticon/screenshot/picture>?",
41 |             "What story does this <emoticon/screenshot/picture> tell?",
42 |             "What is the main purpose of this <emoticon/screenshot/picture>?",
43 |             "How to understand this <emoticon/screenshot/picture>?",
44 |             "What is the important meaning behind this <emoticon/screenshot/picture>?",
45 |             "What is the meme in this <emoticon/screenshot/picture>?",
46 |             "What's <funny/sad/thought-provoking> about this <emoticon/screenshot/picture>?",
47 |             "What's unique about this <emoticon/screenshot/picture>?"
48 |         ]
49 |     },
50 |     "sentence": {
51 |         "Chinese": [
52 |             "这个<emoticon/screenshot/picture>传达了什么信息？",
53 |             "这个<emoticon/screenshot/picture>通常用于表达什么？",
54 |             "这个<emoticon/screenshot/picture>的出处是什么？",
55 |             "这个<emoticon/screenshot/picture>讲述了什么样的故事？",
56 |             "这个<emoticon/screenshot/picture>的主要目的是什么？",
57 |             "如何理解这个<emoticon/screenshot/picture>？",
58 |             "这个<emoticon/screenshot/picture>背后有什么重要含义？",
59 |             "这个<emoticon/screenshot/picture>中的梗是什么？",
60 |             "这个<emoticon/screenshot/picture>有什么<好笑/悲伤/引人深思>之处？",
61 |             "这个<emoticon/screenshot/picture>有什么独特之处？"
62 |         ],
63 |         "English": [
64 |             "What message does this <emoticon/screenshot/picture> convey?",
65 |             "What is this <emoticon/screenshot/picture> usually used to express?",
66 |             "What is the source of this <emoticon/screenshot/picture>?",
67 |             "What story does this <emoticon/screenshot/picture> tell?",
68 |             "What is the main purpose of this <emoticon/screenshot/picture>?",
69 |             "How to understand this <emoticon/screenshot/picture>?",
70 |             "What is the important meaning behind this <emoticon/screenshot/picture>?",
71 |             "What is the meme in this <emoticon/screenshot/picture>?",
72 |             "What's <funny/sad/thought-provoking> about this <emoticon/screenshot/picture>?",
73 |             "What's unique about this <emoticon/screenshot/picture>?"
74 |         ]
75 |     }
76 | }


--------------------------------------------------------------------------------
/data_engine/all_seed/00023_writing.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "judge": {
 3 |         "Chinese": [
 4 |             "这个<img>是否适合在<Specific Purpose/Scenario>下使用？",
 5 |             "使用<标语/宣传口号/广告>来描述这个<img>恰当吗？",
 6 |             "这个<img>是否与<标语/宣传口号/广告>的描述相符合？",
 7 |             "<标语/宣传口号/广告>是否适合用来表达这个<img>的特点？",
 8 |             "这首古诗是否适合用来这个<img>？",
 9 |             "<故事/童话/科普介绍/推荐信/电子邮件>是否适合用来这个<img>？"
10 |         ],
11 |         "English": [
12 |             "Is this <img> suitable for use under <Specific Purpose/Scenario>?",
13 |             "Is it appropriate to use <slogan/promotional slogan/advertisement> to describe this <img>?",
14 |             "Does this <img> match the description of the <slogan/promotional slogan/advertisement>?",
15 |             "Is the <slogan/promotional slogan/advertisement> suitable for expressing the characteristics of this <img>?",
16 |             "Is this ancient poem suitable for this <img>?",
17 |             "Is <Story/Fairy Tale/Science Popularization Introduction/Recommendation Letter/Email> suitable for this <img>?"
18 |         ]
19 |     },
20 |     "choice": {
21 |         "Chinese": [
22 |             "根据这张<img>，写一篇<故事/童话/科普介绍/推荐信/电子邮件>。",
23 |             "关于<img>中的<object>，写一篇<text_type>。",
24 |             "撰写一篇<text_type>，推荐图中的<object>。",
25 |             "撰写一首描述<img>中<object>的诗。",
26 |             "为这个<object>撰写一则推荐语。",
27 |             "推荐在<img>中看到的<object>，并提供购买原因。",
28 |             "给<somebody>写封电子邮件，介绍图片中的内容。",
29 |             "根据<img>内容，我想与朋友分享这个<img>，该如何推荐。",
30 |             "这个<img>适用于什么<用途/场景>？",
31 |             "为图中产品写一个符合产品特点的<标语/宣传口号/广告>？"
32 |         ],
33 |         "English": [
34 |             "Based on this <img>, write a <story/fairy_tale/popular_science_introduction/recommendation_letter/email>.",
35 |             "Write an article about <object> in <img>. <text_type>.",
36 |             "Write an article <text_type> and recommend the <object> in the picture.",
37 |             "Write a poem describing the <object> in <img>.",
38 |             "Write a recommendation for this <object>.",
39 |             "Recommend the <object> seen in <img> and provide a reason for purchase.",
40 |             "Write an email to <somebody> describing the content in the image.",
41 |             "Based on the content of <img>, I want to share this <img> with my friends. How should I recommend it?",
42 |             "What <purpose/scenario> is this <img> suitable for?",
43 |             "Write a <slogan/slogan/advertisement> for the product in the picture that fits the characteristics of the product?"
44 |         ]
45 |     },
46 |     "sentence": {
47 |         "Chinese": [
48 |             "根据这张<img>，写一篇<故事/童话/科普介绍/推荐信/电子邮件>。",
49 |             "关于<img>中的<object>，写一篇<text_type>。",
50 |             "撰写一篇<text_type>，推荐图中的<object>。",
51 |             "撰写一首描述<img>中<object>的诗。",
52 |             "为这个<object>撰写一则推荐语。",
53 |             "推荐在<img>中看到的<object>，并提供购买原因。",
54 |             "给<somebody>写封电子邮件，介绍图片中的内容。",
55 |             "根据<img>内容，我想与朋友分享这个<img>，该如何推荐。",
56 |             "这个<img>适用于什么<用途/场景>？",
57 |             "为图中产品写一个符合产品特点的<标语/宣传口号/广告>？"
58 |         ],
59 |         "English": [
60 |             "Based on this <img>, write a <story/fairy_tale/popular_science_introduction/recommendation_letter/email>.",
61 |             "Write an article about <object> in <img>. <text_type>.",
62 |             "Write an article <text_type> and recommend the <object> in the picture.",
63 |             "Write a poem describing the <object> in <img>.",
64 |             "Write a recommendation for this <object>.",
65 |             "Recommend the <object> seen in <img> and provide a reason for purchase.",
66 |             "Write an email to <somebody> describing the content in the image.",
67 |             "Based on the content of <img>, I want to share this <img> with my friends. How should I recommend it?",
68 |             "What <purpose/scenario> is this <img> suitable for?",
69 |             "Write a <slogan/slogan/advertisement> for the product in the picture that fits the characteristics of the product?"
70 |         ]
71 |     }
72 | }


--------------------------------------------------------------------------------
/data_engine/all_seed/00024_brand_recognition.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "judge": {
 3 |         "Chinese": [
 4 |             "这个<object>来自<Specific brand>吗？",
 5 |             "这张图片中<object>的品牌是<Specific brand>吗？",
 6 |             "这个<object>的制造商是<Specific producer>吗？",
 7 |             "这个<object>的公司是<Specific company>吗？",
 8 |             "这个<object>的国家是<Specific country>吗？"
 9 |         ],
10 |         "English": [
11 |             "Does this <object> come from <Specific brand>?",
12 |             "Is the brand of <object> in this picture <Specific brand>?",
13 |             "Is the manufacturer of this <object> <Specific producer>?",
14 |             "Is this <object> company a <Specific company>?",
15 |             "Is this <object> country a <specific country>?"
16 |         ]
17 |     },
18 |     "choice": {
19 |         "Chinese": [
20 |             "您能识别这张图片中<object>的品牌吗？",
21 |             "这张图片中<object>的品牌是什么？",
22 |             "这个<object>的制造商是谁？",
23 |             "哪家公司生产的这个<object>？",
24 |             "这张图片中<object>是哪个国家的产品？"
25 |         ],
26 |         "English": [
27 |             "Can you identify the brand of <object> in this picture?",
28 |             "What is the brand of <object> in this picture?",
29 |             "Who is the producer of this <object>?",
30 |             "Which company produces this <object>?",
31 |             "Which country's product is the <object> in this picture?"
32 |         ]
33 |     },
34 |     "sentence": {
35 |         "Chinese": [
36 |             "您能识别这张图片中<object>的品牌吗？",
37 |             "这张图片中<object>的品牌是什么？",
38 |             "这个<object>的制造商是谁？",
39 |             "哪家公司生产的这个<object>？",
40 |             "这张图片中<object>是哪个国家的产品？"
41 |         ],
42 |         "English": [
43 |             "Can you identify the brand of <object> in this picture?",
44 |             "What is the brand of <object> in this picture?",
45 |             "Who is the producer of this <object>?",
46 |             "Which company produces this <object>?",
47 |             "Which country's product is the <object> in this picture?"
48 |         ]
49 |     }
50 | }


--------------------------------------------------------------------------------
/data_engine/all_seed/00025_species_recognition.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "judge": {
 3 |         "Chinese": [
 4 |             "这是<Specific species>吗？",
 5 |             "<Specific species>是否在这张<img>中？",
 6 |             "这个<object>的俗名是<name>吗？",
 7 |             "这个<object>的学名是<name>吗？",
 8 |             "在<img>中的<object>物种是<Specific types>吗？",
 9 |             "在<img>中的<object>品种是<Specific varieties>吗？"
10 |         ],
11 |         "English": [
12 |             "Is this <Specific species>?",
13 |             "Is the <Specific species> in this <img>?",
14 |             "Is the common name of this <object> <name>?",
15 |             "Is the scientific name of this <object> <name>?",
16 |             "Is the <object> species in <img> <specific types>?",
17 |             "Is the <object> variety in <img> <Specific varieties>?"
18 |         ]
19 |     },
20 |     "choice": {
21 |         "Chinese": [
22 |             "这张图中的主要物种是什么？",
23 |             "在这张图中中有哪些物种？",
24 |             "识别图中里的物种。",
25 |             "这个物种的俗名是什么？",
26 |             "这个物种的学名是什么？",
27 |             "这是一个<object>，属于哪个物种？",
28 |             "这张图片中的<object>属于哪个品种？"
29 |         ],
30 |         "English": [
31 |             "What are the main species in this picture?",
32 |             "What species are there in this picture?",
33 |             "Identify the species in the picture.",
34 |             "What is the common name of this species?",
35 |             "What is the scientific name of this species?",
36 |             "This is an <object>, which species does it belong to?",
37 |             "Which variety does the <object> in this picture belong to?"
38 |         ]
39 |     },
40 |     "sentence": {
41 |         "Chinese": [
42 |             "这张图中的主要物种是什么？",
43 |             "在这张图中中有哪些物种？",
44 |             "识别图中里的物种。",
45 |             "这个物种的俗名是什么？",
46 |             "这个物种的学名是什么？",
47 |             "这是一个<object>，属于哪个物种？",
48 |             "这张图片中的<object>属于哪个品种？"
49 |         ],
50 |         "English": [
51 |             "What are the main species in this picture?",
52 |             "What species are there in this picture?",
53 |             "Identify the species in the picture.",
54 |             "What is the common name of this species?",
55 |             "What is the scientific name of this species?",
56 |             "This is an <object>, which species does it belong to?",
57 |             "Which variety does the <object> in this picture belong to?"
58 |         ]
59 |     }
60 | }


--------------------------------------------------------------------------------
/data_engine/end_prompt.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "longsentence": {
  3 |         "Chinese": [
  4 |             "请确保你的回答尽可能详细。",
  5 |             "提供详细的解释。",
  6 |             "你的回答需要尽可能详细。",
  7 |             "确保你的回答充分详细。",
  8 |             "在你的答案中深入探讨。",
  9 |             "在你的回答中详细阐述。",
 10 |             "务必提供详细的解答。",
 11 |             "提供详尽的阐释。",
 12 |             "详细解释一下。",
 13 |             "详细阐述你的回答。"
 14 |         ],
 15 |         "English": [
 16 |             " Provide an extensive response.",
 17 |             " Give a comprehensive reply.",
 18 |             " Present a detailed explanation.",
 19 |             " Offer a detailed explanation.",
 20 |             " Be as detailed as possible in your response.",
 21 |             " Be as thorough as you can in your response.",
 22 |             " Go into great depth in your answer.",
 23 |             " Please ensure that your answer is as detailed as possible.",
 24 |             " Your response needs to be as detailed as possible.",
 25 |             " Make sure to provide a detailed answer."
 26 |         ]
 27 |     },
 28 |     "shortsentence": {
 29 |         "Chinese": [
 30 |             "请尽可能简明扼要地回答问题。",
 31 |             "保持你的回复简短明了。",
 32 |             "简明扼要地回答。",
 33 |             "请简明扼要地回答问题。",
 34 |             "你的回答需要尽可能简短。",
 35 |             "在传达必要信息的同时，简明扼要地回应。",
 36 |             "尽可能保持简洁。",
 37 |             "你的答案尽量保持简洁。",
 38 |             "回答需简洁明了。",
 39 |             "请用简明扼要的语言回答。"
 40 |         ],
 41 |         "English": [
 42 |             " Please answer the question as concisely as possible.",
 43 |             " Keep your reply short and clear.",
 44 |             " Keep your reply as brief and clear as possible.",
 45 |             " Provide a brief and clear answer, please.",
 46 |             " Answer concisely.",
 47 |             " Be as brief as you can in your response.",
 48 |             " Respond with brevity while conveying the necessary information.",
 49 |             " Keep it as concise as possible.",
 50 |             " Give a concise and straightforward reply.",
 51 |             " Please keep your answer concise and to the point."
 52 |         ]
 53 |     },
 54 |     "judge": {
 55 |         "Chinese": [
 56 |             "我可以要求以'是'或'否'的形式做出回应吗？",
 57 |             "你的回答应该限于'是'还是'否。",
 58 |             "你需要回答'是'或'否'。",
 59 |             "你需要用'是'还是'否'来回答。",
 60 |             "请用'是'或'否'来回答",
 61 |             "回答时请使用'是'或'否'。",
 62 |             "请以'是'或'否'的方式作答。",
 63 |             "回答时请使用'是'或'否'。",
 64 |             "请简单地用'是'或'否'回答。",
 65 |             "在回答时，只需选择'是'或'否'。"
 66 |         ],
 67 |         "English": [
 68 |             " Please answer with 'Yes' or 'No'.",
 69 |             " Use 'Yes' or 'No' to reply.",
 70 |             " Indicate your answer by choosing 'Yes' or 'No'.",
 71 |             " Use 'Yes' or 'No' for your response, thank you.",
 72 |             " Your answer should be either 'Yes' or 'No'.",
 73 |             " Provide your response with a simple 'Yes' or 'No'.",
 74 |             " May I ask for a response in the form of 'Yes' or 'No'?",
 75 |             " Should your reply be limited to 'Yes' or 'No'?",
 76 |             " Will you kindly respond with either 'Yes' or 'No'?",
 77 |             " Are you able to provide a response in either 'Yes' or 'No'?"
 78 |         ]
 79 |     },
 80 |     "select": {
 81 |         "Chinese": [
 82 |             "选择正确选项。",
 83 |             "请选择正确的选项。",
 84 |             "做出适当的选择。",
 85 |             "做出恰当的选择。",
 86 |             "请挑选出正确的选项。",
 87 |             "从提供的选项中选择正确的选项。",
 88 |             "在提供的选项中作出适当的选择。",
 89 |             "从给定的选项中选择正确的答案。",
 90 |             "从给出的选项中进行选择。",
 91 |             "请在选项中做出正确的选择。"
 92 |         ],
 93 |         "English": [
 94 |             " Please select a correct choice.",
 95 |             " Choose the correct option.",
 96 |             " Choose the right option.",
 97 |             " Make the appropriate selection.",
 98 |             " Choose the right option from the provided choices.",
 99 |             " Make a selection from the available options.",
100 |             " Make a selection from the given options.",
101 |             " Make an appropriate selection.",
102 |             " Pick the right choice.",
103 |             " Select the correct option from the provided choices."
104 |         ]
105 |     }
106 | }


--------------------------------------------------------------------------------
/data_engine/gpt35_qa.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import re
  3 | import httpx
  4 | from openai import OpenAI
  5 | import random
  6 | import copy
  7 | import json
  8 | import time
  9 | import os
 10 | import logging
 11 | logging.basicConfig(level=logging.INFO)
 12 | logger = logging.getLogger(__name__)
 13 | root_path = 'MMInstruct'
 14 | 
 15 | gpt_keys = [
 16 |     {"idx":0,"key":"openai-key-1"},
 17 |     {"idx":1,"key":"openai-key-2"},
 18 | ]
 19 | MAX_API_RETRY = len(gpt_keys)
 20 | REQ_TIME_GAP = 1
 21 | proxy_url = 'proxy_url'
 22 | key_id = 0
 23 | 
 24 | def one_ask(client, text):
 25 |     content = []
 26 |     content.append({"type": "text", "text": text})
 27 | 
 28 |     response = client.chat.completions.create(
 29 |         model="gpt-3.5-turbo",
 30 |         messages=[{"role": "system", 'content': 'You are a helpful and precise assistant.'}, 
 31 |                   {"role": "user", "content": content}]
 32 |     )
 33 |     return response.choices[0]
 34 | 
 35 | def get_answer(prompt):
 36 |     global key_id
 37 |     for i in range(3):
 38 |         try:
 39 |             api_key = gpt_keys[key_id]['key']
 40 |             proxy_url = proxy_url
 41 |             proxies = {
 42 |             "http://": f"{proxy_url}",
 43 |             "https://": f"{proxy_url}",
 44 |             }
 45 |             http_c = httpx.Client(proxies=proxies)
 46 |             client = OpenAI(api_key=api_key, http_client=http_c)
 47 |             response = one_ask(client, prompt)
 48 |             content = response.message.content
 49 |             return content
 50 |         except Exception as e:
 51 |             key_id += 1
 52 |             key_id = key_id % MAX_API_RETRY
 53 |             logger.info(e)
 54 |             time.sleep(2)
 55 |     logger.info(f"Failed after {MAX_API_RETRY} retries.")
 56 |     return "error"
 57 | 
 58 | 
 59 | choice_prompt = 'Giving the description of an image and a question list including five questions, you need to desigin three multiple choice questions related to the <domain>.\n\
 60 | For each sample, the meaning of generated question MUST be similar to the question in the provided question list, and you need to output four choices as candidates.\n\
 61 | There should be only one choice that is the answer to the question, and this correct choice should be generated according to the description of the image. \n\
 62 | These choices should be indexed by captital letters.\n\
 63 | The description of the image and question list for you are as follows:\n\
 64 | Description: <caption>. \n Question: <original_question_list>. \n  \
 65 | You MUST output the generated question, choices and answer in the following format:\n\
 66 | <Q1> {the generated question 1} </Q1> <C1> {the choices you give} </C1> <A1> {the right choice of the question 1} </A1>\n\
 67 | <Q2> {the generated question 2} </Q2> <C2> {the choices you give} </C2> <A2> {the right choice of the question 2} </A2>\n\
 68 | <Q3> {the generated question 3} </Q3> <C3> {the choices you give} </C3> <A3> {the right choice of the question 3} </A3>\n'
 69 | 
 70 | choice_prompt = '给出图像的描述和问题列表，你需要设计三个与<domain>相关的中文单项选择问题。\n\
 71 | 对于每个样本，生成的问题的含义必须与提供的问题列表中的问题相似，并且你需要输出四个选项作为候选者。\n\
 72 | 并且只有一个选择是问题的正确答案，这个正确答案应该根据图像的描述生成。\n\
 73 | 这些选择应该通过A、B、C、D四个大写字母进行索引。\n\
 74 | 图像相关信息（"Empty"表示没有信息）：<prior_knowledge> \n\
 75 | 描述：<caption>\n\n问题：<question_templates>\n\
 76 | 我给你的问题<>里的内容是占位符，你只需要选择一个最合适的即可，不需要保留两个或者更多。\
 77 | 最后，一定保证你生成的问题符合主题，一定不要生成一些和我提供给你的问题列表中含义差别很大的问题。\
 78 | 你必须以以下格式输出生成的问题、选项和答案：\n\
 79 | <Q1> {the generated question 1} </Q1> <C1> {the choices you give: A. xxx B. xxx C. xxx D. xxx} </C1> <A1> {the right choice of the question 1} </A1>\n\
 80 | <Q2> {the generated question 2} </Q2> <C2> {the choices you give: A. xxx B. xxx C. xxx D. xxx} </C2> <A2> {the right choice of the question 2} </A2>\n\
 81 | <Q3> {the generated question 3} </Q3> <C3> {the choices you give: A. xxx B. xxx C. xxx D. xxx} </C3> <A3> {the right choice of the question 3} </A3>\n'
 82 | 
 83 | def generate_choice(domain, begin_ix):
 84 |     captions_path = f'{root_path}/{domain}/{domain}_caption.jsonl'
 85 |     generated_queations_path = f'{root_path}/{domain}/{domain}_choice.jsonl'
 86 |     seed_json = f'{root_path}/all_seed/{domain}.json'
 87 | 
 88 |     questions_model = []
 89 |     with open(seed_json, "r", encoding='utf-8') as file:
 90 |         try:
 91 |             json_data = json.load(file)
 92 |             questions_model = json_data["select"]["Chinese"]
 93 |         except:
 94 |             logger.info('读取问题种子失败')
 95 | 
 96 |     ix = 0
 97 |     with open(captions_path, 'r', encoding='utf-8') as f:
 98 |         for line in f:
 99 |             ix += 1
100 |             if ix < begin_ix:
101 |                 continue
102 |             
103 |             questions_model_list = random.sample(questions_model, min(3, len(questions_model)))
104 |             caption_dict = json.loads(line)
105 |             prompt = choice_prompt
106 | 
107 |             prior_knowledge = str(caption_dict.get("bing_tag", 'Empty'))
108 |             if prior_knowledge == "":
109 |                 prior_knowledge = "Empty"
110 | 
111 |             prompt = prompt.replace("<domain>", domain[6:])
112 |             prompt = prompt.replace('<prior_knowledge>', prior_knowledge)
113 |             prompt = prompt.replace("<caption>", caption_dict['gpt4v_caption_interface'].replace("\n\n","\n"))
114 |             prompt = prompt.replace('<question_templates>', str(questions_model_list))
115 |             try:
116 |                 out = get_answer(prompt)
117 |                 logger.info("[prompt]\n" + prompt)
118 |                 logger.info("[image_path]:" + caption_dict['image_path'] + "\n[GPT OUT]: \n" + str(out))
119 | 
120 |                 question_dict = {
121 |                     "image_path": caption_dict['image_path'], 
122 |                     "qa_raw": str(out),
123 |                     "gpt_prompt": prompt    
124 |                 }
125 |                 open(generated_queations_path, 'a', encoding='utf-8').write(
126 |                     json.dumps(question_dict, ensure_ascii=False)+'\n'
127 |                 )
128 | 
129 |             except Exception as e:
130 |                 logger.info(str(ix) + "  [ERROR]")
131 |                 logger.info("error info:" + str(repr(e)))
132 |                 caption_dict['err'] = str(repr(e))
133 |                 logger.info("error image path:" + caption_dict['image_path'])
134 |                 open(generated_queations_path, 'a', encoding='utf-8').write(
135 |                     json.dumps(caption_dict, ensure_ascii=False)+'\n'
136 |                 )
137 | 
138 |     logger.info('****done****')
139 |     logger.info("total generate " + str(ix) + " {}  pairs. ")
140 | 
141 | 
142 | 
143 | lqa_prompt = 'Provide a description of an image and a list of multiple questions, you need to desigin three long question answering questions related to the <domain>.\n\
144 | For each sample, the meaning of generated question MUST be similar to the question in the provided question list, and you need to output a detailed answer to the question.\n\
145 | The detailed answer to this question should be generated based on the description of the image.\n\
146 | The description of the image and question list for you are as follows:\n\
147 | Description: <caption>. \n Question: <original_question_list>. \n  \
148 | You MUST output the generated questions and answers in the following format:\n\
149 | <Q1> {the generated question 1} </Q1> <A1> {the long answer of the question 1} </A1>\n\
150 | <Q2> {the generated question 2} </Q2> <A2> {the long answer of the question 2} </A2>\n\
151 | <Q3> {the generated question 3} </Q3> <A3> {the long answer of the question 3} </A3>\n'
152 | 
153 | lqa_prompt = '给出图像的描述和问题列表，你需要设计三个与<domain>相关的中文长问答问题。\n\
154 | 对于每个样本，生成的问题的含义必须与提供的问题列表中的问题相似，并且你需要输出该问题的详细答案。\n\
155 | 这个问题的详细答案应该根据图像的描述生成。\n\
156 | 图像相关信息（"Empty"表示没有信息）：<prior_knowledge> \n\
157 | 描述：<caption>\n 问题：<question_templates>\n\
158 | 你必须以以下格式输出生成的问题和答案：\n\
159 | <Q1> {the generated question 1} </Q1> <A1> {the long answer of the question 1} </A1>\n\
160 | <Q2> {the generated question 2} </Q2> <A2> {the long answer of the question 2} </A2>\n\
161 | <Q3> {the generated question 3} </Q3> <A3> {the long answer of the question 3} </A3>\n'
162 | 
163 | def generate_long_qa(domain, begin_ix=0):
164 |     print("\n\n****start lqa and answer working****\n\n")
165 |     captions_path = f'{root_path}/{domain}/{domain}_caption.jsonl'
166 |     generated_queations_path = f'{root_path}/{domain}/{domain}_lqa.jsonl'
167 |     error_dest_path = f'{root_path}/{domain}/{domain}_lqa_err.jsonl'
168 |     seed_json = f'{root_path}/all_seed/{domain}.json'
169 | 
170 |     questions_model = []
171 |     with open(seed_json, "r", encoding='utf-8') as file:
172 |         try:
173 |             json_data = json.load(file)
174 |             questions_model = json_data["select"]["Chinese"]
175 |         except:
176 |             print('读取问题种子失败')
177 | 
178 |     ix = 0
179 |     with open(captions_path, 'r', encoding='utf-8') as f:
180 |         for line in f:
181 |             ix += 1
182 |             if ix < begin_ix:
183 |                 continue
184 |             
185 |             questions_model_list = random.sample(questions_model, min(3, len(questions_model)))
186 |             print("questions_model_list: " + str(questions_model_list))
187 | 
188 |             caption_dict = json.loads(line)
189 |             prompt = lqa_prompt
190 |             prompt = prompt.replace("<domain>", domain)
191 |             prompt = prompt.replace('<prior_knowledge>', str(caption_dict.get("prior", 'Empty')))
192 |             prompt = prompt.replace("<caption>", caption_dict['caption'])
193 |             prompt = prompt.replace('<question_templates>', str(questions_model_list))
194 |             try:
195 |                 out = get_answer(prompt)
196 |                 print("[image_path]: \n" + caption_dict['image_path'] + "\n\n[GPT OUT]: \n" + str(out))
197 |                 question_dict = {
198 |                     "image_path": caption_dict['image_path'], 
199 |                     "qa_raw": str(out),
200 |                     "gpt_prompt": prompt    
201 |                 }
202 |                 open(generated_queations_path, 'a', encoding='utf-8').write(
203 |                         json.dumps(question_dict, ensure_ascii=False)+'\n'
204 |                     )
205 |             except Exception as e:
206 |                 print(str(ix) + "  [ERROR]")
207 |                 print("error info:" + str(repr(e)))
208 |                 caption_dict['err'] = str(repr(e))
209 |                 print("error image path:" + caption_dict['image_path'])
210 |                 open(error_dest_path, 'a', encoding='utf-8').write(
211 |                     json.dumps(caption_dict, ensure_ascii=False)+'\n'
212 |                 )
213 | 
214 |     print('****done****')
215 |     print("total generate " + str(ix) + " pairs. ")
216 | 
217 | 
218 | sqa_prompt = 'Provide a description of an image and a list of multiple questions, you need to desigin three short question answering questions related to the <domain>.\n\
219 | For each sample, the meaning of generated question MUST be similar to the question in the provided question list, and you need to output a few words or short sentences as a short answer to the question.\n\
220 | The answer to this question should be generated based on the description of the image.\n\
221 | The description of the image and question list for you are as follows:\n\
222 | Description: <caption>. \n Question: <original_question_list>. \n  \
223 | You MUST output the generated questions and answers in the following format:\n\
224 | <Q1> {the generated question 1} </Q1> <A1> {the short answer of the question 1} </A1>\n\
225 | <Q2> {the generated question 2} </Q2> <A2> {the short answer of the question 2} </A2>\n\
226 | <Q3> {the generated question 3} </Q3> <A3> {the short answer of the question 3} </A3>\n'
227 | 
228 | sqa_prompt = '给出图像的描述和问题列表，你需要设计三个与<domain>相关的中文短问答问题。\n\
229 | 对于每个样本，生成的问题的含义必须与提供的问题列表中的问题相似，并且你需要输出几个单词或短句作为问题的简短答案。\n\
230 | 这个问题的短答案应该根据图像的描述生成。\n\
231 | 图像相关信息（"Empty"表示没有信息）：<prior_knowledge> \n\
232 | 描述：<caption>\n 问题：<question_templates>\n\
233 | 你必须以以下格式输出生成的问题和答案：\n\
234 | <Q1> {the generated question 1} </Q1> <A1> {the short answer of the question 1} </A1>\n\
235 | <Q2> {the generated question 2} </Q2> <A2> {the short answer of the question 2} </A2>\n\
236 | <Q3> {the generated question 3} </Q3> <A3> {the short answer of the question 3} </A3>\n'
237 | 
238 | def generate_short_qa(domain, begin_ix=0):
239 |     print("\n\n****start sqa and answer working****\n\n")
240 |     captions_path = f'{root_path}/{domain}/{domain}_caption.jsonl'
241 |     generated_queations_path = f'{root_path}/{domain}/{domain}_sqa.jsonl'
242 |     error_dest_path = f'{root_path}/{domain}/{domain}_sqa_err.jsonl'
243 |     seed_json = f'{root_path}/all_seed/{domain}.json'
244 | 
245 |     questions_model = []
246 |     with open(seed_json, "r", encoding='utf-8') as file:
247 |         try:
248 |             json_data = json.load(file)
249 |             questions_model = json_data["select"]["Chinese"]
250 |         except:
251 |             print('读取问题种子失败')
252 | 
253 |     ix = 0
254 |     with open(captions_path, 'r', encoding='utf-8') as f:
255 |         for line in f:
256 |             ix += 1
257 |             if ix < begin_ix:
258 |                 continue
259 | 
260 |             questions_model_list = random.sample(questions_model, min(3, len(questions_model)))
261 |             caption_dict = json.loads(line)
262 | 
263 |             prompt = sqa_prompt
264 |             prompt = prompt.replace("<domain>", domain)
265 |             prompt = prompt.replace('<prior_knowledge>', str(caption_dict.get("prior", 'Empty')))
266 |             prompt = prompt.replace("<caption>", caption_dict['caption'])
267 |             prompt = prompt.replace('<question_templates>', str(questions_model_list))
268 |             try:
269 |                 out = get_answer(prompt)
270 |                 print("[image_path]: \n" + caption_dict['image_path'] + "\n\n[GPT OUT]: \n" + str(out))
271 |                 question_dict = {
272 |                     "image_path": caption_dict['image_path'], 
273 |                     "qa_raw": str(out),
274 |                     "gpt_prompt": prompt    
275 |                 }
276 |                 open(generated_queations_path, 'a', encoding='utf-8').write(
277 |                     json.dumps(question_dict, ensure_ascii=False)+'\n'
278 |                 )
279 |             except Exception as e:
280 |                 print(str(ix) + "  [ERROR]")
281 |                 print("error info:" + str(repr(e)))
282 |                 caption_dict['err'] = str(repr(e))
283 |                 print("error image path:" + caption_dict['image_path'])
284 |                 open(error_dest_path, 'a', encoding='utf-8').write(
285 |                     json.dumps(caption_dict, ensure_ascii=False)+'\n'
286 |                 )
287 | 
288 |     print('****done****')
289 |     print("total generate " + str(ix) + " pairs. ")
290 | 
291 | 
292 | 
293 | judge_prompt = 'Provide a description of an image and a list of multiple questions, you need to desigin three true or false questions related to the <domain>.\n\
294 | For each sample, the meaning of generated question MUST be similar to the question in the provided question list, and you need to output "Yes" or "No" as the answer to the question.\n\
295 | The answer to this question should be generated based on the description of the image.\n\
296 | The description of the image and question list for you are as follows:\n\
297 | Description: <caption>. \n Question: <original_question_list>. \n  \
298 | You MUST output the generated questions and answers in the following format:\n\
299 | <Q1> {the generated question 1} </Q1> <C1> {"Yes",  "No"} </C1> <A1> {the right choice of the question 1} </A1>\n\
300 | <Q2> {the generated question 2} </Q2> <C2> {"Yes",  "No"} </C2> <A2> {the right choice of the question 2} </A2>\n\
301 | <Q3> {the generated question 3} </Q3> <C3> {"Yes",  "No"} </C3> <A3> {the right choice of the question 3} </A3>\n'
302 | 
303 | judge_prompt = '给出图像的描述和问题列表，你需要设计四个与<domain>相关的中文判断题。\n\
304 | 对于每个样本，生成的问题的含义必须与提供的问题列表中的问题相似，并且你需要输出“是”或“否”作为问题的答案。\n\
305 | 注意答案只能是“是”或“否”的其中之一，这个正确答案应该根据图像的描述生成。\n\
306 | 图像相关信息（"Empty"表示没有信息）：<prior_knowledge> \n\
307 | 描述：<caption>\n 问题：<question_templates>\n\
308 | 我给你的问题<>里的内容是占位符，你需要进行根据图像相关信息和描述来生成。\n\
309 | 你生成的四个判断题题目，应该保证其根据图像的描述生成的对应正确答案中的两个为“是”，另外两个为“否”。\n\
310 | 答案为“否”的判断题题目，你可以随机生成一些错误但与图像相关信息和描述相关的词语。\n\
311 | 最后，一定保证你生成的问题逻辑通顺、符合主题且与图像相关信息和描述相关。\n\
312 | 你必须以下格式输出生成的问题、选项和答案：\n\
313 | <Q1> {the generated question 1} </Q1> <C1> {"是",  "否"} </C1> <A1> {"是" or "否"} </A1>\n\
314 | <Q2> {the generated question 2} </Q2> <C2> {"是",  "否"} </C2> <A2> {"是" or "否"} </A2>\n\
315 | <Q3> {the generated question 3} </Q3> <C3> {"是",  "否"} </C3> <A3> {"是" or "否"} </A3>\n\
316 | <Q4> {the generated question 3} </Q4> <C4> {"是",  "否"} </C4> <A4> {"是" or "否"} </A4>\n\
317 | '
318 | 
319 | def generate_judge(domain, begin_ix=0):
320 |     print("\n\n****start judge and answer working****\n\n")
321 |     captions_path = f'{root_path}/{domain}/{domain}_caption.jsonl'
322 |     generated_queations_path = f'{root_path}/{domain}/{domain}_judge.jsonl'
323 |     seed_json = f'{root_path}/all_seed/{domain}.json'
324 | 
325 |     questions_model = []
326 |     with open(seed_json, "r", encoding='utf-8') as file:
327 |         try:
328 |             json_data = json.load(file)
329 |             questions_model = json_data["judge"]["Chinese"]
330 |         except:
331 |             logger.info('读取问题种子失败')
332 |             return
333 | 
334 |     ix = 0
335 |     with open(captions_path, 'r', encoding='utf-8') as f:
336 |         for line in f:
337 |             ix += 1
338 |             if ix < begin_ix:
339 |                 continue
340 | 
341 |             questions_model_list = random.sample(questions_model, min(3, len(questions_model)))
342 |             caption_dict = json.loads(line)
343 | 
344 |             prompt = judge_prompt
345 |             prompt = prompt.replace("<domain>", domain[6:])
346 |             prior_knowledge = str(caption_dict.get("bing_tag", 'Empty'))
347 |             if prior_knowledge == "":
348 |                 prior_knowledge = "Empty"
349 |             prompt = prompt.replace('<prior_knowledge>', prior_knowledge)
350 |             prompt = prompt.replace("<caption>", caption_dict['gpt4v_caption_interface'].replace("\n\n","\n"))
351 |             prompt = prompt.replace('<question_templates>', str(questions_model_list))
352 |             try:
353 |                 out = get_answer(prompt)
354 |                 question_dict = {
355 |                     "image_path": caption_dict['image_path'], 
356 |                     "qa_raw": str(out),
357 |                     "gpt_prompt": prompt    
358 |                 }
359 |                 open(generated_queations_path, 'a', encoding='utf-8').write(
360 |                     json.dumps(question_dict, ensure_ascii=False)+'\n'
361 |                 )
362 |                 
363 |             except Exception as e:
364 |                 logger.info(str(ix) + "  [ERROR]")
365 |                 logger.info("error info:" + str(repr(e)))
366 |                 caption_dict['err'] = str(repr(e))
367 |                 logger.info("error image path:" + caption_dict['image_path'])
368 |                 open(generated_queations_path, 'a', encoding='utf-8').write(
369 |                         json.dumps(caption_dict, ensure_ascii=False)+'\n'
370 |                     )
371 | 
372 |     logger.info('****done****')
373 |     logger.info("total generate " + str(ix) + " pairs. ")
374 | 
375 | if __name__ == "__main__":
376 |     domain = "poster"
377 |     generate_choice(domain,begin_ix=0)
378 | 


--------------------------------------------------------------------------------
/data_engine/gpt4v_caption.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import json
  3 | from openai import OpenAI
  4 | from PIL import Image
  5 | import imghdr
  6 | import base64
  7 | import io
  8 | import httpx
  9 | import logging
 10 | import time
 11 | import os
 12 | 
 13 | logging.basicConfig(level=logging.INFO)
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | gpt_keys = [
 17 |     {"idx":0,"key":"openai-key-1"},
 18 |     {"idx":1,"key":"openai-key-2"},
 19 | ]
 20 | 
 21 | MAX_API_RETRY = len(gpt_keys)
 22 | key_id = 0
 23 | proxy_url = 'proxy_url'
 24 | 
 25 | def list_to_str(tmp):
 26 |     res = ''
 27 |     for item in tmp:
 28 |         res += '\n' + str(item)
 29 |     return res
 30 | 
 31 | def one_ask(text, image_paths, image_size=(512, 512), detail='low'):
 32 |     global key_id
 33 |     for i in range(MAX_API_RETRY):
 34 |         try:
 35 |             api_key = gpt_keys[key_id]['key']
 36 |             proxy_url = proxy_url
 37 |             proxies = {
 38 |                 "http://": f"{proxy_url}",
 39 |                 "https://": f"{proxy_url}",
 40 |             }
 41 |             http_c = httpx.Client(proxies=proxies)
 42 |             client = OpenAI(api_key=api_key, http_client=http_c)
 43 | 
 44 |             content = []
 45 |             content.append({"type": "text", "text": text})
 46 |             for image in image_paths:
 47 |                 image_type = imghdr.what(image)
 48 | 
 49 |                 with Image.open(image) as img:
 50 |                     # 缩略图
 51 |                     if img.size[0] > image_size[0] or img.size[1] > image_size[1]:
 52 |                         img.thumbnail(image_size, Image.LANCZOS)
 53 |                     byte_stream = io.BytesIO()
 54 |                     img.save(byte_stream, format=image_type)
 55 |                     encoded_string = base64.b64encode(byte_stream.getvalue()).decode('utf-8')
 56 | 
 57 |                 img_src_attr_value = f'data:image/{image_type};base64,{encoded_string}'
 58 |                 content.append({"type": "image_url", "image_url": {"url": img_src_attr_value, "detail": detail}})
 59 | 
 60 |             response = client.chat.completions.create(
 61 |                 model="gpt-4-vision-preview",
 62 |                 messages=[{"role": "user", "content": content}],
 63 |                 max_tokens=4096,
 64 |             )
 65 |             content = response.choices[0].message.content
 66 |             logger.info(text)
 67 |             logger.info(content)
 68 |             key_id += 1
 69 |             key_id = key_id % MAX_API_RETRY
 70 | 
 71 |             return content
 72 |         except Exception as e:
 73 |             key_id += 1
 74 |             key_id = key_id % MAX_API_RETRY
 75 |             logger.error('[error in one ask]:' + repr(e))
 76 |             time.sleep(1.5)
 77 |     logger.error(f"Failed after {MAX_API_RETRY} retries.")
 78 |     return "error"
 79 | 
 80 | 
 81 | caption_prompt = "Please describe the image for me in as much detail as possible. You need to generate a description of at least 120 words. If you can, identify what objects are present in the image."
 82 | caption_prompt = "请尽可能详细描述这幅图像。你需要生成至少200字的描述。如果可以的话，识别图像中的物体。"
 83 | 
 84 | caption_prompt_text = "这是一张图和图中的文字信息，文字信息内容为：<ocr_text>。根据图片本身和其中的文本内容理解这幅图，然后尽可能详细描述这幅图像，生成至少200字的描述。"
 85 | caption_prompt_text = "This is an image accompanied by text information, with the content of the text being: <ocr_text>. Based on both the image itself and the text content, understand the image and then describe it as comprehensively as possible, generating a description of at least 200 words."
 86 | 
 87 | def get_gpt4v_caption(img_folder, source_path, dest_path, begin_ix):
 88 |     with open(source_path, 'r', encoding='utf-8') as f:
 89 |         source_json = json.load(f)
 90 |     
 91 |     for ix, data in enumerate(source_json):
 92 |         if ix < begin_ix:
 93 |             continue
 94 | 
 95 |         logger.info("processing " + str(ix) +" total " + str(len(source_json)))
 96 |         try: 
 97 |             prompt = caption_prompt
 98 |             if len(data['text']) >= 1:
 99 |                 # if text in image, new prompt
100 |                 prompt = caption_prompt_text.replace("<ocr_text>",list_to_str(data['text']))
101 | 
102 |             image_file = os.path.join(img_folder, data['image_path'])
103 |             logger.info(image_file)
104 |             gptout = one_ask(prompt, [image_file] ) # max 4 image at one ask
105 |         
106 |             new_item = data.copy()
107 |             new_item['gpt4v_caption_interface'] = gptout
108 |             new_item['gpt4v_prompt'] = prompt
109 | 
110 |             open(dest_path, 'a', encoding='utf-8').write(
111 |                 json.dumps(new_item, ensure_ascii=False)+'\n'
112 |             )
113 |             time.sleep(1.5)
114 |         except Exception as e:
115 |             logger.error("[error]: " + str(repr(e)))
116 | 
117 | 
118 | if __name__ == '__main__':
119 |     img_folder  = 'poster'
120 |     source_path = 'poster.jsonl'
121 |     dest_path =  'poster_caption.jsonl'
122 |     begin_ix = 0
123 |     get_gpt4v_caption(img_folder, source_path, dest_path, begin_ix)
124 |     logger.info("done.")
125 | 


--------------------------------------------------------------------------------
/data_engine/image_retrieval_bing_spider.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from lxml import etree
  3 | import os
  4 | from multiprocessing.dummy import Pool
  5 | import json
  6 | import time
  7 | from PIL import Image
  8 | 
  9 | class BingImagesSpider:
 10 |     thread_amount = 1000
 11 |     per_page_images = 30
 12 |     count = 0
 13 |     success_count = 0
 14 |     ignore_chars = ['|', '.', '，', ',', '', '',
 15 |                     '/', '@', ':', '：', ';', '；', '[', ']', '+', ' - ']
 16 |     image_types = ['jpg', 'png','jpeg']
 17 |     headers = {
 18 |         'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36'
 19 |     }
 20 |     bing_image_url_pattern = 'https://www.bing.com/images/async?q={}&first={}&count={}&mmasync=1'
 21 | 
 22 |     def __init__(self, domain, keywords, amount, save_url, json_path):
 23 |         self.domain = domain
 24 |         self.json_path = json_path
 25 |         self.keywords = keywords
 26 |         self.keyword = None
 27 |         self.amount = amount
 28 |         self.path = save_url
 29 |         self.item_list = []
 30 |         self.thread_pool = Pool(self.thread_amount)
 31 | 
 32 |     def __del__(self):
 33 |         self.thread_pool.close()
 34 |         self.thread_pool.join()
 35 | 
 36 |     def request_homepage(self, url):
 37 |         return requests.get(url, headers=self.headers)
 38 | 
 39 |     def parse_homepage_response(self, response):
 40 |         tree = etree.HTML(response.text)
 41 |         m_list = tree.xpath('//*[@class="imgpt"]/a/@m')
 42 | 
 43 |         info_list = []
 44 |         for m in m_list:
 45 |             dic = json.loads(m)
 46 |             image_title = dic['t']
 47 |             for char in self.ignore_chars:
 48 |                 image_title = image_title.replace(char, ' ')
 49 |             image_title = image_title.replace(
 50 |                 "   ", " ").replace("  ", " ").strip()
 51 | 
 52 |             image_type = dic['murl'].split('.')[-1]
 53 |             if image_type not in self.image_types:
 54 |                 image_type = 'jpg'
 55 | 
 56 |             info = dict()
 57 |             info['image_title'] = image_title
 58 |             info['image_type'] = image_type
 59 |             info['image_md5'] = dic['md5']
 60 |             info['image_url'] = dic['murl']
 61 | 
 62 |             info_list.append(info)
 63 |         return info_list
 64 | 
 65 |     def request_and_save_image(self, info):
 66 |         try:
 67 |             bing_tag = info['image_title']
 68 |             filename = '{}.{}'.format(
 69 |                 self.domain + '_' + str(int(time.time() * 1e6))[-10:], info['image_type']) 
 70 |             filepath = os.path.join(self.path, filename)
 71 | 
 72 |             response = requests.get(info['image_url'], headers=self.headers, timeout=1.5)
 73 |             with open(filepath, 'wb') as fp:
 74 |                 fp.write(response.content)
 75 |             
 76 |             self.count += 1
 77 |             self.success_count += 1
 78 |             self.item_list.append({
 79 |                 'image_path': filename,
 80 |                 'bing_tag': bing_tag,
 81 |                 'retrieval_keyword': self.keyword,
 82 |                 'source': "bing",
 83 |             })
 84 | 
 85 |         except Exception as e:
 86 |             self.count += 1
 87 | 
 88 |     def deduplication(self, info_list):
 89 |         result = []
 90 |         md5_set = set()
 91 |         for info in info_list:
 92 |             if info['image_md5'] not in md5_set:
 93 |                 result.append(info)
 94 |                 md5_set.add(info['image_md5'])
 95 |         return result
 96 | 
 97 |     def run_all(self):
 98 |         print("*** spider ***")
 99 |         if not os.path.exists(self.path):
100 |             os.mkdir(self.path)
101 | 
102 |         self.keyword = None
103 |         self.item_list = []
104 |         for keyword in self.keywords:
105 |             self.keyword = keyword
106 |             print(f'keyword: {keyword}')
107 |             self.run()
108 |             time.sleep(5)
109 | 
110 |         print('done, save total ' +
111 |               str(len(self.item_list)) + ' images.')
112 |         with open(self.json_path, 'a', encoding='utf-8') as output_file:
113 |             for item in self.item_list:
114 |                 output_file.write(json.dumps(item, ensure_ascii=False) + '\n')
115 | 
116 |     def run(self):
117 |         homepage_urls = []
118 |         for i in range(int(self.amount/self.per_page_images * 3) + 1):
119 |             url = self.bing_image_url_pattern.format(
120 |                 self.keyword, i*self.per_page_images, self.per_page_images)
121 |             homepage_urls.append(url)
122 |         print('homepage_urls len {}'.format(len(homepage_urls)))
123 | 
124 |         homepage_responses = self.thread_pool.map(
125 |             self.request_homepage, homepage_urls)
126 | 
127 |         info_list = []
128 |         for response in homepage_responses:
129 |             try:
130 |                 result = self.parse_homepage_response(response)
131 |                 info_list += result
132 |             except Exception as e:
133 |                 pass
134 |         print('info amount before deduplication', len(info_list))
135 | 
136 |         info_list = self.deduplication(info_list)
137 |         print('info amount after deduplication', len(info_list))
138 |         info_list = info_list[: self.amount]
139 |         print('info amount after split', len(info_list))
140 | 
141 |         self.thread_pool.map(self.request_and_save_image, info_list)
142 | 
143 |         print('{} done. Total {} successfully downloaded, {} failed.'.format(self.keyword,
144 |                                                                              self.success_count, self.count - self.success_count))
145 | 
146 | 
147 | def read_keywords(file_path):
148 |     with open(file_path, 'r', encoding='utf-8') as file:
149 |         lines = file.readlines()
150 |         lines = [line.strip() for line in lines]
151 |         return list(set(lines))
152 | 
153 | def remove_broken(image_path):
154 |     images = os.listdir(image_path)
155 |     i = 0
156 |     for image_name in images:
157 |         try:
158 |             image = Image.open(image_path + "/" + image_name)
159 |         except Exception as e:
160 |             i+= 1
161 |             os.remove(image_path + "/" + image_name)
162 |             continue
163 |     print("remove ", i," images")
164 | 
165 | 
166 | if __name__ == "__main__":
167 |     root_path = 'MMInstruct'
168 |     domain_list = ["poster"]
169 |     os.makedirs(os.path.join(root_path, 'bing_images'), exist_ok=True)
170 |     os.makedirs(os.path.join(root_path, 'bing_images/json'), exist_ok=True)
171 |     for domain in domain_list:
172 |         keywords = read_keywords(root_path + "keywords/" + domain + ".txt")
173 |         print(f'keywords: {keywords}') 
174 |         count = 15 
175 |         save_path = os.path.join(root_path, 'bing_images', domain)
176 |         os.makedirs(save_path, exist_ok=True)
177 |         json_path = root_path + '/bing_images/json/' + domain + '.jsonl'
178 |         spider = BingImagesSpider(domain, keywords, count, save_path, json_path)
179 |         spider.run_all()
180 | 
181 |         remove_broken(save_path)
182 |         
183 |     print('done all.')
184 | 


--------------------------------------------------------------------------------
/data_engine/image_retrieval_clip.py:
--------------------------------------------------------------------------------
 1 |       
 2 | import os
 3 | import json
 4 | import tqdm
 5 | from clip_retrieval.clip_client import ClipClient, Modality
 6 | import requests
 7 | 
 8 | root_path = 'MMInstruct'  
 9 | domain_list = os.listdir(root_path + '/images')
10 | domain_list = sorted(domain_list, key=str.lower) 
11 | istart_list = [0]*len(domain_list)
12 | os.makedirs(root_path + '/clip_retrieval_images', exist_ok=True)
13 | os.makedirs(root_path + '/clip_retrieval_images/json', exist_ok=True) 
14 | 
15 | for ix, domain in enumerate(domain_list):
16 |     in_images_path = os.path.join(root_path, "source_domain", domain, "images" )
17 |     in_images_list = [i for i in os.listdir(in_images_path) if i.endswith('.jpg') or i.endswith('.png')]
18 |     out_images_dir = os.path.join(root_path, "clip_retrieval_images", domain)
19 |     out_json_path = os.path.join(root_path, "clip_retrieval_images/json", domain + ".jsonl")
20 |     err_json_path = os.path.join(root_path, "clip_retrieval_images/json", domain + "_err.jsonl")
21 | 
22 |     client = ClipClient(url="https://knn.laion.ai/knn-service", indice_name="laion5B-L-14", num_images=200)
23 | 
24 |     if not os.path.exists(out_images_dir):
25 |         os.makedirs(out_images_dir)
26 |         message = f"The directory '{out_images_dir}' has been created."
27 |     else:
28 |         message = f"The directory '{out_images_dir}' already exists."
29 | 
30 |     for i in tqdm.tqdm(range(len(in_images_list))):
31 |         if i < istart_list[ix]:
32 |             continue
33 | 
34 |         aug_item = {'image_path': in_images_list[i]}
35 |         image_path = os.path.join(in_images_path, in_images_list[i])
36 |         aug_item["retrieval"] = []
37 | 
38 |         try:
39 |             results = client.query(image=image_path) 
40 |         except Exception as e: 
41 |             print(repr(e))
42 |             open(err_json_path, 'a', encoding='utf-8').write(json.dumps(aug_item, ensure_ascii=False)+'\n')
43 |             continue
44 | 
45 |         count = 0
46 |         for i, item in enumerate(results):
47 |             try:
48 |                 url = item['url'] 
49 |                 file_path = out_images_dir + '/{}.jpg'.format(item['id']) 
50 |                 response = requests.get(url, timeout=5)
51 |             except:
52 |                 print("Skip fig {}".format(item['id']))
53 |                 continue
54 |             if response.status_code == 200:
55 |                 with open(file_path, 'wb') as file:
56 |                     file.write(response.content)
57 |                 print(f'download {file_path}')
58 |                 aug_item["retrieval"].append({"image_path":file_path, "caption":item['caption'], "similarity":item['similarity']})
59 |                 count += 1
60 |             else:
61 |                 print('HTTP Error:', response.status_code)
62 |         aug_item["count"] = count
63 |         open(out_json_path, 'a', encoding='utf-8').write(json.dumps(aug_item, ensure_ascii=False)+'\n')
64 | 


--------------------------------------------------------------------------------
/figs/data-engine.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuecao0119/MMInstruct/eb6aacbcc64b1880092de22142eeec0ad7f2422d/figs/data-engine.png


--------------------------------------------------------------------------------
/figs/example_in_domain.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuecao0119/MMInstruct/eb6aacbcc64b1880092de22142eeec0ad7f2422d/figs/example_in_domain.pdf


--------------------------------------------------------------------------------
/figs/example_in_domain.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuecao0119/MMInstruct/eb6aacbcc64b1880092de22142eeec0ad7f2422d/figs/example_in_domain.png


--------------------------------------------------------------------------------
/train_dataset_for_llava.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import copy
  3 | 
  4 | import json
  5 | import logging
  6 | import pathlib
  7 | import torch
  8 | import random
  9 | import transformers
 10 | import tokenizers
 11 | from PIL import Image
 12 | from typing import Dict, Optional, Sequence, List
 13 | from dataclasses import dataclass, field
 14 | from torch.utils.data import Dataset
 15 | local_rank = None
 16 | 
 17 | 
 18 | def rank0_print(*args):
 19 |     if local_rank == 0:
 20 |         print(*args)
 21 | 
 22 | @dataclass
 23 | class DataArguments:
 24 | 	data_path: str = field(default=None,
 25 | 	                       metadata={"help": "Path to the training data."})
 26 | 	lazy_preprocess: bool = False
 27 | 	is_multimodal: bool = False
 28 | 	image_folder: Optional[str] = field(default=None)
 29 | 	image_aspect_ratio: str = 'square'
 30 | 	more_data: Optional[str] = field(default=None) # new add
 31 | 
 32 | 
 33 | class LazySupervisedDataset(Dataset):
 34 | 	"""Dataset for supervised fine-tuning."""
 35 | 
 36 | 	def __init__(self, data_path: str,
 37 | 	             tokenizer: transformers.PreTrainedTokenizer,
 38 | 	             data_args: DataArguments):
 39 | 		super(LazySupervisedDataset, self).__init__()
 40 | 		list_data_dict = json.load(open(data_path, "r"))
 41 | 		rank0_print(f"Total count of list_data_dict load from {data_path}: {len(list_data_dict)}")
 42 | 
 43 |         # new add
 44 | 		if data_args.more_data is not None and data_args.more_data != "":
 45 | 			rank0_print("Append more data.")
 46 | 			more_data_dict = self.load_self_defined_data(data_args.more_data)
 47 | 			list_data_dict += more_data_dict
 48 | 			rank0_print(f"Total count of list_data_dict after append data.: {len(list_data_dict)}")
 49 | 
 50 | 		rank0_print("Formatting inputs...Skip in lazy mode")
 51 | 
 52 | 		self.tokenizer = tokenizer
 53 | 		self.list_data_dict = list_data_dict
 54 | 		self.data_args = data_args
 55 | 
 56 | 	def __len__(self):
 57 | 		return len(self.list_data_dict)
 58 | 
 59 | 	@property
 60 | 	def lengths(self):
 61 | 		length_list = []
 62 | 		for sample in self.list_data_dict:
 63 | 			img_tokens = 128 if 'image' in sample else 0
 64 | 			length_list.append(sum(len(conv['value'].split()) for conv in sample['conversations']) + img_tokens)
 65 | 		return length_list
 66 | 
 67 | 	@property
 68 | 	def modality_lengths(self):
 69 | 		length_list = []
 70 | 		for sample in self.list_data_dict:
 71 | 			try:
 72 | 				cur_len = sum(len(conv['value'].split()) for conv in sample['conversations'])
 73 | 				cur_len = cur_len if 'image' in sample else -cur_len
 74 | 				length_list.append(cur_len)
 75 | 			except Exception as e:
 76 | 				rank0_print(f'modality_lengths line 701 {repr(e)}')
 77 | 				rank0_print(sample)
 78 | 				raise e
 79 | 
 80 | 		return length_list
 81 | 
 82 | 	def __getitem__(self, i) -> Dict[str, torch.Tensor]:
 83 | 		image_folder = None
 84 | 		image_file = None
 85 | 		flag = False
 86 | 		while not flag:
 87 | 			try:
 88 | 				sources = self.list_data_dict[i]
 89 | 				if isinstance(i, int):
 90 | 					sources = [sources]
 91 | 				assert len(sources) == 1, "Don't know why it is wrapped to a list"  # FIXME
 92 | 				if 'image' in sources[0]:
 93 | 					image_file = self.list_data_dict[i]['image']
 94 | 					image_folder = self.data_args.image_folder
 95 | 					processor = self.data_args.image_processor
 96 | 					image = Image.open(os.path.join(image_folder, image_file)).convert('RGB')
 97 | 					if self.data_args.image_aspect_ratio == 'pad':
 98 | 						def expand2square(pil_img, background_color):
 99 | 							width, height = pil_img.size
100 | 							if width == height:
101 | 								return pil_img
102 | 							elif width > height:
103 | 								result = Image.new(pil_img.mode, (width, width), background_color)
104 | 								result.paste(pil_img, (0, (width - height) // 2))
105 | 								return result
106 | 							else:
107 | 								result = Image.new(pil_img.mode, (height, height), background_color)
108 | 								result.paste(pil_img, ((height - width) // 2, 0))
109 | 								return result
110 | 
111 | 						image = expand2square(image, tuple(int(x * 255) for x in processor.image_mean))
112 | 						image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
113 | 					else:
114 | 						image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
115 | 					sources = preprocess_multimodal(
116 | 						copy.deepcopy([e["conversations"] for e in sources]),
117 | 						self.data_args
118 | 					)
119 | 				else:
120 | 					sources = copy.deepcopy([e["conversations"] for e in sources])
121 | 
122 | 				data_dict = preprocess(
123 | 					sources,
124 | 					self.tokenizer,
125 | 					has_image=('image' in self.list_data_dict[i])
126 | 				)
127 | 				if isinstance(i, int):
128 | 					data_dict = dict(
129 | 						input_ids=data_dict["input_ids"][0],
130 | 						labels=data_dict["labels"][0]
131 | 					)
132 | 
133 | 				# image exist in the data
134 | 				if 'image' in self.list_data_dict[i]:
135 | 					data_dict['image'] = image
136 | 				elif self.data_args.is_multimodal:
137 | 					# image does not exist in the data, but the model is multimodal
138 | 					crop_size = self.data_args.image_processor.crop_size
139 | 					data_dict['image'] = torch.zeros(3, crop_size['height'], crop_size['width'])
140 | 
141 | 				flag = True
142 | 			except Exception as e:
143 | 				rank0_print(f"{repr(e)} image file can't open {image_folder} {image_file}")
144 | 				i = random.randint(0, len(self.list_data_dict) - 1)
145 | 
146 | 		return data_dict
147 | 
148 | 	def get_json_files(self, data_dir):
149 | 		json_files = []
150 | 		# 递归遍历目录，可以读到链接文件
151 | 		for root, dirs, files in os.walk(data_dir, followlinks=True):
152 | 			for file in files:
153 | 				if file.endswith('.json'):
154 | 					json_files.append(os.path.join(root, file))
155 | 		return json_files
156 | 
157 | 	def load_self_defined_data(self, data_dir):
158 | 		more_data_dict = []
159 | 		json_files = None
160 | 		if data_dir.endswith('.json'):
161 | 			json_files = [data_dir]
162 | 		else:
163 | 			json_files = self.get_json_files(data_dir)
164 | 			
165 | 		for more_data_path in json_files:
166 | 			more_data = json.load(open(more_data_path, "r"))
167 | 			rank0_print(f"Count of {more_data_path}: {len(more_data)}")
168 | 			# rank0_print(more_data[0])
169 | 
170 | 			more_data_dict += more_data
171 | 
172 | 		rank0_print(f"Total json file {len(json_files)}")
173 | 		rank0_print(f"Total Count {len(more_data_dict)}")
174 | 		rank0_print(type(more_data_dict))
175 | 		rank0_print(more_data_dict[0])
176 | 		return more_data_dict
177 | 


--------------------------------------------------------------------------------