├── README.md
├── dataset_vis_1.png
├── dataset_vis_2.png
├── generate_description_chatgpt3_5_text.py
└── generate_description_chatgpt4v.py
/README.md:
--------------------------------------------------------------------------------
1 | # ChatEarthNet: A Global-Scale Image-Text Dataset Empowering Vision-Language Geo-Foundation Models
2 | ### Access the dataset
3 | The ChatEarthNet can be downloaded from https://doi.org/10.5281/zenodo.11003436
4 |
5 | ### Introduction
6 | The Python code utilizes the ChatGPT API to generate captions.
7 |
8 | [ChatEarthNet](https://arxiv.org/abs/2402.11325) is a new image-text dataset, providing high-quality natural language descriptions for global-scale satellite data. Specifically, we utilize Sentinel-2 data for its global coverage as the foundational image source, employing semantic segmentation labels from the European Space Agency's WorldCover project to enrich the descriptions of land covers. By conducting in-depth semantic analysis, we formulate detailed prompts to elicit rich descriptions from ChatGPT. We then include a manual verification process to enhance the dataset's quality further. Finally, we offer the community ChatEarthNet, a large-scale image-text dataset characterized by global coverage, high quality, wide-ranging diversity, and detailed descriptions. ChatEarthNet consists of 163,488 image-text pairs with captions generated by ChatGPT-3.5 and an additional 10,000 image-text pairs with captions generated by ChatGPT-4V(ision). This dataset has significant potential for both training and evaluating vision-language geo-foundation models for remote sensing.
9 |
10 | 
11 |
12 | 
13 |
14 |
15 | If you find this helpful, please give us a STAR ⭐. Thank you, and have a nice day:)
16 |
17 | ### License
18 | This repository is released under the Apache 2.0 license. The dataset and pretrained model weights are released under the CC-BY-4.0 license.
19 |
20 |
21 | ### Citation
22 | ```
23 | @article{yuan2024chatearthnet,
24 | title={ChatEarthNet: A Global-Scale Image-Text Dataset Empowering Vision-Language Geo-Foundation Models},
25 | author={Yuan, Zhenghang and Xiong, Zhitong and Mou, Lichao and Zhu, Xiao Xiang},
26 | journal={arXiv preprint arXiv:2402.11325},
27 | year={2024}
28 | }
29 | ```
30 |
--------------------------------------------------------------------------------
/dataset_vis_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhu-xlab/ChatEarthNet/1a3189b0c2416e539a2f327c16e4e545be44451a/dataset_vis_1.png
--------------------------------------------------------------------------------
/dataset_vis_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhu-xlab/ChatEarthNet/1a3189b0c2416e539a2f327c16e4e545be44451a/dataset_vis_2.png
--------------------------------------------------------------------------------
/generate_description_chatgpt3_5_text.py:
--------------------------------------------------------------------------------
1 | # analyze the image and generate the corresponding prompts
2 |
3 | import numpy as np
4 | from PIL import Image
5 | import base64
6 | import requests
7 | import io
8 | import json
9 | import time
10 | import random
11 | import functools
12 | import operator
13 | import copy
14 |
15 | Threshold = 20
16 | from requests.exceptions import RequestException, Timeout
17 |
18 |
19 | land_cover_label = {'land_cover': {
20 | 'type': 'segment',
21 | 'BackgroundInvalid': True,
22 | 'categories': [
23 | 'background',
24 | 'water', 'developed area', 'tree', 'shrub', 'grass',
25 | 'crop', 'bare land', 'snow', 'wetland', 'mangroves', 'moss',
26 | ],
27 | 'colors': [
28 | [0, 0, 0], # unknown
29 | [0, 0, 255], # (blue) water
30 | [255, 0, 0], # (red) developed
31 | [0, 192, 0], # (dark green) tree
32 | [200, 170, 120], # (brown) shrub
33 | [0, 255, 0], # (green) grass
34 | [255, 255, 0], # (yellow) crop
35 | [128, 128, 128], # (grey) bare
36 | [255, 255, 255], # (white) snow
37 | [0, 255, 255], # (cyan) wetland
38 | [255, 0, 255], # (pink) mangroves
39 | [128, 0, 128], # (purple) moss
40 | ],
41 | }}
42 |
43 | labels = land_cover_label['land_cover']['categories']
44 | colors = land_cover_label['land_cover']['colors']
45 |
46 | def convert_color_map_to_segmentation(color_map, label_colors):
47 | """
48 | Convert a color map back to a segmentation map with pixel labels from 0 to 7.
49 |
50 | Args:
51 | color_map (numpy.ndarray): A 3D array where each element represents the color of a pixel.
52 | label_colors (list): A list of color tuples corresponding to each label.
53 |
54 | Returns:
55 | numpy.ndarray: A 2D array representing the segmentation map.
56 | """
57 | # Initialize an empty segmentation map with the same height and width as the color map
58 | segmentation_map = np.zeros((color_map.shape[0], color_map.shape[1]), dtype=np.uint8)
59 |
60 | # Map each color in the color map back to its corresponding label
61 | for label, color in enumerate(label_colors):
62 | # Create a mask where the color matches the current label color
63 | mask = np.all(color_map == color, axis=-1)
64 |
65 | # Assign the label to the matching locations in the segmentation map
66 | segmentation_map[mask] = label
67 |
68 | return segmentation_map
69 |
70 |
71 | import numpy as np
72 | from PIL import Image
73 | from collections import Counter
74 |
75 | def divide_into_patches(image):
76 | rows, cols = image.shape
77 | return {
78 | "top left": image[:rows//2, :cols//2],
79 | "top right": image[:rows//2, cols//2:],
80 | "bottom left": image[rows//2:, :cols//2],
81 | "bottom right": image[rows//2:, cols//2:],
82 | "middle": image[rows//4:3*rows//4, cols//4:3*cols//4]
83 | }
84 |
85 |
86 | def count_pixel_proportions1(num, land_type, patch):
87 | count = np.count_nonzero(patch == land_type)
88 | proportions = count / (256.*256.)
89 | return proportions
90 |
91 | def overall_distribution(image):
92 | all_land_types = Counter(list(image.reshape(-1)))
93 | analyze_prompt = ""
94 | all_land_types = dict(sorted(all_land_types.items(), key=lambda item: item[1], reverse=True))
95 |
96 | for land_type, num in all_land_types.items():
97 | #print(f"{labels[land_type]}:{all_land_types[land_type]}")
98 | if land_type == 0 or all_land_types[land_type]=Threshold:
120 | ncounts.append(counts[i])
121 | nvalues.append(values[i])
122 |
123 | # Combine values and counts and sort by frequency in descending order
124 | frequencies = np.column_stack((nvalues, ncounts))
125 | frequencies = frequencies[frequencies[:, 1].argsort()[::-1]] # Sort by frequency
126 | # Return the three most frequent values or all if less than three
127 | out_idx = frequencies[:3, 0] if frequencies.shape[0] >= 3 else frequencies[:, 0]
128 | return out_idx, count_dict
129 |
130 | def convert_percent_range(percent):
131 | candidates = ['fraction', 'part', 'portion', 'amount', 'quantity']
132 | if percent>=0 and percent<=9:
133 | return f'extra small {random.choice(candidates)}'
134 | elif percent>=10 and percent<=19:
135 | return f'small {random.choice(candidates)}'
136 | elif percent>=20 and percent<=49:
137 | return f'medium {random.choice(candidates)}'
138 | elif percent>=50 and percent<=79:
139 | return f'large {random.choice(candidates)}'
140 | elif percent>=80 and percent<=100:
141 | return f'extra large {random.choice(candidates)}'
142 | else:
143 | raise Exception('Wrong portion')
144 |
145 |
146 | def post_with_retry(url, headers=None, json=None, max_retries=3, delay=2):
147 | """
148 | Makes a POST request to a specified URL with a retry mechanism.
149 |
150 | :param url: URL to which the POST request is made
151 | :param data: Dictionary, list of tuples, bytes, or file-like object to send in the body
152 | :param headers: Dictionary of HTTP Headers to send with the request
153 | :param max_retries: Maximum number of retries
154 | :param delay: Delay between retries in seconds
155 | :return: Response object
156 | """
157 | for attempt in range(max_retries):
158 | try:
159 | response = requests.post(url, headers=headers, json=json, timeout=60)
160 | response.raise_for_status() # Raises an HTTPError if the HTTP request returned an unsuccessful status code
161 | return response
162 | except Timeout:
163 | time.sleep(delay)
164 | except RequestException as e:
165 | print(f"Request failed: {e}. Attempt {attempt + 1} of {max_retries}. Retrying in {delay} seconds...")
166 | time.sleep(delay)
167 |
168 | raise Exception(f"Failed to POST to {url} after {max_retries} retries")
169 |
170 |
171 | def analyze_segmentation_map(image):
172 | overalld = overall_distribution(image)
173 | # Divide the segmentation map into patches
174 | patches = divide_into_patches(image)
175 | # Find three largest infrequent pixel types for each patch
176 | #frequent_types = {patch_name: find_most_frequent_types_revised(patch) for patch_name, patch in patches.items()}
177 | frequent_types = {}
178 | frequent_percents = {}
179 | percents = {}
180 | for patch_name, patch in patches.items():
181 | out_idx, cdict = find_most_frequent_types_revised(patch)
182 | out_percent = [convert_percent_range(int(round(cdict[it],2)*100)) for it in out_idx]
183 |
184 | frequent_types[patch_name] = out_idx
185 | frequent_percents[patch_name] = out_percent
186 |
187 | context_str = overalld + "\n"
188 | for patch_name, lst in frequent_types.items():
189 | frequent_percent = frequent_percents[patch_name]
190 | print(frequent_percent)
191 | context_str += f"The {patch_name} mainly contains the following land cover types, in descending order of content: "
192 | if len(lst)==3:
193 | context_str += f"{labels[lst[0]]} ({frequent_percent[0]}), {labels[lst[1]]} ({frequent_percent[1]}), and {labels[lst[2]]} ({frequent_percent[2]}).\n"
194 | if len(lst)==2:
195 | context_str += f"{labels[lst[0]]} ({frequent_percent[0]}) and {labels[lst[1]]} ({frequent_percent[1]}).\n"
196 | if len(lst)==1:
197 | context_str += f"{labels[lst[0]]} ({frequent_percent[0]}).\n"
198 |
199 | return context_str
200 |
201 |
202 | def generate_captions(image_paths, segmentation_maps):
203 | user_prompts = []
204 | for image in segmentation_maps:
205 | analyze_prompt = analyze_segmentation_map(image)
206 | user_prompt = "Analyze the provided image as an AI visual assistant. The following contexts are provided.\n"
207 | user_prompt += "The overall land cover distributions from most to least are: "
208 | user_prompt += analyze_prompt
209 | user_prompt += '\n'
210 | user_prompts.append(user_prompt)
211 |
212 |
213 | system_prompt = "You are an AI visual assistant who can help describe images based on the given contexts. Please write the description in a paragraph, and avoid saying other things. The following constraints should be obeyed:\n\
214 | 1) Describe the image in the order of the spatial distributions presented in the given contexts. Link descriptions of different parts to make the overall image description more fluent.\
215 | 2) Describe the dominant land cover type in the image and its spatial locations.\
216 | 3) Describe the land cover types in each part of the image in descending order of their coverage areas.\
217 | 4) Diversify descriptions related to portions in each paragraph. \
218 | 5) Summarize the main theme of the image in the final sentence.\
219 | 6) Describe it objectively; do not use words: 'possibly', 'likely', 'perhaps', 'context', 'segmentation', 'appear', 'change', 'transition', 'dynamic', or any words with similar connotations."
220 |
221 |
222 | api_key = "openai api key"
223 |
224 | url = "https://api.openai.com/v1/chat/completions"
225 |
226 | headers = {
227 | "Authorization": f"Bearer {api_key}",
228 | "Content-Type": "application/json"
229 | }
230 | model = "gpt-3.5-turbo-1106"
231 |
232 | data1 = {
233 | "model": model,
234 | "max_tokens": 300,
235 | "messages": [
236 | {
237 | 'role': 'system',
238 | 'content': system_prompt,
239 | },
240 | {
241 | 'role': 'user',
242 | 'content': user_prompts[0]
243 | },
244 | ]
245 | }
246 | data2 = {
247 | "model": model,
248 | "max_tokens": 300,
249 | "messages": [
250 | {
251 | 'role': 'system',
252 | 'content': system_prompt,
253 | },
254 |
255 | {
256 | 'role': 'user',
257 | 'content': user_prompts[1]
258 | },
259 | ]
260 | }
261 | data3 = {
262 | "model": model,
263 | "max_tokens": 300,
264 | "messages": [
265 | {
266 | 'role': 'system',
267 | 'content': system_prompt,
268 | },
269 |
270 | {
271 | 'role': 'user',
272 | 'content': user_prompts[2]
273 | },
274 | ]
275 | }
276 | data4 = {
277 | "model": model,
278 | "max_tokens": 300,
279 | "messages": [
280 | {
281 | 'role': 'system',
282 | 'content': system_prompt,
283 | },
284 |
285 | {
286 | 'role': 'user',
287 | 'content': user_prompts[3]
288 | },
289 | ]
290 | }
291 |
292 |
293 | response = post_with_retry(url, headers=headers, json=data1)
294 | time.sleep(0.5)
295 | response2 = post_with_retry(url, headers=headers, json=data2)
296 | time.sleep(0.5)
297 | response3 = post_with_retry(url, headers=headers, json=data3)
298 | time.sleep(0.5)
299 | response4 = post_with_retry(url, headers=headers, json=data4)
300 | time.sleep(0.5)
301 | responses = [response,response2,response3,response4]
302 | for i,response in enumerate(responses):
303 | print('-----------------------------------------------------------')
304 | print(response.json())
305 | with open(image_paths[i].replace('.png','_chatgpt_3_5.json'),'w') as jsonf:
306 | json.dump(response.json(),jsonf)
307 | print('-----------------------------------------------------------')
308 |
309 |
310 | if __name__=='__main__':
311 | with open('chatgpt_3_5_label_file_path.txt','r') as lf:
312 | all_labels = lf.readlines()
313 | selects = all_labels
314 | select_3000 = selects
315 |
316 | for lfile in label_list:
317 | # Example usage
318 | lfile = lfile.strip()
319 | color_img_path00 = f"{lfile[:-4]}_patch00.png"
320 | color_img_path01 = f"{lfile[:-4]}_patch01.png"
321 | color_img_path10 = f"{lfile[:-4]}_patch10.png"
322 | color_img_path11 = f"{lfile[:-4]}_patch11.png"
323 |
324 | im00 = np.array(Image.open(color_img_path00))
325 | im01 = np.array(Image.open(color_img_path01))
326 | im10 = np.array(Image.open(color_img_path10))
327 | im11 = np.array(Image.open(color_img_path11))
328 | seg_map_00 = convert_color_map_to_segmentation(im00, colors)
329 | seg_map_01 = convert_color_map_to_segmentation(im01, colors)
330 | seg_map_10 = convert_color_map_to_segmentation(im10, colors)
331 | seg_map_11 = convert_color_map_to_segmentation(im11, colors)
332 | segmentation_maps = [seg_map_00,seg_map_01,seg_map_10,seg_map_11]
333 | image_paths = [color_img_path00,color_img_path01,color_img_path10,color_img_path11]
334 | generate_captions(image_paths, segmentation_maps)
335 |
336 |
--------------------------------------------------------------------------------
/generate_description_chatgpt4v.py:
--------------------------------------------------------------------------------
1 | # analyze the image and generate the corresponding prompts
2 |
3 | import numpy as np
4 | from PIL import Image
5 | import base64
6 | import requests
7 | import io
8 | import json
9 | import time
10 |
11 |
12 |
13 | land_cover_label = {'land_cover': {
14 | 'type': 'segment',
15 | 'BackgroundInvalid': True,
16 | 'categories': [
17 | 'background',
18 | 'water', 'developed', 'tree', 'shrub', 'grass',
19 | 'crop', 'bare', 'snow', 'wetland', 'mangroves', 'moss',
20 | ],
21 | 'colors': [
22 | [0, 0, 0], # unknown
23 | [0, 0, 255], # (blue) water
24 | [255, 0, 0], # (red) developed
25 | [0, 192, 0], # (dark green) tree
26 | [200, 170, 120], # (brown) shrub
27 | [0, 255, 0], # (green) grass
28 | [255, 255, 0], # (yellow) crop
29 | [128, 128, 128], # (grey) bare
30 | [255, 255, 255], # (white) snow
31 | [0, 255, 255], # (cyan) wetland
32 | [255, 0, 255], # (pink) mangroves
33 | [128, 0, 128], # (purple) moss
34 | ],
35 | }}
36 |
37 | labels = land_cover_label['land_cover']['categories']
38 | colors = land_cover_label['land_cover']['colors']
39 |
40 |
41 |
42 | def analyze_segmentation_map(image):
43 | h, w = image.shape
44 | patches = {
45 | "top_left": image[:h//2, :w//2],
46 | "top_right": image[:h//2, w//2:],
47 | "bottom_left": image[h//2:, :w//2],
48 | "bottom_right": image[h//2:, w//2:],
49 | "middle": image[h//4:3*h//4, w//4:3*w//4]
50 | }
51 |
52 | analyze_prompt = ""
53 |
54 | for name, patch in patches.items():
55 | unique, counts = np.unique(patch, return_counts=True)
56 | proportions = counts / counts.sum()
57 | sorted_indices = np.argsort(-proportions) # Sorting in descending order
58 |
59 | statistic_str = f"{name} distribution:"
60 | for idx in sorted_indices:
61 | if unique[idx] == 0:
62 | continue
63 | statistic_str += f" {labels[unique[idx]]}: {proportions[idx]:.2f};"
64 | analyze_prompt += statistic_str
65 | analyze_prompt += '\n'
66 | return analyze_prompt
67 |
68 |
69 |
70 | import numpy as np
71 | from PIL import Image
72 | from collections import Counter
73 |
74 |
75 | def split_into_patches(image):
76 | h, w = image.shape[:2]
77 | patches = {
78 | "top left part": image[:h//2, :w//2],
79 | "top right part": image[:h//2, w//2:],
80 | "bottom left part": image[h//2:, :w//2],
81 | "bottom right part": image[h//2:, w//2:],
82 | "middle part": image[h//4:h*3//4, w//4:w*3//4]
83 | }
84 | return patches
85 |
86 | def count_pixel_proportions(num, land_type, patch):
87 | count = np.count_nonzero(patch == land_type)
88 | proportions = count / (128.*128.)
89 | return proportions
90 |
91 | def analyze_image(image):
92 | patches = split_into_patches(image)
93 | all_land_types = Counter(list(image.reshape(-1)))
94 | analyze_prompt = ""
95 |
96 | for land_type, num in all_land_types.items():
97 | if land_type == 0:
98 | continue
99 | analyze_prompt += f"{labels[land_type]}: "
100 | for patch_name, patch in patches.items():
101 | proportion = count_pixel_proportions(num, land_type, patch)
102 | analyze_prompt += f" {patch_name}: {proportion:.2%} "
103 | analyze_prompt += '\n'
104 | return analyze_prompt
105 |
106 |
107 | import numpy as np
108 |
109 | def convert_color_map_to_segmentation(color_map, label_colors):
110 | """
111 | Convert a color map back to a segmentation map with pixel labels from 0 to 7.
112 |
113 | Args:
114 | color_map (numpy.ndarray): A 3D array where each element represents the color of a pixel.
115 | label_colors (list): A list of color tuples corresponding to each label.
116 |
117 | Returns:
118 | numpy.ndarray: A 2D array representing the segmentation map.
119 | """
120 | # Initialize an empty segmentation map with the same height and width as the color map
121 | segmentation_map = np.zeros((color_map.shape[0], color_map.shape[1]), dtype=np.uint8)
122 |
123 | # Map each color in the color map back to its corresponding label
124 | for label, color in enumerate(label_colors):
125 | # Create a mask where the color matches the current label color
126 | mask = np.all(color_map == color, axis=-1)
127 |
128 | # Assign the label to the matching locations in the segmentation map
129 | segmentation_map[mask] = label
130 |
131 | return segmentation_map
132 |
133 | def encode_image(image_path):
134 | with open(image_path, "rb") as image_file:
135 | return base64.b64encode(image_file.read()).decode('utf-8')
136 |
137 |
138 | def downsample_image(image, skip_index):
139 | """
140 | Downsample a PIL image by skipping pixels.
141 |
142 | Args:
143 | image (PIL.Image.Image): The source image.
144 | skip_index (int): The number of pixels to skip.
145 |
146 | Returns:
147 | PIL.Image.Image: The downsampled image.
148 | """
149 | # Ensure the input is a PIL Image
150 | if not isinstance(image, Image.Image):
151 | raise ValueError("image must be a PIL.Image.Image object")
152 |
153 | # Get the size of the original image
154 | width, height = image.size
155 |
156 | # Calculate the size of the downsampled image
157 | new_width = (width + skip_index - 1) // skip_index
158 | new_height = (height + skip_index - 1) // skip_index
159 |
160 | # Create a new image of the desired size
161 | downsampled_image = Image.new("RGB", (new_width, new_height))
162 |
163 | # Copy pixels from the original image to the new image, skipping as appropriate
164 | for y in range(0, height, skip_index):
165 | for x in range(0, width, skip_index):
166 | downsampled_image.putpixel((x // skip_index, y // skip_index), image.getpixel((x, y)))
167 |
168 | return downsampled_image
169 |
170 |
171 | def resize_and_encode_image(image_path):
172 | """
173 | Resize an image to 128x128 using nearest neighbor interpolation and then encode it to base64.
174 |
175 | Args:
176 | image_path (str): The path to the image file.
177 |
178 | Returns:
179 | str: A base64 encoded string of the resized image.
180 | """
181 | # Open the image
182 | with Image.open(image_path) as img:
183 | # Resize the image
184 | #resized_img = img.resize((128, 128), Image.NEAREST)
185 | resized_img = downsample_image(img, 2)
186 |
187 | # Save the resized image to a bytes buffer
188 | buffer = io.BytesIO()
189 | resized_img.save(buffer, format=img.format)
190 |
191 | # Get the byte data from the buffer
192 | byte_data = buffer.getvalue()
193 |
194 | # Encode the byte data to base64
195 | base64_encoded = base64.b64encode(byte_data).decode('utf-8')
196 |
197 | return base64_encoded
198 |
199 |
200 | def generate_captions(image_paths, segmentation_maps):
201 | analyze_prompt1 = analyze_segmentation_map(segmentation_maps[0])
202 | analyze_prompt2 = analyze_segmentation_map(segmentation_maps[1])
203 | analyze_prompt3 = analyze_segmentation_map(segmentation_maps[2])
204 | analyze_prompt4 = analyze_segmentation_map(segmentation_maps[3])
205 |
206 | # Example usage
207 | analyze_prompt21 = analyze_image(segmentation_maps[0])
208 | analyze_prompt22 = analyze_image(segmentation_maps[1])
209 | analyze_prompt23 = analyze_image(segmentation_maps[2])
210 | analyze_prompt24 = analyze_image(segmentation_maps[3])
211 |
212 | prompt = "You are an AI visual assistant that can analyze the given image. In the image, different colors represent different land cover types.\
213 | The color for the land cover dictionary is: '[0, 0, 255] (blue): water; [255, 0, 0](red): developed area; \
214 | [0, 192, 0] (dark green): tree; [200, 170, 120] (brown): shrub; [0, 255, 0] (green): grass; [255, 255, 0] (yellow): crop;\
215 | [128, 128, 128] (grey): bare; [255, 255, 255] (white): snow; [0, 255, 255] (cyan): wetland; [255, 0, 255] (pink): mangroves; [128, 0, 128] (purple): moss.' You will be provided four independent images at once."
216 |
217 | prompt += "For the first image, the distribution of each land cover type is:"
218 | prompt += analyze_prompt21
219 | prompt += "For the first image, the spatial distribution of the image is:"
220 | prompt += analyze_prompt1
221 | prompt += "For the second image, the distribution of each land cover type is:"
222 | prompt += analyze_prompt22
223 | prompt += "For the second image, the spatial distribution of the image is:"
224 | prompt += analyze_prompt2
225 | prompt += "For the third image, the distribution of each land cover type is:"
226 | prompt += analyze_prompt23
227 | prompt += "For the third image, the spatial distribution of the image is:"
228 | prompt += analyze_prompt3
229 | prompt += "For the fourth image, the distribution of each land cover type is:"
230 | prompt += analyze_prompt24
231 | prompt += "For the fourth image, the spatial distribution of the image is:"
232 | prompt += analyze_prompt4
233 |
234 | prompt += "You are given four independent images, describe in long sentences for each image seperately using four paragraphs and avoid saying other things.\
235 | The following constraints should be obeyed: \
236 | 1) Do not use color-related words; treat the color as the land cover type directly.\
237 | 2) Generate the four descriptions seperately; do not add connection between them. \
238 | 3) When describing water, developed, and crop areas, incorporate shape descriptors.\
239 | 4) Double-check all the presented land cover types based on the distribution of each land cover type. If some land covers are not presented, do not mention them.\
240 | 5) Describe it objectively; do not use words: 'possibly','likely','perhaps','color dictionary','appear','change','transition', 'dynamic', or any words with similar connotations.\
241 | 6) Double-check the shape and location of the developed area, water course, grass, tree, shrub, wetland, and crop areas based on the given image if they are present.\
242 | 7) Consider the spatial statistics as a unified image without breaking them down into individual spatial distributions and land cover proportions when describing the overall scene.\
243 | 8) Describe each land cover separately for each given image, and then describe the main theme of each given image."
244 |
245 | api_key = "openai api key"
246 |
247 | #base64_image = encode_image(image_path)
248 | base64_image1 = resize_and_encode_image(image_paths[0])
249 | base64_image2 = resize_and_encode_image(image_paths[1])
250 | base64_image3 = resize_and_encode_image(image_paths[2])
251 | base64_image4 = resize_and_encode_image(image_paths[3])
252 |
253 | headers = {
254 | "Content-Type": "application/json",
255 | "Authorization": f"Bearer {api_key}"
256 | }
257 |
258 | payload = {
259 | "model": "gpt-4-vision-preview",
260 | "messages": [
261 | {
262 | "role": "user",
263 | "content": [
264 | {
265 | "type": "text",
266 | "text": prompt,
267 | },
268 | {
269 | "type": "image_url",
270 | "image_url": {
271 | "url": f"data:image/jpeg;base64,{base64_image1}",
272 | },
273 | },
274 | {
275 | "type": "image_url",
276 | "image_url": {
277 | "url": f"data:image/jpeg;base64,{base64_image2}",
278 | },
279 | },
280 | {
281 | "type": "image_url",
282 | "image_url": {
283 | "url": f"data:image/jpeg;base64,{base64_image3}",
284 | },
285 | },
286 | {
287 | "type": "image_url",
288 | "image_url": {
289 | "url": f"data:image/jpeg;base64,{base64_image4}",
290 | },
291 | },
292 | ]
293 | }
294 | ],
295 | "max_tokens": 1200
296 | }
297 |
298 | response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
299 | with open(image_paths[0].replace('png','json'),'w') as jsonf:
300 | json.dump(response.json(),jsonf)
301 |
302 |
303 | if __name__=='__main__':
304 | with open('chatgpt_4_v.txt','r') as lf:
305 | all_labels = lf.readlines()
306 | for lfile in all_labels:
307 | lfile = lfile.strip()
308 | color_img_path00 = f"{lfile[:-4]}_patch00.png"
309 | color_img_path01 = f"{lfile[:-4]}_patch01.png"
310 | color_img_path10 = f"{lfile[:-4]}_patch10.png"
311 | color_img_path11 = f"{lfile[:-4]}_patch11.png"
312 |
313 | im00 = np.array(Image.open(color_img_path00))
314 | im01 = np.array(Image.open(color_img_path01))
315 | im10 = np.array(Image.open(color_img_path10))
316 | im11 = np.array(Image.open(color_img_path11))
317 | seg_map_00 = convert_color_map_to_segmentation(im00, colors)
318 | seg_map_01 = convert_color_map_to_segmentation(im01, colors)
319 | seg_map_10 = convert_color_map_to_segmentation(im10, colors)
320 | seg_map_11 = convert_color_map_to_segmentation(im11, colors)
321 | segmentation_maps = [seg_map_00,seg_map_01,seg_map_10,seg_map_11]
322 | image_paths = [color_img_path00,color_img_path01,color_img_path10,color_img_path11]
323 | generate_captions(image_paths, segmentation_maps)
324 | time.sleep(20)
325 |
326 |
--------------------------------------------------------------------------------