├── README.md ├── dataset_vis_1.png ├── dataset_vis_2.png ├── generate_description_chatgpt3_5_text.py └── generate_description_chatgpt4v.py /README.md: -------------------------------------------------------------------------------- 1 | # ChatEarthNet: A Global-Scale Image-Text Dataset Empowering Vision-Language Geo-Foundation Models 2 | ### Access the dataset 3 | The ChatEarthNet can be downloaded from https://doi.org/10.5281/zenodo.11003436 4 | 5 | ### Introduction 6 | The Python code utilizes the ChatGPT API to generate captions. 7 | 8 | [ChatEarthNet](https://arxiv.org/abs/2402.11325) is a new image-text dataset, providing high-quality natural language descriptions for global-scale satellite data. Specifically, we utilize Sentinel-2 data for its global coverage as the foundational image source, employing semantic segmentation labels from the European Space Agency's WorldCover project to enrich the descriptions of land covers. By conducting in-depth semantic analysis, we formulate detailed prompts to elicit rich descriptions from ChatGPT. We then include a manual verification process to enhance the dataset's quality further. Finally, we offer the community ChatEarthNet, a large-scale image-text dataset characterized by global coverage, high quality, wide-ranging diversity, and detailed descriptions. ChatEarthNet consists of 163,488 image-text pairs with captions generated by ChatGPT-3.5 and an additional 10,000 image-text pairs with captions generated by ChatGPT-4V(ision). This dataset has significant potential for both training and evaluating vision-language geo-foundation models for remote sensing. 9 | 10 | ![Example Image](https://github.com/zhu-xlab/ChatEarthNet/blob/main/dataset_vis_1.png) 11 | 12 | ![Example Image](https://github.com/zhu-xlab/ChatEarthNet/blob/main/dataset_vis_2.png) 13 | 14 | 15 | If you find this helpful, please give us a STAR ⭐. Thank you, and have a nice day:) 16 | 17 | ### License 18 | This repository is released under the Apache 2.0 license. The dataset and pretrained model weights are released under the CC-BY-4.0 license. 19 | 20 | 21 | ### Citation 22 | ``` 23 | @article{yuan2024chatearthnet, 24 | title={ChatEarthNet: A Global-Scale Image-Text Dataset Empowering Vision-Language Geo-Foundation Models}, 25 | author={Yuan, Zhenghang and Xiong, Zhitong and Mou, Lichao and Zhu, Xiao Xiang}, 26 | journal={arXiv preprint arXiv:2402.11325}, 27 | year={2024} 28 | } 29 | ``` 30 | -------------------------------------------------------------------------------- /dataset_vis_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhu-xlab/ChatEarthNet/1a3189b0c2416e539a2f327c16e4e545be44451a/dataset_vis_1.png -------------------------------------------------------------------------------- /dataset_vis_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhu-xlab/ChatEarthNet/1a3189b0c2416e539a2f327c16e4e545be44451a/dataset_vis_2.png -------------------------------------------------------------------------------- /generate_description_chatgpt3_5_text.py: -------------------------------------------------------------------------------- 1 | # analyze the image and generate the corresponding prompts 2 | 3 | import numpy as np 4 | from PIL import Image 5 | import base64 6 | import requests 7 | import io 8 | import json 9 | import time 10 | import random 11 | import functools 12 | import operator 13 | import copy 14 | 15 | Threshold = 20 16 | from requests.exceptions import RequestException, Timeout 17 | 18 | 19 | land_cover_label = {'land_cover': { 20 | 'type': 'segment', 21 | 'BackgroundInvalid': True, 22 | 'categories': [ 23 | 'background', 24 | 'water', 'developed area', 'tree', 'shrub', 'grass', 25 | 'crop', 'bare land', 'snow', 'wetland', 'mangroves', 'moss', 26 | ], 27 | 'colors': [ 28 | [0, 0, 0], # unknown 29 | [0, 0, 255], # (blue) water 30 | [255, 0, 0], # (red) developed 31 | [0, 192, 0], # (dark green) tree 32 | [200, 170, 120], # (brown) shrub 33 | [0, 255, 0], # (green) grass 34 | [255, 255, 0], # (yellow) crop 35 | [128, 128, 128], # (grey) bare 36 | [255, 255, 255], # (white) snow 37 | [0, 255, 255], # (cyan) wetland 38 | [255, 0, 255], # (pink) mangroves 39 | [128, 0, 128], # (purple) moss 40 | ], 41 | }} 42 | 43 | labels = land_cover_label['land_cover']['categories'] 44 | colors = land_cover_label['land_cover']['colors'] 45 | 46 | def convert_color_map_to_segmentation(color_map, label_colors): 47 | """ 48 | Convert a color map back to a segmentation map with pixel labels from 0 to 7. 49 | 50 | Args: 51 | color_map (numpy.ndarray): A 3D array where each element represents the color of a pixel. 52 | label_colors (list): A list of color tuples corresponding to each label. 53 | 54 | Returns: 55 | numpy.ndarray: A 2D array representing the segmentation map. 56 | """ 57 | # Initialize an empty segmentation map with the same height and width as the color map 58 | segmentation_map = np.zeros((color_map.shape[0], color_map.shape[1]), dtype=np.uint8) 59 | 60 | # Map each color in the color map back to its corresponding label 61 | for label, color in enumerate(label_colors): 62 | # Create a mask where the color matches the current label color 63 | mask = np.all(color_map == color, axis=-1) 64 | 65 | # Assign the label to the matching locations in the segmentation map 66 | segmentation_map[mask] = label 67 | 68 | return segmentation_map 69 | 70 | 71 | import numpy as np 72 | from PIL import Image 73 | from collections import Counter 74 | 75 | def divide_into_patches(image): 76 | rows, cols = image.shape 77 | return { 78 | "top left": image[:rows//2, :cols//2], 79 | "top right": image[:rows//2, cols//2:], 80 | "bottom left": image[rows//2:, :cols//2], 81 | "bottom right": image[rows//2:, cols//2:], 82 | "middle": image[rows//4:3*rows//4, cols//4:3*cols//4] 83 | } 84 | 85 | 86 | def count_pixel_proportions1(num, land_type, patch): 87 | count = np.count_nonzero(patch == land_type) 88 | proportions = count / (256.*256.) 89 | return proportions 90 | 91 | def overall_distribution(image): 92 | all_land_types = Counter(list(image.reshape(-1))) 93 | analyze_prompt = "" 94 | all_land_types = dict(sorted(all_land_types.items(), key=lambda item: item[1], reverse=True)) 95 | 96 | for land_type, num in all_land_types.items(): 97 | #print(f"{labels[land_type]}:{all_land_types[land_type]}") 98 | if land_type == 0 or all_land_types[land_type]=Threshold: 120 | ncounts.append(counts[i]) 121 | nvalues.append(values[i]) 122 | 123 | # Combine values and counts and sort by frequency in descending order 124 | frequencies = np.column_stack((nvalues, ncounts)) 125 | frequencies = frequencies[frequencies[:, 1].argsort()[::-1]] # Sort by frequency 126 | # Return the three most frequent values or all if less than three 127 | out_idx = frequencies[:3, 0] if frequencies.shape[0] >= 3 else frequencies[:, 0] 128 | return out_idx, count_dict 129 | 130 | def convert_percent_range(percent): 131 | candidates = ['fraction', 'part', 'portion', 'amount', 'quantity'] 132 | if percent>=0 and percent<=9: 133 | return f'extra small {random.choice(candidates)}' 134 | elif percent>=10 and percent<=19: 135 | return f'small {random.choice(candidates)}' 136 | elif percent>=20 and percent<=49: 137 | return f'medium {random.choice(candidates)}' 138 | elif percent>=50 and percent<=79: 139 | return f'large {random.choice(candidates)}' 140 | elif percent>=80 and percent<=100: 141 | return f'extra large {random.choice(candidates)}' 142 | else: 143 | raise Exception('Wrong portion') 144 | 145 | 146 | def post_with_retry(url, headers=None, json=None, max_retries=3, delay=2): 147 | """ 148 | Makes a POST request to a specified URL with a retry mechanism. 149 | 150 | :param url: URL to which the POST request is made 151 | :param data: Dictionary, list of tuples, bytes, or file-like object to send in the body 152 | :param headers: Dictionary of HTTP Headers to send with the request 153 | :param max_retries: Maximum number of retries 154 | :param delay: Delay between retries in seconds 155 | :return: Response object 156 | """ 157 | for attempt in range(max_retries): 158 | try: 159 | response = requests.post(url, headers=headers, json=json, timeout=60) 160 | response.raise_for_status() # Raises an HTTPError if the HTTP request returned an unsuccessful status code 161 | return response 162 | except Timeout: 163 | time.sleep(delay) 164 | except RequestException as e: 165 | print(f"Request failed: {e}. Attempt {attempt + 1} of {max_retries}. Retrying in {delay} seconds...") 166 | time.sleep(delay) 167 | 168 | raise Exception(f"Failed to POST to {url} after {max_retries} retries") 169 | 170 | 171 | def analyze_segmentation_map(image): 172 | overalld = overall_distribution(image) 173 | # Divide the segmentation map into patches 174 | patches = divide_into_patches(image) 175 | # Find three largest infrequent pixel types for each patch 176 | #frequent_types = {patch_name: find_most_frequent_types_revised(patch) for patch_name, patch in patches.items()} 177 | frequent_types = {} 178 | frequent_percents = {} 179 | percents = {} 180 | for patch_name, patch in patches.items(): 181 | out_idx, cdict = find_most_frequent_types_revised(patch) 182 | out_percent = [convert_percent_range(int(round(cdict[it],2)*100)) for it in out_idx] 183 | 184 | frequent_types[patch_name] = out_idx 185 | frequent_percents[patch_name] = out_percent 186 | 187 | context_str = overalld + "\n" 188 | for patch_name, lst in frequent_types.items(): 189 | frequent_percent = frequent_percents[patch_name] 190 | print(frequent_percent) 191 | context_str += f"The {patch_name} mainly contains the following land cover types, in descending order of content: " 192 | if len(lst)==3: 193 | context_str += f"{labels[lst[0]]} ({frequent_percent[0]}), {labels[lst[1]]} ({frequent_percent[1]}), and {labels[lst[2]]} ({frequent_percent[2]}).\n" 194 | if len(lst)==2: 195 | context_str += f"{labels[lst[0]]} ({frequent_percent[0]}) and {labels[lst[1]]} ({frequent_percent[1]}).\n" 196 | if len(lst)==1: 197 | context_str += f"{labels[lst[0]]} ({frequent_percent[0]}).\n" 198 | 199 | return context_str 200 | 201 | 202 | def generate_captions(image_paths, segmentation_maps): 203 | user_prompts = [] 204 | for image in segmentation_maps: 205 | analyze_prompt = analyze_segmentation_map(image) 206 | user_prompt = "Analyze the provided image as an AI visual assistant. The following contexts are provided.\n" 207 | user_prompt += "The overall land cover distributions from most to least are: " 208 | user_prompt += analyze_prompt 209 | user_prompt += '\n' 210 | user_prompts.append(user_prompt) 211 | 212 | 213 | system_prompt = "You are an AI visual assistant who can help describe images based on the given contexts. Please write the description in a paragraph, and avoid saying other things. The following constraints should be obeyed:\n\ 214 | 1) Describe the image in the order of the spatial distributions presented in the given contexts. Link descriptions of different parts to make the overall image description more fluent.\ 215 | 2) Describe the dominant land cover type in the image and its spatial locations.\ 216 | 3) Describe the land cover types in each part of the image in descending order of their coverage areas.\ 217 | 4) Diversify descriptions related to portions in each paragraph. \ 218 | 5) Summarize the main theme of the image in the final sentence.\ 219 | 6) Describe it objectively; do not use words: 'possibly', 'likely', 'perhaps', 'context', 'segmentation', 'appear', 'change', 'transition', 'dynamic', or any words with similar connotations." 220 | 221 | 222 | api_key = "openai api key" 223 | 224 | url = "https://api.openai.com/v1/chat/completions" 225 | 226 | headers = { 227 | "Authorization": f"Bearer {api_key}", 228 | "Content-Type": "application/json" 229 | } 230 | model = "gpt-3.5-turbo-1106" 231 | 232 | data1 = { 233 | "model": model, 234 | "max_tokens": 300, 235 | "messages": [ 236 | { 237 | 'role': 'system', 238 | 'content': system_prompt, 239 | }, 240 | { 241 | 'role': 'user', 242 | 'content': user_prompts[0] 243 | }, 244 | ] 245 | } 246 | data2 = { 247 | "model": model, 248 | "max_tokens": 300, 249 | "messages": [ 250 | { 251 | 'role': 'system', 252 | 'content': system_prompt, 253 | }, 254 | 255 | { 256 | 'role': 'user', 257 | 'content': user_prompts[1] 258 | }, 259 | ] 260 | } 261 | data3 = { 262 | "model": model, 263 | "max_tokens": 300, 264 | "messages": [ 265 | { 266 | 'role': 'system', 267 | 'content': system_prompt, 268 | }, 269 | 270 | { 271 | 'role': 'user', 272 | 'content': user_prompts[2] 273 | }, 274 | ] 275 | } 276 | data4 = { 277 | "model": model, 278 | "max_tokens": 300, 279 | "messages": [ 280 | { 281 | 'role': 'system', 282 | 'content': system_prompt, 283 | }, 284 | 285 | { 286 | 'role': 'user', 287 | 'content': user_prompts[3] 288 | }, 289 | ] 290 | } 291 | 292 | 293 | response = post_with_retry(url, headers=headers, json=data1) 294 | time.sleep(0.5) 295 | response2 = post_with_retry(url, headers=headers, json=data2) 296 | time.sleep(0.5) 297 | response3 = post_with_retry(url, headers=headers, json=data3) 298 | time.sleep(0.5) 299 | response4 = post_with_retry(url, headers=headers, json=data4) 300 | time.sleep(0.5) 301 | responses = [response,response2,response3,response4] 302 | for i,response in enumerate(responses): 303 | print('-----------------------------------------------------------') 304 | print(response.json()) 305 | with open(image_paths[i].replace('.png','_chatgpt_3_5.json'),'w') as jsonf: 306 | json.dump(response.json(),jsonf) 307 | print('-----------------------------------------------------------') 308 | 309 | 310 | if __name__=='__main__': 311 | with open('chatgpt_3_5_label_file_path.txt','r') as lf: 312 | all_labels = lf.readlines() 313 | selects = all_labels 314 | select_3000 = selects 315 | 316 | for lfile in label_list: 317 | # Example usage 318 | lfile = lfile.strip() 319 | color_img_path00 = f"{lfile[:-4]}_patch00.png" 320 | color_img_path01 = f"{lfile[:-4]}_patch01.png" 321 | color_img_path10 = f"{lfile[:-4]}_patch10.png" 322 | color_img_path11 = f"{lfile[:-4]}_patch11.png" 323 | 324 | im00 = np.array(Image.open(color_img_path00)) 325 | im01 = np.array(Image.open(color_img_path01)) 326 | im10 = np.array(Image.open(color_img_path10)) 327 | im11 = np.array(Image.open(color_img_path11)) 328 | seg_map_00 = convert_color_map_to_segmentation(im00, colors) 329 | seg_map_01 = convert_color_map_to_segmentation(im01, colors) 330 | seg_map_10 = convert_color_map_to_segmentation(im10, colors) 331 | seg_map_11 = convert_color_map_to_segmentation(im11, colors) 332 | segmentation_maps = [seg_map_00,seg_map_01,seg_map_10,seg_map_11] 333 | image_paths = [color_img_path00,color_img_path01,color_img_path10,color_img_path11] 334 | generate_captions(image_paths, segmentation_maps) 335 | 336 | -------------------------------------------------------------------------------- /generate_description_chatgpt4v.py: -------------------------------------------------------------------------------- 1 | # analyze the image and generate the corresponding prompts 2 | 3 | import numpy as np 4 | from PIL import Image 5 | import base64 6 | import requests 7 | import io 8 | import json 9 | import time 10 | 11 | 12 | 13 | land_cover_label = {'land_cover': { 14 | 'type': 'segment', 15 | 'BackgroundInvalid': True, 16 | 'categories': [ 17 | 'background', 18 | 'water', 'developed', 'tree', 'shrub', 'grass', 19 | 'crop', 'bare', 'snow', 'wetland', 'mangroves', 'moss', 20 | ], 21 | 'colors': [ 22 | [0, 0, 0], # unknown 23 | [0, 0, 255], # (blue) water 24 | [255, 0, 0], # (red) developed 25 | [0, 192, 0], # (dark green) tree 26 | [200, 170, 120], # (brown) shrub 27 | [0, 255, 0], # (green) grass 28 | [255, 255, 0], # (yellow) crop 29 | [128, 128, 128], # (grey) bare 30 | [255, 255, 255], # (white) snow 31 | [0, 255, 255], # (cyan) wetland 32 | [255, 0, 255], # (pink) mangroves 33 | [128, 0, 128], # (purple) moss 34 | ], 35 | }} 36 | 37 | labels = land_cover_label['land_cover']['categories'] 38 | colors = land_cover_label['land_cover']['colors'] 39 | 40 | 41 | 42 | def analyze_segmentation_map(image): 43 | h, w = image.shape 44 | patches = { 45 | "top_left": image[:h//2, :w//2], 46 | "top_right": image[:h//2, w//2:], 47 | "bottom_left": image[h//2:, :w//2], 48 | "bottom_right": image[h//2:, w//2:], 49 | "middle": image[h//4:3*h//4, w//4:3*w//4] 50 | } 51 | 52 | analyze_prompt = "" 53 | 54 | for name, patch in patches.items(): 55 | unique, counts = np.unique(patch, return_counts=True) 56 | proportions = counts / counts.sum() 57 | sorted_indices = np.argsort(-proportions) # Sorting in descending order 58 | 59 | statistic_str = f"{name} distribution:" 60 | for idx in sorted_indices: 61 | if unique[idx] == 0: 62 | continue 63 | statistic_str += f" {labels[unique[idx]]}: {proportions[idx]:.2f};" 64 | analyze_prompt += statistic_str 65 | analyze_prompt += '\n' 66 | return analyze_prompt 67 | 68 | 69 | 70 | import numpy as np 71 | from PIL import Image 72 | from collections import Counter 73 | 74 | 75 | def split_into_patches(image): 76 | h, w = image.shape[:2] 77 | patches = { 78 | "top left part": image[:h//2, :w//2], 79 | "top right part": image[:h//2, w//2:], 80 | "bottom left part": image[h//2:, :w//2], 81 | "bottom right part": image[h//2:, w//2:], 82 | "middle part": image[h//4:h*3//4, w//4:w*3//4] 83 | } 84 | return patches 85 | 86 | def count_pixel_proportions(num, land_type, patch): 87 | count = np.count_nonzero(patch == land_type) 88 | proportions = count / (128.*128.) 89 | return proportions 90 | 91 | def analyze_image(image): 92 | patches = split_into_patches(image) 93 | all_land_types = Counter(list(image.reshape(-1))) 94 | analyze_prompt = "" 95 | 96 | for land_type, num in all_land_types.items(): 97 | if land_type == 0: 98 | continue 99 | analyze_prompt += f"{labels[land_type]}: " 100 | for patch_name, patch in patches.items(): 101 | proportion = count_pixel_proportions(num, land_type, patch) 102 | analyze_prompt += f" {patch_name}: {proportion:.2%} " 103 | analyze_prompt += '\n' 104 | return analyze_prompt 105 | 106 | 107 | import numpy as np 108 | 109 | def convert_color_map_to_segmentation(color_map, label_colors): 110 | """ 111 | Convert a color map back to a segmentation map with pixel labels from 0 to 7. 112 | 113 | Args: 114 | color_map (numpy.ndarray): A 3D array where each element represents the color of a pixel. 115 | label_colors (list): A list of color tuples corresponding to each label. 116 | 117 | Returns: 118 | numpy.ndarray: A 2D array representing the segmentation map. 119 | """ 120 | # Initialize an empty segmentation map with the same height and width as the color map 121 | segmentation_map = np.zeros((color_map.shape[0], color_map.shape[1]), dtype=np.uint8) 122 | 123 | # Map each color in the color map back to its corresponding label 124 | for label, color in enumerate(label_colors): 125 | # Create a mask where the color matches the current label color 126 | mask = np.all(color_map == color, axis=-1) 127 | 128 | # Assign the label to the matching locations in the segmentation map 129 | segmentation_map[mask] = label 130 | 131 | return segmentation_map 132 | 133 | def encode_image(image_path): 134 | with open(image_path, "rb") as image_file: 135 | return base64.b64encode(image_file.read()).decode('utf-8') 136 | 137 | 138 | def downsample_image(image, skip_index): 139 | """ 140 | Downsample a PIL image by skipping pixels. 141 | 142 | Args: 143 | image (PIL.Image.Image): The source image. 144 | skip_index (int): The number of pixels to skip. 145 | 146 | Returns: 147 | PIL.Image.Image: The downsampled image. 148 | """ 149 | # Ensure the input is a PIL Image 150 | if not isinstance(image, Image.Image): 151 | raise ValueError("image must be a PIL.Image.Image object") 152 | 153 | # Get the size of the original image 154 | width, height = image.size 155 | 156 | # Calculate the size of the downsampled image 157 | new_width = (width + skip_index - 1) // skip_index 158 | new_height = (height + skip_index - 1) // skip_index 159 | 160 | # Create a new image of the desired size 161 | downsampled_image = Image.new("RGB", (new_width, new_height)) 162 | 163 | # Copy pixels from the original image to the new image, skipping as appropriate 164 | for y in range(0, height, skip_index): 165 | for x in range(0, width, skip_index): 166 | downsampled_image.putpixel((x // skip_index, y // skip_index), image.getpixel((x, y))) 167 | 168 | return downsampled_image 169 | 170 | 171 | def resize_and_encode_image(image_path): 172 | """ 173 | Resize an image to 128x128 using nearest neighbor interpolation and then encode it to base64. 174 | 175 | Args: 176 | image_path (str): The path to the image file. 177 | 178 | Returns: 179 | str: A base64 encoded string of the resized image. 180 | """ 181 | # Open the image 182 | with Image.open(image_path) as img: 183 | # Resize the image 184 | #resized_img = img.resize((128, 128), Image.NEAREST) 185 | resized_img = downsample_image(img, 2) 186 | 187 | # Save the resized image to a bytes buffer 188 | buffer = io.BytesIO() 189 | resized_img.save(buffer, format=img.format) 190 | 191 | # Get the byte data from the buffer 192 | byte_data = buffer.getvalue() 193 | 194 | # Encode the byte data to base64 195 | base64_encoded = base64.b64encode(byte_data).decode('utf-8') 196 | 197 | return base64_encoded 198 | 199 | 200 | def generate_captions(image_paths, segmentation_maps): 201 | analyze_prompt1 = analyze_segmentation_map(segmentation_maps[0]) 202 | analyze_prompt2 = analyze_segmentation_map(segmentation_maps[1]) 203 | analyze_prompt3 = analyze_segmentation_map(segmentation_maps[2]) 204 | analyze_prompt4 = analyze_segmentation_map(segmentation_maps[3]) 205 | 206 | # Example usage 207 | analyze_prompt21 = analyze_image(segmentation_maps[0]) 208 | analyze_prompt22 = analyze_image(segmentation_maps[1]) 209 | analyze_prompt23 = analyze_image(segmentation_maps[2]) 210 | analyze_prompt24 = analyze_image(segmentation_maps[3]) 211 | 212 | prompt = "You are an AI visual assistant that can analyze the given image. In the image, different colors represent different land cover types.\ 213 | The color for the land cover dictionary is: '[0, 0, 255] (blue): water; [255, 0, 0](red): developed area; \ 214 | [0, 192, 0] (dark green): tree; [200, 170, 120] (brown): shrub; [0, 255, 0] (green): grass; [255, 255, 0] (yellow): crop;\ 215 | [128, 128, 128] (grey): bare; [255, 255, 255] (white): snow; [0, 255, 255] (cyan): wetland; [255, 0, 255] (pink): mangroves; [128, 0, 128] (purple): moss.' You will be provided four independent images at once." 216 | 217 | prompt += "For the first image, the distribution of each land cover type is:" 218 | prompt += analyze_prompt21 219 | prompt += "For the first image, the spatial distribution of the image is:" 220 | prompt += analyze_prompt1 221 | prompt += "For the second image, the distribution of each land cover type is:" 222 | prompt += analyze_prompt22 223 | prompt += "For the second image, the spatial distribution of the image is:" 224 | prompt += analyze_prompt2 225 | prompt += "For the third image, the distribution of each land cover type is:" 226 | prompt += analyze_prompt23 227 | prompt += "For the third image, the spatial distribution of the image is:" 228 | prompt += analyze_prompt3 229 | prompt += "For the fourth image, the distribution of each land cover type is:" 230 | prompt += analyze_prompt24 231 | prompt += "For the fourth image, the spatial distribution of the image is:" 232 | prompt += analyze_prompt4 233 | 234 | prompt += "You are given four independent images, describe in long sentences for each image seperately using four paragraphs and avoid saying other things.\ 235 | The following constraints should be obeyed: \ 236 | 1) Do not use color-related words; treat the color as the land cover type directly.\ 237 | 2) Generate the four descriptions seperately; do not add connection between them. \ 238 | 3) When describing water, developed, and crop areas, incorporate shape descriptors.\ 239 | 4) Double-check all the presented land cover types based on the distribution of each land cover type. If some land covers are not presented, do not mention them.\ 240 | 5) Describe it objectively; do not use words: 'possibly','likely','perhaps','color dictionary','appear','change','transition', 'dynamic', or any words with similar connotations.\ 241 | 6) Double-check the shape and location of the developed area, water course, grass, tree, shrub, wetland, and crop areas based on the given image if they are present.\ 242 | 7) Consider the spatial statistics as a unified image without breaking them down into individual spatial distributions and land cover proportions when describing the overall scene.\ 243 | 8) Describe each land cover separately for each given image, and then describe the main theme of each given image." 244 | 245 | api_key = "openai api key" 246 | 247 | #base64_image = encode_image(image_path) 248 | base64_image1 = resize_and_encode_image(image_paths[0]) 249 | base64_image2 = resize_and_encode_image(image_paths[1]) 250 | base64_image3 = resize_and_encode_image(image_paths[2]) 251 | base64_image4 = resize_and_encode_image(image_paths[3]) 252 | 253 | headers = { 254 | "Content-Type": "application/json", 255 | "Authorization": f"Bearer {api_key}" 256 | } 257 | 258 | payload = { 259 | "model": "gpt-4-vision-preview", 260 | "messages": [ 261 | { 262 | "role": "user", 263 | "content": [ 264 | { 265 | "type": "text", 266 | "text": prompt, 267 | }, 268 | { 269 | "type": "image_url", 270 | "image_url": { 271 | "url": f"data:image/jpeg;base64,{base64_image1}", 272 | }, 273 | }, 274 | { 275 | "type": "image_url", 276 | "image_url": { 277 | "url": f"data:image/jpeg;base64,{base64_image2}", 278 | }, 279 | }, 280 | { 281 | "type": "image_url", 282 | "image_url": { 283 | "url": f"data:image/jpeg;base64,{base64_image3}", 284 | }, 285 | }, 286 | { 287 | "type": "image_url", 288 | "image_url": { 289 | "url": f"data:image/jpeg;base64,{base64_image4}", 290 | }, 291 | }, 292 | ] 293 | } 294 | ], 295 | "max_tokens": 1200 296 | } 297 | 298 | response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload) 299 | with open(image_paths[0].replace('png','json'),'w') as jsonf: 300 | json.dump(response.json(),jsonf) 301 | 302 | 303 | if __name__=='__main__': 304 | with open('chatgpt_4_v.txt','r') as lf: 305 | all_labels = lf.readlines() 306 | for lfile in all_labels: 307 | lfile = lfile.strip() 308 | color_img_path00 = f"{lfile[:-4]}_patch00.png" 309 | color_img_path01 = f"{lfile[:-4]}_patch01.png" 310 | color_img_path10 = f"{lfile[:-4]}_patch10.png" 311 | color_img_path11 = f"{lfile[:-4]}_patch11.png" 312 | 313 | im00 = np.array(Image.open(color_img_path00)) 314 | im01 = np.array(Image.open(color_img_path01)) 315 | im10 = np.array(Image.open(color_img_path10)) 316 | im11 = np.array(Image.open(color_img_path11)) 317 | seg_map_00 = convert_color_map_to_segmentation(im00, colors) 318 | seg_map_01 = convert_color_map_to_segmentation(im01, colors) 319 | seg_map_10 = convert_color_map_to_segmentation(im10, colors) 320 | seg_map_11 = convert_color_map_to_segmentation(im11, colors) 321 | segmentation_maps = [seg_map_00,seg_map_01,seg_map_10,seg_map_11] 322 | image_paths = [color_img_path00,color_img_path01,color_img_path10,color_img_path11] 323 | generate_captions(image_paths, segmentation_maps) 324 | time.sleep(20) 325 | 326 | --------------------------------------------------------------------------------