├── data
    ├── classes.txt
    ├── spacenet_eval.csv
    └── labels.csv
├── geovl.png
├── pipeline.png
├── class_clustering.png
├── src
    ├── __init__.py
    ├── clip.py
    ├── classifier.py
    ├── display_chunks.py
    ├── vision.py
    ├── cluster.py
    └── main.py
├── AUTHORS.rst
├── CODE_OF_CONDUCT.md
├── environment.yml
├── setup.py
├── LICENSE
├── LICENSE-THIRD-PARTY.md
├── SECURITY.md
└── README.md


/data/classes.txt:
--------------------------------------------------------------------------------
1 | Buildings
2 | No Buildings


--------------------------------------------------------------------------------
/geovl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/geo-vision-labeler/HEAD/geovl.png


--------------------------------------------------------------------------------
/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/geo-vision-labeler/HEAD/pipeline.png


--------------------------------------------------------------------------------
/class_clustering.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/geo-vision-labeler/HEAD/class_clustering.png


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation. All rights reserved.
2 | # Licensed under the MIT License.


--------------------------------------------------------------------------------
/AUTHORS.rst:
--------------------------------------------------------------------------------
 1 | =======
 2 | Credits
 3 | =======
 4 | 
 5 | Development Lead
 6 | ----------------
 7 | 
 8 | * Gilles Quentin Hacheme <ghacheme@microsoft.com>
 9 | 
10 | Contributors
11 | ------------
12 | 
13 | * Caleb Robinson <davrob@microsoft.com>
14 | * Akram Zaytar <akramzaytar@microsoft.com>
15 | * Girmaw Abebe Tadesse <girmawabebe.tadesse@microsoft.com> 
16 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Microsoft Open Source Code of Conduct
2 | 
3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
4 | 
5 | Resources:
6 | 
7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: geo_vision_labeler
 2 | channels:
 3 |   - defaults
 4 |   - conda-forge
 5 | dependencies:
 6 |   - python=3.10
 7 |   - pip
 8 |   - pip:
 9 |       - numpy==2.2.4
10 |       - tqdm==4.67.1
11 |       - pillow==11.1.0
12 |       - rasterio==1.4.3
13 |       - torchvision==0.22.0
14 |       - transformers==4.50.0
15 |       - openai==1.65.2
16 |       - pytest==8.3.5
17 |       - torch==2.7.0
18 |       - dotenv==0.9.9
19 |       - pandas==2.2.3
20 |       - pyyaml==6.0.2
21 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name="geo_vision_labeler",
 5 |     version="0.1.0",
 6 |     description="A flexible tool to label images using vision LLMs and text classification via OpenAI or open-source LLMs.",
 7 |     author="Gilles Quentin Hacheme",
 8 |     packages=find_packages(),
 9 |     install_requires=[
10 |         "Pillow==11.1.0",
11 |         "rasterio==1.4.3",
12 |         "transformers==4.50.0",
13 |         "torch==2.7.0",
14 |         "tqdm==4.67.1",
15 |         "numpy==2.2.4",
16 |         "openai==1.65.2",
17 |         "torchvision==0.22.0",
18 |         "python-dotenv==0.9.9",
19 |         "pytest==8.3.5",
20 |         "pandas==2.2.3",
21 |         "pyyaml==6.0.2"
22 |     ],
23 |     entry_points={
24 |         "console_scripts": [
25 |             "label-images=src.main:main"
26 |         ]
27 |     },
28 |     python_requires="==3.10",
29 | )
30 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) Microsoft Corporation.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE


--------------------------------------------------------------------------------
/src/clip.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import os
 5 | import torch
 6 | from PIL import Image
 7 | from transformers import CLIPProcessor, CLIPModel
 8 | 
 9 | 
10 | class CLIPClassifier:
11 |     def __init__(self, model_name="openai/clip-vit-large-patch14", device=None):
12 |         """
13 |         Initialize the CLIP model and processor.
14 |         """
15 |         self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
16 |         self.model = CLIPModel.from_pretrained(model_name).to(self.device)
17 |         self.processor = CLIPProcessor.from_pretrained(model_name)
18 | 
19 |     def classify_image(self, image, classes, context):
20 |         """
21 |         Classify an image using CLIP based on image-text similarity.
22 | 
23 |         Args:
24 |             image (PIL.Image): The image to classify.
25 |             classes (list of str): Class labels (e.g., ["buildings", "no buildings"]).
26 |             context (str): Contextual information to prepend to class text.
27 | 
28 |         Returns:
29 |             tuple: (predicted class label, list of class probabilities)
30 |         """
31 | 
32 |         # Build class prompts
33 |         text_inputs = [
34 |             f"{context.strip()} Label '{label}'.".strip() for label in classes
35 |         ]
36 |         
37 |         # Prepare inputs and run model
38 |         inputs = self.processor(
39 |             text=text_inputs,
40 |             images=image.convert("RGB"),
41 |             return_tensors="pt",
42 |             padding=True,
43 |         ).to(self.device)
44 | 
45 |         outputs = self.model(**inputs)
46 |         probs = outputs.logits_per_image.softmax(dim=1).squeeze()
47 | 
48 |         pred_idx = torch.argmax(probs).item()
49 |         return classes[pred_idx], probs.tolist()
50 | 


--------------------------------------------------------------------------------
/data/spacenet_eval.csv:
--------------------------------------------------------------------------------
 1 | scene,period
 2 | L15-0506E-1204N.tif,2018_03
 3 | L15-1615E-1205N.tif,2019_09
 4 | L15-1049E-1370N.tif,2019_12
 5 | L15-0760E-0887N.tif,2019_07
 6 | L15-1289E-1169N.tif,2019_10
 7 | L15-1848E-0793N.tif,2018_05
 8 | L15-0487E-1246N.tif,2019_03
 9 | L15-1669E-1160N.tif,2019_09
10 | L15-0361E-1300N.tif,2019_12
11 | L15-0368E-1245N.tif,2019_07
12 | L15-0614E-0946N.tif,2018_12
13 | L15-0566E-1185N.tif,2018_03
14 | L15-1296E-1198N.tif,2018_11
15 | L15-1615E-1206N.tif,2019_02
16 | L15-1716E-1211N.tif,2018_12
17 | L15-0577E-1243N.tif,2019_09
18 | L15-1203E-1203N.tif,2018_12
19 | L15-0434E-1218N.tif,2019_06
20 | L15-1204E-1202N.tif,2017_10
21 | L15-1672E-1207N.tif,2018_09
22 | L15-1209E-1113N.tif,2018_04
23 | L15-1138E-1216N.tif,2019_09
24 | L15-1276E-1107N.tif,2018_08
25 | L15-1172E-1306N.tif,2018_01
26 | L15-0683E-1006N.tif,2019_06
27 | L15-1389E-1284N.tif,2019_11
28 | L15-1335E-1166N.tif,2019_10
29 | L15-1185E-0935N.tif,2018_10
30 | L15-0387E-1276N.tif,2019_08
31 | L15-1703E-1219N.tif,2019_07
32 | L15-0331E-1257N.tif,2018_01
33 | L15-1204E-1204N.tif,2018_07
34 | L15-0595E-1278N.tif,2018_01
35 | L15-0977E-1187N.tif,2018_09
36 | L15-1481E-1119N.tif,2018_11
37 | L15-0457E-1135N.tif,2018_04
38 | L15-0632E-0892N.tif,2018_09
39 | L15-1200E-0847N.tif,2018_08
40 | L15-0357E-1223N.tif,2019_12
41 | L15-1538E-1163N.tif,2018_12
42 | L15-1210E-1025N.tif,2019_02
43 | L15-0924E-1108N.tif,2019_03
44 | L15-1691E-1211N.tif,2018_12
45 | L15-0571E-1075N.tif,2019_04
46 | L15-1438E-1134N.tif,2018_06
47 | L15-1709E-1112N.tif,2019_03
48 | L15-1617E-1207N.tif,2019_09
49 | L15-1669E-1153N.tif,2018_07
50 | L15-0544E-1228N.tif,2018_02
51 | L15-1015E-1062N.tif,2018_02
52 | L15-1690E-1211N.tif,2019_09
53 | L15-1298E-1322N.tif,2018_03
54 | L15-1748E-1247N.tif,2019_12
55 | L15-0586E-1127N.tif,2019_08
56 | L15-1439E-1134N.tif,2019_12
57 | L15-1025E-1366N.tif,2018_10
58 | L15-1479E-1101N.tif,2019_09
59 | L15-0358E-1220N.tif,2020_01
60 | L15-1014E-1375N.tif,2019_01
61 | 


--------------------------------------------------------------------------------
/LICENSE-THIRD-PARTY.md:
--------------------------------------------------------------------------------
 1 | # Third-Party Licenses
 2 | 
 3 | This project includes several third-party open-source libraries, each with its own license. This file lists the licenses of the dependencies we use.
 4 | 
 5 | ## Python Dependencies
 6 | 
 7 | ### YAML Parsing
 8 | - **pyyaml**: MIT License  
 9 |   - https://github.com/yaml/pyyaml/blob/master/LICENSE
10 | 
11 | ### PyTorch and Related Libraries
12 | - **PyTorch**: BSD-style license
13 |   - https://github.com/pytorch/pytorch/blob/master/LICENSE
14 | - **torchvision**: BSD-style license
15 |   - https://github.com/pytorch/vision/blob/main/LICENSE
16 | 
17 | ### Python Dotenv
18 | - **python-dotenv**: BSD 3-Clause "New" or "Revised" License
19 |   - https://github.com/theskumar/python-dotenv/blob/main/LICENSE
20 | 
21 | ### Scientific Computing
22 | - **numpy**: BSD 3-Clause License
23 |   - https://github.com/numpy/numpy/blob/main/LICENSE.txt
24 | 
25 | ### Image
26 | - **Pillow (PIL Fork)**: MIT-CMU License
27 |   - https://github.com/python-pillow/Pillow/blob/main/LICENSE
28 | - **rasterio**: BSD 3-Clause License
29 |   - https://github.com/mapbox/rasterio/blob/main/LICENSE.txt
30 | 
31 | ### Progress Bar
32 | - **tqdm**: Mozilla Public License (MPL) v. 2.0 
33 |     - https://github.com/tqdm/tqdm/blob/master/LICENCE
34 | 
35 | ### Deep Learning Utilities
36 | - **openai-python**: Apache License 2.0
37 |   - https://github.com/openai/openai-python/blob/main/LICENSE
38 | - **transformers**: Apache License 2.0
39 |   - https://github.com/huggingface/transformers/blob/main/LICENSE
40 | 
41 | ## Dataset Licenses
42 | 
43 | ### SpaceNet 7
44 | - **License**: Creative Commons Attribution-ShareAlike 4.0 International License
45 | - **Source**: SN7: Multi-Temporal Urban Development Challenge
46 | - **Citation Requirements**: 
47 |   - Van Etten, A., Hogan, D., Manso, J. M., Shermeyer, J., Weir, N., & Lewis, R. (2021). The multi-temporal urban development spacenet dataset. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (pp. 6398-6407).
48 | 
49 | ---
50 | 
51 | **Note:** This file is provided as a convenient summary but does not replace the original licenses. Please refer to the original license files of each project for the exact terms. 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.9 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->
42 | 


--------------------------------------------------------------------------------
/src/classifier.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation. All rights reserved.
  2 | # Licensed under the MIT License.
  3 | 
  4 | 
  5 | def classify_with_openai(
  6 |     description, include_filename, filename, classes, openai_client, model_name="gpt-4"
  7 | ):
  8 |     """
  9 |     Uses the OpenAI API to classify the image description into one of the classes.
 10 |     The prompt instructs the model to output exactly one of the classes.
 11 | 
 12 |     Args:
 13 |         description (str): The description of the image.
 14 |         filename (str): The filename of the image.
 15 |         classes (list): The list of classes to classify into.
 16 |         openai_client: The OpenAI API client instance.
 17 |         model_name (str): The name of the OpenAI model to use.
 18 |     Returns:
 19 |         str: The classification result.
 20 |     """
 21 | 
 22 |     if include_filename:
 23 |         filename_str = f"Use the filename [{filename}] to locate the place and use that in your prediction. "
 24 |     else:
 25 |         filename_str = ""
 26 | 
 27 |     messages = [
 28 |         {"role": "system", "content": "You are a classifier."},
 29 |         {
 30 |             "role": "user",
 31 |             "content": (
 32 |                 f"Given a text describing a place, output one of the following classes: {', '.join(classes)}. "
 33 |                 f"{filename_str}"
 34 |                 f"Don't output anything else but one of the classes: {description}. Description: {description}. Class: "
 35 |             ),
 36 |         },
 37 |     ]
 38 |     response = openai_client.chat.completions.create(
 39 |         model=model_name, messages=messages
 40 |     )
 41 |     classification = response.choices[0].message.content.strip()
 42 |     return classification
 43 | 
 44 | 
 45 | def classify_with_huggingface(
 46 |     description,
 47 |     include_filename,
 48 |     filename,
 49 |     classes,
 50 |     pipeline_instance,
 51 |     tokenizer,
 52 |     max_new_tokens=10,
 53 | ):
 54 |     """
 55 |     Uses an open-source language model via a Hugging Face text-generation pipeline to classify the image description.
 56 | 
 57 |     Args:
 58 |         description (str): The description of the image.
 59 |         filename (str): The filename of the image.
 60 |         classes (list): The list of classes to classify into.
 61 |         pipeline_instance: The Hugging Face pipeline instance.
 62 |         tokenizer: The tokenizer for the model.
 63 |         max_new_tokens (int): The maximum number of tokens to generate.
 64 |     Returns:
 65 |         str: The classification result.
 66 |     """
 67 | 
 68 |     if include_filename:
 69 |         filename_str = f"Use the filename [{filename}] to locate the place and use that in your prediction. "
 70 |     else:
 71 |         filename_str = ""
 72 | 
 73 |     messages = [
 74 |         {"role": "system", "content": "You are a classifier."},
 75 |         {
 76 |             "role": "user",
 77 |             "content": (
 78 |                 f"Given a text describing a place, output one of the following classes: {', '.join(classes)}. "
 79 |                 f"{filename_str}"
 80 |                 f"Don't output anything else but one of the classes. Description: {description}. Class: "
 81 |             ),
 82 |         },
 83 |     ]
 84 |     response = pipeline_instance(
 85 |         messages,
 86 |         max_new_tokens=max_new_tokens,
 87 |         do_sample=False,
 88 |         temperature=None,
 89 |         top_p=1.0,
 90 |         return_full_text=False,
 91 |         pad_token_id=tokenizer.eos_token_id,
 92 |         eos_token_id=tokenizer.eos_token_id,
 93 |     )
 94 |     classification = response[0]["generated_text"]
 95 | 
 96 |     if classification[-1] in [".", ",", "!", "?"]:
 97 |         classification = classification[:-1]
 98 |     classification = classification.strip()
 99 | 
100 |     return classification
101 | 


--------------------------------------------------------------------------------
/src/display_chunks.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation. All rights reserved.
  2 | # Licensed under the MIT License.
  3 | 
  4 | import argparse
  5 | import os
  6 | import math
  7 | 
  8 | import pandas as pd
  9 | from PIL import Image, ImageDraw, ImageFont
 10 | 
 11 | 
 12 | def parse_arguments():
 13 |     parser = argparse.ArgumentParser(
 14 |         description="Display image chunks with metadata and predicted labels from a CSV labels file."
 15 |     )
 16 |     parser.add_argument(
 17 |         "--labels_path", type=str, required=True, help="Path to the CSV labels file."
 18 |     )
 19 | 
 20 |     group = parser.add_mutually_exclusive_group(required=True)
 21 |     group.add_argument(
 22 |         "--images_dir",
 23 |         type=str,
 24 |         help="Directory containing the original images.",
 25 |     )
 26 |     group.add_argument(
 27 |         "--img_path",
 28 |         type=str,
 29 |         help="Display only chunks for the specified image img_path.",
 30 |     )
 31 | 
 32 |     parser.add_argument(
 33 |         "--num_files", type=int, default=None, help="Number of chunks to display."
 34 |     )
 35 |     parser.add_argument("--random", action="store_true", help="Select chunks randomly.")
 36 |     parser.add_argument(
 37 |         "--output_path",
 38 |         type=str,
 39 |         default="data/outputs/collage.png",
 40 |         help="Output file for the collage image.",
 41 |     )
 42 |     return parser.parse_args()
 43 | 
 44 | 
 45 | def load_chunk(row, images_dir=None, img_path=None):
 46 |     if img_path:
 47 |         image_path = img_path
 48 |     else:
 49 |         filename = row["filename"]
 50 |         image_path = os.path.join(images_dir, filename)
 51 | 
 52 |     if not os.path.exists(image_path):
 53 |         raise FileNotFoundError(f"Image file {image_path} not found.")
 54 |     image = Image.open(image_path).convert("RGB")
 55 |     left = int(row["left"])
 56 |     lower = int(row["lower"])
 57 |     right = int(row["right"])
 58 |     upper = int(row["upper"])
 59 |     chunk = image.crop((left, lower, right, upper))
 60 |     return chunk
 61 | 
 62 | 
 63 | def annotate_chunk(chunk, row):
 64 |     chunk = chunk.resize((256, 256), Image.NEAREST)
 65 |     draw = ImageDraw.Draw(chunk)
 66 |     text = (
 67 |         f"{row['filename']}\n"
 68 |         f"({row['row']}, {row['col']})\n"
 69 |         f"Label: {row['classification']}"
 70 |     )
 71 |     font = ImageFont.load_default()
 72 |     text_x = 5
 73 |     text_y = 5
 74 |     draw.text((text_x, text_y), text, fill="yellow", font=font)
 75 |     return chunk
 76 | 
 77 | 
 78 | def create_collage(chunks, output_path):
 79 |     n = len(chunks)
 80 |     cols = math.ceil(math.sqrt(n))
 81 |     rows = math.ceil(n / cols)
 82 |     w, h = chunks[0].size
 83 |     line_thickness = 5
 84 |     collage_width = cols * w + (cols - 1) * line_thickness
 85 |     collage_height = rows * h + (rows - 1) * line_thickness
 86 |     collage = Image.new("RGB", (collage_width, collage_height), color="black")
 87 | 
 88 |     for idx, chunk in enumerate(chunks):
 89 |         col = idx % cols
 90 |         row = idx // cols
 91 |         x = col * (w + line_thickness)
 92 |         y = row * (h + line_thickness)
 93 |         collage.paste(chunk, (x, y))
 94 | 
 95 |     draw = ImageDraw.Draw(collage)
 96 |     for col in range(1, cols):
 97 |         x = col * w + (col - 1) * line_thickness
 98 |         draw.rectangle([x, 0, x + line_thickness - 1, collage_height], fill="red")
 99 |     for row in range(1, rows):
100 |         y = row * h + (row - 1) * line_thickness
101 |         draw.rectangle([0, y, collage_width, y + line_thickness - 1], fill="red")
102 | 
103 |     collage.save(output_path)
104 |     print(f"Collage saved to {output_path}")
105 | 
106 | 
107 | def main():
108 |     args = parse_arguments()
109 |     df = pd.read_csv(args.labels_path)
110 | 
111 |     if args.img_path:
112 |         filename = os.path.basename(args.img_path)
113 |         df = df[df["filename"] == filename]
114 | 
115 |     if df.empty:
116 |         print("No entries found for the given criteria.")
117 |         return
118 | 
119 |     n_selected = args.num_files if args.num_files is not None else len(df)
120 |     if args.random:
121 |         selected = df.sample(n=min(n_selected, len(df)))
122 |     else:
123 |         selected = df.head(n_selected)
124 |     selected = selected.reset_index(drop=True)
125 | 
126 |     chunks = []
127 |     for _, row in selected.iterrows():
128 |         chunk = load_chunk(row, images_dir=args.images_dir, img_path=args.img_path)
129 |         chunk = annotate_chunk(chunk, row)
130 |         chunks.append((chunk, row))
131 | 
132 |     chunks = sorted(chunks, key=lambda x: (x[1]["row"], x[1]["col"]))
133 |     chunks = [chunk for chunk, _ in chunks]
134 |     create_collage(chunks, args.output_path)
135 | 
136 | 
137 | if __name__ == "__main__":
138 |     main()
139 | 


--------------------------------------------------------------------------------
/src/vision.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation. All rights reserved.
  2 | # Licensed under the MIT License.
  3 | 
  4 | import os
  5 | import re
  6 | 
  7 | from PIL import Image
  8 | 
  9 | 
 10 | def apply_test_time_augmentation(image, test_time_augmentation):
 11 |     """
 12 |     Applies test time augmentation to the image if specified.
 13 |     Test time augmentation strategies includes flipping on x, y, and/or both axes [x, y, xy].
 14 |     """
 15 | 
 16 |     images = [image]
 17 |     for strategy in test_time_augmentation:
 18 |         if strategy == "x":
 19 |             images.append(image.transpose(Image.FLIP_LEFT_RIGHT))
 20 |         elif strategy == "y":
 21 |             images.append(image.transpose(Image.FLIP_TOP_BOTTOM))
 22 |         elif strategy == "xy":
 23 |             images.append(image.transpose(Image.ROTATE_180))
 24 |         else:
 25 |             raise ValueError(f"Unknown test time augmentation strategy: {strategy}")
 26 |     return images
 27 | 
 28 | 
 29 | def describe_image(
 30 |     image_chunk,
 31 |     image_path,
 32 |     context,
 33 |     prompt,
 34 |     classes,
 35 |     include_filename,
 36 |     include_classes,
 37 |     test_time_augmentation,
 38 |     processor,
 39 |     model,
 40 |     apply_template,
 41 |     device,
 42 | ):
 43 |     """
 44 |     Opens an image, processes it through the vision LLM to generate a detailed description,
 45 |     and returns a processed text label.
 46 |     """
 47 |     if test_time_augmentation:
 48 |         images = apply_test_time_augmentation(image_chunk, test_time_augmentation)
 49 |     else:
 50 |         images = [image_chunk]
 51 | 
 52 |     max_new_tokens = 128
 53 |     descriptions = []
 54 |     for image in images:
 55 |         image = image_chunk.convert("RGB")
 56 |         filename = os.path.basename(image_path).split(".")[0]
 57 | 
 58 |         if apply_template:
 59 |             messages = [
 60 |                 {
 61 |                     "role": "user",
 62 |                     "content": [
 63 |                         {"type": "image"},
 64 |                         {"type": "text", "text": f"{context}. "},
 65 |                     ],
 66 |                 }
 67 |             ]
 68 |             if include_filename:
 69 |                 messages[0]["content"].append(
 70 |                     {
 71 |                         "type": "text",
 72 |                         "text": f"The file name with geo-information is {filename}. ",
 73 |                     }
 74 |                 )
 75 |             if include_classes:
 76 |                 messages[0]["content"].append(
 77 |                     {
 78 |                         "type": "text",
 79 |                         "text": f"The image belongs to one of the following classes: {', '.join(classes)}. ",
 80 |                     }
 81 |                 )
 82 |             messages[0]["content"].append({"type": "text", "text": prompt})
 83 | 
 84 |             input_text = processor.apply_chat_template(
 85 |                 messages, add_generation_prompt=True
 86 |             )
 87 | 
 88 |             inputs = processor(
 89 |                 image, input_text, add_special_tokens=False, return_tensors="pt"
 90 |             ).to(model.device)
 91 |             generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
 92 |         else:
 93 |             input_text = context + ". "
 94 |             if include_filename:
 95 |                 input_text += f"The file name with geo-information is {filename}. "
 96 | 
 97 |             if include_classes:
 98 |                 input_text += f"The image belongs to one of the following classes: {', '.join(classes)}. "
 99 | 
100 |             input_text += prompt
101 |             inputs = processor(text=[input_text], images=[image], return_tensors="pt")
102 |             inputs = {key: value.to(device) for key, value in inputs.items()}
103 | 
104 |             try:
105 |                 generated_ids = model.generate(
106 |                     pixel_values=inputs["pixel_values"],
107 |                     input_ids=inputs["input_ids"],
108 |                     attention_mask=inputs["attention_mask"],
109 |                     image_embeds=None,
110 |                     image_embeds_position_mask=inputs.get(
111 |                         "image_embeds_position_mask", None
112 |                     ),
113 |                     use_cache=True,
114 |                     max_new_tokens=max_new_tokens,
115 |                 )
116 |             except ValueError as e:
117 |                 raise ValueError(
118 |                     f"Error generating text: {e}. "
119 |                     f"Retry with the --apply_vision_template flag."
120 |                 )
121 | 
122 |         generated_text = processor.batch_decode(
123 |             generated_ids, skip_special_tokens=True
124 |         )[0]
125 |         # Cleaning the generated text from any tags
126 |         generated_text = re.sub(r"<.*?>.*?<.*?>", "", generated_text)
127 |         generated_text = generated_text.replace("user\n\n", "").replace(
128 |             "assistant\n\n", ""
129 |         )
130 |         # Remove the prompt from the description
131 |         description = re.sub(
132 |             rf"^.*?{re.escape(prompt.lower())}", "", generated_text.lower()
133 |         ).strip()
134 |         descriptions.append(description)
135 | 
136 |     # Remove duplicates and join the descriptions
137 |     descriptions = list(set(descriptions))
138 |     return ";".join(descriptions)
139 | 


--------------------------------------------------------------------------------
/data/labels.csv:
--------------------------------------------------------------------------------
 1 | filename,row,col,left,upper,right,lower,description,classification
 2 | 2020_01_L15-0566E-1185N.tif,0,0,0,341,341,0,"this is a satellite image. the file name with geo-information is 2020_01_l15-0566e-1185n. the image shows a large field with a few buildings in the middle. the buildings are located in the upper part of the field. there are two buildings in particular, one on the left side and the other on the right side of the picture. the field is quite large, covering a large part of it.;this is a satellite image. the file name with geo-information is 2020_01_l15-0566e-1185n. the image shows a large field with a few buildings in the middle. the buildings are located in the upper part of the field. there are two buildings in particular, one on the left side and the other on the right side of the picture. the field is quite large, covering a large part of it.",Buildings
 3 | 2020_01_L15-0566E-1185N.tif,0,1,341,341,682,0,"this is a satellite image. the file name with geo-information is 2020_01_l15-0566e-1185n. the image is a close-up of a satellite view of a suburban neighborhood. the image shows a large number of houses, some of which are located close to each other. there are also a few cars parked in the area. the houses are arranged in a grid pattern, with some houses located closer to the center and others towards the edges. the cars are parked in a row, with one car located near the center of the scene and another one towards the right side.;this is a satellite image. the file name with geo-information is 2020_01_l15-0566e-1185n. the image is a close-up of a satellite view of a suburban neighborhood. the image shows a large number of houses, some of which are located close to each other. there are also a few cars parked in the area. the houses are arranged in a grid pattern, with some houses located closer to the center and others towards the edges. the cars are parked in a row, with one car located near the center of the scene and another one towards the right side.",Buildings
 4 | 2020_01_L15-0566E-1185N.tif,0,2,682,341,1023,0,"this is a satellite image. the file name with geo-information is 2020_01_l15-0566e-1185n. the image shows a large area of land with a few buildings and a road. there are several trees in the area, some of which are close to the buildings. the trees are located in different areas of the land, some closer to the road and others further away. the image also shows a pond in the middle of the area.;this is a satellite image. the file name with geo-information is 2020_01_l15-0566e-1185n. the image shows a large area of land with a few buildings and a road. there are several trees in the area, some of which are close to the buildings. the trees are located in different areas of the land, some closer to the road and others further away. the image also shows a pond in the middle of the area.",Buildings
 5 | 2020_01_L15-0566E-1185N.tif,1,0,0,682,341,341,"this is a satellite image. the file name with geo-information is 2020_01_l15-0566e-1185n. the image shows a large brick wall with a small window in the middle. the brick wall is brown and has a rough texture. there are several small windows in the wall, some of which are open. the image is taken from a satellite.;this is a satellite image. the file name with geo-information is 2020_01_l15-0566e-1185n. the image shows a large brick wall with a small window in the middle. the brick wall is brown and has a rough texture. there are several small windows in the wall, some of which are open. the image is taken from a satellite.",No Buildings
 6 | 2020_01_L15-0566E-1185N.tif,1,1,341,682,682,341,"this is a satellite image. the file name with geo-information is 2020_01_l15-0566e-1185n. the image is a black and white aerial view of a farm field with a small white building in the middle. the building is surrounded by a fence and a dirt road. there are several other buildings scattered around the field. the field is quite large, covering a large portion of the picture.;this is a satellite image. the file name with geo-information is 2020_01_l15-0566e-1185n. the image is a black and white aerial view of a farm field with a small white building in the middle. the building is surrounded by a fence and a dirt road. there are several other buildings scattered around the field. the field is quite large, covering a large portion of the picture.",Buildings
 7 | 2020_01_L15-0566E-1185N.tif,1,2,682,682,1023,341,this is a satellite image. the file name with geo-information is 2020_01_l15-0566e-1185n. the image is a close-up of a large construction site with a pond in the middle. the pond is surrounded by a dirt field. there are several trucks and a car visible in the image.;this is a satellite image. the file name with geo-information is 2020_01_l15-0566e-1185n. the image is a close-up of a large construction site with a pond in the middle. the pond is surrounded by a dirt field. there are several trucks and a car visible in the image.,Buildings
 8 | 2020_01_L15-0566E-1185N.tif,2,0,0,1023,341,682,"this is a satellite image. the file name with geo-information is 2020_01_l15-0566e-1185n. the image shows a large field with many small squares, some of which are green and others brown. the field is located in the middle of a desert. the image is taken from a satellite.;this is a satellite image. the file name with geo-information is 2020_01_l15-0566e-1185n. the image shows a large field with many small squares, some of which are green and others brown. the field is located in the middle of a desert. the image is taken from a satellite.",No Buildings
 9 | 2020_01_L15-0566E-1185N.tif,2,1,341,1023,682,682,this is a satellite image. the file name with geo-information is 2020_01_l15-0566e-1185n. the image shows a field with a few trees in the middle. there are two small trees in front of the field. the field is brown in color. there is a small black dot in the center of the picture.;this is a satellite image. the file name with geo-information is 2020_01_l15-0566e-1185n. the image shows a field with a few trees in the middle. there are two small trees in front of the field. the field is brown in color. there is a small black dot in the center of the picture.,No Buildings
10 | 2020_01_L15-0566E-1185N.tif,2,2,682,1023,1023,682,"this is a satellite image. the file name with geo-information is 2020_01_l15-0566e-1185n. the image shows a large area of land with a large body of water in the middle of it. there are several trees scattered around the area, some of which are close to the water. the trees are of different sizes and shapes, some are closer to the body of the water and others are further away. the water is quite deep, and the trees are located near the water's edge.;this is a satellite image. the file name with geo-information is 2020_01_l15-0566e-1185n. the image shows a large area of land with a large body of water in the middle of it. there are several trees scattered around the area, some of which are close to the water. the trees are of different sizes and shapes, some are closer to the body of the water and others are further away. the water is quite deep, and the trees are located near the water's edge.",No Buildings
11 | 


--------------------------------------------------------------------------------
/src/cluster.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation. All rights reserved.
  2 | # Licensed under the MIT License.
  3 | 
  4 | import argparse
  5 | import os
  6 | import logging
  7 | import re
  8 | 
  9 | import openai
 10 | import yaml
 11 | from dotenv import find_dotenv, load_dotenv
 12 | from transformers import pipeline
 13 | from openai import AzureOpenAI
 14 | from sklearn.metrics.pairwise import cosine_similarity
 15 | from sklearn.cluster import KMeans
 16 | from tqdm import tqdm
 17 | 
 18 | logging.basicConfig(
 19 |     level=logging.INFO,
 20 |     format="%(asctime)s - %(levelname)s - %(message)s",
 21 | )
 22 | 
 23 | 
 24 | def cluster_classes_with_openai(classes, num_clusters, client, model_name):
 25 |     """
 26 |     Use OpenAI's LLM to cluster the given classes into the specified number of clusters.
 27 |     """
 28 |     # Get clusters' names
 29 |     prompt = (
 30 |         f"Cluster the following list of classes into {num_clusters} clusters based on their semantic similarity. "
 31 |         f"Provide a name for each cluster and list the classes under each cluster. "
 32 |         f"Here is the list of classes:\n\n{', '.join(classes)}\n\n"
 33 |         f"Output the result in the following format:\n"
 34 |         f"Cluster_1: [Suggested name]\n"
 35 |         f"Cluster_2: [Suggested name]\n"
 36 |         f"...\n"
 37 |         f"Don't output anything else but the clusters' names."
 38 |     )
 39 |     response = client.chat.completions.create(
 40 |         model=model_name,
 41 |         messages=[{"role": "user", "content": prompt}],
 42 |         temperature=None,
 43 |     )
 44 |     clusters = []
 45 |     content = response.choices[0].message.content.strip()
 46 |     for line in content.split("\n"):
 47 |         if line.startswith("Cluster_"):
 48 |             cluster = line.split(":")[1]
 49 |             cluster = re.sub(r"[^a-zA-Z0-9_\s]", "", cluster)
 50 |             cluster = cluster.strip()
 51 |             clusters.append(cluster)
 52 |     logging.info(f"Detected clusters: {clusters}")
 53 | 
 54 |     # Match classes to clusters
 55 |     clusters_dict = {cluster: [] for cluster in clusters}
 56 |     clusters_dict["Unknown"] = []
 57 |     for class_ in tqdm(classes, desc="Matching classes to clusters"):
 58 |         prompt = (
 59 |             f"Classify the following class into one of the categories: {', '.join(clusters)}. "
 60 |             f"Here is the class:\n\n{class_}\n\n"
 61 |             f"Output the result in the following format:\n"
 62 |             f"Cat: [Category name]\n"
 63 |             f"...\n"
 64 |             f"Don't output anything else but the categories."
 65 |         )
 66 |         response = client.chat.completions.create(
 67 |             model=model_name,
 68 |             messages=[{"role": "user", "content": prompt}],
 69 |             temperature=None,
 70 |         )
 71 |         content = response.choices[0].message.content.strip()
 72 |         class_cluster = content.split(":")[1].strip()
 73 |         if class_cluster in clusters_dict:
 74 |             clusters_dict[class_cluster].append(class_)
 75 |         else:
 76 |             matched_cluster = [
 77 |                 cluster
 78 |                 for cluster in clusters
 79 |                 if (class_cluster.lower() in cluster.lower())
 80 |                 or (cluster.lower() in class_cluster.lower())
 81 |             ]
 82 | 
 83 |             if matched_cluster:
 84 |                 class_cluster = matched_cluster[0]
 85 |                 clusters_dict[class_cluster].append(class_)
 86 |             else:
 87 |                 logging.warning(
 88 |                     f"Class '{class_}' does not match any cluster name (Suggested cluster: {class_cluster}). It will be added to the 'unknown' cluster."
 89 |                 )
 90 |                 clusters_dict["Unknown"].append(class_)
 91 |     # Removing empty clusters
 92 |     for cluster in list(clusters_dict.keys()):
 93 |         if not clusters_dict[cluster]:
 94 |             logging.warning(f"Cluster '{cluster}' is empty. It will be removed.")
 95 |             del clusters_dict[cluster]
 96 |     # Final number of clusters
 97 |     if len(clusters_dict) < num_clusters:
 98 |         logging.warning(
 99 |             f"Requested {num_clusters} clusters, but only {len(clusters_dict)} clusters available. "
100 |         )
101 |     return clusters_dict
102 | 
103 | 
104 | def cluster_classes_with_huggingface(
105 |     classes, num_clusters, pipeline_instance, tokenizer
106 | ):
107 |     """
108 |     Use Hugging Face's LLM to cluster the given classes into the specified number of clusters.
109 |     """
110 |     # Get clusters' names
111 |     messages = [
112 |         {
113 |             "role": "system",
114 |             "content": "You are a helpful assistant that clusters classes based on semantic similarity.",
115 |         },
116 |         {
117 |             "role": "user",
118 |             "content": (
119 |                 f"Suggest {num_clusters} non-overlaping categories of the following classes based on their semantic similarity. "
120 |                 f"Provide a name for each category\n"
121 |                 f"Here is the list of classes:\n\n{', '.join(classes)}\n\n"
122 |                 f"Output the result in the following format:\n"
123 |                 f"Cluster_1: [Suggested name]\n"
124 |                 f"Cluster_2: [Suggested name]\n"
125 |                 f"...\n"
126 |                 f"Don't output anything else but the clusters' names."
127 |             ),
128 |         },
129 |     ]
130 |     response = pipeline_instance(
131 |         messages,
132 |         max_new_tokens=512,
133 |         do_sample=False,
134 |         temperature=None,
135 |         top_p=1.0,
136 |         return_full_text=False,
137 |         pad_token_id=tokenizer.eos_token_id,
138 |         eos_token_id=tokenizer.eos_token_id,
139 |     )
140 |     clusters = []
141 |     content = response[0]["generated_text"].strip()
142 |     for line in content.split("\n"):
143 |         if line.startswith("Cluster_"):
144 |             cluster = line.split(":")[1]
145 |             cluster = re.sub(r"[^a-zA-Z0-9_\s]", "", cluster)
146 |             cluster = cluster.strip()
147 |             clusters.append(cluster)
148 |     logging.info(f"Detected clusters: {clusters}")
149 | 
150 |     # Match classes to clusters
151 |     clusters_dict = {cluster: [] for cluster in clusters}
152 |     clusters_dict["Unknown"] = []
153 |     for class_ in tqdm(classes, desc="Matching classes to clusters"):
154 |         messages = [
155 |             {
156 |                 "role": "system",
157 |                 "content": "You are a helpful assistant that clusters classes based on semantic similarity.",
158 |             },
159 |             {
160 |                 "role": "user",
161 |                 "content": (
162 |                     f"Classify the following class into one of the categories: {', '.join(clusters)}. "
163 |                     f"Here is the class:\n\n{class_}\n\n"
164 |                     f"Output the result in the following format:\n"
165 |                     f"Cat: [Category name]\n"
166 |                     f"...\n"
167 |                     f"Don't output anything else but the categories."
168 |                 ),
169 |             },
170 |         ]
171 |         response = pipeline_instance(
172 |             messages,
173 |             max_new_tokens=512,
174 |             do_sample=False,
175 |             temperature=None,
176 |             top_p=1.0,
177 |             return_full_text=False,
178 |             pad_token_id=tokenizer.eos_token_id,
179 |             eos_token_id=tokenizer.eos_token_id,
180 |         )
181 |         content = response[0]["generated_text"].strip()
182 |         class_cluster = content.split(":")[1].strip()
183 |         if class_cluster in clusters_dict:
184 |             clusters_dict[class_cluster].append(class_)
185 |         else:
186 |             matched_cluster = [
187 |                 cluster
188 |                 for cluster in clusters
189 |                 if (class_cluster.lower() in cluster.lower())
190 |                 or (cluster.lower() in class_cluster.lower())
191 |             ]
192 | 
193 |             if matched_cluster:
194 |                 class_cluster = matched_cluster[0]
195 |                 clusters_dict[class_cluster].append(class_)
196 |             else:
197 |                 logging.warning(
198 |                     f"Class '{class_}' does not match any cluster name (Suggested cluster: {class_cluster}). It will be added to the 'unknown' cluster."
199 |                 )
200 |                 clusters_dict["Unknown"].append(class_)
201 |     # Removing empty clusters
202 |     for cluster in list(clusters_dict.keys()):
203 |         if not clusters_dict[cluster]:
204 |             logging.warning(f"Cluster '{cluster}' is empty. It will be removed.")
205 |             del clusters_dict[cluster]
206 |     # Final number of clusters
207 |     if len(clusters_dict) != num_clusters:
208 |         logging.warning(
209 |             f"Requested {num_clusters} clusters, but only {len(clusters_dict)} clusters available. "
210 |         )
211 |     return clusters_dict
212 | 
213 | 
214 | def recursive_clustering(
215 |     classes, num_clusters, steps, clustering_type, client, model_name, pipeline_instance
216 | ):
217 |     """
218 |     Perform recursive clustering for a specified number of steps.
219 |     """
220 |     if steps == 0 or not classes:
221 |         return classes
222 | 
223 |     if clustering_type == "openai":
224 |         clusters = cluster_classes_with_openai(
225 |             classes, num_clusters[0], client, model_name
226 |         )
227 |     elif clustering_type == "huggingface":
228 |         tokenizer = pipeline_instance.tokenizer
229 |         clusters = cluster_classes_with_huggingface(
230 |             classes, num_clusters[0], pipeline_instance, tokenizer
231 |         )
232 |     else:
233 |         raise ValueError(f"Unsupported clustering type: {clustering_type}")
234 | 
235 |     # recursively cluster subclusters
236 |     for name, cls_list in clusters.items():
237 |         clusters[name] = recursive_clustering(
238 |             cls_list,
239 |             num_clusters[1:],
240 |             steps - 1,
241 |             clustering_type,
242 |             client,
243 |             model_name,
244 |             pipeline_instance,
245 |         )
246 |     return clusters
247 | 
248 | 
249 | def save_clusters_as_yaml(clusters, output_file):
250 |     """
251 |     Save the clusters to a YAML file.
252 |     """
253 |     with open(output_file, "w") as f:
254 |         yaml.dump(clusters, f, default_flow_style=False)
255 | 
256 | 
257 | def load_classes(classes_file):
258 |     """
259 |     Load classes from a file.
260 |     """
261 |     with open(classes_file, "r") as f:
262 |         classes = [line.strip() for line in f.readlines()]
263 |     return classes
264 | 
265 | 
266 | def init_openai_client(variant):
267 |     if variant.lower() == "azure":
268 |         client = AzureOpenAI(
269 |             api_version=os.environ.get(
270 |                 "AZURE_OPENAI_API_VERSION", "2023-03-15-preview"
271 |             ),
272 |             azure_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT", ""),
273 |             api_key=os.environ.get("AZURE_OPENAI_API_KEY", ""),
274 |         )
275 |     else:
276 |         openai.api_key = os.environ.get("OPENAI_API_KEY", "")
277 |         client = openai
278 |     return client
279 | 
280 | 
281 | def main():
282 |     parser = argparse.ArgumentParser(
283 |         description="Cluster classes into specified clusters using various models."
284 |     )
285 |     parser.add_argument(
286 |         "--classes_file", type=str, required=True, help="Path to txt file with classes."
287 |     )
288 |     parser.add_argument(
289 |         "--num_clusters",
290 |         type=lambda x: [int(i) for i in x.split(",")],
291 |         required=True,
292 |         help="Comma-separated list of number of clusters for each step. If a single value is provided, it will be used for all steps.",
293 |     )
294 |     parser.add_argument(
295 |         "--steps", type=int, default=1, help="Number of recursive clustering steps."
296 |     )
297 |     parser.add_argument(
298 |         "--output_file", type=str, default="clusters.yaml", help="YAML output file."
299 |     )
300 |     parser.add_argument(
301 |         "--clustering_type",
302 |         type=str,
303 |         choices=["openai", "huggingface"],
304 |         required=True,
305 |         help="Clustering type to use.",
306 |     )
307 |     parser.add_argument(
308 |         "--openai_variant",
309 |         type=str,
310 |         choices=["azure", "openai"],
311 |         default="azure",
312 |         help="Azure or OpenAI API for openai type.",
313 |     )
314 |     parser.add_argument(
315 |         "--model_name", type=str, default=None, help="Model name for LLM."
316 |     )
317 |     args = parser.parse_args()
318 | 
319 |     if len(args.num_clusters) == 1:
320 |         args.num_clusters = [args.num_clusters[0]] * args.steps
321 |     elif len(args.num_clusters) != args.steps:
322 |         raise ValueError(
323 |             f"If the number of clusters is provided as a list, it must match the number of steps.\n"
324 |             f"Got {len(args.num_clusters)} clusters for {args.steps} steps.\n"
325 |             f"For example if 'steps' is 3, the number of clusters should be like: --num_clusters 3,5,7"
326 |         )
327 | 
328 |     dotenv_path = find_dotenv()
329 |     if dotenv_path:
330 |         load_dotenv(dotenv_path)
331 |     else:
332 |         logging.warning("No .env file found. Set env vars manually.")
333 | 
334 |     classes = load_classes(args.classes_file)
335 |     logging.info(f"Loaded {len(classes)} classes.")
336 | 
337 |     client = None
338 |     pipeline_instance = None
339 |     if args.clustering_type == "openai":
340 |         client = init_openai_client(args.openai_variant)
341 | 
342 |     elif args.clustering_type == "huggingface":
343 |         pipeline_instance = pipeline("text-generation", model=args.model_name)
344 | 
345 |     clusters = recursive_clustering(
346 |         classes,
347 |         args.num_clusters,
348 |         args.steps,
349 |         args.clustering_type,
350 |         client,
351 |         args.model_name,
352 |         pipeline_instance,
353 |     )
354 | 
355 |     save_clusters_as_yaml(clusters, args.output_file)
356 |     logging.info(f"Clusters saved to {args.output_file}.")
357 | 
358 | 
359 | if __name__ == "__main__":
360 |     main()
361 | 


--------------------------------------------------------------------------------
/src/main.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation. All rights reserved.
  2 | # Licensed under the MIT License.
  3 | 
  4 | import argparse
  5 | import csv
  6 | import glob
  7 | import logging
  8 | import os
  9 | 
 10 | import numpy as np
 11 | import torch
 12 | import tqdm
 13 | import rasterio
 14 | import yaml
 15 | from PIL import Image
 16 | from dotenv import find_dotenv, load_dotenv
 17 | from huggingface_hub import login
 18 | from transformers import (
 19 |     AutoModelForCausalLM,
 20 |     AutoModelForVision2Seq,
 21 |     AutoProcessor,
 22 |     AutoTokenizer,
 23 |     pipeline,
 24 | )
 25 | import openai
 26 | from openai import AzureOpenAI
 27 | 
 28 | from vision import describe_image
 29 | from classifier import classify_with_openai, classify_with_huggingface
 30 | from clip import CLIPClassifier
 31 | 
 32 | logging.basicConfig(
 33 |     level=logging.INFO,
 34 |     format="%(asctime)s - %(levelname)s - %(message)s",
 35 |     handlers=[logging.FileHandler("labeling.log"), logging.StreamHandler()],
 36 | )
 37 | 
 38 | dotenv_path = find_dotenv()
 39 | if dotenv_path:
 40 |     load_dotenv(dotenv_path)
 41 | else:
 42 |     logging.warning(
 43 |         "No .env file found. Make sure to set environment variables manually."
 44 |     )
 45 | 
 46 | login(token=os.environ.get("HF_TOKEN", ""))
 47 | CLIP_MODEL_NAME = os.environ.get("CLIP_MODEL_NAME", "openai/clip-vit-large-patch14")
 48 | 
 49 | 
 50 | def split_image(image_path, split_height_by, split_width_by):
 51 |     # unchanged...
 52 |     with rasterio.open(image_path) as src:
 53 |         image = src.read()
 54 |         if len(image.shape) == 3:
 55 |             image = np.moveaxis(image, 0, -1)
 56 |         else:
 57 |             raise ValueError("Image must have 3 dimensions (height, width, channels).")
 58 |         if image.shape[2] != 3:
 59 |             logging.warning(
 60 |                 f"Image has {image.shape[2]} channels. Keeping the first 3 channels."
 61 |             )
 62 |             image = image[:, :, :3]
 63 | 
 64 |     image = (image - image.min()) / (image.max() - image.min()) * 255
 65 |     image = image.astype(np.uint8)
 66 |     image = Image.fromarray(image, mode="RGB")
 67 |     if min(image.size) < 224:
 68 |         logging.warning(
 69 |             f"Image size {image.size} is smaller than 224. Resizing to 224x224."
 70 |         )
 71 |         image = image.resize((224, 224), Image.BILINEAR)
 72 | 
 73 |     width, height = image.size
 74 |     chunk_width = width // split_width_by
 75 |     chunk_height = height // split_height_by
 76 |     chunks = []
 77 |     coords_dict_list = []
 78 |     for i in range(split_height_by):
 79 |         for j in range(split_width_by):
 80 |             left = j * chunk_width
 81 |             lower = i * chunk_height
 82 |             right = left + chunk_width
 83 |             upper = lower + chunk_height
 84 |             coords = (left, lower, right, upper)
 85 |             chunk = image.crop(coords)
 86 |             chunks.append(chunk)
 87 |             coords_dict_list.append(
 88 |                 {"row": i, "col": j, "left": left, "upper": upper, "right": right, "lower": lower}
 89 |             )
 90 |     return chunks, coords_dict_list
 91 | 
 92 | 
 93 | def init_openai_client(variant):
 94 |     if variant.lower() == "azure":
 95 |         client = AzureOpenAI(
 96 |             api_version=os.environ.get(
 97 |                 "AZURE_OPENAI_API_VERSION", "2023-03-15-preview"
 98 |             ),
 99 |             azure_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT", ""),
100 |             api_key=os.environ.get("AZURE_OPENAI_API_KEY", ""),
101 |         )
102 |     else:
103 |         openai.api_key = os.environ.get("OPENAI_API_KEY", "")
104 |         client = openai
105 |     return client
106 | 
107 | 
108 | def get_hierarchy_depth(nested_dict):
109 |     depth = 0
110 |     if isinstance(nested_dict, dict):
111 |         for v in nested_dict.values():
112 |             depth = max(depth, 1 + get_hierarchy_depth(v))
113 |     return depth
114 | 
115 | def fallback_label(label, classes, chunk, filename, coords, clip_classifier, args):
116 |     """
117 |     Fallback mechanism to handle cases where the label is not found in the current hierarchy.
118 |     """
119 |     if label not in classes:
120 |         # Log a warning and try to find the closest match
121 |         logging.warning(
122 |             f"Label '{label}' not found in classes for file {filename}_r{coords['row']}_c{coords['col']}. "
123 |             "Trying to find the closest match."
124 |         )
125 |         matched_classes = sorted(
126 |             [
127 |                 (class_name, label.find(class_name))
128 |                 for class_name in classes
129 |                 if class_name in label
130 |             ]
131 |             + [
132 |                 (class_name, class_name.find(label))
133 |                 for class_name in classes
134 |                 if label in class_name
135 |             ],
136 |             key=lambda x: x[1],
137 |         )
138 | 
139 |         if matched_classes:
140 |             # Use the closest match
141 |             label = matched_classes[0][0]
142 |         else:
143 |             # Fall back to CLIP classification
144 |             logging.warning(
145 |                 f"No closest match for label '{label}' for file {filename}_r{coords['row']}_c{coords['col']}. "
146 |                 "Falling back to CLIP."
147 |             )
148 |             label, _ = clip_classifier.classify_image(
149 |                 image=chunk,
150 |                 classes=classes,
151 |                 context=args.context,
152 |             )
153 |     return label
154 | 
155 | def main():
156 |     parser = argparse.ArgumentParser(
157 |         description="Label images using a vision LLM and a text classifier."
158 |     )
159 |     parser.add_argument(
160 |         "--input_dir",
161 |         type=str,
162 |         required=True,
163 |         help="Directory containing input images.",
164 |     )
165 |     parser.add_argument(
166 |         "--output_file",
167 |         type=str,
168 |         default="data/labels.csv",
169 |         help="CSV file to write the results.",
170 |     )
171 |     parser.add_argument(
172 |         "--classes_file",
173 |         type=str,
174 |         default="data/classes.txt",
175 |         help="File containing the classes or hierarchy for classification. Can be a text file with flat classes or a YAML file with hierarchical classes.",
176 |     )
177 |     parser.add_argument(
178 |         "--mode",
179 |         type=str,
180 |         choices=["flat", "hierarchical"],
181 |         default="flat",
182 |         help="Classification mode: flat or hierarchical.",
183 |     )
184 |     # existing args...
185 |     parser.add_argument(
186 |         "--vision_model",
187 |         type=str,
188 |         default="microsoft/kosmos-2-patch14-224",
189 |         help="Vision model to use for image description.",
190 |     )
191 |     parser.add_argument(
192 |         "--apply_vision_template",
193 |         action="store_true",
194 |         help="Use 'role'/'content' template for the vision model.",
195 |     )
196 |     parser.add_argument(
197 |         "--classifier",
198 |         type=str,
199 |         default="microsoft/Phi-3-mini-4k-instruct",
200 |         help="Name of the classifier model.",
201 |     )
202 |     parser.add_argument(
203 |         "--classifier_type",
204 |         type=str,
205 |         choices=["openai", "huggingface", "clip"],
206 |         default="huggingface",
207 |         help="Classifier type: openai, huggingface, or clip.",
208 |     )
209 |     parser.add_argument(
210 |         "--openai_variant",
211 |         type=str,
212 |         choices=["azure", "openai"],
213 |         default="azure",
214 |         help="For OpenAI classifier: choose Azure or OpenAI API.",
215 |     )
216 |     parser.add_argument(
217 |         "--split_height_by",
218 |         type=int,
219 |         default=1,
220 |         help="Number of vertical splits per image.",
221 |     )
222 |     parser.add_argument(
223 |         "--split_width_by",
224 |         type=int,
225 |         default=1,
226 |         help="Number of horizontal splits per image.",
227 |     )
228 |     parser.add_argument(
229 |         "--context",
230 |         type=str,
231 |         default="This is a satellite image. ",
232 |         help="Meta prompt to guide the vision or CLIP model.",
233 |     )
234 |     parser.add_argument(
235 |         "--prompt",
236 |         type=str,
237 |         default="Detailed long description of the image: ",
238 |         help="Prompt to guide the vision model.",
239 |     )
240 |     parser.add_argument(
241 |         "--include_filename",
242 |         action="store_true",
243 |         help="Include the filename in the prompt for the vision/CLIP model.",
244 |     )
245 |     parser.add_argument(
246 |         "--include_classes",
247 |         action="store_true",
248 |         help="Include classes in the vision model prompt.",
249 |     )
250 |     parser.add_argument(
251 |         "--test_time_augmentation",
252 |         type=list,
253 |         default="",
254 |         help="Test time augmentation strategies for rotation [x, y, xy].",
255 |     )
256 |     parser.add_argument(
257 |         "--device",
258 |         type=str,
259 |         default="cuda:0" if torch.cuda.is_available() else "cpu",
260 |         help="Device to run the Hugging Face models on.",
261 |     )
262 |     args = parser.parse_args()
263 | 
264 |     # Load classes or hierarchy
265 |     if not os.path.exists(args.classes_file):
266 |         raise FileNotFoundError(f"Classes file {args.classes_file} not found.")
267 |     if (args.mode == "hierarchical") and "yaml" in args.classes_file:
268 |         with open(args.classes_file, "r") as f:
269 |             hierarchy = yaml.safe_load(f)
270 |             classes_flat = list(hierarchy.keys())     
271 |             depth = get_hierarchy_depth(hierarchy)
272 |     else:
273 |         with open(args.classes_file, "r") as f:
274 |             classes_flat = [line.strip() for line in f.readlines()]
275 |             hierarchy = None
276 |         depth = 0
277 | 
278 |     logging.info(f"Detected hierarchy depth = {depth}")
279 | 
280 |     if not classes_flat:
281 |         raise ValueError("No classes found in the classes file.")
282 | 
283 |     logging.info(f"Loaded {len(classes_flat)} top-level classes.")
284 | 
285 |     # Initialize models
286 |     if args.classifier_type != "clip":
287 |         processor = AutoProcessor.from_pretrained(args.vision_model)
288 |         model = AutoModelForVision2Seq.from_pretrained(args.vision_model).to(
289 |             args.device
290 |         )
291 |     clip_classifier = CLIPClassifier(
292 |         model_name=CLIP_MODEL_NAME,
293 |         device=args.device,
294 |     )
295 | 
296 |     # Setup classifier pipelines
297 |     client = None
298 |     gen_pipeline = None
299 |     if args.classifier_type == "openai":
300 |         client = init_openai_client(args.openai_variant)
301 |     elif args.classifier_type == "huggingface":
302 |         tokenizer = AutoTokenizer.from_pretrained(args.classifier)
303 |         llm_model = AutoModelForCausalLM.from_pretrained(args.classifier).to(
304 |             args.device
305 |         )
306 |         gen_pipeline = pipeline(
307 |             "text-generation",
308 |             model=llm_model,
309 |             tokenizer=tokenizer,
310 |             device=args.device,
311 |         )
312 | 
313 |     filepaths = glob.glob(os.path.join(args.input_dir, "*"))
314 |     if not filepaths:
315 |         logging.info("No images found in the specified input directory.")
316 |         return
317 | 
318 |     results = []
319 |     for filepath in tqdm.tqdm(filepaths):
320 |         chunks, coords_list = split_image(
321 |             filepath, args.split_height_by, args.split_width_by
322 |         )
323 |         for i, chunk in enumerate(chunks):
324 |             filename = os.path.basename(filepath)
325 |             coords = coords_list[i]
326 |             desc = None
327 | 
328 |             # get description if not CLIP-only
329 |             if args.classifier_type != "clip":
330 |                 desc = describe_image(
331 |                     image_chunk=chunk,
332 |                     image_path=filepath,
333 |                     context=args.context,
334 |                     prompt=args.prompt,
335 |                     classes=classes_flat,
336 |                     include_classes=args.include_classes,
337 |                     include_filename=args.include_filename,
338 |                     test_time_augmentation=args.test_time_augmentation,
339 |                     processor=processor,
340 |                     model=model,
341 |                     apply_template=args.apply_vision_template,
342 |                     device=args.device,
343 |                 )
344 | 
345 |             # Hierarchical or flat classification
346 |             current_candidates = classes_flat
347 |             label_tree = []
348 |             temp_hierarchy = hierarchy.copy() if hierarchy else None
349 |             for level in range(depth+1):
350 |                 if args.classifier_type == "clip":
351 |                     label, _ = clip_classifier.classify_image(
352 |                         image=chunk,
353 |                         classes=current_candidates,
354 |                         context=args.context,
355 |                     )
356 |                 else:
357 |                     if args.classifier_type == "openai":
358 |                         label = classify_with_openai(
359 |                             desc,
360 |                             args.include_filename,
361 |                             f"{filename}_r{coords['row']}_c{coords['col']}",
362 |                             current_candidates,
363 |                             client,
364 |                             args.classifier,
365 |                         )
366 |                     else:
367 |                         label = classify_with_huggingface(
368 |                             desc,
369 |                             args.include_filename,
370 |                             f"{filename}_r{coords['row']}_c{coords['col']}",
371 |                             current_candidates,
372 |                             gen_pipeline,
373 |                             tokenizer,
374 |                             28,
375 |                         )
376 |                 # Apply fallback if label is not in the current hierarchy
377 |                 label = fallback_label(
378 |                     label, current_candidates, chunk, filename, coords, clip_classifier, args
379 |                 )
380 | 
381 |                 label_tree.append(label)
382 |                 # update candidates for next level
383 |                 if temp_hierarchy is not None and label in temp_hierarchy:
384 |                     current_candidates = temp_hierarchy[label] if isinstance(temp_hierarchy, dict) else temp_hierarchy
385 |                     current_candidates = list(current_candidates.keys()) if isinstance(current_candidates, dict) else current_candidates
386 |                     temp_hierarchy = temp_hierarchy[label] if isinstance(temp_hierarchy, dict) else None
387 |                 else:
388 |                     break
389 | 
390 |             results.append([
391 |                 filename,
392 |                 coords['row'], coords['col'], coords['left'], coords['upper'], coords['right'], coords['lower'],
393 |                 args.mode,
394 |                 desc,
395 |                 ";".join(label_tree),
396 |                 label_tree[-1]
397 |             ])
398 | 
399 |     # write CSV
400 |     with open(args.output_file, mode="w", newline="") as csv_file:
401 |         writer = csv.writer(csv_file)
402 |         writer.writerow([
403 |             "filename","row","col","left","upper","right","lower", "classification_mode", "description", "classification_tree", "classification"
404 |         ])
405 |         writer.writerows(results)
406 | 
407 |     logging.info("Done.")
408 | 
409 | 
410 | if __name__ == "__main__":
411 |     main()
412 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ![Geo Vision Labeler Logo](geovl.png)
  2 | 
  3 | > 📢 **Paper Released!**  
  4 | > Read our paper describing GeoVision Labeler on [arXiv:2505.24340](https://arxiv.org/abs/2505.24340).
  5 | 
  6 | # Geospatial Image Labeling Tool (GeoVision Labeler)
  7 | 
  8 | This repository contains a flexible image labeling tool that takes any image as input and generates a classification label from a set of user-provided classes. It uses a vision LLM from Hugging Face (e.g. `microsoft/kosmos-2-patch14-224`) to generate a detailed description of the image, and then uses one of the classification options:
  9 | 
 10 | - **OpenAI Classification:** Uses the OpenAI models such as GPT-4 or GPT-3.5 via either the Azure OpenAI API or the OpenAI API.
 11 | - **Open-Source Classification:** Uses a Hugging Face language model (e.g. Llama 3.2) via a text-generation pipeline.
 12 | - **CLIP Classification (Fallback):** By default, the CLIP model is used as a fallback when the primary pipeline (vision LLM + Classifier) fails to generate a valid label (i.e., a label not included in the list of classes). In such cases, the CLIP model selects the most likely class from the provided list. The CLIP model can also be used as a standalone classifier. 
 13 | 
 14 | ![Pipeline](pipeline.png)
 15 | 
 16 | *The satellite image is derived from the [SpaceNet v7 dataset](https://spacenet.ai/sn7-challenge/).*
 17 | 
 18 | To handle a large set of semantically related labels (e.g., the 21 [UC Merced](http://weegee.vision.ucmerced.edu/datasets/landuse.html) land-use categories) more effectively, it can be beneficial to first organize them into a smaller set of *meta-classes* before classification. We introduced a zero-shot, LLM-driven semantic clustering pipeline that merges similar classes into meta-classes recursively (in a hierarchical way). The pipeline operates in three steps as follows:
 19 | 
 20 | ![Hierarchical class clustering](class_clustering.png)
 21 | 
 22 | This process will generate a YAML file that captures the hierarchical structure of the meta-classes. These meta-classes are then leveraged to perform hierarchical classification.
 23 | 
 24 | Ex of meta-classes generated for [UC Merced](http://weegee.vision.ucmerced.edu/datasets/landuse.html):
 25 | #### Natural Landscapes
 26 | - Agricultural
 27 | - Beach
 28 | - Chaparral
 29 | - Forest
 30 | - River
 31 | 
 32 | #### Recreational Areas
 33 | - Baseball Diamond
 34 | - Golf Course
 35 | - Tennis Court
 36 | ...
 37 | 
 38 | 
 39 | ## Features
 40 | 
 41 | - **Modular Design:** Separated image description and classification modules.
 42 | - **Flexible Configuration:** Choose the classifier method (OpenAI vs. open-source) and API variant (Azure OpenAI vs. OpenAI) via command-line arguments.
 43 | - **Output:** Results (filenames, predicted labels, and metadata information such as region/patch boundaries and descriptions) are saved in a CSV file.
 44 | - **Recursive Class Clustering & Hierarchical Classification:** Automatically groups semantically similar classes into meta-classes using a zero-shot, LLM-driven clustering pipeline. This enables hierarchical classification, improving accuracy of meta-classes when dealing with large or complex label sets.
 45 | - **Easy to Extend:** The code is designed so that new models or processing options can be added with minimal changes.
 46 | 
 47 | ## Installation
 48 | 
 49 | 1. **Clone the Repository:**
 50 | 
 51 |    ```bash
 52 |    git clone https://github.com/microsoft/geo-vision-labeler.git
 53 |    cd geo-vision-labeler
 54 |     ```
 55 | 
 56 | 2. **Create a Conda Environment File (environment.yml) (optional but recommended):**
 57 | 
 58 |     ```bash
 59 |         conda env create -f environment.yml
 60 |         conda activate geo_vision_labeler
 61 |     ```
 62 | ## Repository Structure
 63 | ```plaintext
 64 | .
 65 | ├── README.md
 66 | ├── environment.yml
 67 | ├── setup.py
 68 | ├── data
 69 | │   ├── classes.txt  # File containing classification labels (one label per line):
 70 | │   │                 # Buildings
 71 | │   └──               # No Buildings
 72 | ├── src
 73 | │   ├── __init__.py
 74 | │   ├── main.py
 75 | │   ├── cluster.py
 76 | │   ├── vision.py
 77 | │   ├── classifier.py
 78 | │   ├── clip.py
 79 | │   └── display_chunks.py
 80 | ```
 81 | 
 82 | ## Configuration
 83 | 
 84 | Set up your API keys as environment variables. For example:
 85 | 
 86 | **For Hugging Face:**
 87 | 
 88 | ```bash
 89 | export HF_TOKEN='your-hugging-face-token'
 90 | ```
 91 | 
 92 | **For Azure OpenAI:**
 93 | 
 94 | ```bash
 95 |     export AZURE_OPENAI_API_KEY='your-azure-openai-api-key'
 96 |     export AZURE_OPENAI_ENDPOINT='your-azure-openai-endpoint'
 97 |     export AZURE_OPENAI_API_VERSION='your-api-version'
 98 | ```
 99 | 
100 | **For OpenAI API:**
101 | 
102 | ```bash
103 |     export OPENAI_API_KEY='your-openai-api-key'
104 | ```
105 | 
106 | Other settings (such as the input directory, prompt, and device) can be passed via command-line arguments `src/main.py`.
107 | 
108 | ## Usage
109 | 
110 | Run the tool from the command line. For example:
111 | 
112 | **Using Hugging Face Models as Classifiers:**
113 | 
114 | ```bash
115 | python src/main.py --input_dir path/to/images --output_file path/to/labels.csv --vision_model vision-model-name-from-hugging-face --classifier llm-classifier-name-from-hugging-face --classes_file path/to/classes.txt --classifier_type huggingface --split_height_by split-height-by --split_width_by split-width-by --context "This is a satellite image" --prompt "Detailed long description of the image: " --include_filename
116 | ```
117 | 
118 | **Using OpenAI Models as Classifiers:**
119 | 
120 | ```bash
121 | python src/main.py --input_dir path/to/images --output_file path/to/labels.csv --vision_model vision-model-name-from-hugging-face --classifier llm-classifier-name-from-openai --classes_file path/to/classes.txt --classifier_type openai --openai_variant azure --split_height_by split-height-by --split_width_by split-width-by --context "This is a satellite image" --prompt "Detailed long description of the image: " --include_filename
122 | ```
123 | 
124 | **Using CLIP Models from Hugging Face:**
125 | 
126 | ```bash
127 | python src/main.py --input_dir path/to/images --output_file path/to/labels.csv --classes_file path/to/classes.txt --classifier clip-classifier-name-from-hugging-face --classifier_type clip --split_height_by split-height-by --split_width_by split-width-by --context "This is a satellite image"
128 | ```
129 | 
130 | An example of a CLIP model is `openai/clip-vit-large-patch14`. 
131 | 
132 | ### Command-line Options for `main.py`:
133 | - `--input_dir`: (Required) Directory containing input images.
134 | - `--output_file`: CSV file to write the results. Defaults to `data/labels.csv`.
135 | - `--mode`: Classification mode. Choose between `"flat"` (single-level classification) or `"hierarchical"` (multi-level classification using meta-classes). Defaults to `"flat"`.
136 | - `--classes_file`: File containing the classes for classification. For flat classification (`--mode flat`), provide a `.txt` file with one class per line (default: `data/classes.txt`). For hierarchical classification (`--mode hierarchical`), provide a `.yaml` file describing the meta-class hierarchy. The YAML file can be automatically generated using the `src/cluster.py` script.
137 | - `--vision_model`: Vision model to use for image description. Defaults to `microsoft/kosmos-2-patch14-224`.
138 | - `--apply_vision_template`: Use the 'role'/'content' prompt template for the vision model. This flag is required when using Llama vision models.
139 | - `--classifier`: Name of the classifier model. Defaults to `microsoft/Phi-3-mini-4k-instruct`.
140 | - `--classifier_type`: Classifier type: OpenAI LLMs (`openai`), Hugging Face LLMs (`huggingface`), or CLIP models (`clip`). Defaults to `huggingface`.
141 | - `--openai_variant`: For OpenAI classifier, choose Azure OpenAI (`azure`) or OpenAI (`openai`) API. Defaults to `azure`. Only valid when `classifier_type` is `openai`.
142 | - `--split_height_by`: Number of vertical splits per image. Defaults to `1`.
143 | - `--split_width_by`: Number of horizontal splits per image. Defaults to `1`.
144 | - `--context`: Meta prompt to guide the vision model. Defaults to `"This is a satellite image. "`.
145 | - `--prompt`: Prompt to guide the vision model. Defaults to `"Detailed long description of the image: "`.
146 |     This is particularly useful when the filename contains contextual information, 
147 |     such as geographical details, that can enhance the model's understanding. Use this flag to enable.
148 | - `--include_classes`: Include the list of classes in the prompt of the vision model. Use this flag to enable.
149 | - `--include_filename`: Include the filename in the prompt of the vision model. Use this flag to enable.
150 | - `--test_time_augmentation`: Test time augmentation strategies for rotation with x, y, and/or both axes `[x, y, xy]`.
151 | - `--device`: Device to run the Hugging Face models on. Defaults to `cuda:0` if available, otherwise `cpu`.
152 | 
153 | ### Generating Meta-Classes for Hierarchical Classification
154 | 
155 | To automatically generate meta-classes for hierarchical classification, use the `src/cluster.py` script. This script clusters your class labels into meta-classes using an LLM (OpenAI or Hugging Face) in one or more recursive steps.
156 | 
157 | **Basic usage:**
158 | 
159 | ```bash
160 | python src/cluster.py --classes_file path/to/classes.txt --num_clusters number-of-clusters --steps recursive-steps-or-hierarchical-depth --clustering_type huggingface --model_name your-hf-model-name --output_file clusters.yaml
161 | ```
162 | 
163 | **Key arguments:**
164 | - `--classes_file`: Path to your `.txt` file containing class labels (one per line).
165 | - `--steps`: Number of recursive clustering steps (default: 1). If greater than 1, `--num_clusters` should be a list matching the number of steps.
166 | - `--num_clusters`: Number of clusters (meta-classes) to create at each step. For multi-step clustering, you can provide either a single number (which will be used for all steps), or a comma-separated list where the number of elements matches the number of steps (e.g., `3,5` when `steps=2`). If a single value is given for multiple steps, it will be applied to each step.
167 | - `--clustering_type`: Choose `"openai"` or `"huggingface"` for the LLM backend.
168 | - `--openai_variant`: (If using `openai`) Choose `"azure"` or `"openai"` API (default: `"azure"`).
169 | - `--model_name`: (Optional) Specify the LLM model name.
170 | - `--output_file`: Output YAML file for the generated meta-class hierarchy (default: `clusters.yaml`).
171 | 
172 | **Example for two-step clustering:**
173 | 
174 | ```bash
175 | python src/cluster.py --classes_file data/classes.txt --num_clusters 3,5 --steps 2 --clustering_type openai --openai_variant openai --output_file data/meta_classes.yaml
176 | ```
177 | 
178 | The resulting YAML file can be used as input for hierarchical classification by passing it to the `--classes_file` argument in `main.py` with `--mode hierarchical`.
179 | 
180 | ### Example Usage
181 | 
182 | This section provides an example of how to run the GeoVision Labeler tool. 
183 | 
184 | To run this example, you can download satellite images from the [SpaceNet v7 dataset](https://spacenet.ai/sn7-challenge/). Once downloaded, place the images in the `data/images` directory and ensure the `classes.txt` file contains the classification labels `Buildings` and `No Buildings`. Then run: 
185 | 
186 | ```bash
187 | python src/main.py --input_dir data/images --output_file data/labels.csv --split_height_by 3 --split_width_by 3 --include_filename
188 | ```
189 | 
190 | The output is a CSV file containing detailed descriptions and classifications for each region/patch of the input image, divided into a grid based on the specified split dimensions. Each row in the file corresponds to a specific region/patch, with metadata such as its position, boundaries, description, and classification label.
191 | 
192 | #### Visualizing Labeled Results on Images
193 | 
194 | To overlay the labeled results directly onto a given image, use the `display_chunks.py` script. For example:
195 | 
196 | ```bash
197 | python src/display_chunks.py --labels_path data/labels.csv --img_path data/images/2020_01_L15-0566E-1185N.tif --output_path data/outputs/2020_01_L15-0566E-1185N_3x3.png
198 | ```
199 | 
200 | 
201 | ### Performance Results
202 | 
203 | We evaluated the GeoVision Labeler on 59 scenes from the [SpaceNet v7 dataset](https://spacenet.ai/sn7-challenge/), using the example usage configuration. Each image was divided into 9 patches. Based on the SpaceNet v7 ground truth labels, each patch was assigned a label of "Buildings" if at least one pixel from the patch was labeled as a building, or "No Buildings" otherwise. Since the dataset is temporal, we randomly selected a timestamp for each scene. The list of selected scenes and their corresponding periods can be found in `data/spacenet_eval.csv`. Predictions were then performed using the GeoVision Labeler pipeline, and the results were compared against the ground truth to compute the evaluation metrics.
204 | 
205 | The following metrics were computed to assess the pipeline's performance:
206 | 
207 | 
208 | <p><strong>Table:</strong> Overall Accuracy (OA) for SpaceNet v7 satellite image patches from 59 scenes, each divided into 9 patches labeled as <em>Buildings</em> or <em>No Buildings</em>. Best OA scores for each classifier and vision model from GeoVision Labeler (GVL) are in <strong>bold</strong>. The <strong>Classes</strong> column indicates whether the list of classes is inserted into the vLLM’s prompt. <strong>Other</strong> refers to vision models used standalone.</p>
209 | 
210 | <details>
211 |   <summary>Click to view detailed SpaceNet v7 evaluation results</summary>
212 |   <div align="center">
213 |     <table border="1" cellspacing="0" cellpadding="6" align="center">
214 |       <thead>
215 |         <tr>
216 |           <th rowspan="2" align="center">Pipeline</th>
217 |           <th rowspan="2" align="center">Classifier</th>
218 |           <th rowspan="2" align="center">Classes</th>
219 |           <th rowspan="2" align="center">Geo-context</th>
220 |           <th colspan="3" align="center">Vision Model</th>
221 |         </tr>
222 |         <tr>
223 |           <th align="center">Other</th>
224 |           <th align="center">Kosmos 2</th>
225 |           <th align="center">Llama 3.2</th>
226 |         </tr>
227 |       </thead>
228 |       <tbody>
229 |         <tr>
230 |           <td align="center">CLIP</td>
231 |           <td align="center">–</td>
232 |           <td align="center">–</td>
233 |           <td align="center">–</td>
234 |           <td align="center">0.588</td>
235 |           <td align="center">–</td>
236 |           <td align="center">–</td>
237 |         </tr>
238 |         <tr>
239 |           <td align="center" rowspan="12"><strong>GVL (Ours)</strong></td>
240 |           <td align="center" rowspan="4">Llama-3.1</td>
241 |           <td align="center">✓</td>
242 |           <td align="center">×</td>
243 |           <td align="center">–</td>
244 |           <td align="center">0.859</td>
245 |           <td align="center">0.776</td>
246 |         </tr>
247 |         <tr>
248 |           <td align="center">✓</td>
249 |           <td align="center">✓</td>
250 |           <td align="center">–</td>
251 |           <td align="center"><strong>0.910</strong></td>
252 |           <td align="center">0.699</td>
253 |         </tr>
254 |         <tr>
255 |           <td align="center">×</td>
256 |           <td align="center">×</td>
257 |           <td align="center">–</td>
258 |           <td align="center">0.889</td>
259 |           <td align="center"><strong>0.821</strong></td>
260 |         </tr>
261 |         <tr>
262 |           <td align="center">×</td>
263 |           <td align="center">✓</td>
264 |           <td align="center">–</td>
265 |           <td align="center">0.859</td>
266 |           <td align="center">0.751</td>
267 |         </tr>
268 |         <tr>
269 |           <td align="center" rowspan="4">Phi-3</td>
270 |           <td align="center">✓</td>
271 |           <td align="center">×</td>
272 |           <td align="center">–</td>
273 |           <td align="center"><strong>0.932</strong></td>
274 |           <td align="center">0.857</td>
275 |         </tr>
276 |         <tr>
277 |           <td align="center">✓</td>
278 |           <td align="center">✓</td>
279 |           <td align="center">–</td>
280 |           <td align="center">0.928</td>
281 |           <td align="center">0.902</td>
282 |         </tr>
283 |         <tr>
284 |           <td align="center">×</td>
285 |           <td align="center">×</td>
286 |           <td align="center">–</td>
287 |           <td align="center">0.928</td>
288 |           <td align="center">0.912</td>
289 |         </tr>
290 |         <tr>
291 |           <td align="center">×</td>
292 |           <td align="center">✓</td>
293 |           <td align="center">–</td>
294 |           <td align="center"><strong>0.932</strong></td>
295 |           <td align="center"><strong>0.927</strong></td>
296 |         </tr>
297 |         <tr>
298 |           <td align="center" rowspan="4">GPT-4o</td>
299 |           <td align="center">✓</td>
300 |           <td align="center">×</td>
301 |           <td align="center">–</td>
302 |           <td align="center">0.878</td>
303 |           <td align="center">0.789</td>
304 |         </tr>
305 |         <tr>
306 |           <td align="center">✓</td>
307 |           <td align="center">✓</td>
308 |           <td align="center">–</td>
309 |           <td align="center"><strong>0.917</strong></td>
310 |           <td align="center">0.799</td>
311 |         </tr>
312 |         <tr>
313 |           <td align="center">×</td>
314 |           <td align="center">×</td>
315 |           <td align="center">–</td>
316 |           <td align="center">0.896</td>
317 |           <td align="center"><strong>0.832</strong></td>
318 |         </tr>
319 |         <tr>
320 |           <td align="center">×</td>
321 |           <td align="center">✓</td>
322 |           <td align="center">–</td>
323 |           <td align="center">0.876</td>
324 |           <td align="center">0.783</td>
325 |         </tr>
326 |       </tbody>
327 |     </table>
328 |   </div>
329 | </details>
330 | 
331 | 
332 | **Model Names on Hugging Face**
333 | 
334 | - ***Vision Models:***
335 |     - Kosmos 2 ([microsoft/kosmos-2-patch14-224](https://huggingface.co/microsoft/kosmos-2-patch14-224))
336 |     - Llama 3.2 ([meta-llama/Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct))
337 |     - CLIP ([openai/clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14))
338 | - ***LLM Classifiers:***
339 |     - Llama 3.1 ([meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct))
340 |     - Phi-3 ([microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct))
341 | 
342 | These results demonstrate the effectiveness of the tool in generating accurate classifications for geospatial imagery, though performance may vary depending on the specific dataset and pipeline configuration.
343 | 
344 | ## Limitations
345 | 
346 | - **Model Dependency:** The quality of the generated descriptions and classifications depends heavily on the selected vision and language models. Suboptimal models may lead to inaccurate results.
347 | - **Class Confusion:** When dealing with a large number of classes, especially those that are similar, the model may confuse related classes with each other. Hierarchical classification using meta-classes helps to distinguish broader categories, but confusion still occur among classes within the same meta-class.
348 | - **Latency:** Hierarchical classification introduces additional computational steps, which can increase the overall latency of the labeling process compared to flat classification.
349 | - **Image Patch Size:** Early experiments show that small image patches (e.g., smaller than 224x224) may not contain enough contextual information for this pipeline to work effectively. Conversely, very large patches might reduce the pipeline's ability to focus on important details during the description stage, potentially hindering the classifier's performance.
350 | - **Image Resolution:**  Very high-resolution images may need to be split into smaller chunks, which could lead to loss of context for certain regions/patches and slow down the pipeline.
351 | - **Image Variability:** The tool may struggle with images containing unusual environmental conditions, especially as most vision LLMs are not trained on satellite images.
352 | - **Image Format Restriction:** The tool is designed to work with RGB images, as expected by most vision LLM models. However, satellite images often contain additional spectral bands (e.g., near-infrared) that could provide more accurate descriptions but are not utilized in the current implementation. Using vision LLMs specifically designed to handle satellite images with spectral bands beyond RGB can address this limitation.
353 | 
354 | Users should consider these limitations when deploying the tool in production environments.
355 | 
356 | ## Citation
357 | 
358 | If you use GeoVision Labeler in your research or project, please cite our work as follows:
359 | 
360 | ```bibtex
361 | @article{hacheme2025geovision,
362 |   title={GeoVision Labeler: Zero-Shot Geospatial Classification with Vision and Language Models},
363 |   author={Hacheme, Gilles Quentin and Tadesse, Girmaw Abebe and Robinson, Caleb and Zaytar, Akram and Dodhia, Rahul and Lavista Ferres, Juan M.},
364 |   journal={arXiv preprint arXiv:2505.24340},
365 |   year={2025}
366 | }
367 | ```
368 | 
369 | ## Contributing
370 | 
371 | This project welcomes contributions and suggestions.  Most contributions require you to agree to a Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
372 | 
373 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions provided by the bot. You will only need to do this once across all repos using our CLA.
374 | 
375 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
376 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
377 | 
378 | ## Trademarks
379 | 
380 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft trademarks or logos is subject to and must follow [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
381 | Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
382 | Any use of third-party trademarks or logos are subject to those third-party's policies.
383 | 
384 | ## License
385 | 
386 | This project is licensed under the [MIT License](LICENSE).
387 | 


--------------------------------------------------------------------------------