├── .github
    └── workflows
    │   └── linting.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── pyproject.toml
├── scripts
    ├── .env.example
    ├── blurry.jpg
    ├── client.py
    ├── server.py
    ├── tools.py
    ├── utils.py
    └── workflow.py
└── uv.lock


/.github/workflows/linting.yml:
--------------------------------------------------------------------------------
 1 | name: Linting
 2 | 
 3 | on:
 4 |   pull_request:
 5 | 
 6 | jobs:
 7 |   lint:
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |       - uses: actions/checkout@v4
11 | 
12 |       - name: Install uv
13 |         uses: astral-sh/setup-uv@v6
14 | 
15 |       - name: Set up Python
16 |         run: uv python install 3.12
17 | 
18 |       - name: Install pre-commit
19 |         shell: bash
20 |         run: uv venv && source .venv/bin/activate && uv pip install pre-commit
21 | 
22 |       - name: Run linter
23 |         shell: bash
24 |         run: uv run -- pre-commit run -a
25 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .venv/
2 | .env
3 | scripts/.env
4 | */__pycache__/
5 | scripts/output.png
6 | .ruff_cache/
7 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | default_language_version:
 3 |   python: python3
 4 | 
 5 | repos:
 6 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 7 |     rev: v4.5.0
 8 |     hooks:
 9 |       - id: check-byte-order-marker
10 |       - id: check-merge-conflict
11 |       - id: check-symlinks
12 |       - id: check-toml
13 |       - id: check-yaml
14 |       - id: detect-private-key
15 |       - id: end-of-file-fixer
16 |       - id: mixed-line-ending
17 |       - id: trailing-whitespace
18 | 
19 |   - repo: https://github.com/charliermarsh/ruff-pre-commit
20 |     rev: v0.11.8
21 |     hooks:
22 |       - id: ruff
23 |         args: [--exit-non-zero-on-fix, --fix]
24 |         exclude: ".*poetry.lock|.*_static"
25 | 
26 |   - repo: https://github.com/pre-commit/mirrors-mypy
27 |     rev: v1.0.1
28 |     hooks:
29 |       - id: mypy
30 |         additional_dependencies:
31 |           [
32 |             "types-requests",
33 |             "types-Deprecated",
34 |             "types-redis",
35 |             "types-setuptools",
36 |             "types-PyYAML",
37 |             "types-protobuf==4.24.0.4",
38 |           ]
39 |         args:
40 |           [
41 |             --namespace-packages,
42 |             --explicit-package-bases,
43 |             --disallow-untyped-defs,
44 |             --ignore-missing-imports,
45 |             --python-version=3.9,
46 |           ]
47 |         entry: bash -c "export MYPYPATH=ingest_anything"
48 | 
49 |   - repo: https://github.com/psf/black-pre-commit-mirror
50 |     rev: 23.10.1
51 |     hooks:
52 |       - id: black-jupyter
53 |         name: black-docs-py
54 |         alias: black
55 |         files: ^(docs/|examples/)
56 |         # Using PEP 8's line length in docs prevents excess left/right scrolling
57 |         args: [--line-length=79]
58 | 
59 |   - repo: https://github.com/pre-commit/mirrors-prettier
60 |     rev: v3.0.3
61 |     hooks:
62 |       - id: prettier
63 | 
64 |   - repo: https://github.com/pappasam/toml-sort
65 |     rev: v0.23.1
66 |     hooks:
67 |       - id: toml-sort-fix
68 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to `Image Generation Agent`
 2 | 
 3 | Do you want to contribute to this project? Make sure to read this guidelines first :)
 4 | 
 5 | ## Issue
 6 | 
 7 | **When to do it**:
 8 | 
 9 | - You found bugs but you don't know how to solve them or don't have time/will to do the solve
10 | - You want new features but you don't know how to implement them or don't have time/will to do the implementation
11 | 
12 | > ⚠️ _Always check open and closed issues before you submit yours to avoid duplicates_
13 | 
14 | **How to do it**:
15 | 
16 | - Open an issue
17 | - Give the issue a meaningful title (short but effective problem/feature request description)
18 | - Describe the problem/feature request
19 | 
20 | ## Traditional contribution
21 | 
22 | **When to do it**:
23 | 
24 | - You found bugs and corrected them
25 | - You optimized/improved the code
26 | - You added new features that you think could be useful to others
27 | 
28 | **How to do it**:
29 | 
30 | 1. Fork this repository
31 | 2. Install `pre-commit` and make sure to have it within the Git Hooks for your fork:
32 | 
33 | ```bash
34 | pip install pre-commit
35 | pre-commit install
36 | ```
37 | 
38 | 3. Commit your changes
39 | 4. Make sure your changes pass the pre-commit linting/type checking, if not modify them so that they pass
40 | 5. Submit pull request (make sure to provide a thorough description of the changes)
41 | 
42 | ### Thanks for contributing!
43 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License
 2 | 
 3 | Copyright (c) Jerry Liu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Image Generation Agent
 2 | 
 3 | **Image Generation Agent** is an open source project aimed at helping you produce stunning images aligned with your prompts, through the automation of the prompt refinement-generation-visual feedback loop.
 4 | 
 5 | ## Installation
 6 | 
 7 | This is a **uv project**, thus you have to make sure uv is installed on your machine - if not, you can get it with:
 8 | 
 9 | ```bash
10 | pip install uv
11 | ```
12 | 
13 | Or follow the [installation guidelines](https://docs.astral.sh/uv/getting-started/installation/) on uv docs.
14 | 
15 | Once uv is on your machine, you can clone this repository:
16 | 
17 | ```bash
18 | git clone https://github.com/run-llama/image-generation-agent
19 | cd image-generation-agent
20 | ```
21 | 
22 | And run:
23 | 
24 | ```bash
25 | uv sync
26 | source .venv/bin/activate
27 | ```
28 | 
29 | Congrats, you successfully installed this project and its dependencies!
30 | 
31 | ## Setting up
32 | 
33 | Access the `scripts` sub-folder, and modify the [`.env.example`](./scripts/.env.example) file so that it contains a valid `GOOGLE_API_KEY` and `OPENAI_API_KEY`. After that, make sure to rename it to `.env`:
34 | 
35 | ```bash
36 | cd scripts/
37 | mv .env.example .env
38 | ```
39 | 
40 | Alternatively, you can export the keys as environmental variables from your terminal:
41 | 
42 | ```bash
43 | export GOOGLE_API_KEY="***"
44 | export OPENAI_API_KEY="sk-***"
45 | ```
46 | 
47 | ## Launching
48 | 
49 | While still being in the `scripts` sub-folder, you can launch the backend of the project with:
50 | 
51 | ```bash
52 | python3 server.py
53 | ```
54 | 
55 | You will se a log message telling you that the websocket server has been started on port 8765.
56 | 
57 | After the websocket has connected, launch the frontend from another terminal window (remaining in the `scripts` sub-folder and with the virtual environment active), using:
58 | 
59 | ```bash
60 | python3 client.py
61 | ```
62 | 
63 | With this command, you'll have the application running on http://localhost:7860 and you will be able to interact with it!
64 | 
65 | ## How does it work?
66 | 
67 | The agent uses two tools under the hood:
68 | 
69 | - `generate_image`: this exploits OpenAI image generation API to create images starting from textual prompts.
70 | - `evaluate_generated_image`: this uses the advanced vision capabilities of Gemini, employing the model as a judge of the faithfulness and quality of the image
71 | 
72 | Whenever you submit a request, the agent first activates the `generate_image` tool, then it assess the fit of the image with your prompt using the `evaluate_generated_image` tool. If the image is deemed suitable, it is returned to the user, whereas otherwise the prompt is refined and the generate-evaluate loop is resumed.
73 | 
74 | ## Contributing
75 | 
76 | Contributions are more than welcome! Follow the [contribution guidelines](CONTRIBUTING.md) to make sure your contribution is compliant with the repo's requirements :)
77 | 
78 | ## License and rights of usage
79 | 
80 | This is an open-source project distributed under an [MIT License](LICENSE).
81 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | build-backend = "hatchling.build"
  3 | requires = ["hatchling"]
  4 | 
  5 | [lint.flake8-annotations]
  6 | mypy-init-return = true
  7 | 
  8 | [lint.pydocstyle]
  9 | convention = "google"
 10 | 
 11 | [project]
 12 | authors = [{email = "clelia@runllama.ai", name = "Clelia Astra Bertelli"}]
 13 | classifiers = [
 14 |     "Topic :: Scientific/Engineering :: Artificial Intelligence",
 15 |     "Topic :: Software Development :: Libraries :: Application Frameworks",
 16 |     "Topic :: Software Development :: Libraries :: Python Modules",
 17 | ]
 18 | dependencies = [
 19 |     "fastapi>=0.115.12",
 20 |     "gradio>=3.36.1",
 21 |     "llama-index>=0.12.36,<0.13",
 22 |     "llama-index-core>=0.12.36,<0.13",
 23 |     "llama-index-llms-google-genai>=0.1.13,<0.2",
 24 |     "llama-index-llms-openai>=0.3.43",
 25 |     "openai>=1.81.0",
 26 |     "orjson>=3.10.18",
 27 |     "pre-commit>=4.2.0",
 28 |     "uvicorn>=0.34.2",
 29 |     "websockets>=15.0.1",
 30 | ]
 31 | description = "Image Generation Agent"
 32 | license = "MIT"
 33 | name = "image-generation-agent"
 34 | readme = "README.md"
 35 | requires-python = ">=3.9,<4.0"
 36 | version = "0.1.0"
 37 | 
 38 | [project.urls]
 39 | Repository = "https://github.com/run-llama/image-generation-agent"
 40 | 
 41 | [tool.hatch.build.targets.sdist]
 42 | include = ["*"]
 43 | 
 44 | [tool.hatch.build.targets.wheel]
 45 | include = ["*"]
 46 | 
 47 | [tool.mypy]
 48 | disallow_untyped_defs = true
 49 | exclude = [".venv"]
 50 | explicit_package_bases = true
 51 | ignore_missing_imports = true
 52 | mypy_path = "llama_index"
 53 | namespace_packages = true
 54 | plugins = "pydantic.mypy"
 55 | python_version = "3.9"
 56 | 
 57 | [tool.ruff]
 58 | exclude = [
 59 |     "_static",
 60 |     "examples",
 61 |     "llama_index/ingestion/client",
 62 |     "notebooks",
 63 | ]
 64 | target-version = "py312"
 65 | lint.ignore = [
 66 |     "COM812",  # Too aggressive
 67 |     "D212",  # Using D213
 68 |     "D417",  # Too aggressive
 69 |     "F541",  # Messes with prompts.py
 70 |     "RUF100",  # Allow blanket noqa
 71 |     "TC002",
 72 |     "UP",  # Remove when we drop Python 3.9
 73 |     "PT001",
 74 |     "E501",  # Use best judgement for line-length
 75 |     "E402",  # Annoying, use best judgement
 76 |     "PYI063",
 77 |     "ANN204",  # this is annoying
 78 |     "D401",  # I disagree
 79 |     "D404",
 80 | ]
 81 | # Feel free to add more here
 82 | lint.select = [
 83 |     "ANN204",
 84 |     "B009",
 85 |     "B010",
 86 |     "B011",
 87 |     "B013",
 88 |     "B014",
 89 |     "C4",
 90 |     "COM812",
 91 |     "COM819",
 92 |     "D201",
 93 |     "D202",
 94 |     "D204",
 95 |     "D207",
 96 |     "D208",
 97 |     "D209",
 98 |     "D211",
 99 |     "D213",
100 |     "D214",
101 |     "D215",
102 |     "D3",
103 |     "D4",
104 |     "E",
105 |     "EXE004",
106 |     "F401",
107 |     "F504",
108 |     "F541",
109 |     "F632",
110 |     "FLY",
111 |     "G010",
112 |     "I002",
113 |     "PERF1",
114 |     "PIE790",
115 |     "PIE794",
116 |     "PIE808",
117 |     "PIE810",
118 |     "PLC0414",
119 |     "PLE2510",
120 |     "PLE2512",
121 |     "PLE2513",
122 |     "PLE2514",
123 |     "PLE2515",
124 |     "PLR1711",
125 |     "PT001",
126 |     "PT003",
127 |     "PT006",
128 |     "PT02",
129 |     "PTH201",
130 |     "PYI",
131 |     "Q",
132 |     "RET501",
133 |     "RET502",
134 |     "RET503",
135 |     "RET504",
136 |     "RSE",
137 |     "RUF005",
138 |     "RUF010",
139 |     "RUF015",
140 |     "RUF1",
141 |     "SIM101",
142 |     "SIM103",
143 |     "SIM109",
144 |     "SIM118",
145 |     "SIM2",
146 |     "SIM300",
147 |     "SIM9",
148 |     "TC005",
149 |     "TD006",
150 |     "TID",
151 |     "TRY201",
152 |     "W",
153 | ]
154 | lint.unfixable = [
155 |     "ERA001",
156 | ]
157 | 
158 | [tool.tomlsort]
159 | all = false
160 | in_place = true
161 | spaces_before_inline_comment = 2  # Match Python PEP 8
162 | spaces_indent_inline_array = 4  # Match Python PEP 8
163 | trailing_comma_inline_array = true
164 | 
165 | [[tool.uv.index]]
166 | name = "nvidia-pypi"
167 | url = "https://pypi.nvidia.com"
168 | 


--------------------------------------------------------------------------------
/scripts/.env.example:
--------------------------------------------------------------------------------
1 | GOOGLE_API_KEY="***"
2 | OPENAI_API_KEY="sk-***"
3 | 


--------------------------------------------------------------------------------
/scripts/blurry.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/image-generation-agent/843c253ee585761e0d67cf462dbb8007431e8129/scripts/blurry.jpg


--------------------------------------------------------------------------------
/scripts/client.py:
--------------------------------------------------------------------------------
 1 | import websockets
 2 | import gradio as gr
 3 | 
 4 | async def websocket_chat(prompt):
 5 |     uri = "ws://localhost:8765"
 6 |     try:
 7 |         async with websockets.connect(uri) as websocket:
 8 |             await websocket.send(prompt)
 9 |             full_response = ""
10 | 
11 |             while True:
12 |                 message = await websocket.recv()
13 |                 if message == "[END]":
14 |                     break
15 |                 full_response += message
16 |                 yield full_response, "blurry.jpg"
17 |             yield full_response, "output.png"
18 | 
19 |     except Exception as e:
20 |         yield f"Error: {e}"
21 | 
22 | def launch_interface():
23 |     with gr.Blocks(theme=gr.themes.Citrus(primary_hue="indigo", secondary_hue="teal")) as frontend:
24 |         gr.HTML("<h1 align='center'>Image Generation Agent🎨</h1>")
25 |         gr.HTML("<h2 align='center'>Get stunning AI-generated images!</h2>")
26 |         with gr.Row():
27 |             usr_txt = gr.Textbox(label="Prompt", placeholder="Describe the image you want here...")
28 |             with gr.Column():
29 |                 resp = gr.Markdown(label="Agent Output", container=True, show_label=True, show_copy_button=True)
30 |                 gen_img = gr.Image(label="Generated Image")
31 | 
32 | 
33 |         with gr.Row():
34 |             btn = gr.Button("Generate🖌️").click(fn=websocket_chat, inputs=[usr_txt], outputs=[resp, gen_img])
35 | 
36 |     frontend.launch()
37 | 
38 | if __name__ == "__main__":
39 |     launch_interface()
40 | 


--------------------------------------------------------------------------------
/scripts/server.py:
--------------------------------------------------------------------------------
 1 | # server.py
 2 | import json
 3 | import asyncio
 4 | import websockets
 5 | from workflow import workflow
 6 | from llama_index.core.agent.workflow.workflow_events import ToolCall, ToolCallResult
 7 | 
 8 | async def run_agent(websocket):
 9 |     async for prompt in websocket:
10 |         handler = workflow.run(user_msg=prompt)
11 |         async for event in handler.stream_events():
12 |             if isinstance(event, ToolCallResult):
13 |                 await websocket.send(f"**Result from `{event.tool_name}`**:\n\n{event.tool_output.content}\n\n")
14 |             elif isinstance(event, ToolCall):
15 |                 await websocket.send(f"### Calling tool: `{event.tool_name}`\n\n```json\n{json.dumps(event.tool_kwargs, indent=4)}\n```\n\n")
16 |         response = await handler
17 |         response = str(response)
18 |         await websocket.send("### Final output\n\n" + response)
19 |         await websocket.send("[END]")
20 | 
21 | async def main():
22 |     print("Starting server on ws://localhost:8765")
23 |     async with websockets.serve(run_agent, "localhost", 8765):
24 |         await asyncio.Future()  # Run forever
25 | 
26 | if __name__ == "__main__":
27 |     asyncio.run(main())
28 | 


--------------------------------------------------------------------------------
/scripts/tools.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | import json
 3 | from pathlib import Path
 4 | from utils import get_api_keys
 5 | from llama_index.llms.openai import OpenAIResponses
 6 | from typing import Literal
 7 | from pydantic import BaseModel, Field
 8 | from llama_index.llms.google_genai import GoogleGenAI
 9 | from llama_index.core.llms import ChatMessage, MessageRole, ImageBlock, TextBlock
10 | 
11 | 
12 | class ImageEvaluation(BaseModel):
13 |     faithfulness: int = Field(description="Faithfulness of the generated image to the generation prompt, from 0 to 100")
14 |     quality: Literal["low", "mediocre", "average", "upper-intermediate", "high", "very high"] = Field(description="Quality of the image, expressed as one of: 'low', 'mediocre', 'average', 'upper-intermediate', 'high', 'very high'")
15 |     prompt_agnostic_description: str = Field(description="Description of the image, agnostic of the image generation prompt")
16 | 
17 | openai_api_key, google_api_key =get_api_keys()
18 | async_openai_client = OpenAIResponses(api_key=openai_api_key, model="gpt-4.1-mini",built_in_tools=[{"type": "image_generation"}])
19 | llm = GoogleGenAI(model="gemini-2.0-flash", api_key=google_api_key)
20 | llm_struct = llm.as_structured_llm(ImageEvaluation)
21 | 
22 | async def generate_image(prompt: str = Field(description="The image generation prompt")) -> str:
23 |     """
24 |     This tool useful to generate images.
25 | 
26 |     Args:
27 |         prompt (str): The image generation prompt
28 | 
29 |     """
30 |     try:
31 |         messages = [ChatMessage.from_str(content=prompt, role="user")]
32 |         img = await async_openai_client.achat(messages)
33 |         for block in img.message.blocks:
34 |             if isinstance(block, ImageBlock):
35 |                 image_bytes = base64.b64decode(block.image)
36 |                 with open("output.png", "wb") as f:
37 |                     f.write(image_bytes)
38 |         return "Image successfully generated"
39 |     except Exception as e:
40 |         return f"An error occurred during image generation: {e.__str__()}"
41 | 
42 | async def evaluate_generated_image(prompt: str = Field(description="The original prompt used to generate the image")) -> str:
43 |     """
44 |     This tool is useful to evaluate a generated image.
45 | 
46 |     Args:
47 |         prompt (str): The original prompt used to generate the image
48 | 
49 |     """
50 |     messages = [ChatMessage(role=MessageRole.USER, blocks=[ImageBlock(path=Path("output.png")), TextBlock(text=f"Could you (1) evaluate the faithfulness of the attached image to this prompt: '{prompt}', (2) evaluate the quality of the image and (3) produce a description of the image that is agnostic of the prompt that was used to generate it?")])]
51 |     resp = await llm_struct.achat(messages=messages)
52 |     struct_output = json.loads(resp.message.blocks[0].text)
53 |     return f"The generated image can be described as:\n'''\n{struct_output['prompt_agnostic_description']}\n'''\nThe faithfulness of the generated image to the original prompt is: {struct_output['faithfulness']}%.\nThe quality of the image is {struct_output['quality']}."
54 | 


--------------------------------------------------------------------------------
/scripts/utils.py:
--------------------------------------------------------------------------------
 1 | from os import environ as ENV
 2 | from dotenv import load_dotenv
 3 | from typing import Tuple
 4 | 
 5 | def get_api_keys() -> Tuple[str, str]:
 6 |     openai_api_key = ENV.get("OPENAI_API_KEY", None)
 7 |     if openai_api_key is None:
 8 |         load_dotenv()
 9 |         openai_api_key = ENV.get("OPENAI_API_KEY", None)
10 |         if not openai_api_key:
11 |             raise ValueError("There is no OPENAI_API_KEY declared among the environmental variables")
12 |     google_api_key = ENV.get("GOOGLE_API_KEY", None)
13 |     if google_api_key is None:
14 |         load_dotenv()
15 |         google_api_key = ENV.get("GOOGLE_API_KEY", None)
16 |         if not google_api_key:
17 |             raise ValueError("There is no GOOGLE_API_KEY declared among the environmental variables")
18 |     return openai_api_key, google_api_key
19 | 


--------------------------------------------------------------------------------
/scripts/workflow.py:
--------------------------------------------------------------------------------
 1 | from tools import generate_image, evaluate_generated_image
 2 | from llama_index.core.agent.workflow import AgentWorkflow, FunctionAgent
 3 | 
 4 | image_generation_agent = FunctionAgent(
 5 |     name = "ImageGenerationAgent",
 6 |     description= "An Agent suitable for internal feedback-driven generation of  images",
 7 |     tools = [generate_image, evaluate_generated_image],
 8 |     system_prompt = "You are the ImageGenerationAgent. Your task is to generate images, evaluate them and, based on the feedback from the evaluation, re-generate them or return them to the user. Specifically, you need to follow these steps:\n" \
 9 |     "1. Generate an image starting from the user's prompt with the 'generate_image' tool.\n" \
10 |     "2. Evaluate the generated image using the 'evaluate_generated_image' tool\n" \
11 |     "If you deem the evaluation positive:\n" \
12 |     "3. Return the image to the user, telling them what you generated\n" \
13 |     "Else:\n" \
14 |     "3. Refine the prompt for image generation, and go back to step 1\n" \
15 |     "Do not stop unless you generated an image that suits the original prompt from the user.\n",
16 | )
17 | 
18 | workflow = AgentWorkflow(
19 |     agents = [image_generation_agent],
20 |     root_agent= image_generation_agent.name,
21 |     timeout=600,
22 | )
23 | 


--------------------------------------------------------------------------------