├── .github └── workflows │ └── linting.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── pyproject.toml ├── scripts ├── .env.example ├── blurry.jpg ├── client.py ├── server.py ├── tools.py ├── utils.py └── workflow.py └── uv.lock /.github/workflows/linting.yml: -------------------------------------------------------------------------------- 1 | name: Linting 2 | 3 | on: 4 | pull_request: 5 | 6 | jobs: 7 | lint: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/checkout@v4 11 | 12 | - name: Install uv 13 | uses: astral-sh/setup-uv@v6 14 | 15 | - name: Set up Python 16 | run: uv python install 3.12 17 | 18 | - name: Install pre-commit 19 | shell: bash 20 | run: uv venv && source .venv/bin/activate && uv pip install pre-commit 21 | 22 | - name: Run linter 23 | shell: bash 24 | run: uv run -- pre-commit run -a 25 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .venv/ 2 | .env 3 | scripts/.env 4 | */__pycache__/ 5 | scripts/output.png 6 | .ruff_cache/ 7 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | default_language_version: 3 | python: python3 4 | 5 | repos: 6 | - repo: https://github.com/pre-commit/pre-commit-hooks 7 | rev: v4.5.0 8 | hooks: 9 | - id: check-byte-order-marker 10 | - id: check-merge-conflict 11 | - id: check-symlinks 12 | - id: check-toml 13 | - id: check-yaml 14 | - id: detect-private-key 15 | - id: end-of-file-fixer 16 | - id: mixed-line-ending 17 | - id: trailing-whitespace 18 | 19 | - repo: https://github.com/charliermarsh/ruff-pre-commit 20 | rev: v0.11.8 21 | hooks: 22 | - id: ruff 23 | args: [--exit-non-zero-on-fix, --fix] 24 | exclude: ".*poetry.lock|.*_static" 25 | 26 | - repo: https://github.com/pre-commit/mirrors-mypy 27 | rev: v1.0.1 28 | hooks: 29 | - id: mypy 30 | additional_dependencies: 31 | [ 32 | "types-requests", 33 | "types-Deprecated", 34 | "types-redis", 35 | "types-setuptools", 36 | "types-PyYAML", 37 | "types-protobuf==4.24.0.4", 38 | ] 39 | args: 40 | [ 41 | --namespace-packages, 42 | --explicit-package-bases, 43 | --disallow-untyped-defs, 44 | --ignore-missing-imports, 45 | --python-version=3.9, 46 | ] 47 | entry: bash -c "export MYPYPATH=ingest_anything" 48 | 49 | - repo: https://github.com/psf/black-pre-commit-mirror 50 | rev: 23.10.1 51 | hooks: 52 | - id: black-jupyter 53 | name: black-docs-py 54 | alias: black 55 | files: ^(docs/|examples/) 56 | # Using PEP 8's line length in docs prevents excess left/right scrolling 57 | args: [--line-length=79] 58 | 59 | - repo: https://github.com/pre-commit/mirrors-prettier 60 | rev: v3.0.3 61 | hooks: 62 | - id: prettier 63 | 64 | - repo: https://github.com/pappasam/toml-sort 65 | rev: v0.23.1 66 | hooks: 67 | - id: toml-sort-fix 68 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to `Image Generation Agent` 2 | 3 | Do you want to contribute to this project? Make sure to read this guidelines first :) 4 | 5 | ## Issue 6 | 7 | **When to do it**: 8 | 9 | - You found bugs but you don't know how to solve them or don't have time/will to do the solve 10 | - You want new features but you don't know how to implement them or don't have time/will to do the implementation 11 | 12 | > ⚠️ _Always check open and closed issues before you submit yours to avoid duplicates_ 13 | 14 | **How to do it**: 15 | 16 | - Open an issue 17 | - Give the issue a meaningful title (short but effective problem/feature request description) 18 | - Describe the problem/feature request 19 | 20 | ## Traditional contribution 21 | 22 | **When to do it**: 23 | 24 | - You found bugs and corrected them 25 | - You optimized/improved the code 26 | - You added new features that you think could be useful to others 27 | 28 | **How to do it**: 29 | 30 | 1. Fork this repository 31 | 2. Install `pre-commit` and make sure to have it within the Git Hooks for your fork: 32 | 33 | ```bash 34 | pip install pre-commit 35 | pre-commit install 36 | ``` 37 | 38 | 3. Commit your changes 39 | 4. Make sure your changes pass the pre-commit linting/type checking, if not modify them so that they pass 40 | 5. Submit pull request (make sure to provide a thorough description of the changes) 41 | 42 | ### Thanks for contributing! 43 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) Jerry Liu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Image Generation Agent 2 | 3 | **Image Generation Agent** is an open source project aimed at helping you produce stunning images aligned with your prompts, through the automation of the prompt refinement-generation-visual feedback loop. 4 | 5 | ## Installation 6 | 7 | This is a **uv project**, thus you have to make sure uv is installed on your machine - if not, you can get it with: 8 | 9 | ```bash 10 | pip install uv 11 | ``` 12 | 13 | Or follow the [installation guidelines](https://docs.astral.sh/uv/getting-started/installation/) on uv docs. 14 | 15 | Once uv is on your machine, you can clone this repository: 16 | 17 | ```bash 18 | git clone https://github.com/run-llama/image-generation-agent 19 | cd image-generation-agent 20 | ``` 21 | 22 | And run: 23 | 24 | ```bash 25 | uv sync 26 | source .venv/bin/activate 27 | ``` 28 | 29 | Congrats, you successfully installed this project and its dependencies! 30 | 31 | ## Setting up 32 | 33 | Access the `scripts` sub-folder, and modify the [`.env.example`](./scripts/.env.example) file so that it contains a valid `GOOGLE_API_KEY` and `OPENAI_API_KEY`. After that, make sure to rename it to `.env`: 34 | 35 | ```bash 36 | cd scripts/ 37 | mv .env.example .env 38 | ``` 39 | 40 | Alternatively, you can export the keys as environmental variables from your terminal: 41 | 42 | ```bash 43 | export GOOGLE_API_KEY="***" 44 | export OPENAI_API_KEY="sk-***" 45 | ``` 46 | 47 | ## Launching 48 | 49 | While still being in the `scripts` sub-folder, you can launch the backend of the project with: 50 | 51 | ```bash 52 | python3 server.py 53 | ``` 54 | 55 | You will se a log message telling you that the websocket server has been started on port 8765. 56 | 57 | After the websocket has connected, launch the frontend from another terminal window (remaining in the `scripts` sub-folder and with the virtual environment active), using: 58 | 59 | ```bash 60 | python3 client.py 61 | ``` 62 | 63 | With this command, you'll have the application running on http://localhost:7860 and you will be able to interact with it! 64 | 65 | ## How does it work? 66 | 67 | The agent uses two tools under the hood: 68 | 69 | - `generate_image`: this exploits OpenAI image generation API to create images starting from textual prompts. 70 | - `evaluate_generated_image`: this uses the advanced vision capabilities of Gemini, employing the model as a judge of the faithfulness and quality of the image 71 | 72 | Whenever you submit a request, the agent first activates the `generate_image` tool, then it assess the fit of the image with your prompt using the `evaluate_generated_image` tool. If the image is deemed suitable, it is returned to the user, whereas otherwise the prompt is refined and the generate-evaluate loop is resumed. 73 | 74 | ## Contributing 75 | 76 | Contributions are more than welcome! Follow the [contribution guidelines](CONTRIBUTING.md) to make sure your contribution is compliant with the repo's requirements :) 77 | 78 | ## License and rights of usage 79 | 80 | This is an open-source project distributed under an [MIT License](LICENSE). 81 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | build-backend = "hatchling.build" 3 | requires = ["hatchling"] 4 | 5 | [lint.flake8-annotations] 6 | mypy-init-return = true 7 | 8 | [lint.pydocstyle] 9 | convention = "google" 10 | 11 | [project] 12 | authors = [{email = "clelia@runllama.ai", name = "Clelia Astra Bertelli"}] 13 | classifiers = [ 14 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 15 | "Topic :: Software Development :: Libraries :: Application Frameworks", 16 | "Topic :: Software Development :: Libraries :: Python Modules", 17 | ] 18 | dependencies = [ 19 | "fastapi>=0.115.12", 20 | "gradio>=3.36.1", 21 | "llama-index>=0.12.36,<0.13", 22 | "llama-index-core>=0.12.36,<0.13", 23 | "llama-index-llms-google-genai>=0.1.13,<0.2", 24 | "llama-index-llms-openai>=0.3.43", 25 | "openai>=1.81.0", 26 | "orjson>=3.10.18", 27 | "pre-commit>=4.2.0", 28 | "uvicorn>=0.34.2", 29 | "websockets>=15.0.1", 30 | ] 31 | description = "Image Generation Agent" 32 | license = "MIT" 33 | name = "image-generation-agent" 34 | readme = "README.md" 35 | requires-python = ">=3.9,<4.0" 36 | version = "0.1.0" 37 | 38 | [project.urls] 39 | Repository = "https://github.com/run-llama/image-generation-agent" 40 | 41 | [tool.hatch.build.targets.sdist] 42 | include = ["*"] 43 | 44 | [tool.hatch.build.targets.wheel] 45 | include = ["*"] 46 | 47 | [tool.mypy] 48 | disallow_untyped_defs = true 49 | exclude = [".venv"] 50 | explicit_package_bases = true 51 | ignore_missing_imports = true 52 | mypy_path = "llama_index" 53 | namespace_packages = true 54 | plugins = "pydantic.mypy" 55 | python_version = "3.9" 56 | 57 | [tool.ruff] 58 | exclude = [ 59 | "_static", 60 | "examples", 61 | "llama_index/ingestion/client", 62 | "notebooks", 63 | ] 64 | target-version = "py312" 65 | lint.ignore = [ 66 | "COM812", # Too aggressive 67 | "D212", # Using D213 68 | "D417", # Too aggressive 69 | "F541", # Messes with prompts.py 70 | "RUF100", # Allow blanket noqa 71 | "TC002", 72 | "UP", # Remove when we drop Python 3.9 73 | "PT001", 74 | "E501", # Use best judgement for line-length 75 | "E402", # Annoying, use best judgement 76 | "PYI063", 77 | "ANN204", # this is annoying 78 | "D401", # I disagree 79 | "D404", 80 | ] 81 | # Feel free to add more here 82 | lint.select = [ 83 | "ANN204", 84 | "B009", 85 | "B010", 86 | "B011", 87 | "B013", 88 | "B014", 89 | "C4", 90 | "COM812", 91 | "COM819", 92 | "D201", 93 | "D202", 94 | "D204", 95 | "D207", 96 | "D208", 97 | "D209", 98 | "D211", 99 | "D213", 100 | "D214", 101 | "D215", 102 | "D3", 103 | "D4", 104 | "E", 105 | "EXE004", 106 | "F401", 107 | "F504", 108 | "F541", 109 | "F632", 110 | "FLY", 111 | "G010", 112 | "I002", 113 | "PERF1", 114 | "PIE790", 115 | "PIE794", 116 | "PIE808", 117 | "PIE810", 118 | "PLC0414", 119 | "PLE2510", 120 | "PLE2512", 121 | "PLE2513", 122 | "PLE2514", 123 | "PLE2515", 124 | "PLR1711", 125 | "PT001", 126 | "PT003", 127 | "PT006", 128 | "PT02", 129 | "PTH201", 130 | "PYI", 131 | "Q", 132 | "RET501", 133 | "RET502", 134 | "RET503", 135 | "RET504", 136 | "RSE", 137 | "RUF005", 138 | "RUF010", 139 | "RUF015", 140 | "RUF1", 141 | "SIM101", 142 | "SIM103", 143 | "SIM109", 144 | "SIM118", 145 | "SIM2", 146 | "SIM300", 147 | "SIM9", 148 | "TC005", 149 | "TD006", 150 | "TID", 151 | "TRY201", 152 | "W", 153 | ] 154 | lint.unfixable = [ 155 | "ERA001", 156 | ] 157 | 158 | [tool.tomlsort] 159 | all = false 160 | in_place = true 161 | spaces_before_inline_comment = 2 # Match Python PEP 8 162 | spaces_indent_inline_array = 4 # Match Python PEP 8 163 | trailing_comma_inline_array = true 164 | 165 | [[tool.uv.index]] 166 | name = "nvidia-pypi" 167 | url = "https://pypi.nvidia.com" 168 | -------------------------------------------------------------------------------- /scripts/.env.example: -------------------------------------------------------------------------------- 1 | GOOGLE_API_KEY="***" 2 | OPENAI_API_KEY="sk-***" 3 | -------------------------------------------------------------------------------- /scripts/blurry.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/image-generation-agent/843c253ee585761e0d67cf462dbb8007431e8129/scripts/blurry.jpg -------------------------------------------------------------------------------- /scripts/client.py: -------------------------------------------------------------------------------- 1 | import websockets 2 | import gradio as gr 3 | 4 | async def websocket_chat(prompt): 5 | uri = "ws://localhost:8765" 6 | try: 7 | async with websockets.connect(uri) as websocket: 8 | await websocket.send(prompt) 9 | full_response = "" 10 | 11 | while True: 12 | message = await websocket.recv() 13 | if message == "[END]": 14 | break 15 | full_response += message 16 | yield full_response, "blurry.jpg" 17 | yield full_response, "output.png" 18 | 19 | except Exception as e: 20 | yield f"Error: {e}" 21 | 22 | def launch_interface(): 23 | with gr.Blocks(theme=gr.themes.Citrus(primary_hue="indigo", secondary_hue="teal")) as frontend: 24 | gr.HTML("

Image Generation Agent🎨

") 25 | gr.HTML("

Get stunning AI-generated images!

") 26 | with gr.Row(): 27 | usr_txt = gr.Textbox(label="Prompt", placeholder="Describe the image you want here...") 28 | with gr.Column(): 29 | resp = gr.Markdown(label="Agent Output", container=True, show_label=True, show_copy_button=True) 30 | gen_img = gr.Image(label="Generated Image") 31 | 32 | 33 | with gr.Row(): 34 | btn = gr.Button("Generate🖌️").click(fn=websocket_chat, inputs=[usr_txt], outputs=[resp, gen_img]) 35 | 36 | frontend.launch() 37 | 38 | if __name__ == "__main__": 39 | launch_interface() 40 | -------------------------------------------------------------------------------- /scripts/server.py: -------------------------------------------------------------------------------- 1 | # server.py 2 | import json 3 | import asyncio 4 | import websockets 5 | from workflow import workflow 6 | from llama_index.core.agent.workflow.workflow_events import ToolCall, ToolCallResult 7 | 8 | async def run_agent(websocket): 9 | async for prompt in websocket: 10 | handler = workflow.run(user_msg=prompt) 11 | async for event in handler.stream_events(): 12 | if isinstance(event, ToolCallResult): 13 | await websocket.send(f"**Result from `{event.tool_name}`**:\n\n{event.tool_output.content}\n\n") 14 | elif isinstance(event, ToolCall): 15 | await websocket.send(f"### Calling tool: `{event.tool_name}`\n\n```json\n{json.dumps(event.tool_kwargs, indent=4)}\n```\n\n") 16 | response = await handler 17 | response = str(response) 18 | await websocket.send("### Final output\n\n" + response) 19 | await websocket.send("[END]") 20 | 21 | async def main(): 22 | print("Starting server on ws://localhost:8765") 23 | async with websockets.serve(run_agent, "localhost", 8765): 24 | await asyncio.Future() # Run forever 25 | 26 | if __name__ == "__main__": 27 | asyncio.run(main()) 28 | -------------------------------------------------------------------------------- /scripts/tools.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import json 3 | from pathlib import Path 4 | from utils import get_api_keys 5 | from llama_index.llms.openai import OpenAIResponses 6 | from typing import Literal 7 | from pydantic import BaseModel, Field 8 | from llama_index.llms.google_genai import GoogleGenAI 9 | from llama_index.core.llms import ChatMessage, MessageRole, ImageBlock, TextBlock 10 | 11 | 12 | class ImageEvaluation(BaseModel): 13 | faithfulness: int = Field(description="Faithfulness of the generated image to the generation prompt, from 0 to 100") 14 | quality: Literal["low", "mediocre", "average", "upper-intermediate", "high", "very high"] = Field(description="Quality of the image, expressed as one of: 'low', 'mediocre', 'average', 'upper-intermediate', 'high', 'very high'") 15 | prompt_agnostic_description: str = Field(description="Description of the image, agnostic of the image generation prompt") 16 | 17 | openai_api_key, google_api_key =get_api_keys() 18 | async_openai_client = OpenAIResponses(api_key=openai_api_key, model="gpt-4.1-mini",built_in_tools=[{"type": "image_generation"}]) 19 | llm = GoogleGenAI(model="gemini-2.0-flash", api_key=google_api_key) 20 | llm_struct = llm.as_structured_llm(ImageEvaluation) 21 | 22 | async def generate_image(prompt: str = Field(description="The image generation prompt")) -> str: 23 | """ 24 | This tool useful to generate images. 25 | 26 | Args: 27 | prompt (str): The image generation prompt 28 | 29 | """ 30 | try: 31 | messages = [ChatMessage.from_str(content=prompt, role="user")] 32 | img = await async_openai_client.achat(messages) 33 | for block in img.message.blocks: 34 | if isinstance(block, ImageBlock): 35 | image_bytes = base64.b64decode(block.image) 36 | with open("output.png", "wb") as f: 37 | f.write(image_bytes) 38 | return "Image successfully generated" 39 | except Exception as e: 40 | return f"An error occurred during image generation: {e.__str__()}" 41 | 42 | async def evaluate_generated_image(prompt: str = Field(description="The original prompt used to generate the image")) -> str: 43 | """ 44 | This tool is useful to evaluate a generated image. 45 | 46 | Args: 47 | prompt (str): The original prompt used to generate the image 48 | 49 | """ 50 | messages = [ChatMessage(role=MessageRole.USER, blocks=[ImageBlock(path=Path("output.png")), TextBlock(text=f"Could you (1) evaluate the faithfulness of the attached image to this prompt: '{prompt}', (2) evaluate the quality of the image and (3) produce a description of the image that is agnostic of the prompt that was used to generate it?")])] 51 | resp = await llm_struct.achat(messages=messages) 52 | struct_output = json.loads(resp.message.blocks[0].text) 53 | return f"The generated image can be described as:\n'''\n{struct_output['prompt_agnostic_description']}\n'''\nThe faithfulness of the generated image to the original prompt is: {struct_output['faithfulness']}%.\nThe quality of the image is {struct_output['quality']}." 54 | -------------------------------------------------------------------------------- /scripts/utils.py: -------------------------------------------------------------------------------- 1 | from os import environ as ENV 2 | from dotenv import load_dotenv 3 | from typing import Tuple 4 | 5 | def get_api_keys() -> Tuple[str, str]: 6 | openai_api_key = ENV.get("OPENAI_API_KEY", None) 7 | if openai_api_key is None: 8 | load_dotenv() 9 | openai_api_key = ENV.get("OPENAI_API_KEY", None) 10 | if not openai_api_key: 11 | raise ValueError("There is no OPENAI_API_KEY declared among the environmental variables") 12 | google_api_key = ENV.get("GOOGLE_API_KEY", None) 13 | if google_api_key is None: 14 | load_dotenv() 15 | google_api_key = ENV.get("GOOGLE_API_KEY", None) 16 | if not google_api_key: 17 | raise ValueError("There is no GOOGLE_API_KEY declared among the environmental variables") 18 | return openai_api_key, google_api_key 19 | -------------------------------------------------------------------------------- /scripts/workflow.py: -------------------------------------------------------------------------------- 1 | from tools import generate_image, evaluate_generated_image 2 | from llama_index.core.agent.workflow import AgentWorkflow, FunctionAgent 3 | 4 | image_generation_agent = FunctionAgent( 5 | name = "ImageGenerationAgent", 6 | description= "An Agent suitable for internal feedback-driven generation of images", 7 | tools = [generate_image, evaluate_generated_image], 8 | system_prompt = "You are the ImageGenerationAgent. Your task is to generate images, evaluate them and, based on the feedback from the evaluation, re-generate them or return them to the user. Specifically, you need to follow these steps:\n" \ 9 | "1. Generate an image starting from the user's prompt with the 'generate_image' tool.\n" \ 10 | "2. Evaluate the generated image using the 'evaluate_generated_image' tool\n" \ 11 | "If you deem the evaluation positive:\n" \ 12 | "3. Return the image to the user, telling them what you generated\n" \ 13 | "Else:\n" \ 14 | "3. Refine the prompt for image generation, and go back to step 1\n" \ 15 | "Do not stop unless you generated an image that suits the original prompt from the user.\n", 16 | ) 17 | 18 | workflow = AgentWorkflow( 19 | agents = [image_generation_agent], 20 | root_agent= image_generation_agent.name, 21 | timeout=600, 22 | ) 23 | --------------------------------------------------------------------------------