├── .gitignore ├── LICENSE.md ├── README.md ├── agentchain-logo.png ├── architecture.png ├── architecture.svg ├── audio └── .gitignore ├── csv └── .gitignore ├── docker ├── Dockerfile └── deploying-from-docker.md ├── download.sh ├── image └── .gitignore ├── main.py ├── requirements.txt └── twilio_lib.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | cover/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | .pybuilder/ 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | # For a library or package, you might want to ignore these files since the code is 91 | # intended to run in multiple environments; otherwise, check them in: 92 | # .python-version 93 | 94 | # pipenv 95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 98 | # install all needed dependencies. 99 | #Pipfile.lock 100 | 101 | # poetry 102 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 103 | # This is especially recommended for binary packages to ensure reproducibility, and is more 104 | # commonly ignored for libraries. 105 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 106 | #poetry.lock 107 | 108 | # pdm 109 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 110 | #pdm.lock 111 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 112 | # in version control. 113 | # https://pdm.fming.dev/#use-with-ide 114 | .pdm.toml 115 | 116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 117 | __pypackages__/ 118 | 119 | # Celery stuff 120 | celerybeat-schedule 121 | celerybeat.pid 122 | 123 | # SageMath parsed files 124 | *.sage.py 125 | 126 | # Environments 127 | .env 128 | .venv 129 | env/ 130 | venv/ 131 | ENV/ 132 | env.bak/ 133 | venv.bak/ 134 | 135 | # Spyder project settings 136 | .spyderproject 137 | .spyproject 138 | 139 | # Rope project settings 140 | .ropeproject 141 | 142 | # mkdocs documentation 143 | /site 144 | 145 | # mypy 146 | .mypy_cache/ 147 | .dmypy.json 148 | dmypy.json 149 | 150 | # Pyre type checker 151 | .pyre/ 152 | 153 | # pytype static type analyzer 154 | .pytype/ 155 | 156 | # Cython debug symbols 157 | cython_debug/ 158 | 159 | # PyCharm 160 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 161 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 162 | # and can be added to the global gitignore or merged into this file. For a more nuclear 163 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 164 | .idea/ -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Microsoft 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | AgentChain logo 3 |

4 | 5 | AgentChain uses Large Language Models (LLMs) for planning and orchestrating multiple Agents or Large Models (LMs) for accomplishing sophisticated tasks. AgentChain is fully multimodal: it accepts text, image, audio, tabular data as input and output. 6 | 7 | - **🧠 LLMs as the brain:** AgentChain leverages state-of-the-art Large Language Models to provide users with the ability to plan and make decisions based on natural language inputs. This feature makes AgentChain a versatile tool for a wide range of applications, such as task execution give natural language instructions, data understanding, and data generation. 8 | - **🌟 Fully Multimodal IO:** AgentChain is fully multimodal, accepting input and output from various modalities, such as text, image, audio, or video (coming soon). This feature makes AgentChain a versatile tool for a wide range of applications, such as computer vision, speech recognition, and transitioning from one modality to another. 9 | - **🤝 Orchestrate Versatile Agents:** AgentChain can orchestrate multiple agents to perform complex tasks. Using composability and hierarchical structuring of tools AgentChain can choose intelligently which tools to use and when for a certain task. This feature makes AgentChain a powerful tool for projects that require complex combination of tools. 10 | - **🔧 Customizable for Ad-hoc Needs:** AgentChain can be customized to fit specific project requirements, making it a versatile tool for a wide range of applications. Specific requirements can be met by enhancing capabilities with new agents (and distributed architecture coming soon). 11 | 12 | 13 | 14 | 15 | 16 | # Get started 17 | 1. Install requirements: `pip install -r requirements.txt` 18 | 2. Download model checkpoints: `bash download.sh` 19 | 3. Depending on the agents you need in-place, make sure to export environment variables 20 | 21 | ```shell 22 | OPENAI_API_KEY={YOUR_OPENAI_API_KEY} # mandatory since the LLM is central in this application 23 | SERPAPI_API_KEY={YOUR_SERPAPI_API_KEY} # make sure to include a serp API key in case you need the agent to be able to search the web 24 | 25 | # These environment variables are needed in case you want the agent to be able to make phone calls 26 | AWS_ACCESS_KEY_ID={YOUR_AWS_ACCESS_KEY_ID} 27 | AWS_SECRET_ACCESS_KEY={YOUR_AWS_SECRET_ACCESS_KEY} 28 | TWILIO_ACCOUNT_SID={YOUR_TWILIO_ACCOUNT_SID} 29 | TWILIO_AUTH_TOKEN={YOUR_TWILIO_AUTH_TOKEN} 30 | AWS_S3_BUCKET_NAME={YOUR_AWS_S3_BUCKET_NAME} # make sure to create an S3 bucket with public access 31 | ``` 32 | 4. Install `ffmpeg` library (needed for whisper): `sudo apt update && sudo apt install ffmpeg` (Ubuntu command) 33 | 5. Run the main script: `python main.py` 34 | 35 | 36 | ## System requirements 37 | As of [this commit](https://github.com/jina-ai/agentchain/commit/da588a728c390fb538fd361d4f41dd50aa193751), it is needed to have at least 29 GB of GPU memory to run the AgentChain. 38 | However, make sure to assign GPU devices correctly in `main.py`. 39 | 40 | You can comment out some tools and models to reduce the GPU memory footprint (but for less capabilities). 41 | 42 | 43 | # Demo 44 | 45 | 46 | AgentChain demo 1: transcribing audio and visualizing the result as an image. A video of the AgentChain interface shows an uploaded audio and the resulting generated image, which is a representation of the audio content. 47 | 48 | https://user-images.githubusercontent.com/4182659/225347932-87298e6c-58d0-4a29-892f-1398b1406c15.mp4 49 | 50 | --- 51 | 52 | AgentChain demo 2: asking questions about an image. A video of the AgentChain interface shows an image and a question being asked about it, with the resulting answer displayed below. 53 | 54 | https://user-images.githubusercontent.com/4182659/225348027-ed30f9d5-d05b-405a-9651-c08f4976cf83.mp4 55 | 56 | --- 57 | 58 | AgentChain demo 3: question-answering on tabular data and making a phone call to report the results. A video of the AgentChain interface shows a table of data with a question being asked and the resulting answer displayed, followed by a phone call being made using the `CommsAgent`. 59 | 60 | https://user-images.githubusercontent.com/4182659/225348128-6e9bdb3b-78ed-49e8-80f5-fd7c9ad66f28.mp4 61 | 62 | # Agents in AgentChain 63 | 64 | > The content of this document mostly shows **our vision** and **what we aim to achieve** with AgentChain. 65 | Check the Demo section to understand what we achieved so far. 66 | 67 | ![](architecture.svg) 68 | 69 | AgentChain is a sophisticated system with the goal of solving general problems. It can orchestrate multiple agents to accomplish sub-problems. These agents are organized into different groups, each with their unique set of capabilities and functionalities. Here are some of the agent groups in AgentChain: 70 | 71 | ### SearchAgents 72 | The `SearchAgents` group is responsible for gathering information from various sources, including search engines, online databases, and APIs. The agents in this group are highly skilled at retrieving up-to-date world knowledge information. Some examples of agents in this group include the `Google Search API`, `Bing API`, `Wikipedia API`, and `Serp`. 73 | 74 | ### CommsAgents 75 | The `CommsAgents` group is responsible for handling communication between different parties, such as sending emails, making phone calls, or messaging via various platforms. The agents in this group can integrate with a wide range of platforms. Some examples of agents in this group include `TwilioCaller`, `TwilioEmailWriter`, `TwilioMessenger` and `Slack`. 76 | 77 | ### ToolsAgents 78 | The `ToolsAgents` group is responsible for performing various computational tasks, such as performing calculations, running scripts, or executing commands. The agents in this group can work with a wide range of programming languages and tools. Some examples of agents in this group include `Math`, `Python REPL`, and `Terminal`. 79 | 80 | ### MultiModalAgents 81 | The `MultiModalAgents` group is responsible for handling input and output from various modalities, such as text, image, audio, or video (coming soon). The agents in this group can process and understand different modalities. Some examples of agents in this group include `OpenAI Whisper`, `Blip2`, `Coqui`, and `StableDiffusion`. 82 | 83 | ### ImageAgents 84 | The `ImageAgents` group is responsible for processing and manipulating images, such as enhancing image quality, object detection, or image recognition. The agents in this group can perform complex operations on images. Some examples of agents in this group include `Upscaler`, `ControlNet` and `YOLO`. 85 | 86 | ### DBAgents 87 | The `DBAgents` group is responsible for adding and fetching data from your database, such as getting metrics or aggregations from your database. The agents in this group interact with databases and enrich other agents with your database information. Some examples of agents in this group include `SQL`, `MongoDB`, `ElasticSearch`, `Qrant` and `Notion`. 88 | 89 | 90 | # Potential Applications 91 | 92 | ### Example 1: 🏝️📸🌅 AgentChain Image Generation System for Travel Company 93 | As a travel company that is promoting a new and exotic destination, it is crucial to have high-quality images that can grab the attention of potential travelers. However, manually creating stunning images can be time-consuming and expensive. That's why the travel company wants to use AgentChain to automate the image generation process and create beautiful visuals with the help of various agents. 94 | 95 | Here is how AgentChain can help by chaining different agents together: 96 | 1. Use `SearchAgent` (`Google Search API`, `Wikipedia API`, `Serp`) to gather information and inspiration about the destination, such as the most popular landmarks, the local cuisine, and the unique features of the location. 97 | 2. Use `ImageAgent` (`Upscaler`) to enhance the quality of images and make them more appealing by using state-of-the-art algorithms to increase the resolution and remove noise from the images. 98 | 3. Use `MultiModalAgent` (`Blip2`) to generate descriptive captions for the images, providing more context and making the images more meaningful. 99 | 4. Use `CommsAgent` (`TwilioEmailWriter`) to send the images to the target audience via email or other messaging platforms, attracting potential travelers with stunning visuals and promoting the new destination. 100 | 101 | ### Example 2: 💼💹📈 AgentChain Financial Analysis Report for Investment Firm 102 | As an investment firm that manages a large portfolio of stocks, it is critical to stay up-to-date with the latest market trends and analyze the performance of different stocks to make informed investment decisions. However, analyzing data from multiple sources can be time-consuming and error-prone. That's why the investment firm wants to use AgentChain to automate the analysis process and generate reports with the help of various agents. 103 | 104 | Here is how AgentChain can help by chaining different agents together: 105 | 1. Use `ToolsAgent` (`Python REPL`, `TableQA`) to analyze data from different sources (e.g., CSV files, stock market APIs) and perform calculations related to financial metrics such as earnings, dividends, and P/E ratios. 106 | 2. Use `SearchAgent` (`Bing API`) to gather news and information related to the stocks in the portfolio, such as recent earnings reports, industry trends, and analyst ratings. 107 | 3. Use `NLPAgent` (`GPT`) to create a summary and bullet points of the news and information gathered, providing insights into market sentiment and potential trends. 108 | 4. Use `CommsAgent` (`TwilioEmailWriter`) to send a summary report of the analysis to the appropriate stakeholders, helping them make informed decisions about their investments. 109 | 110 | ### Example 3: 🛍️💬💻 AgentChain Customer Service Chatbot for E-commerce Site 111 | As an e-commerce site that wants to provide excellent customer service, it is crucial to have a chatbot that can handle customer inquiries and support requests in a timely and efficient manner. However, building a chatbot that can understand and respond to complex customer requests can be challenging. That's why the e-commerce site wants to use AgentChain to automate the chatbot process and provide superior customer service with the help of various agents. 112 | 113 | Here is how AgentChain can help by chaining different agents together: 114 | 1. Use `MultiModalAgent` (`Blip2`, `Whisper`) to handle input from various modalities (text, image, audio), making it easier for customers to ask questions and make requests in a natural way. 115 | 2. Use `SearchAgent` (`Google Search API`, `Wikipedia API`) or `DBAgent` to provide information about products or services whether in-house or public, such as specifications, pricing, and availability. 116 | 3. Use `CommsAgent` (`TwilioMessenger`) to communicate with customers via messaging platforms, providing support and answering questions in real-time. 117 | 4. Use `ToolsAgent` (`Math`) to perform calculations related to discounts, taxes, or shipping costs, helping customers make informed decisions about their purchases. 118 | 5. Use `MultiModalAgent` (`Coqui`) to generate natural-sounding responses and hold more complex conversations, providing a personalized and engaging experience for customers. 119 | 120 | ### Example 4: 🧑‍⚕️💊💤 AgentChain Personal Health Assistant 121 | Access to personal health assistance can be expensive and limited. It is essential to have a personal health assistant that can help individuals manage their health and well-being. However, providing personalized health advice and reminders can be challenging, especially for seniors. That's why AgentChain aims to automate the health assistant process and provide personalized support with the help of various agents. 122 | 123 | Here is how AgentChain can help by chaining different agents together: 124 | 1. Use `DBAgent` to handle input from various health monitoring devices (e.g., heart rate monitors, blood pressure monitors, sleep trackers), providing real-time health data and alerts to the health assistant. 125 | 2. Use `SearchAgent` (`Google Search API`, `Wikipedia API`) or any other medical database to provide information about health topics and medications, such as side effects, dosage, and interactions. 126 | 3. Use `NLPAgent` (`GPT`) to generate personalized recommendations for diet, exercise, and medication, taking into account the seniors' health goals and preferences. 127 | 4. Use `CommsAgent` (`TwilioCaller`, `TwilioMessenger`) to advise, make reminders and provide alerts to help stay on track with their health goals, improving their quality of life and reducing the need for emergency care. 128 | 129 | 130 | ## Acknowledgements 131 | We appreciate the open source of the following projects: 132 | 133 | [Hugging Face](https://github.com/huggingface)   134 | [LangChain](https://github.com/hwchase17/langchain)   135 | [Stable Diffusion](https://github.com/CompVis/stable-diffusion)   136 | [ControlNet](https://github.com/lllyasviel/ControlNet)   137 | [InstructPix2Pix](https://github.com/timothybrooks/instruct-pix2pix)   138 | [CLIPSeg](https://github.com/timojl/clipseg)   139 | [BLIP](https://github.com/salesforce/BLIP)   140 | [Microsoft](https://github.com/microsoft/visual-chatgpt)   141 | -------------------------------------------------------------------------------- /agentchain-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/agentchain/55f2d9fc1bc78a844d4a3313b4753bae433ef775/agentchain-logo.png -------------------------------------------------------------------------------- /architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/agentchain/55f2d9fc1bc78a844d4a3313b4753bae433ef775/architecture.png -------------------------------------------------------------------------------- /audio/.gitignore: -------------------------------------------------------------------------------- 1 | *.wav -------------------------------------------------------------------------------- /csv/.gitignore: -------------------------------------------------------------------------------- 1 | *.csv -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | # Base image with CUDA 11.3.0 and Ubuntu 20.04 2 | FROM nvidia/cuda:11.3.0-base-ubuntu20.04 3 | 4 | # Install python3 and pip 5 | RUN apt update && apt install -y python3-pip 6 | # Install opencv dependencies 7 | RUN DEBIAN_FRONTEND=noninteractive apt install -y --no-install-recommends libsm6 libxext6 libxrender-dev ffmpeg 8 | 9 | # Set the working directory to /app 10 | WORKDIR /app 11 | 12 | # Install any needed packages specified in requirements.txt 13 | COPY requirements.txt /app 14 | RUN pip install -r requirements.txt 15 | 16 | ENTRYPOINT bash -------------------------------------------------------------------------------- /docker/deploying-from-docker.md: -------------------------------------------------------------------------------- 1 | # Deploying with docker 2 | 3 | Dependency management can be challenging, but fortunately there is a solution: Docker. Using Docker to deploy AgentChain offers an easy and reproducible way to get started. Since the agents are deployed on GPU, you will need the NVIDIA Container Toolkit. 4 | 5 | If you've never used Docker with a GPU before, follow the Toolkit installation instructions: 6 | https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#setting-up-nvidia-container-toolkit 7 | 8 | ## Building the image 9 | 10 | Building the image is pretty straightforward 11 | 12 | ```bash 13 | cp ./requirements.txt ./docker 14 | cd docker 15 | docker build -t agentchain . 16 | cd .. 17 | ``` 18 | 19 | ## Download model checkpoints 20 | 21 | ```bash 22 | bash download.sh 23 | ``` 24 | 25 | The model checkpoints are 44GB in total so this can take a while. 26 | 27 | ## Run container 28 | 29 | ```bash 30 | docker run --name agentchain -it -v $(pwd):/app --gpus all -p 7861:7861 agentchain 31 | ``` 32 | 33 | ## Set env variable and start server 34 | 35 | You will now be in a bash shell. Here you need to export the API keys as environment variable for the server. The Open AI API key and the Serp API key are required as they power the main agent and the search agent respectively. 36 | 37 | ```bash 38 | OPENAI_API_KEY= 39 | SERPAPI_API_KEY= 40 | ``` 41 | 42 | (Optional) If you want the CommsAgent to be able to make phone calls you will need to export a few more variables. The AWS_S3_BUCKET_NAME specified needs to be a public access bucket. 43 | 44 | ```bash 45 | AWS_ACCESS_KEY_ID={YOUR_AWS_ACCESS_KEY_ID} 46 | AWS_SECRET_ACCESS_KEY={YOUR_AWS_SECRET_ACCESS_KEY} 47 | TWILIO_ACCOUNT_SID={YOUR_TWILIO_ACCOUNT_SID} 48 | TWILIO_AUTH_TOKEN={YOUR_TWILIO_AUTH_TOKEN} 49 | AWS_S3_BUCKET_NAME={YOUR_AWS_S3_BUCKET_NAME} 50 | ``` 51 | 52 | You can now start the server by running the main script. 53 | 54 | ```bash 55 | python3 main.py 56 | ``` 57 | 58 | The server may take about an hour before serving the first time as there are a few more model checkpoints to install. The installs may also timeout the first time in which case, you can run `python [main.py](http://main.py)` again to resume downloading checkpoints. -------------------------------------------------------------------------------- /download.sh: -------------------------------------------------------------------------------- 1 | git clone https://github.com/lllyasviel/ControlNet.git 2 | ln -s ControlNet/ldm ./ldm 3 | ln -s ControlNet/cldm ./cldm 4 | ln -s ControlNet/annotator ./annotator 5 | cd ControlNet/models 6 | wget https://huggingface.co/lllyasviel/ControlNet/resolve/main/models/control_sd15_canny.pth 7 | wget https://huggingface.co/lllyasviel/ControlNet/resolve/main/models/control_sd15_depth.pth 8 | wget https://huggingface.co/lllyasviel/ControlNet/resolve/main/models/control_sd15_hed.pth 9 | wget https://huggingface.co/lllyasviel/ControlNet/resolve/main/models/control_sd15_mlsd.pth 10 | wget https://huggingface.co/lllyasviel/ControlNet/resolve/main/models/control_sd15_normal.pth 11 | wget https://huggingface.co/lllyasviel/ControlNet/resolve/main/models/control_sd15_openpose.pth 12 | wget https://huggingface.co/lllyasviel/ControlNet/resolve/main/models/control_sd15_scribble.pth 13 | wget https://huggingface.co/lllyasviel/ControlNet/resolve/main/models/control_sd15_seg.pth 14 | cd ../../ 15 | -------------------------------------------------------------------------------- /image/.gitignore: -------------------------------------------------------------------------------- 1 | *.png -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import re 4 | import sys 5 | import uuid 6 | 7 | import cv2 8 | import einops 9 | import gradio as gr 10 | import numpy as np 11 | import pandas as pd 12 | import torch 13 | import whisper 14 | from ControlNet.annotator.openpose import OpenposeDetector 15 | from ControlNet.annotator.uniformer import UniformerDetector 16 | from ControlNet.annotator.util import HWC3, resize_image 17 | from ControlNet.cldm.ddim_hacked import DDIMSampler 18 | from ControlNet.cldm.model import create_model, load_state_dict 19 | from diffusers import (EulerAncestralDiscreteScheduler, 20 | StableDiffusionInstructPix2PixPipeline, 21 | StableDiffusionPipeline) 22 | from langchain.agents import load_tools 23 | from langchain.agents.initialize import initialize_agent 24 | from langchain.agents.tools import Tool 25 | from langchain.chains.conversation.memory import ConversationBufferMemory 26 | from langchain.llms.openai import OpenAI 27 | from ldm.util import instantiate_from_config 28 | from omegaconf import OmegaConf 29 | from PIL import Image 30 | from pytorch_lightning import seed_everything 31 | from transformers import (AutoModelForCausalLM, AutoTokenizer, 32 | BlipForConditionalGeneration, 33 | BlipForQuestionAnswering, BlipProcessor, 34 | CLIPSegForImageSegmentation, CLIPSegProcessor, 35 | pipeline) 36 | from TTS.api import TTS 37 | 38 | sys.path.append(os.path.dirname(os.path.realpath(__file__))) 39 | sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) 40 | 41 | AGENT_CHAIN_PREFIX = """AgentChain is designed to be able to assist with a wide range of text, visual and audio 42 | related tasks, from answering simple questions to providing in-depth explanations and discussions on a wide range of 43 | topics. AgentChain is able to generate human-like text based on the input it receives, allowing it to engage in 44 | natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand. 45 | 46 | AgentChain is able to process and understand large amounts of text, images and audios. As a language model, 47 | AgentChain can not directly read images or audio, but it has a list of tools to finish different visual, text, audio, 48 | math and reasoning tasks. Each image will have a file name formed as "image/xxx.png", and AgentChain can invoke 49 | different tools to indirectly understand pictures. Each audio will have a file name formed as "audio/xxx.wav", 50 | and AgentChain can invoke different tools to indirectly understand audio. When talking about audio, AgentChain is very 51 | strict to the file name and will never fabricate nonexistent files. When using tools to generate new image files, 52 | AgentChain is also known that the image may not be the same as the user's demand, and will use other visual 53 | question answering tools or description tools to observe the real image. AgentChain is able to use tools in a sequence, 54 | and is loyal to the tool observation outputs rather than faking the image content and image file name. It will 55 | remember to provide the file name from the last tool observation, if a new image is generated. 56 | 57 | Human may provide new figures to AgentChain with a description. The description helps AgentChain to understand this 58 | image, but AgentChain should use tools to finish following tasks, rather than directly imagine from the description. 59 | 60 | Overall, AgentChain is a powerful visual dialogue assistant tool that can help with a wide range of tasks and provide 61 | valuable insights and information on a wide range of topics. 62 | 63 | 64 | TOOLS: 65 | ------ 66 | 67 | AgentChain has access to the following tools:""" 68 | 69 | AGENT_CHAIN_FORMAT_INSTRUCTIONS = """To use a tool, please use the following format: 70 | 71 | ``` 72 | Thought: Do I need to use a tool? Yes 73 | Action: the action to take, should be one of [{tool_names}] 74 | Action Input: the input to the action 75 | Observation: the result of the action 76 | ``` 77 | 78 | When you have a response to say to the Human, or if you do not need to use a tool, you MUST use the format: 79 | 80 | ``` 81 | Thought: Do I need to use a tool? No 82 | {ai_prefix}: [your response here] 83 | ``` 84 | """ 85 | 86 | AGENT_CHAIN_SUFFIX = """You are very strict to the filename correctness and will never fake a file name if it does 87 | not exist. You will remember to provide the image file name loyally if it's provided in the last tool observation. 88 | 89 | Begin! 90 | 91 | Previous conversation history: 92 | {chat_history} 93 | 94 | New input: {input} 95 | Since AgentChain is a text language model, AgentChain must use tools to observe images or audio rather than 96 | imagination. The thoughts and observations are only visible for AgentChain, AgentChain should remember to repeat 97 | important information in the final response for Human. Thought: Do I need to use a tool? {agent_scratchpad}""" 98 | 99 | 100 | def cut_dialogue_history(history_memory, keep_last_n_words=500): 101 | tokens = history_memory.split() 102 | n_tokens = len(tokens) 103 | print(f"hitory_memory:{history_memory}, n_tokens: {n_tokens}") 104 | if n_tokens < keep_last_n_words: 105 | return history_memory 106 | else: 107 | paragraphs = history_memory.split('\n') 108 | last_n_tokens = n_tokens 109 | while last_n_tokens >= keep_last_n_words: 110 | last_n_tokens = last_n_tokens - len(paragraphs[0].split(' ')) 111 | paragraphs = paragraphs[1:] 112 | return '\n' + '\n'.join(paragraphs) 113 | 114 | 115 | def get_new_image_name(org_img_name, func_name="update"): 116 | head_tail = os.path.split(org_img_name) 117 | head = head_tail[0] 118 | tail = head_tail[1] 119 | name_split = tail.split('.')[0].split('_') 120 | this_new_uuid = str(uuid.uuid4())[0:4] 121 | if len(name_split) == 1: 122 | most_org_file_name = name_split[0] 123 | recent_prev_file_name = name_split[0] 124 | new_file_name = '{}_{}_{}_{}.png'.format(this_new_uuid, func_name, recent_prev_file_name, most_org_file_name) 125 | else: 126 | assert len(name_split) == 4 127 | most_org_file_name = name_split[3] 128 | recent_prev_file_name = name_split[0] 129 | new_file_name = '{}_{}_{}_{}.png'.format(this_new_uuid, func_name, recent_prev_file_name, most_org_file_name) 130 | return os.path.join(head, new_file_name) 131 | 132 | 133 | def create_model(config_path, device): 134 | config = OmegaConf.load(config_path) 135 | OmegaConf.update(config, "model.params.cond_stage_config.params.device", device) 136 | model = instantiate_from_config(config.model).cpu() 137 | print(f'Loaded model config from [{config_path}]') 138 | return model 139 | 140 | 141 | class MaskFormer: 142 | def __init__(self, device): 143 | self.device = device 144 | self.processor = CLIPSegProcessor.from_pretrained("CIDAS/clipseg-rd64-refined") 145 | self.model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined").to(device) 146 | 147 | def inference(self, image_path, text): 148 | threshold = 0.5 149 | min_area = 0.02 150 | padding = 20 151 | original_image = Image.open(image_path) 152 | image = original_image.resize((512, 512)) 153 | inputs = self.processor(text=text, images=image, padding="max_length", return_tensors="pt", ).to(self.device) 154 | with torch.no_grad(): 155 | outputs = self.model(**inputs) 156 | mask = torch.sigmoid(outputs[0]).squeeze().cpu().numpy() > threshold 157 | area_ratio = len(np.argwhere(mask)) / (mask.shape[0] * mask.shape[1]) 158 | if area_ratio < min_area: 159 | return None 160 | true_indices = np.argwhere(mask) 161 | mask_array = np.zeros_like(mask, dtype=bool) 162 | for idx in true_indices: 163 | padded_slice = tuple(slice(max(0, i - padding), i + padding + 1) for i in idx) 164 | mask_array[padded_slice] = True 165 | visual_mask = (mask_array * 255).astype(np.uint8) 166 | image_mask = Image.fromarray(visual_mask) 167 | return image_mask.resize(image.size) 168 | 169 | 170 | class Pix2Pix: 171 | def __init__(self, device): 172 | print("Initializing Pix2Pix to %s" % device) 173 | self.device = device 174 | self.pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained("timbrooks/instruct-pix2pix", 175 | torch_dtype=torch.float16, 176 | safety_checker=None).to(device) 177 | self.pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(self.pipe.scheduler.config) 178 | 179 | def inference(self, inputs): 180 | """Change style of image.""" 181 | print("===>Starting Pix2Pix Inference") 182 | image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:]) 183 | original_image = Image.open(image_path) 184 | image = \ 185 | self.pipe(instruct_text, image=original_image, num_inference_steps=40, image_guidance_scale=1.2, ).images[0] 186 | updated_image_path = get_new_image_name(image_path, func_name="pix2pix") 187 | image.save(updated_image_path) 188 | return updated_image_path 189 | 190 | 191 | class T2I: 192 | def __init__(self, device): 193 | print("Initializing T2I to %s" % device) 194 | self.device = device 195 | self.pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16) 196 | self.text_refine_tokenizer = AutoTokenizer.from_pretrained("Gustavosta/MagicPrompt-Stable-Diffusion") 197 | self.text_refine_model = AutoModelForCausalLM.from_pretrained("Gustavosta/MagicPrompt-Stable-Diffusion") 198 | self.text_refine_gpt2_pipe = pipeline("text-generation", model=self.text_refine_model, 199 | tokenizer=self.text_refine_tokenizer, device=self.device) 200 | self.pipe.to(device) 201 | 202 | def inference(self, text): 203 | image_filename = os.path.join('image', str(uuid.uuid4())[0:8] + ".png") 204 | refined_text = self.text_refine_gpt2_pipe(text)[0]["generated_text"] 205 | print(f'{text} refined to {refined_text}') 206 | image = self.pipe(refined_text).images[0] 207 | image.save(image_filename) 208 | print(f"Processed T2I.run, text: {text}, image_filename: {image_filename}") 209 | return image_filename 210 | 211 | 212 | class ImageCaptioning: 213 | def __init__(self, device): 214 | print("Initializing ImageCaptioning to %s" % device) 215 | self.device = device 216 | self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") 217 | self.model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to( 218 | self.device) 219 | 220 | def inference(self, image_path): 221 | inputs = self.processor(Image.open(image_path), return_tensors="pt").to(self.device) 222 | out = self.model.generate(**inputs) 223 | captions = self.processor.decode(out[0], skip_special_tokens=True) 224 | return captions 225 | 226 | 227 | class image2pose: 228 | def __init__(self): 229 | print("Direct human pose.") 230 | self.detector = OpenposeDetector() 231 | self.resolution = 512 232 | 233 | def inference(self, inputs): 234 | print("===>Starting image2pose Inference") 235 | image = Image.open(inputs) 236 | image = np.array(image) 237 | image = HWC3(image) 238 | detected_map, _ = self.detector(resize_image(image, self.resolution)) 239 | detected_map = HWC3(detected_map) 240 | image = resize_image(image, self.resolution) 241 | H, W, C = image.shape 242 | detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR) 243 | updated_image_path = get_new_image_name(inputs, func_name="human-pose") 244 | image = Image.fromarray(detected_map) 245 | image.save(updated_image_path) 246 | return updated_image_path 247 | 248 | 249 | class pose2image: 250 | def __init__(self, device): 251 | print("Initialize the pose2image model...") 252 | model = create_model('ControlNet/models/cldm_v15.yaml', device=device).to(device) 253 | model.load_state_dict(load_state_dict('ControlNet/models/control_sd15_openpose.pth', location='cpu')) 254 | self.model = model.to(device) 255 | self.device = device 256 | self.ddim_sampler = DDIMSampler(self.model) 257 | self.ddim_steps = 20 258 | self.image_resolution = 512 259 | self.num_samples = 1 260 | self.save_memory = False 261 | self.strength = 1.0 262 | self.guess_mode = False 263 | self.scale = 9.0 264 | self.seed = -1 265 | self.a_prompt = 'best quality, extremely detailed' 266 | self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, ' \ 267 | 'cropped, worst quality, low quality' 268 | 269 | def inference(self, inputs): 270 | print("===>Starting pose2image Inference") 271 | image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:]) 272 | image = Image.open(image_path) 273 | image = np.array(image) 274 | prompt = instruct_text 275 | img = resize_image(HWC3(image), self.image_resolution) 276 | H, W, C = img.shape 277 | img = cv2.resize(img, (W, H), interpolation=cv2.INTER_NEAREST) 278 | control = torch.from_numpy(img.copy()).float().to(device=self.device) / 255.0 279 | control = torch.stack([control for _ in range(self.num_samples)], dim=0) 280 | control = einops.rearrange(control, 'b h w c -> b c h w').clone() 281 | self.seed = random.randint(0, 65535) 282 | seed_everything(self.seed) 283 | if self.save_memory: 284 | self.model.low_vram_shift(is_diffusing=False) 285 | cond = {"c_concat": [control], "c_crossattn": [ 286 | self.model.get_learned_conditioning([prompt + ', ' + self.a_prompt] * self.num_samples)]} 287 | un_cond = {"c_concat": None if self.guess_mode else [control], 288 | "c_crossattn": [self.model.get_learned_conditioning([self.n_prompt] * self.num_samples)]} 289 | shape = (4, H // 8, W // 8) 290 | self.model.control_scales = [self.strength * (0.825 ** float(12 - i)) for i in 291 | range(13)] if self.guess_mode else ([self.strength] * 13) 292 | samples, intermediates = self.ddim_sampler.sample(self.ddim_steps, self.num_samples, shape, cond, verbose=False, 293 | eta=0., unconditional_guidance_scale=self.scale, 294 | unconditional_conditioning=un_cond) 295 | if self.save_memory: 296 | self.model.low_vram_shift(is_diffusing=False) 297 | x_samples = self.model.decode_first_stage(samples) 298 | x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 299 | 255).astype( 300 | np.uint8) 301 | updated_image_path = get_new_image_name(image_path, func_name="pose2image") 302 | real_image = Image.fromarray(x_samples[0]) # default the index0 image 303 | real_image.save(updated_image_path) 304 | return updated_image_path 305 | 306 | 307 | class image2seg: 308 | def __init__(self): 309 | print("Direct segmentations.") 310 | self.detector = UniformerDetector() 311 | self.resolution = 512 312 | 313 | def inference(self, inputs): 314 | print("===>Starting image2seg Inference") 315 | image = Image.open(inputs) 316 | image = np.array(image) 317 | image = HWC3(image) 318 | detected_map = self.detector(resize_image(image, self.resolution)) 319 | detected_map = HWC3(detected_map) 320 | image = resize_image(image, self.resolution) 321 | H, W, C = image.shape 322 | detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR) 323 | updated_image_path = get_new_image_name(inputs, func_name="segmentation") 324 | image = Image.fromarray(detected_map) 325 | image.save(updated_image_path) 326 | return updated_image_path 327 | 328 | 329 | class seg2image: 330 | def __init__(self, device): 331 | print("Initialize the seg2image model...") 332 | model = create_model('ControlNet/models/cldm_v15.yaml', device=device).to(device) 333 | model.load_state_dict(load_state_dict('ControlNet/models/control_sd15_seg.pth', location='cpu')) 334 | self.model = model.to(device) 335 | self.device = device 336 | self.ddim_sampler = DDIMSampler(self.model) 337 | self.ddim_steps = 20 338 | self.image_resolution = 512 339 | self.num_samples = 1 340 | self.save_memory = False 341 | self.strength = 1.0 342 | self.guess_mode = False 343 | self.scale = 9.0 344 | self.seed = -1 345 | self.a_prompt = 'best quality, extremely detailed' 346 | self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, ' \ 347 | 'cropped, worst quality, low quality' 348 | 349 | def inference(self, inputs): 350 | print("===>Starting seg2image Inference") 351 | image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:]) 352 | image = Image.open(image_path) 353 | image = np.array(image) 354 | prompt = instruct_text 355 | img = resize_image(HWC3(image), self.image_resolution) 356 | H, W, C = img.shape 357 | img = cv2.resize(img, (W, H), interpolation=cv2.INTER_NEAREST) 358 | control = torch.from_numpy(img.copy()).float().to(device=self.device) / 255.0 359 | control = torch.stack([control for _ in range(self.num_samples)], dim=0) 360 | control = einops.rearrange(control, 'b h w c -> b c h w').clone() 361 | self.seed = random.randint(0, 65535) 362 | seed_everything(self.seed) 363 | if self.save_memory: 364 | self.model.low_vram_shift(is_diffusing=False) 365 | cond = {"c_concat": [control], "c_crossattn": [ 366 | self.model.get_learned_conditioning([prompt + ', ' + self.a_prompt] * self.num_samples)]} 367 | un_cond = {"c_concat": None if self.guess_mode else [control], 368 | "c_crossattn": [self.model.get_learned_conditioning([self.n_prompt] * self.num_samples)]} 369 | shape = (4, H // 8, W // 8) 370 | self.model.control_scales = [self.strength * (0.825 ** float(12 - i)) for i in 371 | range(13)] if self.guess_mode else ([self.strength] * 13) 372 | samples, intermediates = self.ddim_sampler.sample(self.ddim_steps, self.num_samples, shape, cond, verbose=False, 373 | eta=0., unconditional_guidance_scale=self.scale, 374 | unconditional_conditioning=un_cond) 375 | if self.save_memory: 376 | self.model.low_vram_shift(is_diffusing=False) 377 | x_samples = self.model.decode_first_stage(samples) 378 | x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 379 | 255).astype( 380 | np.uint8) 381 | updated_image_path = get_new_image_name(image_path, func_name="segment2image") 382 | real_image = Image.fromarray(x_samples[0]) # default the index0 image 383 | real_image.save(updated_image_path) 384 | return updated_image_path 385 | 386 | 387 | class BLIPVQA: 388 | def __init__(self, device): 389 | print("Initializing BLIP VQA to %s" % device) 390 | self.device = device 391 | self.processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base") 392 | self.model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(self.device) 393 | 394 | def get_answer_from_question_and_image(self, inputs): 395 | image_path, question = inputs.split(",") 396 | raw_image = Image.open(image_path).convert('RGB') 397 | print(F'BLIPVQA :question :{question}') 398 | inputs = self.processor(raw_image, question, return_tensors="pt").to(self.device) 399 | out = self.model.generate(**inputs) 400 | answer = self.processor.decode(out[0], skip_special_tokens=True) 401 | return answer 402 | 403 | 404 | class Whisper: 405 | def __init__(self, device): 406 | print("Initializing Whisper on device", device) 407 | self.model = whisper.load_model("medium.en", device=device) 408 | 409 | def transcribe(self, inputs): 410 | return self.model.transcribe(inputs)['text'] 411 | 412 | 413 | class coqui_tts: 414 | 415 | def __init__(self, device): 416 | self.device = device 417 | self.tts = TTS('tts_models/multilingual/multi-dataset/your_tts', gpu=self.device) 418 | 419 | def gen_speech_from_text(self, inputs): 420 | print("===>Starting text2speech Inference") 421 | filename = os.path.join('audio', str(uuid.uuid4())[:8] + ".wav") 422 | self.tts.tts_to_file(text=inputs, speaker=self.tts.speakers[0], language=self.tts.languages[0], 423 | file_path=filename) 424 | 425 | return "Audio generated in " + filename 426 | 427 | 428 | class TableQA: 429 | 430 | def __init__(self, device): 431 | self.device = device 432 | self.pipeline = pipeline(task="table-question-answering", model="google/tapas-large-finetuned-wtq", 433 | device=self.device) 434 | 435 | def get_answer_from_question_and_table(self, inputs): 436 | table_path = inputs.split(",")[0] 437 | questions = inputs.split(",")[1:] 438 | table = pd.read_csv(table_path, dtype=str) 439 | 440 | res = self.pipeline(table=table, query=questions) 441 | 442 | return res['answer'] 443 | 444 | 445 | class TwilioCaller: 446 | def parse_input(self, inputs): 447 | try: 448 | if 'and' in inputs: 449 | text: str = inputs.split("and")[0] 450 | phone_number = inputs.split("and")[1] 451 | elif ',' in inputs: 452 | text: str = inputs.split(",")[0] 453 | phone_number = inputs.split(",")[1:] 454 | if isinstance(phone_number, list): 455 | phone_number = ",".join(phone_number) 456 | else: 457 | raise Exception('Could not make the call, the input is not well formatted. Must be a comma separated string') 458 | except: 459 | raise Exception('Could not parse your input. Must be a comma separated string') 460 | text = text.replace('"', '').strip(' ') 461 | phone_number = phone_number.replace('"', '').strip(' ') 462 | if not re.match('\+[0-9]+', text) and not re.match('\+[0-9]+', phone_number): 463 | raise Exception('Could not make the call, no phone number provided') 464 | if re.match('\+[0-9]+', text) and not re.match('\+[0-9]+', phone_number): 465 | text, phone_number = phone_number, text 466 | return text, phone_number 467 | 468 | def call_with_text(self, inputs): 469 | import twilio 470 | try: 471 | text, phone_number = self.parse_input(inputs) 472 | except Exception as e: 473 | return str(e) 474 | from twilio_lib import call_with_text 475 | try: 476 | call_with_text(text, phone_number) 477 | except twilio.base.exceptions.TwilioRestException: 478 | return 'Internal error, could not submit the call.' 479 | 480 | return 'Call submitted, it should be received soon' 481 | 482 | def call_with_audio(self, inputs): 483 | audio_filename = inputs.split(",")[0] 484 | phone_number = inputs.split(",")[1:] 485 | from twilio_lib import call_with_audio 486 | call_with_audio(audio_filename, phone_number) 487 | 488 | return 'Call submitted, it should be received soon' 489 | 490 | 491 | class ConversationBot: 492 | def __init__(self): 493 | print("Initializing AgentChain") 494 | self.llm = OpenAI(temperature=0) 495 | self.i2t = ImageCaptioning(device="cuda:1") # 1755 496 | self.t2i = T2I(device="cuda:1") # 6677 497 | self.image2pose = image2pose() 498 | self.pose2image = pose2image(device="cuda:1") # 6681 499 | self.BLIPVQA = BLIPVQA(device="cuda:1") # 2709 500 | self.image2seg = image2seg() 501 | self.seg2image = seg2image(device="cuda:1") # 5540 502 | ## up until now, comsuming 23362 MB on GPU 503 | self.pix2pix = Pix2Pix(device="cuda:0") # 2795 504 | self.coqui_tts = coqui_tts(device=False) 505 | self.tableQA = TableQA(device="cuda:0") 506 | self.whisper = Whisper(device="cuda:0") 507 | self.twilio_caller = TwilioCaller() 508 | self.extra_tools = ["serpapi", "llm-math", "python_repl", "requests", "terminal"] 509 | 510 | self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output') 511 | self.tools = [ 512 | Tool(name="Get Photo Description", func=self.i2t.inference, 513 | description="useful when you want to know what is inside the photo. receives image_path as input. " 514 | "The input to this tool should be a string, representing the image_path. "), 515 | Tool(name="Generate Image From User Input Text", func=self.t2i.inference, 516 | description="useful when you want to generate an image from a user input text and save it to a file. " 517 | "like: generate an image of an object or something, or generate an image that includes " 518 | "some objects." 519 | "The input to this tool should be a string, representing the text used to generate image. "), 520 | 521 | Tool(name="Instruct Image Using Text", func=self.pix2pix.inference, 522 | description="useful when you want to the style of the image to be like the text. like: make it look " 523 | "like a painting. or make it like a robot." 524 | "The input to this tool should be a comma separated string of two, representing the " 525 | "image_path and the text. "), 526 | Tool(name="Answer Question About The Image", func=self.BLIPVQA.get_answer_from_question_and_image, 527 | description="useful when you need an answer for a question based on an image. like: what is the " 528 | "background color of the last image, how many cats in this figure, what is in this figure." 529 | "The input to this tool should be a comma separated string of two, representing the " 530 | "image_path and the question"), 531 | Tool(name="Segmentation On Image", func=self.image2seg.inference, 532 | description="useful when you want to detect segmentations of the image. like: segment this image, " 533 | "or generate segmentations on this image, or perform segmentation on this image." 534 | "The input to this tool should be a string, representing the image_path"), 535 | Tool(name="Generate Image Condition On Segmentations", func=self.seg2image.inference, 536 | description="useful when you want to generate a new real image from both the user description and " 537 | "segmentations. like: generate a real image of a object or something from this " 538 | "segmentation image, or generate a new real image of a object or something from these " 539 | "segmentations." 540 | "The input to this tool should be a comma separated string of two, representing the " 541 | "image_path and the user description"), 542 | Tool(name="Pose Detection On Image", func=self.image2pose.inference, 543 | description="useful when you want to detect the human pose of the image. like: generate human poses " 544 | "of this image, or generate a pose image from this image." 545 | "The input to this tool should be a string, representing the image_path"), 546 | Tool(name="Generate Image Condition On Pose Image", func=self.pose2image.inference, 547 | description="useful when you want to generate a new real image from both the user description and a " 548 | "human pose image. like: generate a real image of a human from this human pose image, " 549 | "or generate a new real image of a human from this pose." 550 | "The input to this tool should be a comma separated string of two, representing the " 551 | "image_path and the user description"), 552 | Tool(name="Generate Text from Audio", func=self.whisper.transcribe, 553 | description="useful when you want to generate text from audio. like: generate text from this audio, " 554 | "or transcribe this audio, or listen to this audio. receives audio_path as input." 555 | "The input to this tool should be a string, representing the audio_path"), 556 | Tool(name="Generate Speech From Text", func=self.coqui_tts.gen_speech_from_text, 557 | description="useful when you want to generate a speech from a text. like: generate a speech from " 558 | "this text, or say this text in audio. " 559 | "The input to this tool should be a string, representing the text to be converted to " 560 | "speech." 561 | ), 562 | Tool(name="Answer Question About The table", func=self.tableQA.get_answer_from_question_and_table, 563 | description="useful when you need an answer for a question based on a table. like: what is the " 564 | "maximum of the column age, or what is the sum of row 5 from the following table." 565 | "The input to this tool should be a comma separated string, representing the " 566 | "table_path and the questions"), 567 | Tool(name="Call a phone number with text", func=self.twilio_caller.call_with_text, 568 | description="useful when you need to call a phone number with a text input. like: call +4917424393190 and" 569 | " tell him \"happy birthday\". The input to this tool should be a comma separate string " 570 | "representing the text_input and the phone_number"), 571 | # Tool(name="Call a phone number with audio", func=self.twilio_caller.call_with_audio, 572 | # description="useful when you need to call a phone number with an audio file. like: call +4917424393190 and" 573 | # " using audio file audio/smth.wav. Only use audio files mentioned by the user." 574 | # "The input to this tool should be a comma separated string representing the audio file name and the phone_number"), 575 | ] 576 | 577 | self.tools = self.tools + load_tools(self.extra_tools, llm=self.llm) 578 | 579 | self.agent = initialize_agent( 580 | self.tools, 581 | self.llm, 582 | agent="conversational-react-description", 583 | verbose=True, 584 | memory=self.memory, 585 | return_intermediate_steps=True, 586 | agent_kwargs={'prefix': AGENT_CHAIN_PREFIX, 'format_instructions': AGENT_CHAIN_FORMAT_INSTRUCTIONS, 587 | 'suffix': AGENT_CHAIN_SUFFIX}, ) 588 | 589 | def run_text(self, text, state): 590 | print("===============Running run_text =============") 591 | print("Inputs:", text, state) 592 | print("======>Previous memory:\n %s" % self.agent.memory) 593 | self.agent.memory.buffer = cut_dialogue_history(self.agent.memory.buffer, keep_last_n_words=500) 594 | res = self.agent({"input": text}) 595 | print("======>Current memory:\n %s" % self.agent.memory) 596 | response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output']) 597 | audio_files = re.findall('(audio/\S*wav)', response) 598 | if len(audio_files) > 0: 599 | audio = audio_files[0] 600 | else: 601 | audio = None 602 | state = state + [(text, response)] 603 | print("Outputs:", state) 604 | return state, state, audio 605 | 606 | def run_image(self, image, state, txt): 607 | print("===============Running run_image =============") 608 | print("Inputs:", image, state) 609 | print("======>Previous memory:\n %s" % self.agent.memory) 610 | image_filename = os.path.join('image', str(uuid.uuid4())[0:8] + ".png") 611 | print("======>Auto Resize Image...") 612 | img = Image.open(image.name) 613 | width, height = img.size 614 | ratio = min(512 / width, 512 / height) 615 | width_new, height_new = (round(width * ratio), round(height * ratio)) 616 | img = img.resize((width_new, height_new)) 617 | img = img.convert('RGB') 618 | img.save(image_filename, "PNG") 619 | print(f"Resize image form {width}x{height} to {width_new}x{height_new}") 620 | description = self.i2t.inference(image_filename) 621 | Human_prompt = "\nHuman: provide a figure named {}. The description is: {}. This information helps you to " \ 622 | "understand this image, but you should use tools to finish following tasks, " \ 623 | "rather than directly imagine from my description. If you understand, say \"Received\". \n".format( 624 | image_filename, description) 625 | AI_prompt = "Received. " 626 | self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt 627 | print("======>Current memory:\n %s" % self.agent.memory) 628 | state = state + [(f"![](/file={image_filename})*{image_filename}*", AI_prompt)] 629 | print("Outputs:", state) 630 | return state, state, txt + ' ' + image_filename + ' ' 631 | 632 | def run_audio(self, audio, state, txt): 633 | print("===============Running run_audio =============") 634 | print("Inputs:", audio, state) 635 | print("======>Previous memory:\n %s" % self.agent.memory) 636 | audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav") 637 | import shutil 638 | shutil.copyfile(audio, audio_filename) 639 | transcribed_text = self.whisper.transcribe(audio_filename) 640 | Human_prompt = "\nHuman: provide audio named {}. The description is: {}. This information helps you to " \ 641 | "understand this audio, but you should use tools to finish following tasks, " \ 642 | "rather than directly imagine from my description. If you understand, say \"Received\". \n".format( 643 | audio_filename, transcribed_text) 644 | 645 | AI_prompt = "Received. " 646 | self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt 647 | print("======>Current memory:\n %s" % self.agent.memory) 648 | state = state + [(f"![](/file={audio_filename})*{audio_filename}*", AI_prompt)] 649 | print("Outputs:", state) 650 | return state, audio, state, txt + ' ' + audio_filename + ' ' 651 | 652 | 653 | def run_df(self, df, state, txt): 654 | print("===============Running run_df =============") 655 | print("Inputs:", df, state) 656 | print("======>Previous memory:\n %s" % self.agent.memory) 657 | csv_filename = os.path.join('csv', str(uuid.uuid4())[0:8] + ".csv") 658 | df.to_csv(csv_filename, index=False) 659 | Human_prompt = "\nHuman: provided a csv file named {}. You can specifically use the tool \"Answer Question About The table\" to understand this file. If you understand, say \"Received\". \n".format( 660 | csv_filename) 661 | 662 | AI_prompt = "Received. " 663 | self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt 664 | print("======>Current memory:\n %s" % self.agent.memory) 665 | state = state + [(f"![](/file={csv_filename})*{csv_filename}*", AI_prompt)] 666 | print("Outputs:", state) 667 | return state, state, txt + ' ' + csv_filename + ' ' 668 | 669 | 670 | if __name__ == '__main__': 671 | bot = ConversationBot() 672 | with gr.Blocks(css="#chatbot .overflow-y-auto{height:500px}") as demo: 673 | chatbot = gr.Chatbot(elem_id="chatbot", label="AgentChain") 674 | state = gr.State([]) 675 | with gr.Row(): 676 | with gr.Column(scale=0.8): 677 | txt = gr.Textbox(placeholder="Enter text and press enter", label='Instruct with text').style( 678 | container=False) 679 | with gr.Column(scale=0.2, min_width=0): 680 | btn = gr.UploadButton("Upload Image", file_types=["image"]) 681 | with gr.Row(): 682 | with gr.Column(scale=0.5, min_width=0): 683 | with gr.Row(): 684 | with gr.Column(scale=0.5, min_width=0): 685 | input_audio = gr.Audio(source="upload", type="filepath", label="Upload Audio Input") 686 | with gr.Column(scale=0.5, min_width=0): 687 | output_audio = gr.Audio(type="filepath", label='Audio output', interactive=False) 688 | with gr.Row(): 689 | clear = gr.Button("Clear Chat History") 690 | with gr.Column(scale=0.5, min_width=0): 691 | with gr.Row(): 692 | df = gr.DataFrame(interactive=True, row_count=1, col_count=1, headers=['Column1'], label="Give a Dataframe as input") 693 | with gr.Row(): 694 | with gr.Column(scale=0.8, min_width=0): 695 | persist_df = gr.Button("Upload the dataframe") 696 | 697 | input_audio.upload(bot.run_audio, [input_audio, state, txt], [chatbot, output_audio, state, txt]) 698 | # audio.upload(bot.run_audio, [audio, state, txt], [chatbot, audio, state, txt]) 699 | txt.submit(bot.run_text, [txt, state], [chatbot, state, output_audio]) 700 | txt.submit(lambda: "", None, txt) 701 | btn.upload(bot.run_image, [btn, state, txt], [chatbot, state, txt]) 702 | persist_df.click(bot.run_df, [df, state, txt], [chatbot, state, txt]) 703 | clear.click(bot.memory.clear) 704 | clear.click(lambda: [], None, chatbot) 705 | clear.click(lambda: [], None, state) 706 | 707 | demo.launch(server_name="0.0.0.0", server_port=7861) 708 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch==1.13.1 2 | torchvision 3 | numpy==1.23.1 4 | transformers==4.26.1 5 | albumentations==1.3.0 6 | opencv-python==4.7.0.72 7 | imageio==2.9.0 8 | imageio-ffmpeg==0.4.2 9 | pytorch-lightning==1.5.0 10 | omegaconf==2.1.1 11 | test-tube>=0.7.5 12 | streamlit==1.12.1 13 | einops==0.3.0 14 | webdataset==0.2.5 15 | kornia==0.6 16 | open_clip_torch==2.0.2 17 | invisible-watermark>=0.1.5 18 | streamlit-drawable-canvas==0.8.0 19 | torchmetrics==0.6.0 20 | timm==0.6.12 21 | addict==2.4.0 22 | yapf==0.32.0 23 | prettytable==3.6.0 24 | safetensors==0.2.7 25 | basicsr==1.4.2 26 | langchain==0.0.101 27 | diffusers 28 | gradio 29 | openai 30 | accelerate 31 | openai-whisper==20230308 32 | TTS 33 | pandas 34 | twilio 35 | boto3 36 | google-search-results -------------------------------------------------------------------------------- /twilio_lib.py: -------------------------------------------------------------------------------- 1 | import os 2 | from twilio.rest import Client 3 | from twilio.twiml.voice_response import VoiceResponse, Say 4 | 5 | def write_text_twilml(text): 6 | response = VoiceResponse() 7 | response.say(text) 8 | import tempfile 9 | 10 | # Create a temporary file 11 | with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp_file: 12 | # Write a string to the file 13 | temp_file.write(str(response)) 14 | 15 | # Get the file name 16 | file_name = temp_file.name 17 | return file_name 18 | 19 | def write_voice_twilml(audio_url): 20 | response = VoiceResponse() 21 | response.play(audio_url) 22 | import tempfile 23 | 24 | # Create a temporary file 25 | with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp_file: 26 | # Write a string to the file 27 | temp_file.write(str(response)) 28 | 29 | # Get the file name 30 | file_name = temp_file.name 31 | return file_name 32 | 33 | def push_to_s3(file_name, extension, content_type=None): 34 | import boto3 35 | import uuid 36 | 37 | s3 = boto3.client('s3') 38 | 39 | bucket_name = os.getenv('AWS_S3_BUCKET_NAME', 'god-llm') 40 | file_name = file_name 41 | object_key = f'twilml/{str(uuid.uuid4())[0:4]}.{extension}' 42 | extra_args = {'ACL': 'public-read'} 43 | if content_type: 44 | extra_args["ContentType"] = content_type 45 | s3.upload_file(file_name, bucket_name, object_key, ExtraArgs=extra_args) 46 | return f'https://{bucket_name}.s3.eu-central-1.amazonaws.com/{object_key}' 47 | 48 | def call_with_twilml_url(twilml_url, phone_number): 49 | # Find your Account SID and Auth Token at twilio.com/console 50 | # and set the environment variables. See http://twil.io/secure 51 | account_sid = os.environ['TWILIO_ACCOUNT_SID'] 52 | auth_token = os.environ['TWILIO_AUTH_TOKEN'] 53 | client = Client(account_sid, auth_token) 54 | 55 | call = client.calls.create( 56 | method='GET', 57 | url=twilml_url, 58 | to=phone_number, 59 | from_='+15673393771' 60 | ) 61 | return call 62 | 63 | def call_with_text(text, phone_number): 64 | file_name = write_text_twilml(text) 65 | twilml_url = push_to_s3(file_name, extension='xml', content_type="text/xml") 66 | call_with_twilml_url(twilml_url, phone_number) 67 | 68 | 69 | def call_with_audio(audio_file, phone_number): 70 | audio_url = push_to_s3(audio_file, extension='wav', content_type="audio/wav") 71 | file_name = write_voice_twilml(audio_url) 72 | twilml_url = push_to_s3(file_name, extension='xml', content_type="text/xml") 73 | call_with_twilml_url(twilml_url, phone_number) 74 | --------------------------------------------------------------------------------