├── .gitignore
├── LICENSE.md
├── README.md
├── agentchain-logo.png
├── architecture.png
├── architecture.svg
├── audio
    └── .gitignore
├── csv
    └── .gitignore
├── docker
    ├── Dockerfile
    └── deploying-from-docker.md
├── download.sh
├── image
    └── .gitignore
├── main.py
├── requirements.txt
└── twilio_lib.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Default ignored files
  2 | /shelf/
  3 | /workspace.xml
  4 | 
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | share/python-wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .nox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | *.py,cover
 54 | .hypothesis/
 55 | .pytest_cache/
 56 | cover/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | db.sqlite3-journal
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | .pybuilder/
 80 | target/
 81 | 
 82 | # Jupyter Notebook
 83 | .ipynb_checkpoints
 84 | 
 85 | # IPython
 86 | profile_default/
 87 | ipython_config.py
 88 | 
 89 | # pyenv
 90 | #   For a library or package, you might want to ignore these files since the code is
 91 | #   intended to run in multiple environments; otherwise, check them in:
 92 | # .python-version
 93 | 
 94 | # pipenv
 95 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 96 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 97 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 98 | #   install all needed dependencies.
 99 | #Pipfile.lock
100 | 
101 | # poetry
102 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
103 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
104 | #   commonly ignored for libraries.
105 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
106 | #poetry.lock
107 | 
108 | # pdm
109 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
110 | #pdm.lock
111 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
112 | #   in version control.
113 | #   https://pdm.fming.dev/#use-with-ide
114 | .pdm.toml
115 | 
116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117 | __pypackages__/
118 | 
119 | # Celery stuff
120 | celerybeat-schedule
121 | celerybeat.pid
122 | 
123 | # SageMath parsed files
124 | *.sage.py
125 | 
126 | # Environments
127 | .env
128 | .venv
129 | env/
130 | venv/
131 | ENV/
132 | env.bak/
133 | venv.bak/
134 | 
135 | # Spyder project settings
136 | .spyderproject
137 | .spyproject
138 | 
139 | # Rope project settings
140 | .ropeproject
141 | 
142 | # mkdocs documentation
143 | /site
144 | 
145 | # mypy
146 | .mypy_cache/
147 | .dmypy.json
148 | dmypy.json
149 | 
150 | # Pyre type checker
151 | .pyre/
152 | 
153 | # pytype static type analyzer
154 | .pytype/
155 | 
156 | # Cython debug symbols
157 | cython_debug/
158 | 
159 | # PyCharm
160 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
163 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
164 | .idea/


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Microsoft
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p>
  2 | <img src="https://github.com/jina-ai/agentchain/blob/main/agentchain-logo.png?raw=true" alt="AgentChain logo" width="250px"></a>
  3 | </p>
  4 | 
  5 | AgentChain uses Large Language Models (LLMs) for planning and orchestrating multiple Agents or Large Models (LMs) for accomplishing sophisticated tasks. AgentChain is fully multimodal: it accepts text, image, audio, tabular data as input and output.
  6 | 
  7 | - **🧠 LLMs as the brain:** AgentChain leverages state-of-the-art Large Language Models to provide users with the ability to plan and make decisions based on natural language inputs. This feature makes AgentChain a versatile tool for a wide range of applications, such as task execution give natural language instructions, data understanding, and data generation.
  8 | - **🌟 Fully Multimodal IO:** AgentChain is fully multimodal, accepting input and output from various modalities, such as text, image, audio, or video (coming soon). This feature makes AgentChain a versatile tool for a wide range of applications, such as computer vision, speech recognition, and transitioning from one modality to another.
  9 | - **🤝 Orchestrate Versatile Agents:** AgentChain can orchestrate multiple agents to perform complex tasks. Using composability and hierarchical structuring of tools AgentChain can choose intelligently which tools to use and when for a certain task. This feature makes AgentChain a powerful tool for projects that require complex combination of tools.
 10 | - **🔧 Customizable for Ad-hoc Needs:** AgentChain can be customized to fit specific project requirements, making it a versatile tool for a wide range of applications. Specific requirements can be met by enhancing capabilities with new agents (and distributed architecture coming soon).
 11 | 
 12 | 
 13 | 
 14 | 
 15 | 
 16 | # Get started
 17 | 1. Install requirements: `pip install -r requirements.txt`
 18 | 2. Download model checkpoints: `bash download.sh`
 19 | 3. Depending on the agents you need in-place, make sure to export environment variables
 20 | 
 21 | ```shell
 22 | OPENAI_API_KEY={YOUR_OPENAI_API_KEY} # mandatory since the LLM is central in this application
 23 | SERPAPI_API_KEY={YOUR_SERPAPI_API_KEY}  # make sure to include a serp API key in case you need the agent to be able to search the web
 24 | 
 25 | # These environment variables are needed in case you want the agent to be able to make phone calls
 26 | AWS_ACCESS_KEY_ID={YOUR_AWS_ACCESS_KEY_ID}
 27 | AWS_SECRET_ACCESS_KEY={YOUR_AWS_SECRET_ACCESS_KEY}
 28 | TWILIO_ACCOUNT_SID={YOUR_TWILIO_ACCOUNT_SID}
 29 | TWILIO_AUTH_TOKEN={YOUR_TWILIO_AUTH_TOKEN}
 30 | AWS_S3_BUCKET_NAME={YOUR_AWS_S3_BUCKET_NAME} # make sure to create an S3 bucket with public access
 31 | ```
 32 | 4. Install `ffmpeg` library (needed for whisper): `sudo apt update && sudo apt install ffmpeg` (Ubuntu command)
 33 | 5. Run the main script: `python main.py`
 34 | 
 35 | 
 36 | ## System requirements
 37 | As of [this commit](https://github.com/jina-ai/agentchain/commit/da588a728c390fb538fd361d4f41dd50aa193751), it is needed to have at least 29 GB of GPU memory to run the AgentChain.
 38 | However, make sure to assign GPU devices correctly in `main.py`.
 39 | 
 40 | You can comment out some tools and models to reduce the GPU memory footprint (but for less capabilities).
 41 | 
 42 | 
 43 | # Demo
 44 | 
 45 | 
 46 | AgentChain demo 1: transcribing audio and visualizing the result as an image. A video of the AgentChain interface shows an uploaded audio and the resulting generated image, which is a representation of the audio content.
 47 | 
 48 | https://user-images.githubusercontent.com/4182659/225347932-87298e6c-58d0-4a29-892f-1398b1406c15.mp4
 49 | 
 50 | ---
 51 | 
 52 | AgentChain demo 2: asking questions about an image. A video of the AgentChain interface shows an image and a question being asked about it, with the resulting answer displayed below.
 53 | 
 54 | https://user-images.githubusercontent.com/4182659/225348027-ed30f9d5-d05b-405a-9651-c08f4976cf83.mp4
 55 | 
 56 | ---
 57 | 
 58 | AgentChain demo 3: question-answering on tabular data and making a phone call to report the results. A video of the AgentChain interface shows a table of data with a question being asked and the resulting answer displayed, followed by a phone call being made using the `CommsAgent`.
 59 | 
 60 | https://user-images.githubusercontent.com/4182659/225348128-6e9bdb3b-78ed-49e8-80f5-fd7c9ad66f28.mp4
 61 | 
 62 | # Agents in AgentChain
 63 | 
 64 | > The content of this document mostly shows **our vision** and **what we aim to achieve** with AgentChain.
 65 | Check the Demo section to understand what we achieved so far.
 66 | 
 67 | ![](architecture.svg)
 68 | 
 69 | AgentChain is a sophisticated system with the goal of solving general problems. It can orchestrate multiple agents to accomplish sub-problems. These agents are organized into different groups, each with their unique set of capabilities and functionalities. Here are some of the agent groups in AgentChain:
 70 | 
 71 | ### SearchAgents
 72 | The `SearchAgents` group is responsible for gathering information from various sources, including search engines, online databases, and APIs. The agents in this group are highly skilled at retrieving up-to-date world knowledge information. Some examples of agents in this group include the `Google Search API`, `Bing API`, `Wikipedia API`, and `Serp`.
 73 | 
 74 | ### CommsAgents
 75 | The `CommsAgents` group is responsible for handling communication between different parties, such as sending emails, making phone calls, or messaging via various platforms. The agents in this group can integrate with a wide range of platforms. Some examples of agents in this group include `TwilioCaller`, `TwilioEmailWriter`, `TwilioMessenger` and `Slack`.
 76 | 
 77 | ### ToolsAgents
 78 | The `ToolsAgents` group is responsible for performing various computational tasks, such as performing calculations, running scripts, or executing commands. The agents in this group can work with a wide range of programming languages and tools. Some examples of agents in this group include `Math`, `Python REPL`, and `Terminal`.
 79 | 
 80 | ### MultiModalAgents
 81 | The `MultiModalAgents` group is responsible for handling input and output from various modalities, such as text, image, audio, or video (coming soon). The agents in this group can process and understand different modalities. Some examples of agents in this group include `OpenAI Whisper`, `Blip2`, `Coqui`, and `StableDiffusion`.
 82 | 
 83 | ### ImageAgents
 84 | The `ImageAgents` group is responsible for processing and manipulating images, such as enhancing image quality, object detection, or image recognition. The agents in this group can perform complex operations on images. Some examples of agents in this group include `Upscaler`, `ControlNet` and `YOLO`.
 85 | 
 86 | ### DBAgents
 87 | The `DBAgents` group is responsible for adding and fetching data from your database, such as getting metrics or aggregations from your database. The agents in this group interact with databases and enrich other agents with your database information. Some examples of agents in this group include `SQL`, `MongoDB`, `ElasticSearch`, `Qrant` and `Notion`.
 88 | 
 89 | 
 90 | # Potential Applications
 91 | 
 92 | ### Example 1: 🏝️📸🌅 AgentChain Image Generation System for Travel Company
 93 | As a travel company that is promoting a new and exotic destination, it is crucial to have high-quality images that can grab the attention of potential travelers. However, manually creating stunning images can be time-consuming and expensive. That's why the travel company wants to use AgentChain to automate the image generation process and create beautiful visuals with the help of various agents.
 94 | 
 95 | Here is how AgentChain can help by chaining different agents together:
 96 | 1. Use `SearchAgent` (`Google Search API`, `Wikipedia API`, `Serp`) to gather information and inspiration about the destination, such as the most popular landmarks, the local cuisine, and the unique features of the location.
 97 | 2. Use `ImageAgent` (`Upscaler`) to enhance the quality of images and make them more appealing by using state-of-the-art algorithms to increase the resolution and remove noise from the images.
 98 | 3. Use `MultiModalAgent` (`Blip2`) to generate descriptive captions for the images, providing more context and making the images more meaningful.
 99 | 4. Use `CommsAgent` (`TwilioEmailWriter`) to send the images to the target audience via email or other messaging platforms, attracting potential travelers with stunning visuals and promoting the new destination.
100 | 
101 | ### Example 2: 💼💹📈 AgentChain Financial Analysis Report for Investment Firm
102 | As an investment firm that manages a large portfolio of stocks, it is critical to stay up-to-date with the latest market trends and analyze the performance of different stocks to make informed investment decisions. However, analyzing data from multiple sources can be time-consuming and error-prone. That's why the investment firm wants to use AgentChain to automate the analysis process and generate reports with the help of various agents.
103 | 
104 | Here is how AgentChain can help by chaining different agents together:
105 | 1. Use `ToolsAgent` (`Python REPL`, `TableQA`) to analyze data from different sources (e.g., CSV files, stock market APIs) and perform calculations related to financial metrics such as earnings, dividends, and P/E ratios.
106 | 2. Use `SearchAgent` (`Bing API`) to gather news and information related to the stocks in the portfolio, such as recent earnings reports, industry trends, and analyst ratings.
107 | 3. Use `NLPAgent` (`GPT`) to create a summary and bullet points of the news and information gathered, providing insights into market sentiment and potential trends.
108 | 4. Use `CommsAgent` (`TwilioEmailWriter`) to send a summary report of the analysis to the appropriate stakeholders, helping them make informed decisions about their investments.
109 | 
110 | ### Example 3: 🛍️💬💻 AgentChain Customer Service Chatbot for E-commerce Site
111 | As an e-commerce site that wants to provide excellent customer service, it is crucial to have a chatbot that can handle customer inquiries and support requests in a timely and efficient manner. However, building a chatbot that can understand and respond to complex customer requests can be challenging. That's why the e-commerce site wants to use AgentChain to automate the chatbot process and provide superior customer service with the help of various agents.
112 | 
113 | Here is how AgentChain can help by chaining different agents together:
114 | 1. Use `MultiModalAgent` (`Blip2`, `Whisper`) to handle input from various modalities (text, image, audio), making it easier for customers to ask questions and make requests in a natural way.
115 | 2. Use `SearchAgent` (`Google Search API`, `Wikipedia API`) or `DBAgent` to provide information about products or services whether in-house or public, such as specifications, pricing, and availability.
116 | 3. Use `CommsAgent` (`TwilioMessenger`) to communicate with customers via messaging platforms, providing support and answering questions in real-time.
117 | 4. Use `ToolsAgent` (`Math`) to perform calculations related to discounts, taxes, or shipping costs, helping customers make informed decisions about their purchases.
118 | 5. Use `MultiModalAgent` (`Coqui`) to generate natural-sounding responses and hold more complex conversations, providing a personalized and engaging experience for customers.
119 | 
120 | ### Example 4: 🧑‍⚕️💊💤 AgentChain Personal Health Assistant
121 | Access to personal health assistance can be expensive and limited. It is essential to have a personal health assistant that can help individuals manage their health and well-being. However, providing personalized health advice and reminders can be challenging, especially for seniors. That's why AgentChain aims to automate the health assistant process and provide personalized support with the help of various agents.
122 | 
123 | Here is how AgentChain can help by chaining different agents together:
124 | 1. Use `DBAgent` to handle input from various health monitoring devices (e.g., heart rate monitors, blood pressure monitors, sleep trackers), providing real-time health data and alerts to the health assistant.
125 | 2. Use `SearchAgent` (`Google Search API`, `Wikipedia API`) or any other medical database to provide information about health topics and medications, such as side effects, dosage, and interactions.
126 | 3. Use `NLPAgent` (`GPT`) to generate personalized recommendations for diet, exercise, and medication, taking into account the seniors' health goals and preferences.
127 | 4. Use `CommsAgent` (`TwilioCaller`, `TwilioMessenger`) to advise, make reminders and provide alerts to help stay on track with their health goals, improving their quality of life and reducing the need for emergency care.
128 | 
129 | 
130 | ## Acknowledgements
131 | We appreciate the open source of the following projects:
132 | 
133 | [Hugging Face](https://github.com/huggingface) &#8194;
134 | [LangChain](https://github.com/hwchase17/langchain) &#8194;
135 | [Stable Diffusion](https://github.com/CompVis/stable-diffusion) &#8194; 
136 | [ControlNet](https://github.com/lllyasviel/ControlNet) &#8194; 
137 | [InstructPix2Pix](https://github.com/timothybrooks/instruct-pix2pix) &#8194; 
138 | [CLIPSeg](https://github.com/timojl/clipseg) &#8194;
139 | [BLIP](https://github.com/salesforce/BLIP) &#8194;
140 | [Microsoft](https://github.com/microsoft/visual-chatgpt) &#8194;
141 | 


--------------------------------------------------------------------------------
/agentchain-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/agentchain/55f2d9fc1bc78a844d4a3313b4753bae433ef775/agentchain-logo.png


--------------------------------------------------------------------------------
/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/agentchain/55f2d9fc1bc78a844d4a3313b4753bae433ef775/architecture.png


--------------------------------------------------------------------------------
/audio/.gitignore:
--------------------------------------------------------------------------------
1 | *.wav


--------------------------------------------------------------------------------
/csv/.gitignore:
--------------------------------------------------------------------------------
1 | *.csv


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Base image with CUDA 11.3.0 and Ubuntu 20.04
 2 | FROM nvidia/cuda:11.3.0-base-ubuntu20.04
 3 | 
 4 | # Install python3 and pip
 5 | RUN apt update && apt install -y python3-pip
 6 | # Install opencv dependencies
 7 | RUN DEBIAN_FRONTEND=noninteractive apt install -y --no-install-recommends libsm6 libxext6 libxrender-dev ffmpeg
 8 | 
 9 | # Set the working directory to /app
10 | WORKDIR /app
11 | 
12 | # Install any needed packages specified in requirements.txt
13 | COPY requirements.txt /app
14 | RUN pip install -r requirements.txt
15 | 
16 | ENTRYPOINT bash


--------------------------------------------------------------------------------
/docker/deploying-from-docker.md:
--------------------------------------------------------------------------------
 1 | # Deploying with docker
 2 | 
 3 | Dependency management can be challenging, but fortunately there is a solution: Docker. Using Docker to deploy AgentChain offers an easy and reproducible way to get started. Since the agents are deployed on GPU, you will need the NVIDIA Container Toolkit.
 4 | 
 5 | If you've never used Docker with a GPU before, follow the Toolkit installation instructions:
 6 | https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#setting-up-nvidia-container-toolkit
 7 | 
 8 | ## Building the image
 9 | 
10 | Building the image is pretty straightforward
11 | 
12 | ```bash
13 | cp ./requirements.txt ./docker
14 | cd docker
15 | docker build -t agentchain .
16 | cd ..
17 | ```
18 | 
19 | ## Download model checkpoints
20 | 
21 | ```bash
22 | bash download.sh
23 | ```
24 | 
25 | The model checkpoints are 44GB in total so this can take a while.
26 | 
27 | ## Run container
28 | 
29 | ```bash
30 | docker run --name agentchain -it -v $(pwd):/app --gpus all -p 7861:7861 agentchain
31 | ```
32 | 
33 | ## Set env variable and start server
34 | 
35 | You will now be in a bash shell. Here you need to export the API keys as environment variable for the server. The Open AI API key and the Serp API key are required as they power the main agent and the search agent respectively.
36 | 
37 | ```bash
38 | OPENAI_API_KEY=<YOUR_OPENAI_API_KEY>
39 | SERPAPI_API_KEY=<YOUR_SERPAPI_API_KEY>
40 | ```
41 | 
42 | (Optional) If you want the CommsAgent to be able to make phone calls you will need to export a few more variables. The AWS_S3_BUCKET_NAME specified needs to be a public access bucket.
43 | 
44 | ```bash
45 | AWS_ACCESS_KEY_ID={YOUR_AWS_ACCESS_KEY_ID}
46 | AWS_SECRET_ACCESS_KEY={YOUR_AWS_SECRET_ACCESS_KEY}
47 | TWILIO_ACCOUNT_SID={YOUR_TWILIO_ACCOUNT_SID}
48 | TWILIO_AUTH_TOKEN={YOUR_TWILIO_AUTH_TOKEN}
49 | AWS_S3_BUCKET_NAME={YOUR_AWS_S3_BUCKET_NAME}
50 | ```
51 | 
52 | You can now start the server by running the main script.
53 | 
54 | ```bash
55 | python3 main.py
56 | ```
57 | 
58 | The server may take about an hour before serving the first time as there are a few more model checkpoints to install. The installs may also timeout the first time in which case, you can run `python [main.py](http://main.py)` again to resume downloading checkpoints.


--------------------------------------------------------------------------------
/download.sh:
--------------------------------------------------------------------------------
 1 | git clone https://github.com/lllyasviel/ControlNet.git
 2 | ln -s ControlNet/ldm ./ldm
 3 | ln -s ControlNet/cldm ./cldm
 4 | ln -s ControlNet/annotator ./annotator
 5 | cd ControlNet/models
 6 | wget https://huggingface.co/lllyasviel/ControlNet/resolve/main/models/control_sd15_canny.pth
 7 | wget https://huggingface.co/lllyasviel/ControlNet/resolve/main/models/control_sd15_depth.pth
 8 | wget https://huggingface.co/lllyasviel/ControlNet/resolve/main/models/control_sd15_hed.pth
 9 | wget https://huggingface.co/lllyasviel/ControlNet/resolve/main/models/control_sd15_mlsd.pth
10 | wget https://huggingface.co/lllyasviel/ControlNet/resolve/main/models/control_sd15_normal.pth
11 | wget https://huggingface.co/lllyasviel/ControlNet/resolve/main/models/control_sd15_openpose.pth
12 | wget https://huggingface.co/lllyasviel/ControlNet/resolve/main/models/control_sd15_scribble.pth
13 | wget https://huggingface.co/lllyasviel/ControlNet/resolve/main/models/control_sd15_seg.pth
14 | cd ../../
15 | 


--------------------------------------------------------------------------------
/image/.gitignore:
--------------------------------------------------------------------------------
1 | *.png


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import re
  4 | import sys
  5 | import uuid
  6 | 
  7 | import cv2
  8 | import einops
  9 | import gradio as gr
 10 | import numpy as np
 11 | import pandas as pd
 12 | import torch
 13 | import whisper
 14 | from ControlNet.annotator.openpose import OpenposeDetector
 15 | from ControlNet.annotator.uniformer import UniformerDetector
 16 | from ControlNet.annotator.util import HWC3, resize_image
 17 | from ControlNet.cldm.ddim_hacked import DDIMSampler
 18 | from ControlNet.cldm.model import create_model, load_state_dict
 19 | from diffusers import (EulerAncestralDiscreteScheduler,
 20 |                        StableDiffusionInstructPix2PixPipeline,
 21 |                        StableDiffusionPipeline)
 22 | from langchain.agents import load_tools
 23 | from langchain.agents.initialize import initialize_agent
 24 | from langchain.agents.tools import Tool
 25 | from langchain.chains.conversation.memory import ConversationBufferMemory
 26 | from langchain.llms.openai import OpenAI
 27 | from ldm.util import instantiate_from_config
 28 | from omegaconf import OmegaConf
 29 | from PIL import Image
 30 | from pytorch_lightning import seed_everything
 31 | from transformers import (AutoModelForCausalLM, AutoTokenizer,
 32 |                           BlipForConditionalGeneration,
 33 |                           BlipForQuestionAnswering, BlipProcessor,
 34 |                           CLIPSegForImageSegmentation, CLIPSegProcessor,
 35 |                           pipeline)
 36 | from TTS.api import TTS
 37 | 
 38 | sys.path.append(os.path.dirname(os.path.realpath(__file__)))
 39 | sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
 40 | 
 41 | AGENT_CHAIN_PREFIX = """AgentChain is designed to be able to assist with a wide range of text, visual and audio 
 42 | related tasks, from answering simple questions to providing in-depth explanations and discussions on a wide range of 
 43 | topics. AgentChain is able to generate human-like text based on the input it receives, allowing it to engage in 
 44 | natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand.
 45 | 
 46 | AgentChain is able to process and understand large amounts of text, images and audios. As a language model, 
 47 | AgentChain can not directly read images or audio, but it has a list of tools to finish different visual, text, audio, 
 48 | math and reasoning tasks. Each image will have a file name formed as "image/xxx.png", and AgentChain can invoke 
 49 | different tools to indirectly understand pictures. Each audio will have a file name formed as "audio/xxx.wav", 
 50 | and AgentChain can invoke different tools to indirectly understand audio. When talking about audio, AgentChain is very 
 51 | strict to the file name and will never fabricate nonexistent files. When using tools to generate new image files, 
 52 | AgentChain is also known that the image may not be the same as the user's demand, and will use other visual 
 53 | question answering tools or description tools to observe the real image. AgentChain is able to use tools in a sequence, 
 54 | and is loyal to the tool observation outputs rather than faking the image content and image file name. It will 
 55 | remember to provide the file name from the last tool observation, if a new image is generated.
 56 | 
 57 | Human may provide new figures to AgentChain with a description. The description helps AgentChain to understand this 
 58 | image, but AgentChain should use tools to finish following tasks, rather than directly imagine from the description.
 59 | 
 60 | Overall, AgentChain is a powerful visual dialogue assistant tool that can help with a wide range of tasks and provide 
 61 | valuable insights and information on a wide range of topics.
 62 | 
 63 | 
 64 | TOOLS:
 65 | ------
 66 | 
 67 | AgentChain  has access to the following tools:"""
 68 | 
 69 | AGENT_CHAIN_FORMAT_INSTRUCTIONS = """To use a tool, please use the following format:
 70 | 
 71 | ```
 72 | Thought: Do I need to use a tool? Yes
 73 | Action: the action to take, should be one of [{tool_names}]
 74 | Action Input: the input to the action
 75 | Observation: the result of the action
 76 | ```
 77 | 
 78 | When you have a response to say to the Human, or if you do not need to use a tool, you MUST use the format:
 79 | 
 80 | ```
 81 | Thought: Do I need to use a tool? No
 82 | {ai_prefix}: [your response here]
 83 | ```
 84 | """
 85 | 
 86 | AGENT_CHAIN_SUFFIX = """You are very strict to the filename correctness and will never fake a file name if it does 
 87 | not exist. You will remember to provide the image file name loyally if it's provided in the last tool observation.
 88 | 
 89 | Begin!
 90 | 
 91 | Previous conversation history:
 92 | {chat_history}
 93 | 
 94 | New input: {input}
 95 | Since AgentChain is a text language model, AgentChain must use tools to observe images or audio rather than 
 96 | imagination. The thoughts and observations are only visible for AgentChain, AgentChain should remember to repeat 
 97 | important information in the final response for Human. Thought: Do I need to use a tool? {agent_scratchpad}"""
 98 | 
 99 | 
100 | def cut_dialogue_history(history_memory, keep_last_n_words=500):
101 |     tokens = history_memory.split()
102 |     n_tokens = len(tokens)
103 |     print(f"hitory_memory:{history_memory}, n_tokens: {n_tokens}")
104 |     if n_tokens < keep_last_n_words:
105 |         return history_memory
106 |     else:
107 |         paragraphs = history_memory.split('\n')
108 |         last_n_tokens = n_tokens
109 |         while last_n_tokens >= keep_last_n_words:
110 |             last_n_tokens = last_n_tokens - len(paragraphs[0].split(' '))
111 |             paragraphs = paragraphs[1:]
112 |         return '\n' + '\n'.join(paragraphs)
113 | 
114 | 
115 | def get_new_image_name(org_img_name, func_name="update"):
116 |     head_tail = os.path.split(org_img_name)
117 |     head = head_tail[0]
118 |     tail = head_tail[1]
119 |     name_split = tail.split('.')[0].split('_')
120 |     this_new_uuid = str(uuid.uuid4())[0:4]
121 |     if len(name_split) == 1:
122 |         most_org_file_name = name_split[0]
123 |         recent_prev_file_name = name_split[0]
124 |         new_file_name = '{}_{}_{}_{}.png'.format(this_new_uuid, func_name, recent_prev_file_name, most_org_file_name)
125 |     else:
126 |         assert len(name_split) == 4
127 |         most_org_file_name = name_split[3]
128 |         recent_prev_file_name = name_split[0]
129 |         new_file_name = '{}_{}_{}_{}.png'.format(this_new_uuid, func_name, recent_prev_file_name, most_org_file_name)
130 |     return os.path.join(head, new_file_name)
131 | 
132 | 
133 | def create_model(config_path, device):
134 |     config = OmegaConf.load(config_path)
135 |     OmegaConf.update(config, "model.params.cond_stage_config.params.device", device)
136 |     model = instantiate_from_config(config.model).cpu()
137 |     print(f'Loaded model config from [{config_path}]')
138 |     return model
139 | 
140 | 
141 | class MaskFormer:
142 |     def __init__(self, device):
143 |         self.device = device
144 |         self.processor = CLIPSegProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
145 |         self.model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined").to(device)
146 | 
147 |     def inference(self, image_path, text):
148 |         threshold = 0.5
149 |         min_area = 0.02
150 |         padding = 20
151 |         original_image = Image.open(image_path)
152 |         image = original_image.resize((512, 512))
153 |         inputs = self.processor(text=text, images=image, padding="max_length", return_tensors="pt", ).to(self.device)
154 |         with torch.no_grad():
155 |             outputs = self.model(**inputs)
156 |         mask = torch.sigmoid(outputs[0]).squeeze().cpu().numpy() > threshold
157 |         area_ratio = len(np.argwhere(mask)) / (mask.shape[0] * mask.shape[1])
158 |         if area_ratio < min_area:
159 |             return None
160 |         true_indices = np.argwhere(mask)
161 |         mask_array = np.zeros_like(mask, dtype=bool)
162 |         for idx in true_indices:
163 |             padded_slice = tuple(slice(max(0, i - padding), i + padding + 1) for i in idx)
164 |             mask_array[padded_slice] = True
165 |         visual_mask = (mask_array * 255).astype(np.uint8)
166 |         image_mask = Image.fromarray(visual_mask)
167 |         return image_mask.resize(image.size)
168 | 
169 | 
170 | class Pix2Pix:
171 |     def __init__(self, device):
172 |         print("Initializing Pix2Pix to %s" % device)
173 |         self.device = device
174 |         self.pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained("timbrooks/instruct-pix2pix",
175 |                                                                            torch_dtype=torch.float16,
176 |                                                                            safety_checker=None).to(device)
177 |         self.pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(self.pipe.scheduler.config)
178 | 
179 |     def inference(self, inputs):
180 |         """Change style of image."""
181 |         print("===>Starting Pix2Pix Inference")
182 |         image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
183 |         original_image = Image.open(image_path)
184 |         image = \
185 |             self.pipe(instruct_text, image=original_image, num_inference_steps=40, image_guidance_scale=1.2, ).images[0]
186 |         updated_image_path = get_new_image_name(image_path, func_name="pix2pix")
187 |         image.save(updated_image_path)
188 |         return updated_image_path
189 | 
190 | 
191 | class T2I:
192 |     def __init__(self, device):
193 |         print("Initializing T2I to %s" % device)
194 |         self.device = device
195 |         self.pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
196 |         self.text_refine_tokenizer = AutoTokenizer.from_pretrained("Gustavosta/MagicPrompt-Stable-Diffusion")
197 |         self.text_refine_model = AutoModelForCausalLM.from_pretrained("Gustavosta/MagicPrompt-Stable-Diffusion")
198 |         self.text_refine_gpt2_pipe = pipeline("text-generation", model=self.text_refine_model,
199 |                                               tokenizer=self.text_refine_tokenizer, device=self.device)
200 |         self.pipe.to(device)
201 | 
202 |     def inference(self, text):
203 |         image_filename = os.path.join('image', str(uuid.uuid4())[0:8] + ".png")
204 |         refined_text = self.text_refine_gpt2_pipe(text)[0]["generated_text"]
205 |         print(f'{text} refined to {refined_text}')
206 |         image = self.pipe(refined_text).images[0]
207 |         image.save(image_filename)
208 |         print(f"Processed T2I.run, text: {text}, image_filename: {image_filename}")
209 |         return image_filename
210 | 
211 | 
212 | class ImageCaptioning:
213 |     def __init__(self, device):
214 |         print("Initializing ImageCaptioning to %s" % device)
215 |         self.device = device
216 |         self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
217 |         self.model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(
218 |             self.device)
219 | 
220 |     def inference(self, image_path):
221 |         inputs = self.processor(Image.open(image_path), return_tensors="pt").to(self.device)
222 |         out = self.model.generate(**inputs)
223 |         captions = self.processor.decode(out[0], skip_special_tokens=True)
224 |         return captions
225 | 
226 | 
227 | class image2pose:
228 |     def __init__(self):
229 |         print("Direct human pose.")
230 |         self.detector = OpenposeDetector()
231 |         self.resolution = 512
232 | 
233 |     def inference(self, inputs):
234 |         print("===>Starting image2pose Inference")
235 |         image = Image.open(inputs)
236 |         image = np.array(image)
237 |         image = HWC3(image)
238 |         detected_map, _ = self.detector(resize_image(image, self.resolution))
239 |         detected_map = HWC3(detected_map)
240 |         image = resize_image(image, self.resolution)
241 |         H, W, C = image.shape
242 |         detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
243 |         updated_image_path = get_new_image_name(inputs, func_name="human-pose")
244 |         image = Image.fromarray(detected_map)
245 |         image.save(updated_image_path)
246 |         return updated_image_path
247 | 
248 | 
249 | class pose2image:
250 |     def __init__(self, device):
251 |         print("Initialize the pose2image model...")
252 |         model = create_model('ControlNet/models/cldm_v15.yaml', device=device).to(device)
253 |         model.load_state_dict(load_state_dict('ControlNet/models/control_sd15_openpose.pth', location='cpu'))
254 |         self.model = model.to(device)
255 |         self.device = device
256 |         self.ddim_sampler = DDIMSampler(self.model)
257 |         self.ddim_steps = 20
258 |         self.image_resolution = 512
259 |         self.num_samples = 1
260 |         self.save_memory = False
261 |         self.strength = 1.0
262 |         self.guess_mode = False
263 |         self.scale = 9.0
264 |         self.seed = -1
265 |         self.a_prompt = 'best quality, extremely detailed'
266 |         self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, ' \
267 |                         'cropped, worst quality, low quality'
268 | 
269 |     def inference(self, inputs):
270 |         print("===>Starting pose2image Inference")
271 |         image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
272 |         image = Image.open(image_path)
273 |         image = np.array(image)
274 |         prompt = instruct_text
275 |         img = resize_image(HWC3(image), self.image_resolution)
276 |         H, W, C = img.shape
277 |         img = cv2.resize(img, (W, H), interpolation=cv2.INTER_NEAREST)
278 |         control = torch.from_numpy(img.copy()).float().to(device=self.device) / 255.0
279 |         control = torch.stack([control for _ in range(self.num_samples)], dim=0)
280 |         control = einops.rearrange(control, 'b h w c -> b c h w').clone()
281 |         self.seed = random.randint(0, 65535)
282 |         seed_everything(self.seed)
283 |         if self.save_memory:
284 |             self.model.low_vram_shift(is_diffusing=False)
285 |         cond = {"c_concat": [control], "c_crossattn": [
286 |             self.model.get_learned_conditioning([prompt + ', ' + self.a_prompt] * self.num_samples)]}
287 |         un_cond = {"c_concat": None if self.guess_mode else [control],
288 |                    "c_crossattn": [self.model.get_learned_conditioning([self.n_prompt] * self.num_samples)]}
289 |         shape = (4, H // 8, W // 8)
290 |         self.model.control_scales = [self.strength * (0.825 ** float(12 - i)) for i in
291 |                                      range(13)] if self.guess_mode else ([self.strength] * 13)
292 |         samples, intermediates = self.ddim_sampler.sample(self.ddim_steps, self.num_samples, shape, cond, verbose=False,
293 |                                                           eta=0., unconditional_guidance_scale=self.scale,
294 |                                                           unconditional_conditioning=un_cond)
295 |         if self.save_memory:
296 |             self.model.low_vram_shift(is_diffusing=False)
297 |         x_samples = self.model.decode_first_stage(samples)
298 |         x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0,
299 |                                                                                                            255).astype(
300 |             np.uint8)
301 |         updated_image_path = get_new_image_name(image_path, func_name="pose2image")
302 |         real_image = Image.fromarray(x_samples[0])  # default the index0 image
303 |         real_image.save(updated_image_path)
304 |         return updated_image_path
305 | 
306 | 
307 | class image2seg:
308 |     def __init__(self):
309 |         print("Direct segmentations.")
310 |         self.detector = UniformerDetector()
311 |         self.resolution = 512
312 | 
313 |     def inference(self, inputs):
314 |         print("===>Starting image2seg Inference")
315 |         image = Image.open(inputs)
316 |         image = np.array(image)
317 |         image = HWC3(image)
318 |         detected_map = self.detector(resize_image(image, self.resolution))
319 |         detected_map = HWC3(detected_map)
320 |         image = resize_image(image, self.resolution)
321 |         H, W, C = image.shape
322 |         detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
323 |         updated_image_path = get_new_image_name(inputs, func_name="segmentation")
324 |         image = Image.fromarray(detected_map)
325 |         image.save(updated_image_path)
326 |         return updated_image_path
327 | 
328 | 
329 | class seg2image:
330 |     def __init__(self, device):
331 |         print("Initialize the seg2image model...")
332 |         model = create_model('ControlNet/models/cldm_v15.yaml', device=device).to(device)
333 |         model.load_state_dict(load_state_dict('ControlNet/models/control_sd15_seg.pth', location='cpu'))
334 |         self.model = model.to(device)
335 |         self.device = device
336 |         self.ddim_sampler = DDIMSampler(self.model)
337 |         self.ddim_steps = 20
338 |         self.image_resolution = 512
339 |         self.num_samples = 1
340 |         self.save_memory = False
341 |         self.strength = 1.0
342 |         self.guess_mode = False
343 |         self.scale = 9.0
344 |         self.seed = -1
345 |         self.a_prompt = 'best quality, extremely detailed'
346 |         self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, ' \
347 |                         'cropped, worst quality, low quality'
348 | 
349 |     def inference(self, inputs):
350 |         print("===>Starting seg2image Inference")
351 |         image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
352 |         image = Image.open(image_path)
353 |         image = np.array(image)
354 |         prompt = instruct_text
355 |         img = resize_image(HWC3(image), self.image_resolution)
356 |         H, W, C = img.shape
357 |         img = cv2.resize(img, (W, H), interpolation=cv2.INTER_NEAREST)
358 |         control = torch.from_numpy(img.copy()).float().to(device=self.device) / 255.0
359 |         control = torch.stack([control for _ in range(self.num_samples)], dim=0)
360 |         control = einops.rearrange(control, 'b h w c -> b c h w').clone()
361 |         self.seed = random.randint(0, 65535)
362 |         seed_everything(self.seed)
363 |         if self.save_memory:
364 |             self.model.low_vram_shift(is_diffusing=False)
365 |         cond = {"c_concat": [control], "c_crossattn": [
366 |             self.model.get_learned_conditioning([prompt + ', ' + self.a_prompt] * self.num_samples)]}
367 |         un_cond = {"c_concat": None if self.guess_mode else [control],
368 |                    "c_crossattn": [self.model.get_learned_conditioning([self.n_prompt] * self.num_samples)]}
369 |         shape = (4, H // 8, W // 8)
370 |         self.model.control_scales = [self.strength * (0.825 ** float(12 - i)) for i in
371 |                                      range(13)] if self.guess_mode else ([self.strength] * 13)
372 |         samples, intermediates = self.ddim_sampler.sample(self.ddim_steps, self.num_samples, shape, cond, verbose=False,
373 |                                                           eta=0., unconditional_guidance_scale=self.scale,
374 |                                                           unconditional_conditioning=un_cond)
375 |         if self.save_memory:
376 |             self.model.low_vram_shift(is_diffusing=False)
377 |         x_samples = self.model.decode_first_stage(samples)
378 |         x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0,
379 |                                                                                                            255).astype(
380 |             np.uint8)
381 |         updated_image_path = get_new_image_name(image_path, func_name="segment2image")
382 |         real_image = Image.fromarray(x_samples[0])  # default the index0 image
383 |         real_image.save(updated_image_path)
384 |         return updated_image_path
385 | 
386 | 
387 | class BLIPVQA:
388 |     def __init__(self, device):
389 |         print("Initializing BLIP VQA to %s" % device)
390 |         self.device = device
391 |         self.processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
392 |         self.model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(self.device)
393 | 
394 |     def get_answer_from_question_and_image(self, inputs):
395 |         image_path, question = inputs.split(",")
396 |         raw_image = Image.open(image_path).convert('RGB')
397 |         print(F'BLIPVQA :question :{question}')
398 |         inputs = self.processor(raw_image, question, return_tensors="pt").to(self.device)
399 |         out = self.model.generate(**inputs)
400 |         answer = self.processor.decode(out[0], skip_special_tokens=True)
401 |         return answer
402 | 
403 | 
404 | class Whisper:
405 |     def __init__(self, device):
406 |         print("Initializing Whisper on device", device)
407 |         self.model = whisper.load_model("medium.en", device=device)
408 | 
409 |     def transcribe(self, inputs):
410 |         return self.model.transcribe(inputs)['text']
411 | 
412 | 
413 | class coqui_tts:
414 | 
415 |     def __init__(self, device):
416 |         self.device = device
417 |         self.tts = TTS('tts_models/multilingual/multi-dataset/your_tts', gpu=self.device)
418 | 
419 |     def gen_speech_from_text(self, inputs):
420 |         print("===>Starting text2speech Inference")
421 |         filename = os.path.join('audio', str(uuid.uuid4())[:8] + ".wav")
422 |         self.tts.tts_to_file(text=inputs, speaker=self.tts.speakers[0], language=self.tts.languages[0],
423 |                              file_path=filename)
424 | 
425 |         return "Audio generated in " + filename
426 | 
427 | 
428 | class TableQA:
429 | 
430 |     def __init__(self, device):
431 |         self.device = device
432 |         self.pipeline = pipeline(task="table-question-answering", model="google/tapas-large-finetuned-wtq",
433 |                                  device=self.device)
434 | 
435 |     def get_answer_from_question_and_table(self, inputs):
436 |         table_path = inputs.split(",")[0]
437 |         questions = inputs.split(",")[1:]
438 |         table = pd.read_csv(table_path, dtype=str)
439 | 
440 |         res = self.pipeline(table=table, query=questions)
441 | 
442 |         return res['answer']
443 | 
444 | 
445 | class TwilioCaller:
446 |     def parse_input(self, inputs):
447 |         try:
448 |             if 'and' in inputs:
449 |                 text: str = inputs.split("and")[0]
450 |                 phone_number = inputs.split("and")[1]
451 |             elif ',' in inputs:
452 |                 text: str = inputs.split(",")[0]
453 |                 phone_number = inputs.split(",")[1:]
454 |                 if isinstance(phone_number, list):
455 |                     phone_number = ",".join(phone_number)
456 |             else:
457 |                 raise Exception('Could not make the call, the input is not well formatted. Must be a comma separated string')
458 |         except:
459 |             raise Exception('Could not parse your input. Must be a comma separated string')
460 |         text = text.replace('"', '').strip(' ')
461 |         phone_number = phone_number.replace('"', '').strip(' ')
462 |         if not re.match('\+[0-9]+', text) and not re.match('\+[0-9]+', phone_number):
463 |             raise Exception('Could not make the call, no phone number provided')
464 |         if re.match('\+[0-9]+', text) and not re.match('\+[0-9]+', phone_number):
465 |             text, phone_number = phone_number, text
466 |         return text, phone_number
467 | 
468 |     def call_with_text(self, inputs):
469 |         import twilio
470 |         try:
471 |             text, phone_number = self.parse_input(inputs)
472 |         except Exception as e:
473 |             return str(e)
474 |         from twilio_lib import call_with_text
475 |         try:
476 |             call_with_text(text, phone_number)
477 |         except twilio.base.exceptions.TwilioRestException:
478 |             return 'Internal error, could not submit the call.'
479 | 
480 |         return 'Call submitted, it should be received soon'
481 | 
482 |     def call_with_audio(self, inputs):
483 |         audio_filename = inputs.split(",")[0]
484 |         phone_number = inputs.split(",")[1:]
485 |         from twilio_lib import call_with_audio
486 |         call_with_audio(audio_filename, phone_number)
487 | 
488 |         return 'Call submitted, it should be received soon'
489 | 
490 | 
491 | class ConversationBot:
492 |     def __init__(self):
493 |         print("Initializing AgentChain")
494 |         self.llm = OpenAI(temperature=0)
495 |         self.i2t = ImageCaptioning(device="cuda:1")  # 1755
496 |         self.t2i = T2I(device="cuda:1")  # 6677
497 |         self.image2pose = image2pose()
498 |         self.pose2image = pose2image(device="cuda:1")  # 6681
499 |         self.BLIPVQA = BLIPVQA(device="cuda:1")  # 2709
500 |         self.image2seg = image2seg()
501 |         self.seg2image = seg2image(device="cuda:1")  # 5540
502 |         ## up until now, comsuming  23362 MB on GPU
503 |         self.pix2pix = Pix2Pix(device="cuda:0")  # 2795
504 |         self.coqui_tts = coqui_tts(device=False)
505 |         self.tableQA = TableQA(device="cuda:0")
506 |         self.whisper = Whisper(device="cuda:0")
507 |         self.twilio_caller = TwilioCaller()
508 |         self.extra_tools = ["serpapi", "llm-math", "python_repl", "requests", "terminal"]
509 | 
510 |         self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output')
511 |         self.tools = [
512 |             Tool(name="Get Photo Description", func=self.i2t.inference,
513 |                  description="useful when you want to know what is inside the photo. receives image_path as input. "
514 |                              "The input to this tool should be a string, representing the image_path. "),
515 |             Tool(name="Generate Image From User Input Text", func=self.t2i.inference,
516 |                  description="useful when you want to generate an image from a user input text and save it to a file. "
517 |                              "like: generate an image of an object or something, or generate an image that includes "
518 |                              "some objects."
519 |                              "The input to this tool should be a string, representing the text used to generate image. "),
520 | 
521 |             Tool(name="Instruct Image Using Text", func=self.pix2pix.inference,
522 |                  description="useful when you want to the style of the image to be like the text. like: make it look "
523 |                              "like a painting. or make it like a robot."
524 |                              "The input to this tool should be a comma separated string of two, representing the "
525 |                              "image_path and the text. "),
526 |             Tool(name="Answer Question About The Image", func=self.BLIPVQA.get_answer_from_question_and_image,
527 |                  description="useful when you need an answer for a question based on an image. like: what is the "
528 |                              "background color of the last image, how many cats in this figure, what is in this figure."
529 |                              "The input to this tool should be a comma separated string of two, representing the "
530 |                              "image_path and the question"),
531 |             Tool(name="Segmentation On Image", func=self.image2seg.inference,
532 |                  description="useful when you want to detect segmentations of the image. like: segment this image, "
533 |                              "or generate segmentations on this image, or perform segmentation on this image."
534 |                              "The input to this tool should be a string, representing the image_path"),
535 |             Tool(name="Generate Image Condition On Segmentations", func=self.seg2image.inference,
536 |                  description="useful when you want to generate a new real image from both the user description and "
537 |                              "segmentations. like: generate a real image of a object or something from this "
538 |                              "segmentation image, or generate a new real image of a object or something from these "
539 |                              "segmentations."
540 |                              "The input to this tool should be a comma separated string of two, representing the "
541 |                              "image_path and the user description"),
542 |             Tool(name="Pose Detection On Image", func=self.image2pose.inference,
543 |                  description="useful when you want to detect the human pose of the image. like: generate human poses "
544 |                              "of this image, or generate a pose image from this image."
545 |                              "The input to this tool should be a string, representing the image_path"),
546 |             Tool(name="Generate Image Condition On Pose Image", func=self.pose2image.inference,
547 |                  description="useful when you want to generate a new real image from both the user description and a "
548 |                              "human pose image. like: generate a real image of a human from this human pose image, "
549 |                              "or generate a new real image of a human from this pose."
550 |                              "The input to this tool should be a comma separated string of two, representing the "
551 |                              "image_path and the user description"),
552 |             Tool(name="Generate Text from Audio", func=self.whisper.transcribe,
553 |                  description="useful when you want to generate text from audio. like: generate text from this audio, "
554 |                              "or transcribe this audio, or listen to this audio. receives audio_path as input."
555 |                              "The input to this tool should be a string, representing the audio_path"),
556 |             Tool(name="Generate Speech From Text", func=self.coqui_tts.gen_speech_from_text,
557 |                  description="useful when you want to generate a speech from a text. like: generate a speech from "
558 |                              "this text, or say this text in audio. "
559 |                              "The input to this tool should be a string, representing the text to be converted to "
560 |                              "speech."
561 |                  ),
562 |             Tool(name="Answer Question About The table", func=self.tableQA.get_answer_from_question_and_table,
563 |                  description="useful when you need an answer for a question based on a table. like: what is the "
564 |                              "maximum of the column age, or  what is the sum of row 5 from the following table."
565 |                              "The input to this tool should be a comma separated string, representing the "
566 |                              "table_path and the questions"),
567 |             Tool(name="Call a phone number with text", func=self.twilio_caller.call_with_text,
568 |                  description="useful when you need to call a phone number with a text input. like: call +4917424393190 and"
569 |                              " tell him \"happy birthday\". The input to this tool should be a comma separate string "
570 |                              "representing the text_input and the phone_number"),
571 |             # Tool(name="Call a phone number with audio", func=self.twilio_caller.call_with_audio,
572 |             #      description="useful when you need to call a phone number with an audio file. like: call +4917424393190 and"
573 |             #                  " using audio file audio/smth.wav. Only use audio files mentioned by the user."
574 |             #                  "The input to this tool should be a comma separated string representing the audio file name and the phone_number"),
575 |         ]
576 | 
577 |         self.tools = self.tools + load_tools(self.extra_tools, llm=self.llm)
578 | 
579 |         self.agent = initialize_agent(
580 |             self.tools,
581 |             self.llm,
582 |             agent="conversational-react-description",
583 |             verbose=True,
584 |             memory=self.memory,
585 |             return_intermediate_steps=True,
586 |             agent_kwargs={'prefix': AGENT_CHAIN_PREFIX, 'format_instructions': AGENT_CHAIN_FORMAT_INSTRUCTIONS,
587 |                           'suffix': AGENT_CHAIN_SUFFIX}, )
588 | 
589 |     def run_text(self, text, state):
590 |         print("===============Running run_text =============")
591 |         print("Inputs:", text, state)
592 |         print("======>Previous memory:\n %s" % self.agent.memory)
593 |         self.agent.memory.buffer = cut_dialogue_history(self.agent.memory.buffer, keep_last_n_words=500)
594 |         res = self.agent({"input": text})
595 |         print("======>Current memory:\n %s" % self.agent.memory)
596 |         response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
597 |         audio_files = re.findall('(audio/\S*wav)', response)
598 |         if len(audio_files) > 0:
599 |             audio = audio_files[0]
600 |         else:
601 |             audio = None
602 |         state = state + [(text, response)]
603 |         print("Outputs:", state)
604 |         return state, state, audio
605 | 
606 |     def run_image(self, image, state, txt):
607 |         print("===============Running run_image =============")
608 |         print("Inputs:", image, state)
609 |         print("======>Previous memory:\n %s" % self.agent.memory)
610 |         image_filename = os.path.join('image', str(uuid.uuid4())[0:8] + ".png")
611 |         print("======>Auto Resize Image...")
612 |         img = Image.open(image.name)
613 |         width, height = img.size
614 |         ratio = min(512 / width, 512 / height)
615 |         width_new, height_new = (round(width * ratio), round(height * ratio))
616 |         img = img.resize((width_new, height_new))
617 |         img = img.convert('RGB')
618 |         img.save(image_filename, "PNG")
619 |         print(f"Resize image form {width}x{height} to {width_new}x{height_new}")
620 |         description = self.i2t.inference(image_filename)
621 |         Human_prompt = "\nHuman: provide a figure named {}. The description is: {}. This information helps you to " \
622 |                        "understand this image, but you should use tools to finish following tasks, " \
623 |                        "rather than directly imagine from my description. If you understand, say \"Received\". \n".format(
624 |             image_filename, description)
625 |         AI_prompt = "Received.  "
626 |         self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt
627 |         print("======>Current memory:\n %s" % self.agent.memory)
628 |         state = state + [(f"![](/file={image_filename})*{image_filename}*", AI_prompt)]
629 |         print("Outputs:", state)
630 |         return state, state, txt + ' ' + image_filename + ' '
631 | 
632 |     def run_audio(self, audio, state, txt):
633 |         print("===============Running run_audio =============")
634 |         print("Inputs:", audio, state)
635 |         print("======>Previous memory:\n %s" % self.agent.memory)
636 |         audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
637 |         import shutil
638 |         shutil.copyfile(audio, audio_filename)
639 |         transcribed_text = self.whisper.transcribe(audio_filename)
640 |         Human_prompt = "\nHuman: provide audio named {}. The description is: {}. This information helps you to " \
641 |                        "understand this audio, but you should use tools to finish following tasks, " \
642 |                        "rather than directly imagine from my description. If you understand, say \"Received\". \n".format(
643 |             audio_filename, transcribed_text)
644 | 
645 |         AI_prompt = "Received.  "
646 |         self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt
647 |         print("======>Current memory:\n %s" % self.agent.memory)
648 |         state = state + [(f"![](/file={audio_filename})*{audio_filename}*", AI_prompt)]
649 |         print("Outputs:", state)
650 |         return state, audio, state, txt + ' ' + audio_filename + ' '
651 | 
652 | 
653 |     def run_df(self, df, state, txt):
654 |         print("===============Running run_df =============")
655 |         print("Inputs:", df, state)
656 |         print("======>Previous memory:\n %s" % self.agent.memory)
657 |         csv_filename = os.path.join('csv', str(uuid.uuid4())[0:8] + ".csv")
658 |         df.to_csv(csv_filename, index=False)
659 |         Human_prompt = "\nHuman: provided a csv file named {}. You can specifically use the tool \"Answer Question About The table\" to understand this file. If you understand, say \"Received\". \n".format(
660 |             csv_filename)
661 | 
662 |         AI_prompt = "Received.  "
663 |         self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt
664 |         print("======>Current memory:\n %s" % self.agent.memory)
665 |         state = state + [(f"![](/file={csv_filename})*{csv_filename}*", AI_prompt)]
666 |         print("Outputs:", state)
667 |         return state, state, txt + ' ' + csv_filename + ' '
668 | 
669 | 
670 | if __name__ == '__main__':
671 |     bot = ConversationBot()
672 |     with gr.Blocks(css="#chatbot .overflow-y-auto{height:500px}") as demo:
673 |         chatbot = gr.Chatbot(elem_id="chatbot", label="AgentChain")
674 |         state = gr.State([])
675 |         with gr.Row():
676 |             with gr.Column(scale=0.8):
677 |                 txt = gr.Textbox(placeholder="Enter text and press enter", label='Instruct with text').style(
678 |                     container=False)
679 |             with gr.Column(scale=0.2, min_width=0):
680 |                 btn = gr.UploadButton("Upload Image", file_types=["image"])
681 |         with gr.Row():
682 |             with gr.Column(scale=0.5, min_width=0):
683 |                 with gr.Row():
684 |                     with gr.Column(scale=0.5, min_width=0):
685 |                         input_audio = gr.Audio(source="upload", type="filepath", label="Upload Audio Input")
686 |                     with gr.Column(scale=0.5, min_width=0):
687 |                         output_audio = gr.Audio(type="filepath", label='Audio output', interactive=False)
688 |                 with gr.Row():
689 |                     clear = gr.Button("Clear Chat History")
690 |             with gr.Column(scale=0.5, min_width=0):
691 |                 with gr.Row():
692 |                     df = gr.DataFrame(interactive=True, row_count=1, col_count=1, headers=['Column1'], label="Give a Dataframe as input")
693 |                 with gr.Row():
694 |                     with gr.Column(scale=0.8, min_width=0):
695 |                         persist_df = gr.Button("Upload the dataframe")
696 | 
697 |         input_audio.upload(bot.run_audio, [input_audio, state, txt], [chatbot, output_audio, state, txt])
698 |         # audio.upload(bot.run_audio, [audio, state, txt], [chatbot, audio, state, txt])
699 |         txt.submit(bot.run_text, [txt, state], [chatbot, state, output_audio])
700 |         txt.submit(lambda: "", None, txt)
701 |         btn.upload(bot.run_image, [btn, state, txt], [chatbot, state, txt])
702 |         persist_df.click(bot.run_df, [df, state, txt], [chatbot, state, txt])
703 |         clear.click(bot.memory.clear)
704 |         clear.click(lambda: [], None, chatbot)
705 |         clear.click(lambda: [], None, state)
706 | 
707 |         demo.launch(server_name="0.0.0.0", server_port=7861)
708 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | torch==1.13.1
 2 | torchvision
 3 | numpy==1.23.1
 4 | transformers==4.26.1
 5 | albumentations==1.3.0
 6 | opencv-python==4.7.0.72
 7 | imageio==2.9.0
 8 | imageio-ffmpeg==0.4.2
 9 | pytorch-lightning==1.5.0
10 | omegaconf==2.1.1
11 | test-tube>=0.7.5
12 | streamlit==1.12.1
13 | einops==0.3.0
14 | webdataset==0.2.5
15 | kornia==0.6
16 | open_clip_torch==2.0.2
17 | invisible-watermark>=0.1.5
18 | streamlit-drawable-canvas==0.8.0
19 | torchmetrics==0.6.0
20 | timm==0.6.12
21 | addict==2.4.0
22 | yapf==0.32.0
23 | prettytable==3.6.0
24 | safetensors==0.2.7
25 | basicsr==1.4.2
26 | langchain==0.0.101
27 | diffusers
28 | gradio
29 | openai
30 | accelerate
31 | openai-whisper==20230308
32 | TTS
33 | pandas
34 | twilio
35 | boto3
36 | google-search-results


--------------------------------------------------------------------------------
/twilio_lib.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from twilio.rest import Client
 3 | from twilio.twiml.voice_response import VoiceResponse, Say
 4 | 
 5 | def write_text_twilml(text):
 6 |     response = VoiceResponse()
 7 |     response.say(text)
 8 |     import tempfile
 9 | 
10 |     # Create a temporary file
11 |     with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp_file:
12 |         # Write a string to the file
13 |         temp_file.write(str(response))
14 | 
15 |         # Get the file name
16 |         file_name = temp_file.name
17 |     return file_name
18 | 
19 | def write_voice_twilml(audio_url):
20 |     response = VoiceResponse()
21 |     response.play(audio_url)
22 |     import tempfile
23 | 
24 |     # Create a temporary file
25 |     with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp_file:
26 |         # Write a string to the file
27 |         temp_file.write(str(response))
28 | 
29 |         # Get the file name
30 |         file_name = temp_file.name
31 |     return file_name
32 | 
33 | def push_to_s3(file_name, extension, content_type=None):
34 |     import boto3
35 |     import uuid
36 | 
37 |     s3 = boto3.client('s3')
38 | 
39 |     bucket_name = os.getenv('AWS_S3_BUCKET_NAME', 'god-llm')
40 |     file_name = file_name
41 |     object_key = f'twilml/{str(uuid.uuid4())[0:4]}.{extension}'
42 |     extra_args = {'ACL': 'public-read'}
43 |     if content_type:
44 |         extra_args["ContentType"] = content_type
45 |     s3.upload_file(file_name, bucket_name, object_key, ExtraArgs=extra_args)
46 |     return f'https://{bucket_name}.s3.eu-central-1.amazonaws.com/{object_key}'
47 | 
48 | def call_with_twilml_url(twilml_url, phone_number):
49 |     # Find your Account SID and Auth Token at twilio.com/console
50 |     # and set the environment variables. See http://twil.io/secure
51 |     account_sid = os.environ['TWILIO_ACCOUNT_SID']
52 |     auth_token = os.environ['TWILIO_AUTH_TOKEN']
53 |     client = Client(account_sid, auth_token)
54 | 
55 |     call = client.calls.create(
56 |         method='GET',
57 |         url=twilml_url,
58 |         to=phone_number,
59 |         from_='+15673393771'
60 |     )
61 |     return call
62 | 
63 | def call_with_text(text, phone_number):
64 |     file_name = write_text_twilml(text)
65 |     twilml_url = push_to_s3(file_name, extension='xml', content_type="text/xml")
66 |     call_with_twilml_url(twilml_url, phone_number)
67 | 
68 | 
69 | def call_with_audio(audio_file, phone_number):
70 |     audio_url = push_to_s3(audio_file, extension='wav', content_type="audio/wav")
71 |     file_name = write_voice_twilml(audio_url)
72 |     twilml_url = push_to_s3(file_name, extension='xml', content_type="text/xml")
73 |     call_with_twilml_url(twilml_url, phone_number)
74 | 


--------------------------------------------------------------------------------