4 |
5 |
6 |
7 | HuggingGPT
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/web/src/assets/switch.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Microsoft Open Source Code of Conduct
2 |
3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
4 |
5 | Resources:
6 |
7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 |
--------------------------------------------------------------------------------
/web/src/types/index.ts:
--------------------------------------------------------------------------------
1 | export interface ChatMessage {
2 | role: "user" | "assistant" | "system";
3 | type: "text" | "image" | "audio" | "video" | "code";
4 | first: boolean;
5 | content: string;
6 | }
7 |
8 | export interface CleanChatMessage {
9 | role: "user" | "assistant" | "system";
10 | content: string;
11 | }
12 |
13 | export interface Collection {
14 | chatgpt: {
15 | [key: string]: ChatMessage[];
16 | };
17 | hugginggpt: {
18 | [key: string]: ChatMessage[];
19 | };
20 | }
21 |
--------------------------------------------------------------------------------
/web/vite.config.ts:
--------------------------------------------------------------------------------
1 | import { fileURLToPath, URL } from "node:url";
2 | import path from "path";
3 | import { defineConfig } from "vite";
4 | import vue from "@vitejs/plugin-vue";
5 |
6 | // https://vitejs.dev/config/
7 | export default defineConfig({
8 | server: {
9 | host: "0.0.0.0",
10 | port: 9999,
11 | open: true,
12 | cors: true,
13 | },
14 | plugins: [vue()],
15 | base: "./",
16 | resolve: {
17 | alias: {
18 | "@": path.resolve(__dirname, "./src"),
19 | },
20 | },
21 | });
22 |
--------------------------------------------------------------------------------
/web/src/assets/tailwind.css:
--------------------------------------------------------------------------------
1 | @tailwind base;
2 | @tailwind components;
3 | @tailwind utilities;
4 |
5 | @layer components {
6 | .btn {
7 | @apply px-4 py-2 text-sm font-medium tracking-wide text-white capitalize transition-colors duration-300 transform bg-blue-700 rounded-md hover:bg-blue-600 focus:outline-none focus:bg-blue-600 whitespace-nowrap disabled:bg-blue-300;
8 | }
9 | .input {
10 | @apply px-4 py-2 text-gray-700 bg-white border rounded-md mr-2 sm:mr-4 focus:border-blue-400 focus:outline-none focus:ring focus:ring-blue-300 focus:ring-opacity-40 flex-grow;
11 | }
12 | }
13 |
--------------------------------------------------------------------------------
/web/electron/preload.js:
--------------------------------------------------------------------------------
1 | /**
2 | * The preload script runs before. It has access to web APIs
3 | * as well as Electron's renderer process modules and some
4 | * polyfilled Node.js functions.
5 | *
6 | * https://www.electronjs.org/docs/latest/tutorial/sandbox
7 | */
8 | window.addEventListener('DOMContentLoaded', () => {
9 | const replaceText = (selector, text) => {
10 | const element = document.getElementById(selector)
11 | if (element) element.innerText = text
12 | }
13 |
14 | for (const type of ['chrome', 'node', 'electron']) {
15 | replaceText(`${type}-version`, process.versions[type])
16 | }
17 | })
18 |
--------------------------------------------------------------------------------
/web/public/audio.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/server/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/diffusers.git@8c530fc2f6a76a2aefb6b285dce6df1675092ac6#egg=diffusers
2 | git+https://github.com/huggingface/transformers@c612628045822f909020f7eb6784c79700813eda#egg=transformers
3 | git+https://github.com/patrickvonplaten/controlnet_aux@78efc716868a7f5669c288233d65b471f542ce40#egg=controlnet_aux
4 | tiktoken==0.3.3
5 | pydub==0.25.1
6 | espnet==202301
7 | espnet_model_zoo==0.1.7
8 | flask==2.2.3
9 | flask_cors==3.0.10
10 | waitress==2.1.2
11 | datasets==2.11.0
12 | asteroid==0.6.0
13 | speechbrain==0.5.14
14 | timm==0.6.13
15 | typeguard==2.13.3
16 | accelerate==0.18.0
17 | pytesseract==0.3.10
18 | gradio==3.24.1
--------------------------------------------------------------------------------
/web/src/api/hugginggpt.ts:
--------------------------------------------------------------------------------
1 | import type { CleanChatMessage } from "@/types";
2 | import axios, { AxiosError } from "axios";
3 | import { HUGGINGGPT_BASE_URL } from "@/config";
4 |
5 | const model = "gpt-3.5-turbo";
6 |
7 | axios.defaults.headers.post["Content-Type"] = "application/json";
8 |
9 | export async function hugginggpt(messageList: CleanChatMessage[]) {
10 | var endpoint = `${HUGGINGGPT_BASE_URL}/hugginggpt`
11 | try {
12 | const response = await axios({
13 | url: endpoint,
14 | method: "post",
15 | data: {
16 | model,
17 | messages: messageList.slice(1)
18 | },
19 | timeout: 180000, // 180 seconds
20 | });
21 | return {
22 | status: "success",
23 | data: response.data.message,
24 | };
25 | } catch (error: any) {
26 | return {
27 | status: "error",
28 | message: error.message
29 | };
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/web/src/api/chatgpt.ts:
--------------------------------------------------------------------------------
1 | import type { CleanChatMessage } from "@/types";
2 | import axios, { AxiosError } from "axios";
3 | import { CHAT_GPT_URL, CHAT_GPT_LLM } from "@/config";
4 |
5 | axios.defaults.headers.post["Content-Type"] = "application/json";
6 |
7 | export async function chatgpt(messageList: CleanChatMessage[], apiKey: string) {
8 | var endpoint = `${CHAT_GPT_URL}/v1/chat/completions`
9 |
10 | try {
11 | const completion = await axios({
12 | url: endpoint,
13 | method: "post",
14 | headers: {
15 | Authorization: `Bearer ${apiKey}`,
16 | },
17 | data: {
18 | model: CHAT_GPT_LLM,
19 | messages: messageList
20 | },
21 | timeout: 60000, // 180 seconds
22 | });
23 | return {
24 | status: "success",
25 | data: completion.data.choices[0].message.content,
26 | };
27 | } catch (error: any) {
28 | return {
29 | status: "error",
30 | message: error.message
31 | };
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/web/src/assets/audio.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/web/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "vue3-ts-vite-router-tailwindcss",
3 | "version": "0.0.0",
4 | "main": "index.html",
5 | "scripts": {
6 | "dev": "vite",
7 | "build": "run-p type-check build-only",
8 | "preview": "vite preview --port 4173",
9 | "build-only": "vite build",
10 | "type-check": "vue-tsc --noEmit",
11 | "e:dev": "yarn build && cp -r electron/. dist && cd dist && yarn && yarn dev",
12 | "e:build": "yarn build && cp -r electron/. dist && cd dist && yarn && yarn build"
13 | },
14 | "dependencies": {
15 | "axios": "^1.3.4",
16 | "vue": "^3.2.38",
17 | "vue-router": "^4.1.5"
18 | },
19 | "devDependencies": {
20 | "@types/node": "^16.11.56",
21 | "@vitejs/plugin-vue": "^3.0.3",
22 | "@vue/tsconfig": "^0.1.3",
23 | "autoprefixer": "^10.4.12",
24 | "npm-run-all": "^4.1.5",
25 | "postcss": "^8.4.18",
26 | "tailwindcss": "^3.2.1",
27 | "typescript": "~4.7.4",
28 | "vite": "^3.2.5",
29 | "vue-tsc": "^0.40.7"
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) Microsoft Corporation.
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE
22 |
--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
1 | cff-version: 1.2.0
2 | message: "If you find this work useful in your method, you can cite the paper as below."
3 | authors:
4 | - family-names: Shen
5 | given-names: Yongliang
6 | - family-names: Song
7 | given-names: Kaitao
8 | - family-names: Tan
9 | given-names: Xu
10 | - family-names: Li
11 | given-names: Dongsheng
12 | - family-names: Lu
13 | given-names: Weiming
14 | - family-names: Zhuang
15 | given-names: Yueting
16 | title: "HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in HuggingFace"
17 | version: 2.0.4
18 | license: MIT
19 | status: preprint
20 | date-released: 2023-03-30
21 | url: https://github.com/microsoft/JARVIS
22 | preferred-citation:
23 | type: article
24 | authors:
25 | - family-names: Shen
26 | given-names: Yongliang
27 | - family-names: Song
28 | given-names: Kaitao
29 | - family-names: Tan
30 | given-names: Xu
31 | - family-names: Li
32 | given-names: Dongsheng
33 | - family-names: Lu
34 | given-names: Weiming
35 | - family-names: Zhuang
36 | given-names: Yueting
37 | journal: "arXiv preprint arXiv:2303.17580"
38 | title: "HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in HuggingFace"
39 | year: 2023
40 | url: https://arxiv.org/abs/2303.17580
--------------------------------------------------------------------------------
/SUPPORT.md:
--------------------------------------------------------------------------------
1 | # TODO: The maintainer of this repo has not yet edited this file
2 |
3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
4 |
5 | - **No CSS support:** Fill out this template with information about how to file issues and get help.
6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps.
7 | - **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide.
8 |
9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
10 |
11 | # Support
12 |
13 | ## How to file issues and get help
14 |
15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing
16 | issues before filing new issues to avoid duplicates. For new issues, file your bug or
17 | feature request as a new Issue.
18 |
19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE
20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
22 |
23 | ## Microsoft Support Policy
24 |
25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
26 |
--------------------------------------------------------------------------------
/web/src/assets/setting.svg:
--------------------------------------------------------------------------------
1 |
5 |
--------------------------------------------------------------------------------
/web/electron/main.js:
--------------------------------------------------------------------------------
1 | // Modules to control application life and create native browser window
2 | const {app, BrowserWindow} = require('electron')
3 | const path = require('path')
4 |
5 | function createWindow () {
6 | // Create the browser window.
7 | const mainWindow = new BrowserWindow({
8 | width: 800,
9 | height: 600,
10 | webPreferences: {
11 | preload: path.join(__dirname, 'preload.js')
12 | }
13 | })
14 |
15 | // and load the index.html of the app.
16 | mainWindow.loadFile('index.html')
17 |
18 | // Open the DevTools.
19 | // mainWindow.webContents.openDevTools()
20 | }
21 |
22 | // This method will be called when Electron has finished
23 | // initialization and is ready to create browser windows.
24 | // Some APIs can only be used after this event occurs.
25 | app.whenReady().then(() => {
26 | createWindow()
27 |
28 | app.on('activate', function () {
29 | // On macOS it's common to re-create a window in the app when the
30 | // dock icon is clicked and there are no other windows open.
31 | if (BrowserWindow.getAllWindows().length === 0) createWindow()
32 | })
33 | })
34 |
35 | // Quit when all windows are closed, except on macOS. There, it's common
36 | // for applications and their menu bar to stay active until the user quits
37 | // explicitly with Cmd + Q.
38 | app.on('window-all-closed', function () {
39 | if (process.platform !== 'darwin') app.quit()
40 | })
41 |
42 | // In this file you can include the rest of your app's specific main process
43 | // code. You can also put them in separate files and require them here.
44 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 |
2 | To contribute to this GitHub project, you can follow these steps:
3 |
4 | 1. Fork the repository you want to contribute to by clicking the "Fork" button on the project page.
5 |
6 | 2. Clone the repository to your local machine and enter the newly created repo using the following commands:
7 |
8 | ```
9 | git clone https://github.com/YOUR-GITHUB-USERNAME/JARVIS
10 | cd JARVIS
11 | ```
12 | 3. Create a new branch for your changes using the following command:
13 |
14 | ```
15 | git checkout -b "branch-name"
16 | ```
17 | 4. Make your changes to the code or documentation.
18 |
19 | 5. Add the changes to the staging area using the following command:
20 | ```
21 | git add .
22 | ```
23 |
24 | 6. Commit the changes with a meaningful commit message using the following command:
25 | ```
26 | git commit -m "your commit message"
27 | ```
28 | 7. Push the changes to your forked repository using the following command:
29 | ```
30 | git push origin branch-name
31 | ```
32 | 8. Go to the GitHub website and navigate to your forked repository.
33 |
34 | 9. Click the "New pull request" button.
35 |
36 | 10. Select the branch you just pushed to and the branch you want to merge into on the original repository.
37 |
38 | 11. Add a description of your changes and click the "Create pull request" button.
39 |
40 | 12. Wait for the project maintainer to review your changes and provide feedback.
41 |
42 | 13. Make any necessary changes based on feedback and repeat steps 5-12 until your changes are accepted and merged into the main project.
43 |
44 | 14. Once your changes are merged, you can update your forked repository and local copy of the repository with the following commands:
45 |
46 | ```
47 | git fetch upstream
48 | git checkout main
49 | git merge upstream/main
50 | ```
51 | Finally, delete the branch you created with the following command:
52 | ```
53 | git branch -d branch-name
54 | ```
55 | That's it you made it 🐣⭐⭐
56 |
--------------------------------------------------------------------------------
/server/models/download.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Set models and datasets to download
4 | models=(
5 | "nlpconnect/vit-gpt2-image-captioning"
6 | "lllyasviel/ControlNet"
7 | "lllyasviel/sd-controlnet-canny"
8 | "lllyasviel/sd-controlnet-depth"
9 | "lllyasviel/sd-controlnet-hed"
10 | "lllyasviel/sd-controlnet-mlsd"
11 | "lllyasviel/sd-controlnet-openpose"
12 | "lllyasviel/sd-controlnet-scribble"
13 | "lllyasviel/sd-controlnet-seg"
14 | "runwayml/stable-diffusion-v1-5"
15 | "damo-vilab/text-to-video-ms-1.7b"
16 | "microsoft/speecht5_asr"
17 | "JorisCos/DCCRNet_Libri1Mix_enhsingle_16k"
18 | "espnet/kan-bayashi_ljspeech_vits"
19 | "facebook/detr-resnet-101"
20 | "microsoft/speecht5_hifigan"
21 | "microsoft/speecht5_vc"
22 | "openai/whisper-base"
23 | "Intel/dpt-large"
24 | "facebook/detr-resnet-50-panoptic"
25 | "facebook/detr-resnet-50"
26 | "google/owlvit-base-patch32"
27 | "impira/layoutlm-document-qa"
28 | "ydshieh/vit-gpt2-coco-en"
29 | "dandelin/vilt-b32-finetuned-vqa"
30 | "lambdalabs/sd-image-variations-diffusers"
31 | "facebook/maskformer-swin-base-coco"
32 | "Intel/dpt-hybrid-midas"
33 | )
34 | datasets=("Matthijs/cmu-arctic-xvectors")
35 |
36 | # Set the current directory
37 | CURRENT_DIR=$(pwd)
38 |
39 | # Download models
40 | for model in "${models[@]}"; do
41 | echo "----- Downloading from https://huggingface.co/${model} -----"
42 | if [ -d "${model}" ]; then
43 | (cd "${model}" && git pull && git lfs pull)
44 | else
45 | git clone --recurse-submodules "https://huggingface.co/${model}" "${model}"
46 | fi
47 | done
48 |
49 | # Download datasets
50 | for dataset in "${datasets[@]}"; do
51 | echo "----- Downloading from https://huggingface.co/datasets/${dataset} -----"
52 | if [ -d "${dataset}" ]; then
53 | (cd "${dataset}" && git pull && git lfs pull)
54 | else
55 | git clone --recurse-submodules "https://huggingface.co/datasets/${dataset}" "${dataset}"
56 | fi
57 | done
--------------------------------------------------------------------------------
/server/models/download.ps1:
--------------------------------------------------------------------------------
1 | $models = @(
2 | "nlpconnect/vit-gpt2-image-captioning",
3 | "lllyasviel/ControlNet",
4 | "lllyasviel/sd-controlnet-canny",
5 | "lllyasviel/sd-controlnet-depth",
6 | "lllyasviel/sd-controlnet-hed",
7 | "lllyasviel/sd-controlnet-mlsd",
8 | "lllyasviel/sd-controlnet-openpose",
9 | "lllyasviel/sd-controlnet-scribble",
10 | "lllyasviel/sd-controlnet-seg",
11 | "runwayml/stable-diffusion-v1-5",
12 | "damo-vilab/text-to-video-ms-1.7b",
13 | "microsoft/speecht5_asr",
14 | "JorisCos/DCCRNet_Libri1Mix_enhsingle_16k",
15 | "espnet/kan-bayashi_ljspeech_vits",
16 | "facebook/detr-resnet-101",
17 | "microsoft/speecht5_hifigan",
18 | "microsoft/speecht5_vc",
19 | "openai/whisper-base",
20 | "Intel/dpt-large",
21 | "facebook/detr-resnet-50-panoptic",
22 | "facebook/detr-resnet-50",
23 | "google/owlvit-base-patch32",
24 | "impira/layoutlm-document-qa",
25 | "ydshieh/vit-gpt2-coco-en",
26 | "dandelin/vilt-b32-finetuned-vqa",
27 | "lambdalabs/sd-image-variations-diffusers",
28 | "facebook/maskformer-swin-base-coco",
29 | "Intel/dpt-hybrid-midas"
30 | )
31 |
32 | $CURRENT_DIR = Get-Location
33 |
34 | foreach ($model in $models) {
35 | Write-Host "----- Downloading from https://huggingface.co/$model -----"
36 | if (Test-Path $model) {
37 | Set-Location $model
38 | git pull
39 | git lfs pull
40 | Set-Location $CURRENT_DIR
41 | } else {
42 | git clone "https://huggingface.co/$model" $model
43 | }
44 | }
45 |
46 | $datasets = @(
47 | "Matthijs/cmu-arctic-xvectors"
48 | )
49 |
50 | foreach ($dataset in $datasets) {
51 | Write-Host "----- Downloading from https://huggingface.co/datasets/$dataset -----"
52 | if (Test-Path $dataset) {
53 | Set-Location $dataset
54 | git pull
55 | git lfs pull
56 | Set-Location $CURRENT_DIR
57 | } else {
58 | git clone "https://huggingface.co/datasets/$dataset" $dataset
59 | }
60 | }
--------------------------------------------------------------------------------
/web/public/video.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/web/src/components/Loading.vue:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
114 |
--------------------------------------------------------------------------------
/server/get_token_ids.py:
--------------------------------------------------------------------------------
1 | import tiktoken
2 |
3 | encodings = {
4 | "gpt-4": tiktoken.get_encoding("cl100k_base"),
5 | "gpt-4-32k": tiktoken.get_encoding("cl100k_base"),
6 | "gpt-3.5-turbo": tiktoken.get_encoding("cl100k_base"),
7 | "gpt-3.5-turbo-0301": tiktoken.get_encoding("cl100k_base"),
8 | "text-davinci-003": tiktoken.get_encoding("p50k_base"),
9 | "text-davinci-002": tiktoken.get_encoding("p50k_base"),
10 | "text-davinci-001": tiktoken.get_encoding("r50k_base"),
11 | "text-curie-001": tiktoken.get_encoding("r50k_base"),
12 | "text-babbage-001": tiktoken.get_encoding("r50k_base"),
13 | "text-ada-001": tiktoken.get_encoding("r50k_base"),
14 | "davinci": tiktoken.get_encoding("r50k_base"),
15 | "curie": tiktoken.get_encoding("r50k_base"),
16 | "babbage": tiktoken.get_encoding("r50k_base"),
17 | "ada": tiktoken.get_encoding("r50k_base"),
18 | }
19 |
20 | max_length = {
21 | "gpt-4": 8192,
22 | "gpt-4-32k": 32768,
23 | "gpt-3.5-turbo": 4096,
24 | "gpt-3.5-turbo-0301": 4096,
25 | "text-davinci-003": 4096,
26 | "text-davinci-002": 4096,
27 | "text-davinci-001": 2049,
28 | "text-curie-001": 2049,
29 | "text-babbage-001": 2049,
30 | "text-ada-001": 2049,
31 | "davinci": 2049,
32 | "curie": 2049,
33 | "babbage": 2049,
34 | "ada": 2049
35 | }
36 |
37 | def count_tokens(model_name, text):
38 | return len(encodings[model_name].encode(text))
39 |
40 | def get_max_context_length(model_name):
41 | return max_length[model_name]
42 |
43 | def get_token_ids_for_task_parsing(model_name):
44 | text = '''{"task": "text-classification", "token-classification", "text2text-generation", "summarization", "translation", "question-answering", "conversational", "text-generation", "sentence-similarity", "tabular-classification", "object-detection", "image-classification", "image-to-image", "image-to-text", "text-to-image", "visual-question-answering", "document-question-answering", "image-segmentation", "text-to-speech", "text-to-video", "automatic-speech-recognition", "audio-to-audio", "audio-classification", "canny-control", "hed-control", "mlsd-control", "normal-control", "openpose-control", "canny-text-to-image", "depth-text-to-image", "hed-text-to-image", "mlsd-text-to-image", "normal-text-to-image", "openpose-text-to-image", "seg-text-to-image", "args", "text", "path", "dep", "id", "-"}'''
45 | res = encodings[model_name].encode(text)
46 | res = list(set(res))
47 | return res
48 |
49 | def get_token_ids_for_choose_model(model_name):
50 | text = '''{"id": "reason"}'''
51 | res = encodings[model_name].encode(text)
52 | res = list(set(res))
53 | return res
--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | ## Security
4 |
5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
6 |
7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below.
8 |
9 | ## Reporting Security Issues
10 |
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 |
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report).
14 |
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey).
16 |
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc).
18 |
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 |
21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 | * Full paths of source file(s) related to the manifestation of the issue
23 | * The location of the affected source code (tag/branch/commit or direct URL)
24 | * Any special configuration required to reproduce the issue
25 | * Step-by-step instructions to reproduce the issue
26 | * Proof-of-concept or exploit code (if possible)
27 | * Impact of the issue, including how an attacker might exploit the issue
28 |
29 | This information will help us triage your report more quickly.
30 |
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs.
32 |
33 | ## Preferred Languages
34 |
35 | We prefer all communications to be in English.
36 |
37 | ## Policy
38 |
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd).
40 |
41 |
42 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # ALL
2 | *.dev.yaml
3 |
4 | # for server
5 | server/models/*
6 | !server/models/download.sh
7 | !server/models/download.ps1
8 | server/logs/
9 | server/models_dev
10 | server/public/*
11 | !server/public/examples/
12 | server/public/examples/*
13 | !server/public/examples/a.jpg
14 | !server/public/examples/b.jpg
15 | !server/public/examples/c.jpg
16 | !server/public/examples/d.jpg
17 | !server/public/examples/e.jpg
18 | !server/public/examples/f.jpg
19 | !server/public/examples/g.jpg
20 |
21 | # docker
22 | Dockerfile
23 | docker-compose.yml
24 |
25 | # for gradio
26 | # server/run_gradio.py
27 |
28 | # for web
29 | web/node_modules
30 | web/package-lock.json
31 | web/dist
32 | web/electron-dist
33 | web/yarn.lock
34 |
35 | # Byte-compiled / optimized / DLL files
36 | __pycache__/
37 | *.py[cod]
38 | *$py.class
39 |
40 | # C extensions
41 | *.so
42 |
43 | # Distribution / packaging
44 | .Python
45 | build/
46 | develop-eggs/
47 | dist/
48 | downloads/
49 | eggs/
50 | .eggs/
51 | lib/
52 | lib64/
53 | parts/
54 | sdist/
55 | var/
56 | wheels/
57 | pip-wheel-metadata/
58 | share/python-wheels/
59 | *.egg-info/
60 | .installed.cfg
61 | *.egg
62 | MANIFEST
63 |
64 | # PyInstaller
65 | # Usually these files are written by a python script from a template
66 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
67 | *.manifest
68 | *.spec
69 |
70 | # Installer logs
71 | pip-log.txt
72 | pip-delete-this-directory.txt
73 |
74 | # Unit test / coverage reports
75 | htmlcov/
76 | .tox/
77 | .nox/
78 | .coverage
79 | .coverage.*
80 | .cache
81 | nosetests.xml
82 | coverage.xml
83 | *.cover
84 | *.py,cover
85 | .hypothesis/
86 | .pytest_cache/
87 |
88 | # Translations
89 | *.mo
90 | *.pot
91 |
92 | # Django stuff:
93 | *.log
94 | local_settings.py
95 | db.sqlite3
96 | db.sqlite3-journal
97 |
98 | # Flask stuff:
99 | instance/
100 | .webassets-cache
101 |
102 | # Scrapy stuff:
103 | .scrapy
104 |
105 | # Sphinx documentation
106 | docs/_build/
107 |
108 | # PyBuilder
109 | target/
110 |
111 | # Jupyter Notebook
112 | .ipynb_checkpoints
113 |
114 | # IPython
115 | profile_default/
116 | ipython_config.py
117 |
118 | # pyenv
119 | .python-version
120 |
121 | # pipenv
122 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
123 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
124 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
125 | # install all needed dependencies.
126 | #Pipfile.lock
127 |
128 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
129 | __pypackages__/
130 |
131 | # Celery stuff
132 | celerybeat-schedule
133 | celerybeat.pid
134 |
135 | # SageMath parsed files
136 | *.sage.py
137 |
138 | # Environments
139 | .env
140 | .venv
141 | env/
142 | venv/
143 | ENV/
144 | env.bak/
145 | venv.bak/
146 |
147 | # Spyder project settings
148 | .spyderproject
149 | .spyproject
150 |
151 | # Rope project settings
152 | .ropeproject
153 |
154 | # mkdocs documentation
155 | /site
156 |
157 | # mypy
158 | .mypy_cache/
159 | .dmypy.json
160 | dmypy.json
161 |
162 | # Pyre type checker
163 | .pyre/
164 |
165 |
--------------------------------------------------------------------------------
/web/src/prompt/index.ts:
--------------------------------------------------------------------------------
1 | import type {Collection, ChatMessage } from "@/types";
2 |
3 | const ChatGPTTerminalMessage:ChatMessage[] = [
4 | {
5 | role: "assistant",
6 | content: "Hi there! I am OpenAI ChatGPT, an AI assistant for you. How can I help you? ",
7 | type: "text",
8 | first: true
9 | },
10 | {
11 | role: "user",
12 | content: "I want you to act as a linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply with the terminal output inside one unique code block, and nothing else. do not write explanations. do not type commands unless I instruct you to do so. When I need to tell you something in English, I will do so by putting text inside curly brackets {like this}.",
13 | type: "text",
14 | first: true
15 | },
16 | {
17 | role: "assistant",
18 | content: "Yes, I will do it for you. Please type the command and I will reply with the terminal output.",
19 | type: "text",
20 | first: true
21 | }
22 | ]
23 |
24 | const ChatGPTPolishMessage:ChatMessage[] = [
25 | {
26 | role: "assistant",
27 | content: "Hi there! I am OpenAI ChatGPT, an AI assistant for you. How can I help you? ",
28 | type: "text",
29 | first: true
30 | },
31 | {
32 | role: "user",
33 | content: "You are a well-trained AI writing assistant with expertise in writing academic papers for computer conferences. By giving you a draft paragraph, I hope you can help me polish my writing with your knowledge. The language should be concise and consistent with the style of an academic paper.",
34 | type: "text",
35 | first: true
36 | },
37 | {
38 | role: "assistant",
39 | content: "No problem, I will think carefully and polish the paper for you.",
40 | type: "text",
41 | first: true
42 | },
43 | ]
44 |
45 | const ChatGPTTranslationMessage:ChatMessage[] = [
46 | {
47 | role: "assistant",
48 | content: "Hi there! I am OpenAI ChatGPT, an AI assistant for you. How can I help you? ",
49 | type: "text",
50 | first: true
51 | },
52 | {
53 | role: "user",
54 | content: "I want you to act as an English translator, spelling corrector and improver. I will speak to you in any language and you will detect the language, translate it and answer in the corrected and improved version of my text, in English. I want you to replace my simplified A0-level words and sentences with more beautiful and elegant, upper level English words and sentences. Keep the meaning same, but make them more literary. I want you to only reply the correction, the improvements and nothing else, do not write explanations.",
55 | type: "text",
56 | first: true
57 | },
58 | {
59 | role: "assistant",
60 | content: "Sure, I will act as an English translator and improver.",
61 | type: "text",
62 | first: true
63 | },
64 | ]
65 |
66 |
67 | const defaultChatGPTMessage:ChatMessage[] = [
68 | {
69 | role: "assistant",
70 | content: "Hi there! I am OpenAI ChatGPT, an AI assistant for you. How can I help you? ",
71 | type: "text",
72 | first: true
73 | }
74 | ]
75 |
76 | const defaultHuggingGPTMessage:ChatMessage[] = [
77 | {
78 | role: "assistant",
79 | content: "Hi there, I am HuggingGPT empowered by Huggingface family! Yes, I can provide thousands of models for dozens of tasks. For more fun and creativity, I have invited Diffusers family to join our team. Feel free to experience it!",
80 | type: "text",
81 | first: true
82 | }
83 | ]
84 |
85 | const promptCollection: Collection = {
86 | "chatgpt": {
87 | "terminal": ChatGPTTerminalMessage,
88 | "polish": ChatGPTPolishMessage,
89 | "translation": ChatGPTTranslationMessage,
90 | "default": defaultChatGPTMessage,
91 | },
92 | "hugginggpt": {
93 | "default": defaultHuggingGPTMessage
94 | }
95 | }
96 |
97 |
98 | export default promptCollection
--------------------------------------------------------------------------------
/server/configs/config.gradio.yaml:
--------------------------------------------------------------------------------
1 | huggingface:
2 | token: REPLACE_WITH_YOUR_HUGGINGFACE_TOKEN_HERE # required: huggingface token @ https://huggingface.co/settings/tokens
3 | dev: false
4 | debug: true
5 | log_file: logs/debug.log
6 | model: text-davinci-003 # currently only support text-davinci-003, we will support more open-source LLMs in the future
7 | use_completion: true
8 | inference_mode: huggingface # local, huggingface or hybrid, prefer hybrid
9 | local_deployment: full # minimal, standard or full, prefer full
10 | device: cuda:0 # cuda:id or cpu
11 | num_candidate_models: 5
12 | max_description_length: 100
13 | proxy: # optional: your proxy server "http://ip:port"
14 | local_inference_endpoint:
15 | host: localhost
16 | port: 8005
17 | logit_bias:
18 | parse_task: 0.1
19 | choose_model: 5
20 | tprompt:
21 | parse_task: >-
22 | #1 Task Planning Stage: The AI assistant can parse user input to several tasks: [{"task": task, "id": task_id, "dep": dependency_task_id, "args": {"text": text or -dep_id, "image": image_url or -dep_id, "audio": audio_url or -dep_id}}]. The special tag "-dep_id" refer to the one generated text/image/audio in the dependency task (Please consider whether the dependency task generates resources of this type.) and "dep_id" must be in "dep" list. The "dep" field denotes the ids of the previous prerequisite tasks which generate a new resource that the current task relies on. The "args" field must in ["text", "image", "audio"], nothing else. The task MUST be selected from the following options: "token-classification", "text2text-generation", "summarization", "translation", "question-answering", "conversational", "text-generation", "sentence-similarity", "tabular-classification", "object-detection", "image-classification", "image-to-image", "image-to-text", "text-to-image", "text-to-video", "visual-question-answering", "document-question-answering", "image-segmentation", "depth-estimation", "text-to-speech", "automatic-speech-recognition", "audio-to-audio", "audio-classification", "canny-control", "hed-control", "mlsd-control", "normal-control", "openpose-control", "canny-text-to-image", "depth-text-to-image", "hed-text-to-image", "mlsd-text-to-image", "normal-text-to-image", "openpose-text-to-image", "seg-text-to-image". There may be multiple tasks of the same type. Think step by step about all the tasks needed to resolve the user's request. Parse out as few tasks as possible while ensuring that the user request can be resolved. Pay attention to the dependencies and order among tasks. If the user input can't be parsed, you need to reply empty JSON [].
23 | choose_model: >-
24 | #2 Model Selection Stage: Given the user request and the parsed tasks, the AI assistant helps the user to select a suitable model from a list of models to process the user request. The assistant should focus more on the description of the model and find the model that has the most potential to solve requests and tasks. Also, prefer models with local inference endpoints for speed and stability.
25 | response_results: >-
26 | #4 Response Generation Stage: With the task execution logs, the AI assistant needs to describe the process and inference results.
27 | demos_or_presteps:
28 | parse_task: demos/demo_parse_task.json
29 | choose_model: demos/demo_choose_model.json
30 | response_results: demos/demo_response_results.json
31 | prompt:
32 | parse_task: The chat log [ {{context}} ] may contain the resources I mentioned. Now I input { {{input}} }. Pay attention to the input and output types of tasks and the dependencies between tasks.
33 | choose_model: >-
34 | Please choose the most suitable model from {{metas}} for the task {{task}}. The output must be in a strict JSON format: {"id": "id", "reason": "your detail reasons for the choice"}.
35 | response_results: >-
36 | Yes. Please first think carefully and directly answer my request based on the inference results. Some of the inferences may not always turn out to be correct and require you to make careful consideration in making decisions. Then please detail your workflow including the used models and inference results for my request in your friendly tone. Please filter out information that is not relevant to my request. Tell me the complete path or urls of files in inference results. If there is nothing in the results, please tell me you can't make it. }
--------------------------------------------------------------------------------
/server/demos/demo_parse_task.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "role": "user",
4 | "content": "Give you some pictures e1.jpg, e2.png, e3.jpg, help me count the number of sheep?"
5 | },
6 | {
7 | "role": "assistant",
8 | "content": "[{\"task\": \"image-to-text\", \"id\": 0, \"dep\": [-1], \"args\": {\"image\": \"e1.jpg\" }}, {\"task\": \"object-detection\", \"id\": 1, \"dep\": [-1], \"args\": {\"image\": \"e1.jpg\" }}, {\"task\": \"visual-question-answering\", \"id\": 2, \"dep\": [1], \"args\": {\"image\": \"-1\", \"text\": \"How many sheep in the picture\"}} }}, {\"task\": \"image-to-text\", \"id\": 3, \"dep\": [-1], \"args\": {\"image\": \"e2.png\" }}, {\"task\": \"object-detection\", \"id\": 4, \"dep\": [-1], \"args\": {\"image\": \"e2.png\" }}, {\"task\": \"visual-question-answering\", \"id\": 5, \"dep\": [4], \"args\": {\"image\": \"-4\", \"text\": \"How many sheep in the picture\"}} }}, {\"task\": \"image-to-text\", \"id\": 6, \"dep\": [-1], \"args\": {\"image\": \"e3.jpg\" }}, {\"task\": \"object-detection\", \"id\": 7, \"dep\": [-1], \"args\": {\"image\": \"e3.jpg\" }}, {\"task\": \"visual-question-answering\", \"id\": 8, \"dep\": [7], \"args\": {\"image\": \"-7\", \"text\": \"How many sheep in the picture\"}}]"
9 | },
10 |
11 | {
12 | "role":"user",
13 | "content":"Look at /e.jpg, can you tell me how many objects in the picture? Give me a picture and video similar to this one."
14 | },
15 | {
16 | "role":"assistant",
17 | "content":"[{\"task\": \"image-to-text\", \"id\": 0, \"dep\": [-1], \"args\": {\"image\": \"/e.jpg\" }}, {\"task\": \"object-detection\", \"id\": 1, \"dep\": [-1], \"args\": {\"image\": \"/e.jpg\" }}, {\"task\": \"visual-question-answering\", \"id\": 2, \"dep\": [1], \"args\": {\"image\": \"-1\", \"text\": \"how many objects in the picture?\" }}, {\"task\": \"text-to-image\", \"id\": 3, \"dep\": [0], \"args\": {\"text\": \"\" }}, {\"task\": \"image-to-image\", \"id\": 4, \"dep\": [-1], \"args\": {\"image\": \"/e.jpg\" }}, {\"task\": \"text-to-video\", \"id\": 5, \"dep\": [0], \"args\": {\"text\": \"\" }}]"
18 | },
19 |
20 | {
21 | "role":"user",
22 | "content":"given a document /images/e.jpeg, answer me what is the student amount? And describe the image with your voice"
23 | },
24 | {
25 | "role":"assistant",
26 | "content":"{\"task\": \"document-question-answering\", \"id\": 0, \"dep\": [-1], \"args\": {\"image\": \"/images/e.jpeg\", \"text\": \"what is the student amount?\" }}, {\"task\": \"visual-question-answering\", \"id\": 1, \"dep\": [-1], \"args\": {\"image\": \"/images/e.jpeg\", \"text\": \"what is the student amount?\" }}, {\"task\": \"image-to-text\", \"id\": 2, \"dep\": [-1], \"args\": {\"image\": \"/images/e.jpg\" }}, {\"task\": \"text-to-speech\", \"id\": 3, \"dep\": [2], \"args\": {\"text\": \"-2\" }}]"
27 | },
28 |
29 | {
30 | "role": "user",
31 | "content": "Given an image /example.jpg, first generate a hed image, then based on the hed image generate a new image where a girl is reading a book"
32 | },
33 | {
34 | "role": "assistant",
35 | "content": "[{\"task\": \"openpose-control\", \"id\": 0, \"dep\": [-1], \"args\": {\"image\": \"/example.jpg\" }}, {\"task\": \"openpose-text-to-image\", \"id\": 1, \"dep\": [0], \"args\": {\"text\": \"a girl is reading a book\", \"image\": \"-0\" }}]"
36 | },
37 |
38 | {
39 | "role": "user",
40 | "content": "please show me a video and an image of (based on the text) 'a boy is running' and dub it"
41 | },
42 | {
43 | "role": "assistant",
44 | "content": "[{\"task\": \"text-to-video\", \"id\": 0, \"dep\": [-1], \"args\": {\"text\": \"a boy is running\" }}, {\"task\": \"text-to-speech\", \"id\": 1, \"dep\": [-1], \"args\": {\"text\": \"a boy is running\" }}, {\"task\": \"text-to-image\", \"id\": 2, \"dep\": [-1], \"args\": {\"text\": \"a boy is running\" }}]"
45 | },
46 |
47 |
48 | {
49 | "role": "user",
50 | "content": "please show me a joke and an image of cat"
51 | },
52 | {
53 | "role": "assistant",
54 | "content": "[{\"task\": \"conversational\", \"id\": 0, \"dep\": [-1], \"args\": {\"text\": \"please show me a joke of cat\" }}, {\"task\": \"text-to-image\", \"id\": 1, \"dep\": [-1], \"args\": {\"text\": \"a photo of cat\" }}]"
55 | }
56 | ]
57 |
--------------------------------------------------------------------------------
/server/configs/config.lite.yaml:
--------------------------------------------------------------------------------
1 | openai:
2 | api_key: REPLACE_WITH_YOUR_OPENAI_API_KEY_HERE
3 | huggingface:
4 | token: REPLACE_WITH_YOUR_HUGGINGFACE_TOKEN_HERE # required: huggingface token @ https://huggingface.co/settings/tokens
5 | dev: false
6 | debug: false
7 | log_file: logs/debug.log
8 | model: text-davinci-003 # currently only support text-davinci-003, gpt-4, we will support more open-source LLMs in the future
9 | use_completion: true
10 | inference_mode: huggingface # local, huggingface or hybrid, prefer hybrid
11 | local_deployment: minimal # minimal, standard or full, prefer full
12 | num_candidate_models: 5
13 | max_description_length: 100
14 | proxy: # optional: your proxy server "http://ip:port"
15 | http_listen:
16 | host: 0.0.0.0 # if you use web as the client, please set `http://{LAN_IP_of_the_server}:{port}/` to `BASE_URL` of `web/src/config/index.ts`.
17 | port: 8004
18 | logit_bias:
19 | parse_task: 0.1
20 | choose_model: 5
21 | tprompt:
22 | parse_task: >-
23 | #1 Task Planning Stage: The AI assistant can parse user input to several tasks: [{"task": task, "id": task_id, "dep": dependency_task_id, "args": {"text": text or -dep_id, "image": image_url or -dep_id, "audio": audio_url or -dep_id}}]. The special tag "-dep_id" refer to the one generated text/image/audio in the dependency task (Please consider whether the dependency task generates resources of this type.) and "dep_id" must be in "dep" list. The "dep" field denotes the ids of the previous prerequisite tasks which generate a new resource that the current task relies on. The "args" field must in ["text", "image", "audio"], nothing else. The task MUST be selected from the following options: "token-classification", "text2text-generation", "summarization", "translation", "question-answering", "conversational", "text-generation", "sentence-similarity", "tabular-classification", "object-detection", "image-classification", "image-to-image", "image-to-text", "text-to-image", "text-to-video", "visual-question-answering", "document-question-answering", "image-segmentation", "depth-estimation", "text-to-speech", "automatic-speech-recognition", "audio-to-audio", "audio-classification", "canny-control", "hed-control", "mlsd-control", "normal-control", "openpose-control", "canny-text-to-image", "depth-text-to-image", "hed-text-to-image", "mlsd-text-to-image", "normal-text-to-image", "openpose-text-to-image", "seg-text-to-image". There may be multiple tasks of the same type. Think step by step about all the tasks needed to resolve the user's request. Parse out as few tasks as possible while ensuring that the user request can be resolved. Pay attention to the dependencies and order among tasks. If the user input can't be parsed, you need to reply empty JSON [].
24 | choose_model: >-
25 | #2 Model Selection Stage: Given the user request and the parsed tasks, the AI assistant helps the user to select a suitable model from a list of models to process the user request. The assistant should focus more on the description of the model and find the model that has the most potential to solve requests and tasks. Also, prefer models with local inference endpoints for speed and stability.
26 | response_results: >-
27 | #4 Response Generation Stage: With the task execution logs, the AI assistant needs to describe the process and inference results.
28 | demos_or_presteps:
29 | parse_task: demos/demo_parse_task.json
30 | choose_model: demos/demo_choose_model.json
31 | response_results: demos/demo_response_results.json
32 | prompt:
33 | parse_task: The chat log [ {{context}} ] may contain the resources I mentioned. Now I input { {{input}} }. Pay attention to the input and output types of tasks and the dependencies between tasks.
34 | choose_model: >-
35 | Please choose the most suitable model from {{metas}} for the task {{task}}. The output must be in a strict JSON format: {"id": "id", "reason": "your detail reasons for the choice"}.
36 | response_results: >-
37 | Yes. Please first think carefully and directly answer my request based on the inference results. Some of the inferences may not always turn out to be correct and require you to make careful consideration in making decisions. Then please detail your workflow including the used models and inference results for my request in your friendly tone. Please filter out information that is not relevant to my request. Tell me the complete path or urls of files in inference results. If there is nothing in the results, please tell me you can't make it. }
--------------------------------------------------------------------------------
/server/configs/config.azure.yaml:
--------------------------------------------------------------------------------
1 | azure:
2 | api_key: REPLACE_WITH_YOUR_AZURE_API_KEY_HERE
3 | base_url: REPLACE_WITH_YOUR_ENDPOINT_HERE
4 | deployment_name: REPLACE_WITH_YOUR_DEPLOYMENT_NAME_HERE
5 | api_version: "2022-12-01"
6 | huggingface:
7 | token: REPLACE_WITH_YOUR_HUGGINGFACE_TOKEN_HERE # required: huggingface token @ https://huggingface.co/settings/tokens
8 | dev: false
9 | debug: false
10 | log_file: logs/debug.log
11 | model: text-davinci-003 # currently only support text-davinci-003, gpt-4, we will support more open-source LLMs in the future
12 | use_completion: true
13 | inference_mode: huggingface # local, huggingface or hybrid, prefer hybrid
14 | local_deployment: full # minimal, standard or full, prefer full
15 | device: cuda:0 # cuda:id or cpu
16 | num_candidate_models: 5
17 | max_description_length: 100
18 | proxy: # optional: your proxy server "http://ip:port"
19 | http_listen:
20 | host: 0.0.0.0 # if you use web as the client, please set `http://{LAN_IP_of_the_server}:{port}/` to `BASE_URL` of `web/src/config/index.ts`.
21 | port: 8004
22 | local_inference_endpoint:
23 | host: localhost
24 | port: 8005
25 | logit_bias:
26 | parse_task: 0.1
27 | choose_model: 5
28 | tprompt:
29 | parse_task: >-
30 | #1 Task Planning Stage: The AI assistant can parse user input to several tasks: [{"task": task, "id": task_id, "dep": dependency_task_id, "args": {"text": text or -dep_id, "image": image_url or -dep_id, "audio": audio_url or -dep_id}}]. The special tag "-dep_id" refer to the one generated text/image/audio in the dependency task (Please consider whether the dependency task generates resources of this type.) and "dep_id" must be in "dep" list. The "dep" field denotes the ids of the previous prerequisite tasks which generate a new resource that the current task relies on. The "args" field must in ["text", "image", "audio"], nothing else. The task MUST be selected from the following options: "token-classification", "text2text-generation", "summarization", "translation", "question-answering", "conversational", "text-generation", "sentence-similarity", "tabular-classification", "object-detection", "image-classification", "image-to-image", "image-to-text", "text-to-image", "text-to-video", "visual-question-answering", "document-question-answering", "image-segmentation", "depth-estimation", "text-to-speech", "automatic-speech-recognition", "audio-to-audio", "audio-classification", "canny-control", "hed-control", "mlsd-control", "normal-control", "openpose-control", "canny-text-to-image", "depth-text-to-image", "hed-text-to-image", "mlsd-text-to-image", "normal-text-to-image", "openpose-text-to-image", "seg-text-to-image". There may be multiple tasks of the same type. Think step by step about all the tasks needed to resolve the user's request. Parse out as few tasks as possible while ensuring that the user request can be resolved. Pay attention to the dependencies and order among tasks. If the user input can't be parsed, you need to reply empty JSON [].
31 | choose_model: >-
32 | #2 Model Selection Stage: Given the user request and the parsed tasks, the AI assistant helps the user to select a suitable model from a list of models to process the user request. The assistant should focus more on the description of the model and find the model that has the most potential to solve requests and tasks. Also, prefer models with local inference endpoints for speed and stability.
33 | response_results: >-
34 | #4 Response Generation Stage: With the task execution logs, the AI assistant needs to describe the process and inference results.
35 | demos_or_presteps:
36 | parse_task: demos/demo_parse_task.json
37 | choose_model: demos/demo_choose_model.json
38 | response_results: demos/demo_response_results.json
39 | prompt:
40 | parse_task: The chat log [ {{context}} ] may contain the resources I mentioned. Now I input { {{input}} }. Pay attention to the input and output types of tasks and the dependencies between tasks.
41 | choose_model: >-
42 | Please choose the most suitable model from {{metas}} for the task {{task}}. The output must be in a strict JSON format: {"id": "id", "reason": "your detail reasons for the choice"}.
43 | response_results: >-
44 | Yes. Please first think carefully and directly answer my request based on the inference results. Some of the inferences may not always turn out to be correct and require you to make careful consideration in making decisions. Then please detail your workflow including the used models and inference results for my request in your friendly tone. Please filter out information that is not relevant to my request. Tell me the complete path or urls of files in inference results. If there is nothing in the results, please tell me you can't make it. }
--------------------------------------------------------------------------------
/server/configs/config.default.yaml:
--------------------------------------------------------------------------------
1 | openai:
2 | api_key: REPLACE_WITH_YOUR_OPENAI_API_KEY_HERE
3 | # azure:
4 | # api_key: REPLACE_WITH_YOUR_AZURE_API_KEY_HERE
5 | # base_url: REPLACE_WITH_YOUR_ENDPOINT_HERE
6 | # deployment_name: REPLACE_WITH_YOUR_DEPLOYMENT_NAME_HERE
7 | # api_version: "2022-12-01"
8 | huggingface:
9 | token: REPLACE_WITH_YOUR_HUGGINGFACE_TOKEN_HERE # required: huggingface token @ https://huggingface.co/settings/tokens
10 | dev: false
11 | debug: false
12 | log_file: logs/debug.log
13 | model: text-davinci-003 # currently only support text-davinci-003, gpt-4, we will support more open-source LLMs in the future
14 | use_completion: true
15 | inference_mode: hybrid # local, huggingface or hybrid, prefer hybrid
16 | local_deployment: full # minimal, standard or full, prefer full
17 | device: cuda:0 # cuda:id or cpu
18 | num_candidate_models: 5
19 | max_description_length: 100
20 | proxy: # optional: your proxy server "http://ip:port"
21 | http_listen:
22 | host: 0.0.0.0 # if you use web as the client, please set `http://{LAN_IP_of_the_server}:{port}/` to `BASE_URL` of `web/src/config/index.ts`.
23 | port: 8004
24 | local_inference_endpoint:
25 | host: localhost
26 | port: 8005
27 | logit_bias:
28 | parse_task: 0.1
29 | choose_model: 5
30 | tprompt:
31 | parse_task: >-
32 | #1 Task Planning Stage: The AI assistant can parse user input to several tasks: [{"task": task, "id": task_id, "dep": dependency_task_id, "args": {"text": text or -dep_id, "image": image_url or -dep_id, "audio": audio_url or -dep_id}}]. The special tag "-dep_id" refer to the one generated text/image/audio in the dependency task (Please consider whether the dependency task generates resources of this type.) and "dep_id" must be in "dep" list. The "dep" field denotes the ids of the previous prerequisite tasks which generate a new resource that the current task relies on. The "args" field must in ["text", "image", "audio"], nothing else. The task MUST be selected from the following options: "token-classification", "text2text-generation", "summarization", "translation", "question-answering", "conversational", "text-generation", "sentence-similarity", "tabular-classification", "object-detection", "image-classification", "image-to-image", "image-to-text", "text-to-image", "text-to-video", "visual-question-answering", "document-question-answering", "image-segmentation", "depth-estimation", "text-to-speech", "automatic-speech-recognition", "audio-to-audio", "audio-classification", "canny-control", "hed-control", "mlsd-control", "normal-control", "openpose-control", "canny-text-to-image", "depth-text-to-image", "hed-text-to-image", "mlsd-text-to-image", "normal-text-to-image", "openpose-text-to-image", "seg-text-to-image". There may be multiple tasks of the same type. Think step by step about all the tasks needed to resolve the user's request. Parse out as few tasks as possible while ensuring that the user request can be resolved. Pay attention to the dependencies and order among tasks. If the user input can't be parsed, you need to reply empty JSON [].
33 | choose_model: >-
34 | #2 Model Selection Stage: Given the user request and the parsed tasks, the AI assistant helps the user to select a suitable model from a list of models to process the user request. The assistant should focus more on the description of the model and find the model that has the most potential to solve requests and tasks. Also, prefer models with local inference endpoints for speed and stability.
35 | response_results: >-
36 | #4 Response Generation Stage: With the task execution logs, the AI assistant needs to describe the process and inference results.
37 | demos_or_presteps:
38 | parse_task: demos/demo_parse_task.json
39 | choose_model: demos/demo_choose_model.json
40 | response_results: demos/demo_response_results.json
41 | prompt:
42 | parse_task: The chat log [ {{context}} ] may contain the resources I mentioned. Now I input { {{input}} }. Pay attention to the input and output types of tasks and the dependencies between tasks.
43 | choose_model: >-
44 | Please choose the most suitable model from {{metas}} for the task {{task}}. The output must be in a strict JSON format: {"id": "id", "reason": "your detail reasons for the choice"}.
45 | response_results: >-
46 | Yes. Please first think carefully and directly answer my request based on the inference results. Some of the inferences may not always turn out to be correct and require you to make careful consideration in making decisions. Then please detail your workflow including the used models and inference results for my request in your friendly tone. Please filter out information that is not relevant to my request. Tell me the complete path or urls of files in inference results. If there is nothing in the results, please tell me you can't make it. }
--------------------------------------------------------------------------------
/server/run_gradio_demo.py:
--------------------------------------------------------------------------------
1 | import uuid
2 | import gradio as gr
3 | import re
4 | from diffusers.utils import load_image
5 | import requests
6 | from awesome_chat import chat_huggingface
7 |
8 | all_messages = []
9 | OPENAI_KEY = ""
10 |
11 | def add_message(content, role):
12 | message = {"role":role, "content":content}
13 | all_messages.append(message)
14 |
15 | def extract_medias(message):
16 | image_pattern = re.compile(r"(http(s?):|\/)?([\.\/_\w:-])*?\.(jpg|jpeg|tiff|gif|png)")
17 | image_urls = []
18 | for match in image_pattern.finditer(message):
19 | if match.group(0) not in image_urls:
20 | image_urls.append(match.group(0))
21 |
22 | audio_pattern = re.compile(r"(http(s?):|\/)?([\.\/_\w:-])*?\.(flac|wav)")
23 | audio_urls = []
24 | for match in audio_pattern.finditer(message):
25 | if match.group(0) not in audio_urls:
26 | audio_urls.append(match.group(0))
27 |
28 | video_pattern = re.compile(r"(http(s?):|\/)?([\.\/_\w:-])*?\.(mp4)")
29 | video_urls = []
30 | for match in video_pattern.finditer(message):
31 | if match.group(0) not in video_urls:
32 | video_urls.append(match.group(0))
33 |
34 | return image_urls, audio_urls, video_urls
35 |
36 | def set_openai_key(openai_key):
37 | global OPENAI_KEY
38 | OPENAI_KEY = openai_key
39 | return OPENAI_KEY
40 |
41 | def add_text(messages, message):
42 | if len(OPENAI_KEY) == 0 or not OPENAI_KEY.startswith("sk-"):
43 | return messages, "Please set your OpenAI API key first."
44 | add_message(message, "user")
45 | messages = messages + [(message, None)]
46 | image_urls, audio_urls, video_urls = extract_medias(message)
47 |
48 | for image_url in image_urls:
49 | if not image_url.startswith("http"):
50 | image_url = "public/" + image_url
51 | image = load_image(image_url)
52 | name = f"public/images/{str(uuid.uuid4())[:4]}.jpg"
53 | image.save(name)
54 | messages = messages + [((f"{name}",), None)]
55 | for audio_url in audio_urls:
56 | if not audio_url.startswith("http"):
57 | audio_url = "public/" + audio_url
58 | ext = audio_url.split(".")[-1]
59 | name = f"public/audios/{str(uuid.uuid4()[:4])}.{ext}"
60 | response = requests.get(audio_url)
61 | with open(name, "wb") as f:
62 | f.write(response.content)
63 | messages = messages + [((f"{name}",), None)]
64 | for video_url in video_urls:
65 | if not video_url.startswith("http"):
66 | video_url = "public/" + video_url
67 | ext = video_url.split(".")[-1]
68 | name = f"public/audios/{str(uuid.uuid4()[:4])}.{ext}"
69 | response = requests.get(video_url)
70 | with open(name, "wb") as f:
71 | f.write(response.content)
72 | messages = messages + [((f"{name}",), None)]
73 | return messages, ""
74 |
75 | def bot(messages):
76 | if len(OPENAI_KEY) == 0 or not OPENAI_KEY.startswith("sk-"):
77 | return messages
78 | message = chat_huggingface(all_messages, OPENAI_KEY, "openai", "https://api.openai.com/v1/completions")["message"]
79 | image_urls, audio_urls, video_urls = extract_medias(message)
80 | add_message(message, "assistant")
81 | messages[-1][1] = message
82 | for image_url in image_urls:
83 | if not image_url.startswith("http"):
84 | image_url = image_url.replace("public/", "")
85 | messages = messages + [((None, (f"public/{image_url}",)))]
86 | for audio_url in audio_urls:
87 | if not audio_url.startswith("http"):
88 | audio_url = audio_url.replace("public/", "")
89 | messages = messages + [((None, (f"public/{audio_url}",)))]
90 | for video_url in video_urls:
91 | if not video_url.startswith("http"):
92 | video_url = video_url.replace("public/", "")
93 | messages = messages + [((None, (f"public/{video_url}",)))]
94 | return messages
95 |
96 | with gr.Blocks() as demo:
97 | gr.Markdown("
HuggingGPT (Dev)
")
98 | with gr.Row():
99 | openai_api_key = gr.Textbox(
100 | show_label=False,
101 | placeholder="Set your OpenAI API key here and press Enter",
102 | lines=1,
103 | type="password",
104 | )
105 |
106 | chatbot = gr.Chatbot([], elem_id="chatbot").style(height=500)
107 |
108 | with gr.Row():
109 | txt = gr.Textbox(
110 | show_label=False,
111 | placeholder="Enter text and press enter. The url of the multimedia resource must contain the extension name.",
112 | ).style(container=False)
113 |
114 | txt.submit(add_text, [chatbot, txt], [chatbot, txt]).then(
115 | bot, chatbot, chatbot
116 | )
117 | openai_api_key.submit(set_openai_key, [openai_api_key], [openai_api_key])
118 |
119 | gr.Examples(
120 | examples=["Given a collection of image A: /examples/a.jpg, B: /examples/b.jpg, C: /examples/c.jpg, please tell me how many zebras in these picture?",
121 | "Please generate a canny image based on /examples/f.jpg",
122 | "show me a joke and an image of cat",
123 | "what is in the /examples/a.jpg",
124 | "generate a video and audio about a dog is running on the grass",
125 | "based on the /examples/a.jpg, please generate a video and audio",
126 | "based on pose of /examples/d.jpg and content of /examples/e.jpg, please show me a new image",
127 | ],
128 | inputs=txt
129 | )
130 |
131 | demo.launch()
--------------------------------------------------------------------------------
/web/src/assets/chatgpt.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/web/src/assets/huggingface.svg:
--------------------------------------------------------------------------------
1 |
2 |
74 |
--------------------------------------------------------------------------------
/web/src/assets/logo.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # JARVIS
2 |
3 | **This project is under construction and we will have all the code ready soon.**
4 |
5 |
6 |
7 |
8 |
9 |
10 | ## Updates
11 | + [2023.04.16] Jarvis now supports the OpenAI service on the Azure platform and the GPT-4 model.
12 | + [2023.04.06] We added the Gradio demo and built the web API for `/tasks` and `/results` in `server` mode.
13 | + The Gradio demo is now hosted on Hugging Face Space. (Build with `inference_mode=hybrid` and `local_deployment=standard`)
14 | + The Web API `/tasks` and `/results` access intermediate results for `Stage #1`: task planning and `Stage #1-3`: model selection with execution results. See here.
15 | + [2023.04.03] We added the CLI mode and provided parameters for configuring the scale of local endpoints.
16 | + You can enjoy a lightweight experience with Jarvis without deploying the models locally. See here.
17 | + Just run `python awesome_chat.py --config configs/config.lite.yaml` to experience it.
18 | + [2023.04.01] We updated a version of code for building.
19 |
20 | ## Overview
21 |
22 | Language serves as an interface for LLMs to connect numerous AI models for solving complicated AI tasks!
23 |
24 | See our paper: [HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in HuggingFace](http://arxiv.org/abs/2303.17580), Yongliang Shen, Kaitao Song, Xu Tan, Dongsheng Li, Weiming Lu and Yueting Zhuang (the first two authors contribute equally)
25 |
26 |
27 |
28 | We introduce a collaborative system that consists of **an LLM as the controller** and **numerous expert models as collaborative executors** (from HuggingFace Hub). The workflow of our system consists of four stages:
29 | + **Task Planning**: Using ChatGPT to analyze the requests of users to understand their intention, and disassemble them into possible solvable tasks.
30 | + **Model Selection**: To solve the planned tasks, ChatGPT selects expert models hosted on Hugging Face based on their descriptions.
31 | + **Task Execution**: Invokes and executes each selected model, and return the results to ChatGPT.
32 | + **Response Generation**: Finally, using ChatGPT to integrate the prediction of all models, and generate responses.
33 |
34 | ## System Requirements
35 |
36 | ### Default (Recommended)
37 |
38 | For `configs/config.default.yaml`:
39 |
40 | + Ubuntu 16.04 LTS
41 | + VRAM >= 24GB
42 | + RAM > 12GB (minimal), 16GB (standard), 80GB (full)
43 | + Disk > 284GB
44 | + 42GB for `damo-vilab/text-to-video-ms-1.7b`
45 | + 126GB for `ControlNet`
46 | + 66GB for `stable-diffusion-v1-5`
47 | + 50GB for others
48 |
49 | ### Minimum (Lite)
50 |
51 | For `configs/config.lite.yaml`:
52 |
53 | + Ubuntu 16.04 LTS
54 | + Nothing else
55 |
56 | The configuration `configs/config.lite.yaml` does not require any expert models to be downloaded and deployed locally. However, it means that Jarvis is restricted to models running stably on HuggingFace Inference Endpoints.
57 |
58 | ## Quick Start
59 |
60 | First replace `openai.key` and `huggingface.token` in `server/configs/config.default.yaml` with **your personal OpenAI Key** and **your Hugging Face Token**, or put them in the environment variables `OPENAI_API_KEY` and `HUGGINGFACE_ACCESS_TOKEN` respectfully. Then run the following commands:
61 |
62 |
63 |
64 | ### For Server:
65 |
66 | ```bash
67 | # setup env
68 | cd server
69 | conda create -n jarvis python=3.8
70 | conda activate jarvis
71 | conda install pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvidia
72 | pip install -r requirements.txt
73 |
74 | # download models. Make sure that `git-lfs` is installed.
75 | cd models
76 | bash download.sh # required when `inference_mode` is `local` or `hybrid`.
77 |
78 | # run server
79 | cd ..
80 | python models_server.py --config configs/config.default.yaml # required when `inference_mode` is `local` or `hybrid`
81 | python awesome_chat.py --config configs/config.default.yaml --mode server # for text-davinci-003
82 | ```
83 |
84 | Now you can access Jarvis' services by the Web API.
85 |
86 | + `/hugginggpt` --method `POST`, access the full service.
87 | + `/tasks` --method `POST`, access intermediate results for Stage #1.
88 | + `/results` --method `POST`, access intermediate results for Stage #1-3.
89 |
90 | For example:
91 |
92 | ```bash
93 | # request
94 | curl --location 'http://localhost:8004/tasks' \
95 | --header 'Content-Type: application/json' \
96 | --data '{
97 | "messages": [
98 | {
99 | "role": "user",
100 | "content": "based on pose of /examples/d.jpg and content of /examples/e.jpg, please show me a new image"
101 | }
102 | ]
103 | }'
104 |
105 | # response
106 | [{"args":{"image":"/examples/d.jpg"},"dep":[-1],"id":0,"task":"openpose-control"},{"args":{"image":"/examples/e.jpg"},"dep":[-1],"id":1,"task":"image-to-text"},{"args":{"image":"-0","text":"-1"},"dep":[1,0],"id":2,"task":"openpose-text-to-image"}]
107 | ```
108 |
109 |
110 | ### For Web:
111 |
112 | We provide a user-friendly web page. After starting `awesome_chat.py` in a server mode, you can run the commands to communicate with Jarvis in your browser:
113 |
114 | - you need to install `nodejs` and `npm` first.
115 | - [ IMPORTANT ] if you are running the web client on another machine, you need set `http://{LAN_IP_of_the_server}:{port}/` to `HUGGINGGPT_BASE_URL` of `web/src/config/index.ts`.
116 | - if you want to use the video generation feature, you need to compile `ffmpeg` manually with H.264.
117 | - you can switch to ChatGPT by `double click` on the setting icon!
118 |
119 | ```bash
120 | cd web
121 | npm install
122 | npm run dev
123 | ```
124 |
125 | ```bash
126 | # Optional: Install ffmpeg
127 | # This command need be executed without errors.
128 | LD_LIBRARY_PATH=/usr/local/lib /usr/local/bin/ffmpeg -i input.mp4 -vcodec libx264 output.mp4
129 | ```
130 |
131 |
132 |
133 | ### For Gradio
134 |
135 | The Gradio demo is now hosted on Hugging Face Space. You can also run the following commands to start the demo locally:
136 |
137 | ```bash
138 | python models_server.py --config configs/config.gradio.yaml
139 | python run_gradio_demo.py --config configs/config.gradio.yaml
140 |
141 | # or run the HF Space as a Docker image (Build with `inference_mode=hibrid` and `local_deployment=standard`)
142 | docker run -it -p 7860:7860 --platform=linux/amd64 registry.hf.space/microsoft-hugginggpt:latest python app.py
143 | ```
144 |
145 | ### For CLI:
146 |
147 | You can also run Jarvis more easily in CLI mode:
148 |
149 | ```bash
150 | cd server
151 | python awesome_chat.py --config configs/config.default.yaml --mode cli
152 | ```
153 |
154 | Examples of CLI mode:
155 |
156 | ```
157 | Welcome to Jarvis! A collaborative system that consists of an LLM as the controller and numerous expert models as collaborative executors. Jarvis can plan tasks, schedule Hugging Face models, generate friendly responses based on your requests, and help you with many things. Please enter your request (`exit` to exit).
158 | [ User ]: Given a collection of image A: /examples/a.jpg, B: /examples/b.jpg, C: /examples/c.jpg, please tell me how many zebras in these picture?
159 | [ Jarvis ]: Based on your request, I have used several models to process the images you provided. For the image A: /examples/a.jpg, I used the image-to-text model nlpconnect/vit-gpt2-image-captioning to generate the text "a cat sitting on a window sill looking out". Then I used the object-detection model facebook/detr-resnet-50 to generate the image with predicted box '/images/a3dc.jpg', which contains two objects with labels 'potted plant' and 'cat'. For the image B: /examples/b.jpg, I used the image-to-text model nlpconnect/vit-gpt2-image-captioning to generate the text "two zebras standing in a field of dry grass". Then I used the object-detection model facebook/detr-resnet-50 to generate the image with predicted box '/images/f5df.jpg', which contains three objects with labels 'zebra'. For the image C: /examples/c.jpg, I used the image-to-text model nlpconnect/vit-gpt2-image-captioning to generate the text "a zebra standing in a field of tall grass". Then I used the object-detection model facebook/detr-resnet-50 to generate the image with predicted box '/images/69c7.jpg', which contains one object with label 'zebra'. Finally, I used the canny-control model canny-control to generate the image '/images/7e18.png' based on the image /examples/f.jpg. Therefore, there are four zebras in these pictures. Is there anything else I can help you with?
160 | [ User ]: Please answer all the named entities in the sentence: Iron Man is a superhero appearing in American comic books published by Marvel Comics. The character was co-created by writer and editor Stan Lee, developed by scripter Larry Lieber, and designed by artists Don Heck and Jack Kirby.
161 | [ Jarvis ]: Sure, I understand your request. Based on the inference results of the models, I will answer your request step by step. Firstly, I used the token-classification model dslim/bert-base-NER to recognize the named entities in the sentence. The inference result is that the named entities are Iron Man, Stan Lee, Larry Lieber, Don Heck and Jack Kirby. Then, I used the question-answering model deepset/roberta-base-squad2 to answer your request. The inference result is that there is no output since the context cannot be empty. Therefore, I cannot make it. I hope this answer your request. Is there anything else I can help you with?
162 | [ User ]: Please generate a canny image based on /examples/f.jpg
163 | [ Jarvis ]: Sure. I have generated a canny image based on /examples/f.jpg. To do this, I first used the image-to-text model nlpconnect/vit-gpt2-image-captioning to generate the text description of the image, which is "a herd of giraffes and zebras grazing in a field". Then I used the canny-control model to generate the canny image of the original image, which is stored in the path "/images/1ea2.png". Finally, I used the canny-text-to-image model lllyasviel/sd-controlnet-canny to generate the canny image based on the text description and the canny image, which is stored in the path "/images/ba66.png". Do you need anything else?
164 | ```
165 |
166 |
167 | ## Configuration
168 |
169 | The server-side configuration file is `server/configs/config.default.yaml`, and some parameters are presented as follows:
170 |
171 | + `model`: LLM, currently supports `text-davinci-003`. We are working on integrating more open-source LLMs.
172 | + `inference_mode`: mode of inference endpoints
173 | + `local`: only use the local inference endpoints
174 | + `huggingface`: only use the Hugging Face Inference Endpoints **(free of local inference endpoints)**
175 | + `hybrid`: both of `local` and `huggingface`
176 | + `local_deployment`: scale of locally deployed models, works under `local` or `hybrid` inference mode:
177 | + `minimal` (RAM>12GB, ControlNet only)
178 | + `standard` (RAM>16GB, ControlNet + Standard Pipelines)
179 | + `full` (RAM>42GB, All registered models)
180 |
181 | On a personal laptop, we recommend the configuration of `inference_mode: hybrid `and `local_deployment: minimal`. But the available models under this setting may be limited due to the instability of remote Hugging Face Inference Endpoints.
182 |
183 | ## Screenshots
184 |
185 |
186 |
187 | ## Citation
188 | If you find this work useful in your method, you can cite the paper as below:
189 |
190 | @article{shen2023hugginggpt,
191 | title = {HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in HuggingFace},
192 | author = {Shen, Yongliang and Song, Kaitao and Tan, Xu and Li, Dongsheng and Lu, Weiming and Zhuang, Yueting},
193 | journal = {arXiv preprint arXiv:2303.17580},
194 | year = {2023}
195 | }
196 |
197 | ## Acknowledgement
198 |
199 | - [ChatGPT](https://platform.openai.com/)
200 | - [Hugging Face](https://huggingface.co/)
201 | - [ControlNet](https://github.com/lllyasviel/ControlNet)
202 | - [ChatGPT-vue](https://github.com/lianginx/chatgpt-vue)
203 |
--------------------------------------------------------------------------------
/web/src/views/home.vue:
--------------------------------------------------------------------------------
1 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
320 |
321 |
322 |
323 |
324 | {{title}}
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
339 |
{{roleAlias[item.role]}} :
340 |
344 |
345 |
349 |
351 |
352 |
355 |
356 |
357 |
358 | {{item.content}}
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 | Please input OpenAI key:
371 |
372 |
373 |
383 |
390 |
391 |
394 |
395 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
433 |
--------------------------------------------------------------------------------
/server/models_server.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import logging
3 | import random
4 | import uuid
5 | import numpy as np
6 | from transformers import pipeline
7 | from diffusers import DiffusionPipeline, StableDiffusionControlNetPipeline, ControlNetModel, UniPCMultistepScheduler
8 | from diffusers.utils import load_image
9 | from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
10 | from diffusers.utils import export_to_video
11 | from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5ForSpeechToSpeech
12 | from transformers import BlipProcessor, BlipForConditionalGeneration
13 | from transformers import TrOCRProcessor, VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
14 | from datasets import load_dataset
15 | from PIL import Image
16 | import flask
17 | from flask import request, jsonify
18 | import waitress
19 | from flask_cors import CORS
20 | import io
21 | from torchvision import transforms
22 | import torch
23 | import torchaudio
24 | from speechbrain.pretrained import WaveformEnhancement
25 | import joblib
26 | from huggingface_hub import hf_hub_url, cached_download
27 | from transformers import AutoImageProcessor, TimesformerForVideoClassification
28 | from transformers import MaskFormerFeatureExtractor, MaskFormerForInstanceSegmentation, AutoFeatureExtractor
29 | from controlnet_aux import OpenposeDetector, MLSDdetector, HEDdetector, CannyDetector, MidasDetector
30 | from controlnet_aux.open_pose.body import Body
31 | from controlnet_aux.mlsd.models.mbv2_mlsd_large import MobileV2_MLSD_Large
32 | from controlnet_aux.hed import Network
33 | from transformers import DPTForDepthEstimation, DPTFeatureExtractor
34 | import warnings
35 | import time
36 | from espnet2.bin.tts_inference import Text2Speech
37 | import soundfile as sf
38 | from asteroid.models import BaseModel
39 | import traceback
40 | import os
41 | import yaml
42 |
43 | warnings.filterwarnings("ignore")
44 |
45 | parser = argparse.ArgumentParser()
46 | parser.add_argument("--config", type=str, default="configs/config.default.yaml")
47 | args = parser.parse_args()
48 |
49 | logger = logging.getLogger(__name__)
50 | logger.setLevel(logging.INFO)
51 | handler = logging.StreamHandler()
52 | handler.setLevel(logging.INFO)
53 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
54 | handler.setFormatter(formatter)
55 | logger.addHandler(handler)
56 |
57 | config = yaml.load(open(args.config, "r"), Loader=yaml.FullLoader)
58 |
59 | # host = config["local_inference_endpoint"]["host"]
60 | port = config["local_inference_endpoint"]["port"]
61 |
62 | local_deployment = config["local_deployment"]
63 | device = config.get("device", "cuda:0")
64 |
65 | PROXY = None
66 | if config["proxy"]:
67 | PROXY = {
68 | "https": config["proxy"],
69 | }
70 |
71 | app = flask.Flask(__name__)
72 | CORS(app)
73 |
74 | start = time.time()
75 |
76 | local_fold = "models"
77 | # if args.config.endswith(".dev"):
78 | # local_fold = "models_dev"
79 |
80 |
81 | def load_pipes(local_deployment):
82 | other_pipes = {}
83 | standard_pipes = {}
84 | controlnet_sd_pipes = {}
85 | if local_deployment in ["full"]:
86 | other_pipes = {
87 | "nlpconnect/vit-gpt2-image-captioning":{
88 | "model": VisionEncoderDecoderModel.from_pretrained(f"{local_fold}/nlpconnect/vit-gpt2-image-captioning"),
89 | "feature_extractor": ViTImageProcessor.from_pretrained(f"{local_fold}/nlpconnect/vit-gpt2-image-captioning"),
90 | "tokenizer": AutoTokenizer.from_pretrained(f"{local_fold}/nlpconnect/vit-gpt2-image-captioning"),
91 | "device": device
92 | },
93 | # "Salesforce/blip-image-captioning-large": {
94 | # "model": BlipForConditionalGeneration.from_pretrained(f"{local_fold}/Salesforce/blip-image-captioning-large"),
95 | # "processor": BlipProcessor.from_pretrained(f"{local_fold}/Salesforce/blip-image-captioning-large"),
96 | # "device": device
97 | # },
98 | "damo-vilab/text-to-video-ms-1.7b": {
99 | "model": DiffusionPipeline.from_pretrained(f"{local_fold}/damo-vilab/text-to-video-ms-1.7b", torch_dtype=torch.float16, variant="fp16"),
100 | "device": device
101 | },
102 | # "facebook/maskformer-swin-large-ade": {
103 | # "model": MaskFormerForInstanceSegmentation.from_pretrained(f"{local_fold}/facebook/maskformer-swin-large-ade"),
104 | # "feature_extractor" : AutoFeatureExtractor.from_pretrained("facebook/maskformer-swin-large-ade"),
105 | # "device": device
106 | # },
107 | # "microsoft/trocr-base-printed": {
108 | # "processor": TrOCRProcessor.from_pretrained(f"{local_fold}/microsoft/trocr-base-printed"),
109 | # "model": VisionEncoderDecoderModel.from_pretrained(f"{local_fold}/microsoft/trocr-base-printed"),
110 | # "device": device
111 | # },
112 | # "microsoft/trocr-base-handwritten": {
113 | # "processor": TrOCRProcessor.from_pretrained(f"{local_fold}/microsoft/trocr-base-handwritten"),
114 | # "model": VisionEncoderDecoderModel.from_pretrained(f"{local_fold}/microsoft/trocr-base-handwritten"),
115 | # "device": device
116 | # },
117 | "JorisCos/DCCRNet_Libri1Mix_enhsingle_16k": {
118 | "model": BaseModel.from_pretrained("JorisCos/DCCRNet_Libri1Mix_enhsingle_16k"),
119 | "device": device
120 | },
121 | "espnet/kan-bayashi_ljspeech_vits": {
122 | "model": Text2Speech.from_pretrained(f"espnet/kan-bayashi_ljspeech_vits"),
123 | "device": device
124 | },
125 | "lambdalabs/sd-image-variations-diffusers": {
126 | "model": DiffusionPipeline.from_pretrained(f"{local_fold}/lambdalabs/sd-image-variations-diffusers"), #torch_dtype=torch.float16
127 | "device": device
128 | },
129 | # "CompVis/stable-diffusion-v1-4": {
130 | # "model": DiffusionPipeline.from_pretrained(f"{local_fold}/CompVis/stable-diffusion-v1-4"),
131 | # "device": device
132 | # },
133 | # "stabilityai/stable-diffusion-2-1": {
134 | # "model": DiffusionPipeline.from_pretrained(f"{local_fold}/stabilityai/stable-diffusion-2-1"),
135 | # "device": device
136 | # },
137 | "runwayml/stable-diffusion-v1-5": {
138 | "model": DiffusionPipeline.from_pretrained(f"{local_fold}/runwayml/stable-diffusion-v1-5"),
139 | "device": device
140 | },
141 | # "microsoft/speecht5_tts":{
142 | # "processor": SpeechT5Processor.from_pretrained(f"{local_fold}/microsoft/speecht5_tts"),
143 | # "model": SpeechT5ForTextToSpeech.from_pretrained(f"{local_fold}/microsoft/speecht5_tts"),
144 | # "vocoder": SpeechT5HifiGan.from_pretrained(f"{local_fold}/microsoft/speecht5_hifigan"),
145 | # "embeddings_dataset": load_dataset(f"{local_fold}/Matthijs/cmu-arctic-xvectors", split="validation"),
146 | # "device": device
147 | # },
148 | # "speechbrain/mtl-mimic-voicebank": {
149 | # "model": WaveformEnhancement.from_hparams(source="speechbrain/mtl-mimic-voicebank", savedir="models/mtl-mimic-voicebank"),
150 | # "device": device
151 | # },
152 | "microsoft/speecht5_vc":{
153 | "processor": SpeechT5Processor.from_pretrained(f"{local_fold}/microsoft/speecht5_vc"),
154 | "model": SpeechT5ForSpeechToSpeech.from_pretrained(f"{local_fold}/microsoft/speecht5_vc"),
155 | "vocoder": SpeechT5HifiGan.from_pretrained(f"{local_fold}/microsoft/speecht5_hifigan"),
156 | "embeddings_dataset": load_dataset(f"{local_fold}/Matthijs/cmu-arctic-xvectors", split="validation"),
157 | "device": device
158 | },
159 | # "julien-c/wine-quality": {
160 | # "model": joblib.load(cached_download(hf_hub_url("julien-c/wine-quality", "sklearn_model.joblib")))
161 | # },
162 | # "facebook/timesformer-base-finetuned-k400": {
163 | # "processor": AutoImageProcessor.from_pretrained(f"{local_fold}/facebook/timesformer-base-finetuned-k400"),
164 | # "model": TimesformerForVideoClassification.from_pretrained(f"{local_fold}/facebook/timesformer-base-finetuned-k400"),
165 | # "device": device
166 | # },
167 | "facebook/maskformer-swin-base-coco": {
168 | "feature_extractor": MaskFormerFeatureExtractor.from_pretrained(f"{local_fold}/facebook/maskformer-swin-base-coco"),
169 | "model": MaskFormerForInstanceSegmentation.from_pretrained(f"{local_fold}/facebook/maskformer-swin-base-coco"),
170 | "device": device
171 | },
172 | "Intel/dpt-hybrid-midas": {
173 | "model": DPTForDepthEstimation.from_pretrained(f"{local_fold}/Intel/dpt-hybrid-midas", low_cpu_mem_usage=True),
174 | "feature_extractor": DPTFeatureExtractor.from_pretrained(f"{local_fold}/Intel/dpt-hybrid-midas"),
175 | "device": device
176 | }
177 | }
178 |
179 | if local_deployment in ["full", "standard"]:
180 | standard_pipes = {
181 | # "superb/wav2vec2-base-superb-ks": {
182 | # "model": pipeline(task="audio-classification", model=f"{local_fold}/superb/wav2vec2-base-superb-ks"),
183 | # "device": device
184 | # },
185 | "openai/whisper-base": {
186 | "model": pipeline(task="automatic-speech-recognition", model=f"{local_fold}/openai/whisper-base"),
187 | "device": device
188 | },
189 | "microsoft/speecht5_asr": {
190 | "model": pipeline(task="automatic-speech-recognition", model=f"{local_fold}/microsoft/speecht5_asr"),
191 | "device": device
192 | },
193 | "Intel/dpt-large": {
194 | "model": pipeline(task="depth-estimation", model=f"{local_fold}/Intel/dpt-large"),
195 | "device": device
196 | },
197 | # "microsoft/beit-base-patch16-224-pt22k-ft22k": {
198 | # "model": pipeline(task="image-classification", model=f"{local_fold}/microsoft/beit-base-patch16-224-pt22k-ft22k"),
199 | # "device": device
200 | # },
201 | "facebook/detr-resnet-50-panoptic": {
202 | "model": pipeline(task="image-segmentation", model=f"{local_fold}/facebook/detr-resnet-50-panoptic"),
203 | "device": device
204 | },
205 | "facebook/detr-resnet-101": {
206 | "model": pipeline(task="object-detection", model=f"{local_fold}/facebook/detr-resnet-101"),
207 | "device": device
208 | },
209 | # "openai/clip-vit-large-patch14": {
210 | # "model": pipeline(task="zero-shot-image-classification", model=f"{local_fold}/openai/clip-vit-large-patch14"),
211 | # "device": device
212 | # },
213 | "google/owlvit-base-patch32": {
214 | "model": pipeline(task="zero-shot-object-detection", model=f"{local_fold}/google/owlvit-base-patch32"),
215 | "device": device
216 | },
217 | # "microsoft/DialoGPT-medium": {
218 | # "model": pipeline(task="conversational", model=f"{local_fold}/microsoft/DialoGPT-medium"),
219 | # "device": device
220 | # },
221 | # "bert-base-uncased": {
222 | # "model": pipeline(task="fill-mask", model=f"{local_fold}/bert-base-uncased"),
223 | # "device": device
224 | # },
225 | # "deepset/roberta-base-squad2": {
226 | # "model": pipeline(task = "question-answering", model=f"{local_fold}/deepset/roberta-base-squad2"),
227 | # "device": device
228 | # },
229 | # "facebook/bart-large-cnn": {
230 | # "model": pipeline(task="summarization", model=f"{local_fold}/facebook/bart-large-cnn"),
231 | # "device": device
232 | # },
233 | # "google/tapas-base-finetuned-wtq": {
234 | # "model": pipeline(task="table-question-answering", model=f"{local_fold}/google/tapas-base-finetuned-wtq"),
235 | # "device": device
236 | # },
237 | # "distilbert-base-uncased-finetuned-sst-2-english": {
238 | # "model": pipeline(task="text-classification", model=f"{local_fold}/distilbert-base-uncased-finetuned-sst-2-english"),
239 | # "device": device
240 | # },
241 | # "gpt2": {
242 | # "model": pipeline(task="text-generation", model="gpt2"),
243 | # "device": device
244 | # },
245 | # "mrm8488/t5-base-finetuned-question-generation-ap": {
246 | # "model": pipeline(task="text2text-generation", model=f"{local_fold}/mrm8488/t5-base-finetuned-question-generation-ap"),
247 | # "device": device
248 | # },
249 | # "Jean-Baptiste/camembert-ner": {
250 | # "model": pipeline(task="token-classification", model=f"{local_fold}/Jean-Baptiste/camembert-ner", aggregation_strategy="simple"),
251 | # "device": device
252 | # },
253 | # "t5-base": {
254 | # "model": pipeline(task="translation", model=f"{local_fold}/t5-base"),
255 | # "device": device
256 | # },
257 | "impira/layoutlm-document-qa": {
258 | "model": pipeline(task="document-question-answering", model=f"{local_fold}/impira/layoutlm-document-qa"),
259 | "device": device
260 | },
261 | "ydshieh/vit-gpt2-coco-en": {
262 | "model": pipeline(task="image-to-text", model=f"{local_fold}/ydshieh/vit-gpt2-coco-en"),
263 | "device": device
264 | },
265 | "dandelin/vilt-b32-finetuned-vqa": {
266 | "model": pipeline(task="visual-question-answering", model=f"{local_fold}/dandelin/vilt-b32-finetuned-vqa"),
267 | "device": device
268 | }
269 | }
270 |
271 | if local_deployment in ["full", "standard", "minimal"]:
272 | controlnet = ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
273 | controlnetpipe = StableDiffusionControlNetPipeline.from_pretrained(
274 | f"{local_fold}/runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16
275 | )
276 |
277 | def mlsd_control_network():
278 | model = MobileV2_MLSD_Large()
279 | model.load_state_dict(torch.load(f"{local_fold}/lllyasviel/ControlNet/annotator/ckpts/mlsd_large_512_fp32.pth"), strict=True)
280 | return MLSDdetector(model)
281 |
282 |
283 | hed_network = Network(f"{local_fold}/lllyasviel/ControlNet/annotator/ckpts/network-bsds500.pth")
284 |
285 | controlnet_sd_pipes = {
286 | "openpose-control": {
287 | "model": OpenposeDetector(Body(f"{local_fold}/lllyasviel/ControlNet/annotator/ckpts/body_pose_model.pth"))
288 | },
289 | "mlsd-control": {
290 | "model": mlsd_control_network()
291 | },
292 | "hed-control": {
293 | "model": HEDdetector(hed_network)
294 | },
295 | "scribble-control": {
296 | "model": HEDdetector(hed_network)
297 | },
298 | "midas-control": {
299 | "model": MidasDetector(model_path=f"{local_fold}/lllyasviel/ControlNet/annotator/ckpts/dpt_hybrid-midas-501f0c75.pt")
300 | },
301 | "canny-control": {
302 | "model": CannyDetector()
303 | },
304 | "lllyasviel/sd-controlnet-canny":{
305 | "control": controlnet,
306 | "model": controlnetpipe,
307 | "device": device
308 | },
309 | "lllyasviel/sd-controlnet-depth":{
310 | "control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-depth", torch_dtype=torch.float16),
311 | "model": controlnetpipe,
312 | "device": device
313 | },
314 | "lllyasviel/sd-controlnet-hed":{
315 | "control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-hed", torch_dtype=torch.float16),
316 | "model": controlnetpipe,
317 | "device": device
318 | },
319 | "lllyasviel/sd-controlnet-mlsd":{
320 | "control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-mlsd", torch_dtype=torch.float16),
321 | "model": controlnetpipe,
322 | "device": device
323 | },
324 | "lllyasviel/sd-controlnet-openpose":{
325 | "control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-openpose", torch_dtype=torch.float16),
326 | "model": controlnetpipe,
327 | "device": device
328 | },
329 | "lllyasviel/sd-controlnet-scribble":{
330 | "control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-scribble", torch_dtype=torch.float16),
331 | "model": controlnetpipe,
332 | "device": device
333 | },
334 | "lllyasviel/sd-controlnet-seg":{
335 | "control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-seg", torch_dtype=torch.float16),
336 | "model": controlnetpipe,
337 | "device": device
338 | }
339 | }
340 | pipes = {**standard_pipes, **other_pipes, **controlnet_sd_pipes}
341 | return pipes
342 |
343 | pipes = load_pipes(local_deployment)
344 |
345 | end = time.time()
346 | during = end - start
347 |
348 | print(f"[ ready ] {during}s")
349 |
350 | @app.route('/running', methods=['GET'])
351 | def running():
352 | return jsonify({"running": True})
353 |
354 | @app.route('/status/', methods=['GET'])
355 | def status(model_id):
356 | disabled_models = ["microsoft/trocr-base-printed", "microsoft/trocr-base-handwritten"]
357 | if model_id in pipes.keys() and model_id not in disabled_models:
358 | print(f"[ check {model_id} ] success")
359 | return jsonify({"loaded": True})
360 | else:
361 | print(f"[ check {model_id} ] failed")
362 | return jsonify({"loaded": False})
363 |
364 | @app.route('/models/', methods=['POST'])
365 | def models(model_id):
366 | while "using" in pipes[model_id] and pipes[model_id]["using"]:
367 | print(f"[ inference {model_id} ] waiting")
368 | time.sleep(0.1)
369 | pipes[model_id]["using"] = True
370 | print(f"[ inference {model_id} ] start")
371 |
372 | start = time.time()
373 |
374 | pipe = pipes[model_id]["model"]
375 |
376 | if "device" in pipes[model_id]:
377 | try:
378 | pipe.to(pipes[model_id]["device"])
379 | except:
380 | pipe.device = torch.device(pipes[model_id]["device"])
381 | pipe.model.to(pipes[model_id]["device"])
382 |
383 | result = None
384 | try:
385 | # text to video
386 | if model_id == "damo-vilab/text-to-video-ms-1.7b":
387 | pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
388 | # pipe.enable_model_cpu_offload()
389 | prompt = request.get_json()["text"]
390 | video_frames = pipe(prompt, num_inference_steps=50, num_frames=40).frames
391 | video_path = export_to_video(video_frames)
392 | file_name = str(uuid.uuid4())[:4]
393 | os.system(f"LD_LIBRARY_PATH=/usr/local/lib /usr/local/bin/ffmpeg -i {video_path} -vcodec libx264 public/videos/{file_name}.mp4")
394 | result = {"path": f"/videos/{file_name}.mp4"}
395 |
396 | # controlnet
397 | if model_id.startswith("lllyasviel/sd-controlnet-"):
398 | pipe.controlnet.to('cpu')
399 | pipe.controlnet = pipes[model_id]["control"].to(pipes[model_id]["device"])
400 | pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
401 | control_image = load_image(request.get_json()["img_url"])
402 | # generator = torch.manual_seed(66)
403 | out_image: Image = pipe(request.get_json()["text"], num_inference_steps=20, image=control_image).images[0]
404 | file_name = str(uuid.uuid4())[:4]
405 | out_image.save(f"public/images/{file_name}.png")
406 | result = {"path": f"/images/{file_name}.png"}
407 |
408 | if model_id.endswith("-control"):
409 | image = load_image(request.get_json()["img_url"])
410 | if "scribble" in model_id:
411 | control = pipe(image, scribble = True)
412 | elif "canny" in model_id:
413 | control = pipe(image, low_threshold=100, high_threshold=200)
414 | else:
415 | control = pipe(image)
416 | file_name = str(uuid.uuid4())[:4]
417 | control.save(f"public/images/{file_name}.png")
418 | result = {"path": f"/images/{file_name}.png"}
419 |
420 | # image to image
421 | if model_id == "lambdalabs/sd-image-variations-diffusers":
422 | im = load_image(request.get_json()["img_url"])
423 | file_name = str(uuid.uuid4())[:4]
424 | with open(f"public/images/{file_name}.png", "wb") as f:
425 | f.write(request.data)
426 | tform = transforms.Compose([
427 | transforms.ToTensor(),
428 | transforms.Resize(
429 | (224, 224),
430 | interpolation=transforms.InterpolationMode.BICUBIC,
431 | antialias=False,
432 | ),
433 | transforms.Normalize(
434 | [0.48145466, 0.4578275, 0.40821073],
435 | [0.26862954, 0.26130258, 0.27577711]),
436 | ])
437 | inp = tform(im).to(pipes[model_id]["device"]).unsqueeze(0)
438 | out = pipe(inp, guidance_scale=3)
439 | out["images"][0].save(f"public/images/{file_name}.jpg")
440 | result = {"path": f"/images/{file_name}.jpg"}
441 |
442 | # image to text
443 | if model_id == "Salesforce/blip-image-captioning-large":
444 | raw_image = load_image(request.get_json()["img_url"]).convert('RGB')
445 | text = request.get_json()["text"]
446 | inputs = pipes[model_id]["processor"](raw_image, return_tensors="pt").to(pipes[model_id]["device"])
447 | out = pipe.generate(**inputs)
448 | caption = pipes[model_id]["processor"].decode(out[0], skip_special_tokens=True)
449 | result = {"generated text": caption}
450 | if model_id == "ydshieh/vit-gpt2-coco-en":
451 | img_url = request.get_json()["img_url"]
452 | generated_text = pipe(img_url)[0]['generated_text']
453 | result = {"generated text": generated_text}
454 | if model_id == "nlpconnect/vit-gpt2-image-captioning":
455 | image = load_image(request.get_json()["img_url"]).convert("RGB")
456 | pixel_values = pipes[model_id]["feature_extractor"](images=image, return_tensors="pt").pixel_values
457 | pixel_values = pixel_values.to(pipes[model_id]["device"])
458 | generated_ids = pipe.generate(pixel_values, **{"max_length": 200, "num_beams": 1})
459 | generated_text = pipes[model_id]["tokenizer"].batch_decode(generated_ids, skip_special_tokens=True)[0]
460 | result = {"generated text": generated_text}
461 | # image to text: OCR
462 | if model_id == "microsoft/trocr-base-printed" or model_id == "microsoft/trocr-base-handwritten":
463 | image = load_image(request.get_json()["img_url"]).convert("RGB")
464 | pixel_values = pipes[model_id]["processor"](image, return_tensors="pt").pixel_values
465 | pixel_values = pixel_values.to(pipes[model_id]["device"])
466 | generated_ids = pipe.generate(pixel_values)
467 | generated_text = pipes[model_id]["processor"].batch_decode(generated_ids, skip_special_tokens=True)[0]
468 | result = {"generated text": generated_text}
469 |
470 | # text to image
471 | if model_id == "runwayml/stable-diffusion-v1-5":
472 | file_name = str(uuid.uuid4())[:4]
473 | text = request.get_json()["text"]
474 | out = pipe(prompt=text)
475 | out["images"][0].save(f"public/images/{file_name}.jpg")
476 | result = {"path": f"/images/{file_name}.jpg"}
477 |
478 | # object detection
479 | if model_id == "google/owlvit-base-patch32" or model_id == "facebook/detr-resnet-101":
480 | img_url = request.get_json()["img_url"]
481 | open_types = ["cat", "couch", "person", "car", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird"]
482 | result = pipe(img_url, candidate_labels=open_types)
483 |
484 | # VQA
485 | if model_id == "dandelin/vilt-b32-finetuned-vqa":
486 | question = request.get_json()["text"]
487 | img_url = request.get_json()["img_url"]
488 | result = pipe(question=question, image=img_url)
489 |
490 | #DQA
491 | if model_id == "impira/layoutlm-document-qa":
492 | question = request.get_json()["text"]
493 | img_url = request.get_json()["img_url"]
494 | result = pipe(img_url, question)
495 |
496 | # depth-estimation
497 | if model_id == "Intel/dpt-large":
498 | output = pipe(request.get_json()["img_url"])
499 | image = output['depth']
500 | name = str(uuid.uuid4())[:4]
501 | image.save(f"public/images/{name}.jpg")
502 | result = {"path": f"/images/{name}.jpg"}
503 |
504 | if model_id == "Intel/dpt-hybrid-midas" and model_id == "Intel/dpt-large":
505 | image = load_image(request.get_json()["img_url"])
506 | inputs = pipes[model_id]["feature_extractor"](images=image, return_tensors="pt")
507 | with torch.no_grad():
508 | outputs = pipe(**inputs)
509 | predicted_depth = outputs.predicted_depth
510 | prediction = torch.nn.functional.interpolate(
511 | predicted_depth.unsqueeze(1),
512 | size=image.size[::-1],
513 | mode="bicubic",
514 | align_corners=False,
515 | )
516 | output = prediction.squeeze().cpu().numpy()
517 | formatted = (output * 255 / np.max(output)).astype("uint8")
518 | image = Image.fromarray(formatted)
519 | name = str(uuid.uuid4())[:4]
520 | image.save(f"public/images/{name}.jpg")
521 | result = {"path": f"/images/{name}.jpg"}
522 |
523 | # TTS
524 | if model_id == "espnet/kan-bayashi_ljspeech_vits":
525 | text = request.get_json()["text"]
526 | wav = pipe(text)["wav"]
527 | name = str(uuid.uuid4())[:4]
528 | sf.write(f"public/audios/{name}.wav", wav.cpu().numpy(), pipe.fs, "PCM_16")
529 | result = {"path": f"/audios/{name}.wav"}
530 |
531 | if model_id == "microsoft/speecht5_tts":
532 | text = request.get_json()["text"]
533 | inputs = pipes[model_id]["processor"](text=text, return_tensors="pt")
534 | embeddings_dataset = pipes[model_id]["embeddings_dataset"]
535 | speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(pipes[model_id]["device"])
536 | pipes[model_id]["vocoder"].to(pipes[model_id]["device"])
537 | speech = pipe.generate_speech(inputs["input_ids"].to(pipes[model_id]["device"]), speaker_embeddings, vocoder=pipes[model_id]["vocoder"])
538 | name = str(uuid.uuid4())[:4]
539 | sf.write(f"public/audios/{name}.wav", speech.cpu().numpy(), samplerate=16000)
540 | result = {"path": f"/audios/{name}.wav"}
541 |
542 | # ASR
543 | if model_id == "openai/whisper-base" or model_id == "microsoft/speecht5_asr":
544 | audio_url = request.get_json()["audio_url"]
545 | result = { "text": pipe(audio_url)["text"]}
546 |
547 | # audio to audio
548 | if model_id == "JorisCos/DCCRNet_Libri1Mix_enhsingle_16k":
549 | audio_url = request.get_json()["audio_url"]
550 | wav, sr = torchaudio.load(audio_url)
551 | with torch.no_grad():
552 | result_wav = pipe(wav.to(pipes[model_id]["device"]))
553 | name = str(uuid.uuid4())[:4]
554 | sf.write(f"public/audios/{name}.wav", result_wav.cpu().squeeze().numpy(), sr)
555 | result = {"path": f"/audios/{name}.wav"}
556 |
557 | if model_id == "microsoft/speecht5_vc":
558 | audio_url = request.get_json()["audio_url"]
559 | wav, sr = torchaudio.load(audio_url)
560 | inputs = pipes[model_id]["processor"](audio=wav, sampling_rate=sr, return_tensors="pt")
561 | embeddings_dataset = pipes[model_id]["embeddings_dataset"]
562 | speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
563 | pipes[model_id]["vocoder"].to(pipes[model_id]["device"])
564 | speech = pipe.generate_speech(inputs["input_ids"].to(pipes[model_id]["device"]), speaker_embeddings, vocoder=pipes[model_id]["vocoder"])
565 | name = str(uuid.uuid4())[:4]
566 | sf.write(f"public/audios/{name}.wav", speech.cpu().numpy(), samplerate=16000)
567 | result = {"path": f"/audios/{name}.wav"}
568 |
569 | # segmentation
570 | if model_id == "facebook/detr-resnet-50-panoptic":
571 | result = []
572 | segments = pipe(request.get_json()["img_url"])
573 | image = load_image(request.get_json()["img_url"])
574 |
575 | colors = []
576 | for i in range(len(segments)):
577 | colors.append((random.randint(100, 255), random.randint(100, 255), random.randint(100, 255), 50))
578 |
579 | for segment in segments:
580 | mask = segment["mask"]
581 | mask = mask.convert('L')
582 | layer = Image.new('RGBA', mask.size, colors[i])
583 | image.paste(layer, (0, 0), mask)
584 | name = str(uuid.uuid4())[:4]
585 | image.save(f"public/images/{name}.jpg")
586 | result = {"path": f"/images/{name}.jpg"}
587 |
588 | if model_id == "facebook/maskformer-swin-base-coco" or model_id == "facebook/maskformer-swin-large-ade":
589 | image = load_image(request.get_json()["img_url"])
590 | inputs = pipes[model_id]["feature_extractor"](images=image, return_tensors="pt").to(pipes[model_id]["device"])
591 | outputs = pipe(**inputs)
592 | result = pipes[model_id]["feature_extractor"].post_process_panoptic_segmentation(outputs, target_sizes=[image.size[::-1]])[0]
593 | predicted_panoptic_map = result["segmentation"].cpu().numpy()
594 | predicted_panoptic_map = Image.fromarray(predicted_panoptic_map.astype(np.uint8))
595 | name = str(uuid.uuid4())[:4]
596 | predicted_panoptic_map.save(f"public/images/{name}.jpg")
597 | result = {"path": f"/images/{name}.jpg"}
598 |
599 | except Exception as e:
600 | print(e)
601 | traceback.print_exc()
602 | result = {"error": {"message": "Error when running the model inference."}}
603 |
604 | if "device" in pipes[model_id]:
605 | try:
606 | pipe.to("cpu")
607 | torch.cuda.empty_cache()
608 | except:
609 | pipe.device = torch.device("cpu")
610 | pipe.model.to("cpu")
611 | torch.cuda.empty_cache()
612 |
613 | pipes[model_id]["using"] = False
614 |
615 | if result is None:
616 | result = {"error": {"message": "model not found"}}
617 |
618 | end = time.time()
619 | during = end - start
620 | print(f"[ complete {model_id} ] {during}s")
621 | print(f"[ result {model_id} ] {result}")
622 |
623 | return jsonify(result)
624 |
625 |
626 | if __name__ == '__main__':
627 | # temp folders
628 | if not os.path.exists("public/audios"):
629 | os.makedirs("public/audios")
630 | if not os.path.exists("public/images"):
631 | os.makedirs("public/images")
632 | if not os.path.exists("public/videos"):
633 | os.makedirs("public/videos")
634 |
635 | waitress.serve(app, host="0.0.0.0", port=port)
--------------------------------------------------------------------------------
/server/awesome_chat.py:
--------------------------------------------------------------------------------
1 | import base64
2 | import copy
3 | from io import BytesIO
4 | import io
5 | import os
6 | import random
7 | import time
8 | import traceback
9 | import uuid
10 | import requests
11 | import re
12 | import json
13 | import logging
14 | import argparse
15 | import yaml
16 | from PIL import Image, ImageDraw
17 | from diffusers.utils import load_image
18 | from pydub import AudioSegment
19 | import threading
20 | from queue import Queue
21 | import flask
22 | from flask import request, jsonify
23 | import waitress
24 | from flask_cors import CORS, cross_origin
25 | from get_token_ids import get_token_ids_for_task_parsing, get_token_ids_for_choose_model, count_tokens, get_max_context_length
26 | from huggingface_hub.inference_api import InferenceApi
27 | from huggingface_hub.inference_api import ALL_TASKS
28 |
29 | parser = argparse.ArgumentParser()
30 | parser.add_argument("--config", type=str, default="configs/config.default.yaml")
31 | parser.add_argument("--mode", type=str, default="cli")
32 | args = parser.parse_args()
33 |
34 | if __name__ != "__main__":
35 | args.config = "configs/config.gradio.yaml"
36 | args.mode = "gradio"
37 |
38 | config = yaml.load(open(args.config, "r"), Loader=yaml.FullLoader)
39 |
40 | os.makedirs("logs", exist_ok=True)
41 | os.makedirs("public/images", exist_ok=True)
42 | os.makedirs("public/audios", exist_ok=True)
43 | os.makedirs("public/videos", exist_ok=True)
44 |
45 |
46 | logger = logging.getLogger(__name__)
47 | logger.setLevel(logging.DEBUG)
48 |
49 | handler = logging.StreamHandler()
50 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
51 | handler.setFormatter(formatter)
52 | if not config["debug"]:
53 | handler.setLevel(logging.CRITICAL)
54 | logger.addHandler(handler)
55 |
56 | log_file = config["log_file"]
57 | if log_file:
58 | filehandler = logging.FileHandler(log_file)
59 | filehandler.setLevel(logging.DEBUG)
60 | filehandler.setFormatter(formatter)
61 | logger.addHandler(filehandler)
62 |
63 | LLM = config["model"]
64 | use_completion = config["use_completion"]
65 |
66 | # consistent: wrong msra model name
67 | LLM_encoding = LLM
68 | if config["dev"] and LLM == "gpt-3.5-turbo":
69 | LLM_encoding = "text-davinci-003"
70 | task_parsing_highlight_ids = get_token_ids_for_task_parsing(LLM_encoding)
71 | choose_model_highlight_ids = get_token_ids_for_choose_model(LLM_encoding)
72 |
73 | # ENDPOINT MODEL NAME
74 | # /v1/chat/completions gpt-4, gpt-4-0314, gpt-4-32k, gpt-4-32k-0314, gpt-3.5-turbo, gpt-3.5-turbo-0301
75 | # /v1/completions text-davinci-003, text-davinci-002, text-curie-001, text-babbage-001, text-ada-001, davinci, curie, babbage, ada
76 |
77 | if use_completion:
78 | api_name = "completions"
79 | else:
80 | api_name = "chat/completions"
81 |
82 | API_TYPE = None
83 | # priority: local > azure > openai
84 | if "dev" in config and config["dev"]:
85 | API_TYPE = "local"
86 | elif "azure" in config:
87 | API_TYPE = "azure"
88 | elif "openai" in config:
89 | API_TYPE = "openai"
90 | else:
91 | logger.warning(f"No endpoint specified in {args.config}. The endpoint will be set dynamically according to the client.")
92 |
93 | if args.mode in ["test", "cli"]:
94 | assert API_TYPE, "Only server mode supports dynamic endpoint."
95 |
96 | API_KEY = None
97 | API_ENDPOINT = None
98 | if API_TYPE == "local":
99 | API_ENDPOINT = f"{config['local']['endpoint']}/v1/{api_name}"
100 | elif API_TYPE == "azure":
101 | API_ENDPOINT = f"{config['azure']['base_url']}/openai/deployments/{config['azure']['deployment_name']}/{api_name}?api-version={config['azure']['api_version']}"
102 | API_KEY = config["azure"]["api_key"]
103 | elif API_TYPE == "openai":
104 | API_ENDPOINT = f"https://api.openai.com/v1/{api_name}"
105 | if config["openai"]["api_key"].startswith("sk-"): # Check for valid OpenAI key in config file
106 | API_KEY = config["openai"]["api_key"]
107 | elif "OPENAI_API_KEY" in os.environ and os.getenv("OPENAI_API_KEY").startswith("sk-"): # Check for environment variable OPENAI_API_KEY
108 | API_KEY = os.getenv("OPENAI_API_KEY")
109 | else:
110 | raise ValueError(f"Incorrect OpenAI key. Please check your {args.config} file.")
111 |
112 | PROXY = None
113 | if config["proxy"]:
114 | PROXY = {
115 | "https": config["proxy"],
116 | }
117 |
118 | inference_mode = config["inference_mode"]
119 |
120 | # check the local_inference_endpoint
121 | Model_Server = None
122 | if inference_mode!="huggingface":
123 | Model_Server = "http://" + config["local_inference_endpoint"]["host"] + ":" + str(config["local_inference_endpoint"]["port"])
124 | message = f"The server of local inference endpoints is not running, please start it first. (or using `inference_mode: huggingface` in {args.config} for a feature-limited experience)"
125 | try:
126 | r = requests.get(Model_Server + "/running")
127 | if r.status_code != 200:
128 | raise ValueError(message)
129 | except:
130 | raise ValueError(message)
131 |
132 |
133 | parse_task_demos_or_presteps = open(config["demos_or_presteps"]["parse_task"], "r").read()
134 | choose_model_demos_or_presteps = open(config["demos_or_presteps"]["choose_model"], "r").read()
135 | response_results_demos_or_presteps = open(config["demos_or_presteps"]["response_results"], "r").read()
136 |
137 | parse_task_prompt = config["prompt"]["parse_task"]
138 | choose_model_prompt = config["prompt"]["choose_model"]
139 | response_results_prompt = config["prompt"]["response_results"]
140 |
141 | parse_task_tprompt = config["tprompt"]["parse_task"]
142 | choose_model_tprompt = config["tprompt"]["choose_model"]
143 | response_results_tprompt = config["tprompt"]["response_results"]
144 |
145 | MODELS = [json.loads(line) for line in open("data/p0_models.jsonl", "r").readlines()]
146 | MODELS_MAP = {}
147 | for model in MODELS:
148 | tag = model["task"]
149 | if tag not in MODELS_MAP:
150 | MODELS_MAP[tag] = []
151 | MODELS_MAP[tag].append(model)
152 | METADATAS = {}
153 | for model in MODELS:
154 | METADATAS[model["id"]] = model
155 |
156 | HUGGINGFACE_HEADERS = {}
157 | if config["huggingface"]["token"] and config["huggingface"]["token"].startswith("hf_"): # Check for valid huggingface token in config file
158 | HUGGINGFACE_HEADERS = {
159 | "Authorization": f"Bearer {config['huggingface']['token']}",
160 | }
161 | elif "HUGGINGFACE_ACCESS_TOKEN" in os.environ and os.getenv("HUGGINGFACE_ACCESS_TOKEN").startswith("hf_"): # Check for environment variable HUGGINGFACE_ACCESS_TOKEN
162 | HUGGINGFACE_HEADERS = {
163 | "Authorization": f"Bearer {os.getenv('HUGGINGFACE_ACCESS_TOKEN')}",
164 | }
165 | else:
166 | raise ValueError(f"Incorrect HuggingFace token. Please check your {args.config} file.")
167 |
168 | def convert_chat_to_completion(data):
169 | messages = data.pop('messages', [])
170 | tprompt = ""
171 | if messages[0]['role'] == "system":
172 | tprompt = messages[0]['content']
173 | messages = messages[1:]
174 | final_prompt = ""
175 | for message in messages:
176 | if message['role'] == "user":
177 | final_prompt += (""+ "user" + "\n" + message['content'] + "\n")
178 | elif message['role'] == "assistant":
179 | final_prompt += (""+ "assistant" + "\n" + message['content'] + "\n")
180 | else:
181 | final_prompt += (""+ "system" + "\n" + message['content'] + "\n")
182 | final_prompt = tprompt + final_prompt
183 | final_prompt = final_prompt + "assistant"
184 | data["prompt"] = final_prompt
185 | data['stop'] = data.get('stop', [""])
186 | data['max_tokens'] = data.get('max_tokens', max(get_max_context_length(LLM) - count_tokens(LLM_encoding, final_prompt), 1))
187 | return data
188 |
189 | def send_request(data):
190 | api_key = data.pop("api_key")
191 | api_type = data.pop("api_type")
192 | api_endpoint = data.pop("api_endpoint")
193 | if use_completion:
194 | data = convert_chat_to_completion(data)
195 | if api_type == "openai":
196 | HEADER = {
197 | "Authorization": f"Bearer {api_key}"
198 | }
199 | elif api_type == "azure":
200 | HEADER = {
201 | "api-key": api_key,
202 | "Content-Type": "application/json"
203 | }
204 | else:
205 | HEADER = None
206 | response = requests.post(api_endpoint, json=data, headers=HEADER, proxies=PROXY)
207 | if "error" in response.json():
208 | return response.json()
209 | logger.debug(response.text.strip())
210 | if use_completion:
211 | return response.json()["choices"][0]["text"].strip()
212 | else:
213 | return response.json()["choices"][0]["message"]["content"].strip()
214 |
215 | def replace_slot(text, entries):
216 | for key, value in entries.items():
217 | if not isinstance(value, str):
218 | value = str(value)
219 | text = text.replace("{{" + key +"}}", value.replace('"', "'").replace('\n', ""))
220 | return text
221 |
222 | def find_json(s):
223 | s = s.replace("\'", "\"")
224 | start = s.find("{")
225 | end = s.rfind("}")
226 | res = s[start:end+1]
227 | res = res.replace("\n", "")
228 | return res
229 |
230 | def field_extract(s, field):
231 | try:
232 | field_rep = re.compile(f'{field}.*?:.*?"(.*?)"', re.IGNORECASE)
233 | extracted = field_rep.search(s).group(1).replace("\"", "\'")
234 | except:
235 | field_rep = re.compile(f'{field}:\ *"(.*?)"', re.IGNORECASE)
236 | extracted = field_rep.search(s).group(1).replace("\"", "\'")
237 | return extracted
238 |
239 | def get_id_reason(choose_str):
240 | reason = field_extract(choose_str, "reason")
241 | id = field_extract(choose_str, "id")
242 | choose = {"id": id, "reason": reason}
243 | return id.strip(), reason.strip(), choose
244 |
245 | def record_case(success, **args):
246 | if success:
247 | f = open("logs/log_success.jsonl", "a")
248 | else:
249 | f = open("logs/log_fail.jsonl", "a")
250 | log = args
251 | f.write(json.dumps(log) + "\n")
252 | f.close()
253 |
254 | def image_to_bytes(img_url):
255 | img_byte = io.BytesIO()
256 | type = img_url.split(".")[-1]
257 | load_image(img_url).save(img_byte, format="png")
258 | img_data = img_byte.getvalue()
259 | return img_data
260 |
261 | def resource_has_dep(command):
262 | args = command["args"]
263 | for _, v in args.items():
264 | if "" in v:
265 | return True
266 | return False
267 |
268 | def fix_dep(tasks):
269 | for task in tasks:
270 | args = task["args"]
271 | task["dep"] = []
272 | for k, v in args.items():
273 | if "" in v:
274 | dep_task_id = int(v.split("-")[1])
275 | if dep_task_id not in task["dep"]:
276 | task["dep"].append(dep_task_id)
277 | if len(task["dep"]) == 0:
278 | task["dep"] = [-1]
279 | return tasks
280 |
281 | def unfold(tasks):
282 | flag_unfold_task = False
283 | try:
284 | for task in tasks:
285 | for key, value in task["args"].items():
286 | if "" in value:
287 | generated_items = value.split(",")
288 | if len(generated_items) > 1:
289 | flag_unfold_task = True
290 | for item in generated_items:
291 | new_task = copy.deepcopy(task)
292 | dep_task_id = int(item.split("-")[1])
293 | new_task["dep"] = [dep_task_id]
294 | new_task["args"][key] = item
295 | tasks.append(new_task)
296 | tasks.remove(task)
297 | except Exception as e:
298 | print(e)
299 | traceback.print_exc()
300 | logger.debug("unfold task failed.")
301 |
302 | if flag_unfold_task:
303 | logger.debug(f"unfold tasks: {tasks}")
304 |
305 | return tasks
306 |
307 | def chitchat(messages, api_key, api_type, api_endpoint):
308 | data = {
309 | "model": LLM,
310 | "messages": messages,
311 | "api_key": api_key,
312 | "api_type": api_type,
313 | "api_endpoint": api_endpoint
314 | }
315 | return send_request(data)
316 |
317 | def parse_task(context, input, api_key, api_type, api_endpoint):
318 | demos_or_presteps = parse_task_demos_or_presteps
319 | messages = json.loads(demos_or_presteps)
320 | messages.insert(0, {"role": "system", "content": parse_task_tprompt})
321 |
322 | # cut chat logs
323 | start = 0
324 | while start <= len(context):
325 | history = context[start:]
326 | prompt = replace_slot(parse_task_prompt, {
327 | "input": input,
328 | "context": history
329 | })
330 | messages.append({"role": "user", "content": prompt})
331 | history_text = "\nuser".join([m["content"] for m in messages])
332 | num = count_tokens(LLM_encoding, history_text)
333 | if get_max_context_length(LLM) - num > 800:
334 | break
335 | messages.pop()
336 | start += 2
337 |
338 | logger.debug(messages)
339 | data = {
340 | "model": LLM,
341 | "messages": messages,
342 | "temperature": 0,
343 | "logit_bias": {item: config["logit_bias"]["parse_task"] for item in task_parsing_highlight_ids},
344 | "api_key": api_key,
345 | "api_type": api_type,
346 | "api_endpoint": api_endpoint
347 | }
348 | return send_request(data)
349 |
350 | def choose_model(input, task, metas, api_key, api_type, api_endpoint):
351 | prompt = replace_slot(choose_model_prompt, {
352 | "input": input,
353 | "task": task,
354 | "metas": metas,
355 | })
356 | demos_or_presteps = replace_slot(choose_model_demos_or_presteps, {
357 | "input": input,
358 | "task": task,
359 | "metas": metas
360 | })
361 | messages = json.loads(demos_or_presteps)
362 | messages.insert(0, {"role": "system", "content": choose_model_tprompt})
363 | messages.append({"role": "user", "content": prompt})
364 | logger.debug(messages)
365 | data = {
366 | "model": LLM,
367 | "messages": messages,
368 | "temperature": 0,
369 | "logit_bias": {item: config["logit_bias"]["choose_model"] for item in choose_model_highlight_ids}, # 5
370 | "api_key": api_key,
371 | "api_type": api_type,
372 | "api_endpoint": api_endpoint
373 | }
374 | return send_request(data)
375 |
376 |
377 | def response_results(input, results, api_key, api_type, api_endpoint):
378 | results = [v for k, v in sorted(results.items(), key=lambda item: item[0])]
379 | prompt = replace_slot(response_results_prompt, {
380 | "input": input,
381 | })
382 | demos_or_presteps = replace_slot(response_results_demos_or_presteps, {
383 | "input": input,
384 | "processes": results
385 | })
386 | messages = json.loads(demos_or_presteps)
387 | messages.insert(0, {"role": "system", "content": response_results_tprompt})
388 | messages.append({"role": "user", "content": prompt})
389 | logger.debug(messages)
390 | data = {
391 | "model": LLM,
392 | "messages": messages,
393 | "temperature": 0,
394 | "api_key": api_key,
395 | "api_type": api_type,
396 | "api_endpoint": api_endpoint
397 | }
398 | return send_request(data)
399 |
400 | def huggingface_model_inference(model_id, data, task):
401 | task_url = f"https://api-inference.huggingface.co/models/{model_id}" # InferenceApi does not yet support some tasks
402 | inference = InferenceApi(repo_id=model_id, token=config["huggingface"]["token"])
403 |
404 | # NLP tasks
405 | if task == "question-answering":
406 | inputs = {"question": data["text"], "context": (data["context"] if "context" in data else "" )}
407 | result = inference(inputs)
408 | if task == "sentence-similarity":
409 | inputs = {"source_sentence": data["text1"], "target_sentence": data["text2"]}
410 | result = inference(inputs)
411 | if task in ["text-classification", "token-classification", "text2text-generation", "summarization", "translation", "conversational", "text-generation"]:
412 | inputs = data["text"]
413 | result = inference(inputs)
414 |
415 | # CV tasks
416 | if task == "visual-question-answering" or task == "document-question-answering":
417 | img_url = data["image"]
418 | text = data["text"]
419 | img_data = image_to_bytes(img_url)
420 | img_base64 = base64.b64encode(img_data).decode("utf-8")
421 | json_data = {}
422 | json_data["inputs"] = {}
423 | json_data["inputs"]["question"] = text
424 | json_data["inputs"]["image"] = img_base64
425 | result = requests.post(task_url, headers=HUGGINGFACE_HEADERS, json=json_data).json()
426 | # result = inference(inputs) # not support
427 |
428 | if task == "image-to-image":
429 | img_url = data["image"]
430 | img_data = image_to_bytes(img_url)
431 | # result = inference(data=img_data) # not support
432 | HUGGINGFACE_HEADERS["Content-Length"] = str(len(img_data))
433 | r = requests.post(task_url, headers=HUGGINGFACE_HEADERS, data=img_data)
434 | result = r.json()
435 | if "path" in result:
436 | result["generated image"] = result.pop("path")
437 |
438 | if task == "text-to-image":
439 | inputs = data["text"]
440 | img = inference(inputs)
441 | name = str(uuid.uuid4())[:4]
442 | img.save(f"public/images/{name}.png")
443 | result = {}
444 | result["generated image"] = f"/images/{name}.png"
445 |
446 | if task == "image-segmentation":
447 | img_url = data["image"]
448 | img_data = image_to_bytes(img_url)
449 | image = Image.open(BytesIO(img_data))
450 | predicted = inference(data=img_data)
451 | colors = []
452 | for i in range(len(predicted)):
453 | colors.append((random.randint(100, 255), random.randint(100, 255), random.randint(100, 255), 155))
454 | for i, pred in enumerate(predicted):
455 | label = pred["label"]
456 | mask = pred.pop("mask").encode("utf-8")
457 | mask = base64.b64decode(mask)
458 | mask = Image.open(BytesIO(mask), mode='r')
459 | mask = mask.convert('L')
460 |
461 | layer = Image.new('RGBA', mask.size, colors[i])
462 | image.paste(layer, (0, 0), mask)
463 | name = str(uuid.uuid4())[:4]
464 | image.save(f"public/images/{name}.jpg")
465 | result = {}
466 | result["generated image"] = f"/images/{name}.jpg"
467 | result["predicted"] = predicted
468 |
469 | if task == "object-detection":
470 | img_url = data["image"]
471 | img_data = image_to_bytes(img_url)
472 | predicted = inference(data=img_data)
473 | image = Image.open(BytesIO(img_data))
474 | draw = ImageDraw.Draw(image)
475 | labels = list(item['label'] for item in predicted)
476 | color_map = {}
477 | for label in labels:
478 | if label not in color_map:
479 | color_map[label] = (random.randint(0, 255), random.randint(0, 100), random.randint(0, 255))
480 | for label in predicted:
481 | box = label["box"]
482 | draw.rectangle(((box["xmin"], box["ymin"]), (box["xmax"], box["ymax"])), outline=color_map[label["label"]], width=2)
483 | draw.text((box["xmin"]+5, box["ymin"]-15), label["label"], fill=color_map[label["label"]])
484 | name = str(uuid.uuid4())[:4]
485 | image.save(f"public/images/{name}.jpg")
486 | result = {}
487 | result["generated image"] = f"/images/{name}.jpg"
488 | result["predicted"] = predicted
489 |
490 | if task in ["image-classification"]:
491 | img_url = data["image"]
492 | img_data = image_to_bytes(img_url)
493 | result = inference(data=img_data)
494 |
495 | if task == "image-to-text":
496 | img_url = data["image"]
497 | img_data = image_to_bytes(img_url)
498 | HUGGINGFACE_HEADERS["Content-Length"] = str(len(img_data))
499 | r = requests.post(task_url, headers=HUGGINGFACE_HEADERS, data=img_data, proxies=PROXY)
500 | result = {}
501 | if "generated_text" in r.json()[0]:
502 | result["generated text"] = r.json()[0].pop("generated_text")
503 |
504 | # AUDIO tasks
505 | if task == "text-to-speech":
506 | inputs = data["text"]
507 | response = inference(inputs, raw_response=True)
508 | # response = requests.post(task_url, headers=HUGGINGFACE_HEADERS, json={"inputs": text})
509 | name = str(uuid.uuid4())[:4]
510 | with open(f"public/audios/{name}.flac", "wb") as f:
511 | f.write(response.content)
512 | result = {"generated audio": f"/audios/{name}.flac"}
513 | if task in ["automatic-speech-recognition", "audio-to-audio", "audio-classification"]:
514 | audio_url = data["audio"]
515 | audio_data = requests.get(audio_url, timeout=10).content
516 | response = inference(data=audio_data, raw_response=True)
517 | result = response.json()
518 | if task == "audio-to-audio":
519 | content = None
520 | type = None
521 | for k, v in result[0].items():
522 | if k == "blob":
523 | content = base64.b64decode(v.encode("utf-8"))
524 | if k == "content-type":
525 | type = "audio/flac".split("/")[-1]
526 | audio = AudioSegment.from_file(BytesIO(content))
527 | name = str(uuid.uuid4())[:4]
528 | audio.export(f"public/audios/{name}.{type}", format=type)
529 | result = {"generated audio": f"/audios/{name}.{type}"}
530 | return result
531 |
532 | def local_model_inference(model_id, data, task):
533 | task_url = f"{Model_Server}/models/{model_id}"
534 |
535 | # contronlet
536 | if model_id.startswith("lllyasviel/sd-controlnet-"):
537 | img_url = data["image"]
538 | text = data["text"]
539 | response = requests.post(task_url, json={"img_url": img_url, "text": text})
540 | results = response.json()
541 | if "path" in results:
542 | results["generated image"] = results.pop("path")
543 | return results
544 | if model_id.endswith("-control"):
545 | img_url = data["image"]
546 | response = requests.post(task_url, json={"img_url": img_url})
547 | results = response.json()
548 | if "path" in results:
549 | results["generated image"] = results.pop("path")
550 | return results
551 |
552 | if task == "text-to-video":
553 | response = requests.post(task_url, json=data)
554 | results = response.json()
555 | if "path" in results:
556 | results["generated video"] = results.pop("path")
557 | return results
558 |
559 | # NLP tasks
560 | if task == "question-answering" or task == "sentence-similarity":
561 | response = requests.post(task_url, json=data)
562 | return response.json()
563 | if task in ["text-classification", "token-classification", "text2text-generation", "summarization", "translation", "conversational", "text-generation"]:
564 | response = requests.post(task_url, json=data)
565 | return response.json()
566 |
567 | # CV tasks
568 | if task == "depth-estimation":
569 | img_url = data["image"]
570 | response = requests.post(task_url, json={"img_url": img_url})
571 | results = response.json()
572 | if "path" in results:
573 | results["generated image"] = results.pop("path")
574 | return results
575 | if task == "image-segmentation":
576 | img_url = data["image"]
577 | response = requests.post(task_url, json={"img_url": img_url})
578 | results = response.json()
579 | results["generated image"] = results.pop("path")
580 | return results
581 | if task == "image-to-image":
582 | img_url = data["image"]
583 | response = requests.post(task_url, json={"img_url": img_url})
584 | results = response.json()
585 | if "path" in results:
586 | results["generated image"] = results.pop("path")
587 | return results
588 | if task == "text-to-image":
589 | response = requests.post(task_url, json=data)
590 | results = response.json()
591 | if "path" in results:
592 | results["generated image"] = results.pop("path")
593 | return results
594 | if task == "object-detection":
595 | img_url = data["image"]
596 | response = requests.post(task_url, json={"img_url": img_url})
597 | predicted = response.json()
598 | if "error" in predicted:
599 | return predicted
600 | image = load_image(img_url)
601 | draw = ImageDraw.Draw(image)
602 | labels = list(item['label'] for item in predicted)
603 | color_map = {}
604 | for label in labels:
605 | if label not in color_map:
606 | color_map[label] = (random.randint(0, 255), random.randint(0, 100), random.randint(0, 255))
607 | for label in predicted:
608 | box = label["box"]
609 | draw.rectangle(((box["xmin"], box["ymin"]), (box["xmax"], box["ymax"])), outline=color_map[label["label"]], width=2)
610 | draw.text((box["xmin"]+5, box["ymin"]-15), label["label"], fill=color_map[label["label"]])
611 | name = str(uuid.uuid4())[:4]
612 | image.save(f"public/images/{name}.jpg")
613 | results = {}
614 | results["generated image"] = f"/images/{name}.jpg"
615 | results["predicted"] = predicted
616 | return results
617 | if task in ["image-classification", "image-to-text", "document-question-answering", "visual-question-answering"]:
618 | img_url = data["image"]
619 | text = None
620 | if "text" in data:
621 | text = data["text"]
622 | response = requests.post(task_url, json={"img_url": img_url, "text": text})
623 | results = response.json()
624 | return results
625 | # AUDIO tasks
626 | if task == "text-to-speech":
627 | response = requests.post(task_url, json=data)
628 | results = response.json()
629 | if "path" in results:
630 | results["generated audio"] = results.pop("path")
631 | return results
632 | if task in ["automatic-speech-recognition", "audio-to-audio", "audio-classification"]:
633 | audio_url = data["audio"]
634 | response = requests.post(task_url, json={"audio_url": audio_url})
635 | return response.json()
636 |
637 |
638 | def model_inference(model_id, data, hosted_on, task):
639 | if hosted_on == "unknown":
640 | localStatusUrl = f"{Model_Server}/status/{model_id}"
641 | r = requests.get(localStatusUrl)
642 | logger.debug("Local Server Status: " + str(r.json()))
643 | if r.status_code == 200 and "loaded" in r.json() and r.json()["loaded"]:
644 | hosted_on = "local"
645 | else:
646 | huggingfaceStatusUrl = f"https://api-inference.huggingface.co/status/{model_id}"
647 | r = requests.get(huggingfaceStatusUrl, headers=HUGGINGFACE_HEADERS, proxies=PROXY)
648 | logger.debug("Huggingface Status: " + str(r.json()))
649 | if r.status_code == 200 and "loaded" in r.json() and r.json()["loaded"]:
650 | hosted_on = "huggingface"
651 | try:
652 | if hosted_on == "local":
653 | inference_result = local_model_inference(model_id, data, task)
654 | elif hosted_on == "huggingface":
655 | inference_result = huggingface_model_inference(model_id, data, task)
656 | except Exception as e:
657 | print(e)
658 | traceback.print_exc()
659 | inference_result = {"error":{"message": str(e)}}
660 | return inference_result
661 |
662 |
663 | def get_model_status(model_id, url, headers, queue = None):
664 | endpoint_type = "huggingface" if "huggingface" in url else "local"
665 | if "huggingface" in url:
666 | r = requests.get(url, headers=headers, proxies=PROXY)
667 | else:
668 | r = requests.get(url)
669 | if r.status_code == 200 and "loaded" in r.json() and r.json()["loaded"]:
670 | if queue:
671 | queue.put((model_id, True, endpoint_type))
672 | return True
673 | else:
674 | if queue:
675 | queue.put((model_id, False, None))
676 | return False
677 |
678 | def get_avaliable_models(candidates, topk=5):
679 | all_available_models = {"local": [], "huggingface": []}
680 | threads = []
681 | result_queue = Queue()
682 |
683 | for candidate in candidates:
684 | model_id = candidate["id"]
685 |
686 | if inference_mode != "local":
687 | huggingfaceStatusUrl = f"https://api-inference.huggingface.co/status/{model_id}"
688 | thread = threading.Thread(target=get_model_status, args=(model_id, huggingfaceStatusUrl, HUGGINGFACE_HEADERS, result_queue))
689 | threads.append(thread)
690 | thread.start()
691 |
692 | if inference_mode != "huggingface" and config["local_deployment"] != "minimal":
693 | localStatusUrl = f"{Model_Server}/status/{model_id}"
694 | thread = threading.Thread(target=get_model_status, args=(model_id, localStatusUrl, {}, result_queue))
695 | threads.append(thread)
696 | thread.start()
697 |
698 | result_count = len(threads)
699 | while result_count:
700 | model_id, status, endpoint_type = result_queue.get()
701 | if status and model_id not in all_available_models:
702 | all_available_models[endpoint_type].append(model_id)
703 | if len(all_available_models["local"] + all_available_models["huggingface"]) >= topk:
704 | break
705 | result_count -= 1
706 |
707 | for thread in threads:
708 | thread.join()
709 |
710 | return all_available_models
711 |
712 | def collect_result(command, choose, inference_result):
713 | result = {"task": command}
714 | result["inference result"] = inference_result
715 | result["choose model result"] = choose
716 | logger.debug(f"inference result: {inference_result}")
717 | return result
718 |
719 |
720 | def run_task(input, command, results, api_key, api_type, api_endpoint):
721 | id = command["id"]
722 | args = command["args"]
723 | task = command["task"]
724 | deps = command["dep"]
725 | if deps[0] != -1:
726 | dep_tasks = [results[dep] for dep in deps]
727 | else:
728 | dep_tasks = []
729 |
730 | logger.debug(f"Run task: {id} - {task}")
731 | logger.debug("Deps: " + json.dumps(dep_tasks))
732 |
733 | if deps[0] != -1:
734 | if "image" in args and "-" in args["image"]:
735 | resource_id = int(args["image"].split("-")[1])
736 | if "generated image" in results[resource_id]["inference result"]:
737 | args["image"] = results[resource_id]["inference result"]["generated image"]
738 | if "audio" in args and "-" in args["audio"]:
739 | resource_id = int(args["audio"].split("-")[1])
740 | if "generated audio" in results[resource_id]["inference result"]:
741 | args["audio"] = results[resource_id]["inference result"]["generated audio"]
742 | if "text" in args and "-" in args["text"]:
743 | resource_id = int(args["text"].split("-")[1])
744 | if "generated text" in results[resource_id]["inference result"]:
745 | args["text"] = results[resource_id]["inference result"]["generated text"]
746 |
747 | text = image = audio = None
748 | for dep_task in dep_tasks:
749 | if "generated text" in dep_task["inference result"]:
750 | text = dep_task["inference result"]["generated text"]
751 | logger.debug("Detect the generated text of dependency task (from results):" + text)
752 | elif "text" in dep_task["task"]["args"]:
753 | text = dep_task["task"]["args"]["text"]
754 | logger.debug("Detect the text of dependency task (from args): " + text)
755 | if "generated image" in dep_task["inference result"]:
756 | image = dep_task["inference result"]["generated image"]
757 | logger.debug("Detect the generated image of dependency task (from results): " + image)
758 | elif "image" in dep_task["task"]["args"]:
759 | image = dep_task["task"]["args"]["image"]
760 | logger.debug("Detect the image of dependency task (from args): " + image)
761 | if "generated audio" in dep_task["inference result"]:
762 | audio = dep_task["inference result"]["generated audio"]
763 | logger.debug("Detect the generated audio of dependency task (from results): " + audio)
764 | elif "audio" in dep_task["task"]["args"]:
765 | audio = dep_task["task"]["args"]["audio"]
766 | logger.debug("Detect the audio of dependency task (from args): " + audio)
767 |
768 | if "image" in args and "" in args["image"]:
769 | if image:
770 | args["image"] = image
771 | if "audio" in args and "" in args["audio"]:
772 | if audio:
773 | args["audio"] = audio
774 | if "text" in args and "" in args["text"]:
775 | if text:
776 | args["text"] = text
777 |
778 | for resource in ["image", "audio"]:
779 | if resource in args and not args[resource].startswith("public/") and len(args[resource]) > 0 and not args[resource].startswith("http"):
780 | args[resource] = f"public/{args[resource]}"
781 |
782 | if "-text-to-image" in command['task'] and "text" not in args:
783 | logger.debug("control-text-to-image task, but text is empty, so we use control-generation instead.")
784 | control = task.split("-")[0]
785 |
786 | if control == "seg":
787 | task = "image-segmentation"
788 | command['task'] = task
789 | elif control == "depth":
790 | task = "depth-estimation"
791 | command['task'] = task
792 | else:
793 | task = f"{control}-control"
794 |
795 | command["args"] = args
796 | logger.debug(f"parsed task: {command}")
797 |
798 | if task.endswith("-text-to-image") or task.endswith("-control"):
799 | if inference_mode != "huggingface":
800 | if task.endswith("-text-to-image"):
801 | control = task.split("-")[0]
802 | best_model_id = f"lllyasviel/sd-controlnet-{control}"
803 | else:
804 | best_model_id = task
805 | hosted_on = "local"
806 | reason = "ControlNet is the best model for this task."
807 | choose = {"id": best_model_id, "reason": reason}
808 | logger.debug(f"chosen model: {choose}")
809 | else:
810 | logger.warning(f"Task {command['task']} is not available. ControlNet need to be deployed locally.")
811 | record_case(success=False, **{"input": input, "task": command, "reason": f"Task {command['task']} is not available. ControlNet need to be deployed locally.", "op":"message"})
812 | inference_result = {"error": f"service related to ControlNet is not available."}
813 | results[id] = collect_result(command, "", inference_result)
814 | return False
815 | elif task in ["summarization", "translation", "conversational", "text-generation", "text2text-generation"]: # ChatGPT Can do
816 | best_model_id = "ChatGPT"
817 | reason = "ChatGPT performs well on some NLP tasks as well."
818 | choose = {"id": best_model_id, "reason": reason}
819 | messages = [{
820 | "role": "user",
821 | "content": f"[ {input} ] contains a task in JSON format {command}. Now you are a {command['task']} system, the arguments are {command['args']}. Just help me do {command['task']} and give me the result. The result must be in text form without any urls."
822 | }]
823 | response = chitchat(messages, api_key, api_type, api_endpoint)
824 | results[id] = collect_result(command, choose, {"response": response})
825 | return True
826 | else:
827 | if task not in MODELS_MAP:
828 | logger.warning(f"no available models on {task} task.")
829 | record_case(success=False, **{"input": input, "task": command, "reason": f"task not support: {command['task']}", "op":"message"})
830 | inference_result = {"error": f"{command['task']} not found in available tasks."}
831 | results[id] = collect_result(command, "", inference_result)
832 | return False
833 |
834 | candidates = MODELS_MAP[task][:10]
835 | all_avaliable_models = get_avaliable_models(candidates, config["num_candidate_models"])
836 | all_avaliable_model_ids = all_avaliable_models["local"] + all_avaliable_models["huggingface"]
837 | logger.debug(f"avaliable models on {command['task']}: {all_avaliable_models}")
838 |
839 | if len(all_avaliable_model_ids) == 0:
840 | logger.warning(f"no available models on {command['task']}")
841 | record_case(success=False, **{"input": input, "task": command, "reason": f"no available models: {command['task']}", "op":"message"})
842 | inference_result = {"error": f"no available models on {command['task']} task."}
843 | results[id] = collect_result(command, "", inference_result)
844 | return False
845 |
846 | if len(all_avaliable_model_ids) == 1:
847 | best_model_id = all_avaliable_model_ids[0]
848 | hosted_on = "local" if best_model_id in all_avaliable_models["local"] else "huggingface"
849 | reason = "Only one model available."
850 | choose = {"id": best_model_id, "reason": reason}
851 | logger.debug(f"chosen model: {choose}")
852 | else:
853 | cand_models_info = [
854 | {
855 | "id": model["id"],
856 | "inference endpoint": all_avaliable_models.get(
857 | "local" if model["id"] in all_avaliable_models["local"] else "huggingface"
858 | ),
859 | "likes": model.get("likes"),
860 | "description": model.get("description", "")[:config["max_description_length"]],
861 | # "language": model.get("meta").get("language") if model.get("meta") else None,
862 | "tags": model.get("meta").get("tags") if model.get("meta") else None,
863 | }
864 | for model in candidates
865 | if model["id"] in all_avaliable_model_ids
866 | ]
867 |
868 | choose_str = choose_model(input, command, cand_models_info, api_key, api_type, api_endpoint)
869 | logger.debug(f"chosen model: {choose_str}")
870 | try:
871 | choose = json.loads(choose_str)
872 | reason = choose["reason"]
873 | best_model_id = choose["id"]
874 | hosted_on = "local" if best_model_id in all_avaliable_models["local"] else "huggingface"
875 | except Exception as e:
876 | logger.warning(f"the response [ {choose_str} ] is not a valid JSON, try to find the model id and reason in the response.")
877 | choose_str = find_json(choose_str)
878 | best_model_id, reason, choose = get_id_reason(choose_str)
879 | hosted_on = "local" if best_model_id in all_avaliable_models["local"] else "huggingface"
880 | inference_result = model_inference(best_model_id, args, hosted_on, command['task'])
881 |
882 | if "error" in inference_result:
883 | logger.warning(f"Inference error: {inference_result['error']}")
884 | record_case(success=False, **{"input": input, "task": command, "reason": f"inference error: {inference_result['error']}", "op":"message"})
885 | results[id] = collect_result(command, choose, inference_result)
886 | return False
887 |
888 | results[id] = collect_result(command, choose, inference_result)
889 | return True
890 |
891 | def chat_huggingface(messages, api_key, api_type, api_endpoint, return_planning = False, return_results = False):
892 | start = time.time()
893 | context = messages[:-1]
894 | input = messages[-1]["content"]
895 | logger.info("*"*80)
896 | logger.info(f"input: {input}")
897 |
898 | task_str = parse_task(context, input, api_key, api_type, api_endpoint)
899 |
900 | if "error" in task_str:
901 | record_case(success=False, **{"input": input, "task": task_str, "reason": f"task parsing error: {task_str['error']['message']}", "op":"report message"})
902 | return {"message": task_str["error"]["message"]}
903 |
904 | task_str = task_str.strip()
905 | logger.info(task_str)
906 |
907 | try:
908 | tasks = json.loads(task_str)
909 | except Exception as e:
910 | logger.debug(e)
911 | response = chitchat(messages, api_key, api_type, api_endpoint)
912 | record_case(success=False, **{"input": input, "task": task_str, "reason": "task parsing fail", "op":"chitchat"})
913 | return {"message": response}
914 |
915 | if task_str == "[]": # using LLM response for empty task
916 | record_case(success=False, **{"input": input, "task": [], "reason": "task parsing fail: empty", "op": "chitchat"})
917 | response = chitchat(messages, api_key, api_type, api_endpoint)
918 | return {"message": response}
919 |
920 | if len(tasks) == 1 and tasks[0]["task"] in ["summarization", "translation", "conversational", "text-generation", "text2text-generation"]:
921 | record_case(success=True, **{"input": input, "task": tasks, "reason": "chitchat tasks", "op": "chitchat"})
922 | response = chitchat(messages, api_key, api_type, api_endpoint)
923 | return {"message": response}
924 |
925 | tasks = unfold(tasks)
926 | tasks = fix_dep(tasks)
927 | logger.debug(tasks)
928 |
929 | if return_planning:
930 | return tasks
931 |
932 | results = {}
933 | threads = []
934 | tasks = tasks[:]
935 | d = dict()
936 | retry = 0
937 | while True:
938 | num_thread = len(threads)
939 | for task in tasks:
940 | # logger.debug(f"d.keys(): {d.keys()}, dep: {dep}")
941 | for dep_id in task["dep"]:
942 | if dep_id >= task["id"]:
943 | task["dep"] = [-1]
944 | break
945 | dep = task["dep"]
946 | if dep[0] == -1 or len(list(set(dep).intersection(d.keys()))) == len(dep):
947 | tasks.remove(task)
948 | thread = threading.Thread(target=run_task, args=(input, task, d, api_key, api_type, api_endpoint))
949 | thread.start()
950 | threads.append(thread)
951 | if num_thread == len(threads):
952 | time.sleep(0.5)
953 | retry += 1
954 | if retry > 160:
955 | logger.debug("User has waited too long, Loop break.")
956 | break
957 | if len(tasks) == 0:
958 | break
959 | for thread in threads:
960 | thread.join()
961 |
962 | results = d.copy()
963 |
964 | logger.debug(results)
965 | if return_results:
966 | return results
967 |
968 | response = response_results(input, results, api_key, api_type, api_endpoint).strip()
969 |
970 | end = time.time()
971 | during = end - start
972 |
973 | answer = {"message": response}
974 | record_case(success=True, **{"input": input, "task": task_str, "results": results, "response": response, "during": during, "op":"response"})
975 | logger.info(f"response: {response}")
976 | return answer
977 |
978 | def test():
979 | # single round examples
980 | inputs = [
981 | "Given a collection of image A: /examples/a.jpg, B: /examples/b.jpg, C: /examples/c.jpg, please tell me how many zebras in these picture?"
982 | "Can you give me a picture of a small bird flying in the sky with trees and clouds. Generate a high definition image if possible.",
983 | "Please answer all the named entities in the sentence: Iron Man is a superhero appearing in American comic books published by Marvel Comics. The character was co-created by writer and editor Stan Lee, developed by scripter Larry Lieber, and designed by artists Don Heck and Jack Kirby.",
984 | "please dub for me: 'Iron Man is a superhero appearing in American comic books published by Marvel Comics. The character was co-created by writer and editor Stan Lee, developed by scripter Larry Lieber, and designed by artists Don Heck and Jack Kirby.'"
985 | "Given an image: https://huggingface.co/datasets/mishig/sample_images/resolve/main/palace.jpg, please answer the question: What is on top of the building?",
986 | "Please generate a canny image based on /examples/f.jpg"
987 | ]
988 |
989 | for input in inputs:
990 | messages = [{"role": "user", "content": input}]
991 | chat_huggingface(messages, API_KEY, API_TYPE, API_ENDPOINT, return_planning = False, return_results = False)
992 |
993 | # multi rounds example
994 | messages = [
995 | {"role": "user", "content": "Please generate a canny image based on /examples/f.jpg"},
996 | {"role": "assistant", "content": """Sure. I understand your request. Based on the inference results of the models, I have generated a canny image for you. The workflow I used is as follows: First, I used the image-to-text model (nlpconnect/vit-gpt2-image-captioning) to convert the image /examples/f.jpg to text. The generated text is "a herd of giraffes and zebras grazing in a field". Second, I used the canny-control model (canny-control) to generate a canny image from the text. Unfortunately, the model failed to generate the canny image. Finally, I used the canny-text-to-image model (lllyasviel/sd-controlnet-canny) to generate a canny image from the text. The generated image is located at /images/f16d.png. I hope this answers your request. Is there anything else I can help you with?"""},
997 | {"role": "user", "content": """then based on the above canny image and a prompt "a photo of a zoo", generate a new image."""},
998 | ]
999 | chat_huggingface(messages, API_KEY, API_TYPE, API_ENDPOINT, return_planning = False, return_results = False)
1000 |
1001 | def cli():
1002 | messages = []
1003 | print("Welcome to Jarvis! A collaborative system that consists of an LLM as the controller and numerous expert models as collaborative executors. Jarvis can plan tasks, schedule Hugging Face models, generate friendly responses based on your requests, and help you with many things. Please enter your request (`exit` to exit).")
1004 | while True:
1005 | message = input("[ User ]: ")
1006 | if message == "exit":
1007 | break
1008 | messages.append({"role": "user", "content": message})
1009 | answer = chat_huggingface(messages, API_KEY, API_TYPE, API_ENDPOINT, return_planning=False, return_results=False)
1010 | print("[ Jarvis ]: ", answer["message"])
1011 | messages.append({"role": "assistant", "content": answer["message"]})
1012 |
1013 |
1014 | def server():
1015 | http_listen = config["http_listen"]
1016 | host = http_listen["host"]
1017 | port = http_listen["port"]
1018 |
1019 | app = flask.Flask(__name__, static_folder="public", static_url_path="/")
1020 | app.config['DEBUG'] = False
1021 | CORS(app)
1022 |
1023 | @cross_origin()
1024 | @app.route('/tasks', methods=['POST'])
1025 | def tasks():
1026 | data = request.get_json()
1027 | messages = data["messages"]
1028 | api_key = data.get("api_key", API_KEY)
1029 | api_endpoint = data.get("api_endpoint", API_ENDPOINT)
1030 | api_type = data.get("api_type", API_TYPE)
1031 | if api_key is None or api_type is None or api_endpoint is None:
1032 | return jsonify({"error": "Please provide api_key, api_type and api_endpoint"})
1033 | response = chat_huggingface(messages, api_key, api_type, api_endpoint, return_planning=True)
1034 | return jsonify(response)
1035 |
1036 | @cross_origin()
1037 | @app.route('/results', methods=['POST'])
1038 | def results():
1039 | data = request.get_json()
1040 | messages = data["messages"]
1041 | api_key = data.get("api_key", API_KEY)
1042 | api_endpoint = data.get("api_endpoint", API_ENDPOINT)
1043 | api_type = data.get("api_type", API_TYPE)
1044 | if api_key is None or api_type is None or api_endpoint is None:
1045 | return jsonify({"error": "Please provide api_key, api_type and api_endpoint"})
1046 | response = chat_huggingface(messages, api_key, api_type, api_endpoint, return_results=True)
1047 | return jsonify(response)
1048 |
1049 | @cross_origin()
1050 | @app.route('/hugginggpt', methods=['POST'])
1051 | def chat():
1052 | data = request.get_json()
1053 | messages = data["messages"]
1054 | api_key = data.get("api_key", API_KEY)
1055 | api_endpoint = data.get("api_endpoint", API_ENDPOINT)
1056 | api_type = data.get("api_type", API_TYPE)
1057 | if api_key is None or api_type is None or api_endpoint is None:
1058 | return jsonify({"error": "Please provide api_key, api_type and api_endpoint"})
1059 | response = chat_huggingface(messages, api_key, api_type, api_endpoint)
1060 | return jsonify(response)
1061 | print("server running...")
1062 | waitress.serve(app, host=host, port=port)
1063 |
1064 | if __name__ == "__main__":
1065 | if args.mode == "test":
1066 | test()
1067 | elif args.mode == "server":
1068 | server()
1069 | elif args.mode == "cli":
1070 | cli()
1071 |
--------------------------------------------------------------------------------