HuggingGPT

├── web ├── env.d.ts ├── public │ ├── favicon.ico │ ├── audio.svg │ └── video.svg ├── src │ ├── assets │ │ ├── favicon.ico │ │ ├── clip.svg │ │ ├── switch.svg │ │ ├── tailwind.css │ │ ├── audio.svg │ │ ├── setting.svg │ │ ├── chatgpt.svg │ │ ├── huggingface.svg │ │ └── logo.svg │ ├── App.vue │ ├── main.ts │ ├── router │ │ └── index.ts │ ├── config │ │ └── index.ts │ ├── types │ │ └── index.ts │ ├── api │ │ ├── hugginggpt.ts │ │ └── chatgpt.ts │ ├── components │ │ └── Loading.vue │ ├── prompt │ │ └── index.ts │ └── views │ │ └── home.vue ├── postcss.config.js ├── electron │ ├── .npmrc │ ├── package.json │ ├── electron-builder.yml │ ├── preload.js │ └── main.js ├── tailwind.config.js ├── tsconfig.config.json ├── tsconfig.json ├── index.html ├── vite.config.ts └── package.json ├── assets ├── overview.jpg ├── prompt_flow.jpg ├── screenshot_a.jpg └── screenshot_q.jpg ├── server ├── public │ └── examples │ │ ├── a.jpg │ │ ├── b.jpg │ │ ├── c.jpg │ │ ├── d.jpg │ │ ├── e.jpg │ │ ├── f.jpg │ │ └── g.jpg ├── demos │ ├── demo_choose_model.json │ ├── demo_response_results.json │ └── demo_parse_task.json ├── requirements.txt ├── models │ ├── download.sh │ └── download.ps1 ├── get_token_ids.py ├── configs │ ├── config.gradio.yaml │ ├── config.lite.yaml │ ├── config.azure.yaml │ └── config.default.yaml ├── run_gradio_demo.py ├── models_server.py └── awesome_chat.py ├── CODE_OF_CONDUCT.md ├── LICENSE ├── CITATION.cff ├── SUPPORT.md ├── CONTRIBUTING.md ├── SECURITY.md ├── .gitignore └── README.md /web/env.d.ts: -------------------------------------------------------------------------------- 1 | /// 2 | -------------------------------------------------------------------------------- /assets/overview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekryski/JARVIS/main/assets/overview.jpg -------------------------------------------------------------------------------- /assets/prompt_flow.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekryski/JARVIS/main/assets/prompt_flow.jpg -------------------------------------------------------------------------------- /assets/screenshot_a.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekryski/JARVIS/main/assets/screenshot_a.jpg -------------------------------------------------------------------------------- /assets/screenshot_q.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekryski/JARVIS/main/assets/screenshot_q.jpg -------------------------------------------------------------------------------- /web/public/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekryski/JARVIS/main/web/public/favicon.ico -------------------------------------------------------------------------------- /web/src/assets/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekryski/JARVIS/main/web/src/assets/favicon.ico -------------------------------------------------------------------------------- /server/public/examples/a.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekryski/JARVIS/main/server/public/examples/a.jpg -------------------------------------------------------------------------------- /server/public/examples/b.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekryski/JARVIS/main/server/public/examples/b.jpg -------------------------------------------------------------------------------- /server/public/examples/c.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekryski/JARVIS/main/server/public/examples/c.jpg -------------------------------------------------------------------------------- /server/public/examples/d.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekryski/JARVIS/main/server/public/examples/d.jpg -------------------------------------------------------------------------------- /server/public/examples/e.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekryski/JARVIS/main/server/public/examples/e.jpg -------------------------------------------------------------------------------- /server/public/examples/f.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekryski/JARVIS/main/server/public/examples/f.jpg -------------------------------------------------------------------------------- /server/public/examples/g.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekryski/JARVIS/main/server/public/examples/g.jpg -------------------------------------------------------------------------------- /web/postcss.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | plugins: { 3 | tailwindcss: {}, 4 | autoprefixer: {}, 5 | }, 6 | } 7 | -------------------------------------------------------------------------------- /web/src/App.vue: -------------------------------------------------------------------------------- 1 | 3 | 4 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /web/electron/.npmrc: -------------------------------------------------------------------------------- 1 | registry=https://registry.npmmirror.com 2 | electron_mirror=https://npmmirror.com/mirrors/electron/ 3 | chromedriver_cdnurl=https://npmmirror.com/mirrors/chromedriver 4 | -------------------------------------------------------------------------------- /server/demos/demo_choose_model.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "role": "user", 4 | "content": "{{input}}" 5 | }, 6 | { 7 | "role": "assistant", 8 | "content": "{{task}}" 9 | } 10 | ] -------------------------------------------------------------------------------- /web/src/main.ts: -------------------------------------------------------------------------------- 1 | import { createApp } from "vue"; 2 | import App from "./App.vue"; 3 | import router from "./router"; 4 | import "./assets/tailwind.css"; 5 | 6 | const app = createApp(App); 7 | 8 | app.use(router).mount("#app"); 9 | -------------------------------------------------------------------------------- /web/tailwind.config.js: -------------------------------------------------------------------------------- 1 | /** @type {import('tailwindcss').Config} */ 2 | module.exports = { 3 | content: ["./index.html", "./src/**/*.{vue,js,ts,jsx,tsx}"], 4 | theme: { 5 | extend: {}, 6 | }, 7 | plugins: [], 8 | } 9 | -------------------------------------------------------------------------------- /web/tsconfig.config.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "@vue/tsconfig/tsconfig.node.json", 3 | "include": ["vite.config.*", "vitest.config.*", "cypress.config.*"], 4 | "compilerOptions": { 5 | "composite": true, 6 | "types": ["node"] 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /web/electron/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "chatgpt", 3 | "version": "1.0.0", 4 | "main": "main.js", 5 | "scripts": { 6 | "dev": "electron .", 7 | "build": "electron-builder" 8 | }, 9 | "devDependencies": { 10 | "electron": "^23.1.0", 11 | "electron-builder": "^23.6.0" 12 | } 13 | } -------------------------------------------------------------------------------- /web/src/assets/clip.svg: -------------------------------------------------------------------------------- 1 | 4 | -------------------------------------------------------------------------------- /web/src/router/index.ts: -------------------------------------------------------------------------------- 1 | import { createRouter, createWebHashHistory } from "vue-router"; 2 | 3 | const router = createRouter({ 4 | history: createWebHashHistory(import.meta.env.BASE_URL), 5 | routes: [ 6 | { 7 | path: "/", 8 | name: "home", 9 | component: () => import("@/views/home.vue"), 10 | }, 11 | ], 12 | }); 13 | 14 | export default router; 15 | -------------------------------------------------------------------------------- /server/demos/demo_response_results.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "role": "user", 4 | "content": "{{input}}" 5 | }, 6 | { 7 | "role": "assistant", 8 | "content": "Before give you a response, I want to introduce my workflow for your request, which is shown in the following JSON data: {{processes}}. Do you have any demands regarding my response?" 9 | } 10 | ] -------------------------------------------------------------------------------- /web/electron/electron-builder.yml: -------------------------------------------------------------------------------- 1 | appId: com.chatgpt.app 2 | productName: ChatGPT 3 | directories: 4 | output: ../electron-dist 5 | mac: 6 | category: public.app-category.productivity 7 | target: 8 | - target: dmg 9 | arch: 10 | - arm64 11 | - x64 12 | win: 13 | target: 14 | - target: nsis 15 | arch: 16 | - x64 17 | - ia32 18 | -------------------------------------------------------------------------------- /web/src/config/index.ts: -------------------------------------------------------------------------------- 1 | const HUGGINGGPT_BASE_URL = "http://localhost:8004" 2 | 3 | // use ChatGPT: double click on the setting icon 4 | const CHAT_GPT_URL = "https://api.openai.com" 5 | const CHAT_GPT_LLM = "gpt-3.5-turbo" // gpt-3.5-turbo, gpt-4 6 | // Dev: local endpoint 7 | // const CHAT_GPT_URL = "http://localhost:8006" 8 | 9 | 10 | export {HUGGINGGPT_BASE_URL, CHAT_GPT_URL, CHAT_GPT_LLM} -------------------------------------------------------------------------------- /web/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "@vue/tsconfig/tsconfig.web.json", 3 | "include": ["env.d.ts", "src/**/*", "src/**/*.vue"], 4 | "compilerOptions": { 5 | "baseUrl": ".", 6 | "paths": { 7 | "@/*": ["./src/*"] 8 | }, 9 | "resolveJsonModule": true 10 | }, 11 | 12 | "references": [ 13 | { 14 | "path": "./tsconfig.config.json" 15 | } 16 | ] 17 | } 18 | -------------------------------------------------------------------------------- /web/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | HuggingGPT 8 | 9 | 10 |

11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /web/src/assets/switch.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /web/src/types/index.ts: -------------------------------------------------------------------------------- 1 | export interface ChatMessage { 2 | role: "user" | "assistant" | "system"; 3 | type: "text" | "image" | "audio" | "video" | "code"; 4 | first: boolean; 5 | content: string; 6 | } 7 | 8 | export interface CleanChatMessage { 9 | role: "user" | "assistant" | "system"; 10 | content: string; 11 | } 12 | 13 | export interface Collection { 14 | chatgpt: { 15 | [key: string]: ChatMessage[]; 16 | }; 17 | hugginggpt: { 18 | [key: string]: ChatMessage[]; 19 | }; 20 | } 21 | -------------------------------------------------------------------------------- /web/vite.config.ts: -------------------------------------------------------------------------------- 1 | import { fileURLToPath, URL } from "node:url"; 2 | import path from "path"; 3 | import { defineConfig } from "vite"; 4 | import vue from "@vitejs/plugin-vue"; 5 | 6 | // https://vitejs.dev/config/ 7 | export default defineConfig({ 8 | server: { 9 | host: "0.0.0.0", 10 | port: 9999, 11 | open: true, 12 | cors: true, 13 | }, 14 | plugins: [vue()], 15 | base: "./", 16 | resolve: { 17 | alias: { 18 | "@": path.resolve(__dirname, "./src"), 19 | }, 20 | }, 21 | }); 22 | -------------------------------------------------------------------------------- /web/src/assets/tailwind.css: -------------------------------------------------------------------------------- 1 | @tailwind base; 2 | @tailwind components; 3 | @tailwind utilities; 4 | 5 | @layer components { 6 | .btn { 7 | @apply px-4 py-2 text-sm font-medium tracking-wide text-white capitalize transition-colors duration-300 transform bg-blue-700 rounded-md hover:bg-blue-600 focus:outline-none focus:bg-blue-600 whitespace-nowrap disabled:bg-blue-300; 8 | } 9 | .input { 10 | @apply px-4 py-2 text-gray-700 bg-white border rounded-md mr-2 sm:mr-4 focus:border-blue-400 focus:outline-none focus:ring focus:ring-blue-300 focus:ring-opacity-40 flex-grow; 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /web/electron/preload.js: -------------------------------------------------------------------------------- 1 | /** 2 | * The preload script runs before. It has access to web APIs 3 | * as well as Electron's renderer process modules and some 4 | * polyfilled Node.js functions. 5 | * 6 | * https://www.electronjs.org/docs/latest/tutorial/sandbox 7 | */ 8 | window.addEventListener('DOMContentLoaded', () => { 9 | const replaceText = (selector, text) => { 10 | const element = document.getElementById(selector) 11 | if (element) element.innerText = text 12 | } 13 | 14 | for (const type of ['chrome', 'node', 'electron']) { 15 | replaceText(`${type}-version`, process.versions[type]) 16 | } 17 | }) 18 | -------------------------------------------------------------------------------- /web/public/audio.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /server/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/diffusers.git@8c530fc2f6a76a2aefb6b285dce6df1675092ac6#egg=diffusers 2 | git+https://github.com/huggingface/transformers@c612628045822f909020f7eb6784c79700813eda#egg=transformers 3 | git+https://github.com/patrickvonplaten/controlnet_aux@78efc716868a7f5669c288233d65b471f542ce40#egg=controlnet_aux 4 | tiktoken==0.3.3 5 | pydub==0.25.1 6 | espnet==202301 7 | espnet_model_zoo==0.1.7 8 | flask==2.2.3 9 | flask_cors==3.0.10 10 | waitress==2.1.2 11 | datasets==2.11.0 12 | asteroid==0.6.0 13 | speechbrain==0.5.14 14 | timm==0.6.13 15 | typeguard==2.13.3 16 | accelerate==0.18.0 17 | pytesseract==0.3.10 18 | gradio==3.24.1 -------------------------------------------------------------------------------- /web/src/api/hugginggpt.ts: -------------------------------------------------------------------------------- 1 | import type { CleanChatMessage } from "@/types"; 2 | import axios, { AxiosError } from "axios"; 3 | import { HUGGINGGPT_BASE_URL } from "@/config"; 4 | 5 | const model = "gpt-3.5-turbo"; 6 | 7 | axios.defaults.headers.post["Content-Type"] = "application/json"; 8 | 9 | export async function hugginggpt(messageList: CleanChatMessage[]) { 10 | var endpoint = `${HUGGINGGPT_BASE_URL}/hugginggpt` 11 | try { 12 | const response = await axios({ 13 | url: endpoint, 14 | method: "post", 15 | data: { 16 | model, 17 | messages: messageList.slice(1) 18 | }, 19 | timeout: 180000, // 180 seconds 20 | }); 21 | return { 22 | status: "success", 23 | data: response.data.message, 24 | }; 25 | } catch (error: any) { 26 | return { 27 | status: "error", 28 | message: error.message 29 | }; 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /web/src/api/chatgpt.ts: -------------------------------------------------------------------------------- 1 | import type { CleanChatMessage } from "@/types"; 2 | import axios, { AxiosError } from "axios"; 3 | import { CHAT_GPT_URL, CHAT_GPT_LLM } from "@/config"; 4 | 5 | axios.defaults.headers.post["Content-Type"] = "application/json"; 6 | 7 | export async function chatgpt(messageList: CleanChatMessage[], apiKey: string) { 8 | var endpoint = `${CHAT_GPT_URL}/v1/chat/completions` 9 | 10 | try { 11 | const completion = await axios({ 12 | url: endpoint, 13 | method: "post", 14 | headers: { 15 | Authorization: `Bearer ${apiKey}`, 16 | }, 17 | data: { 18 | model: CHAT_GPT_LLM, 19 | messages: messageList 20 | }, 21 | timeout: 60000, // 180 seconds 22 | }); 23 | return { 24 | status: "success", 25 | data: completion.data.choices[0].message.content, 26 | }; 27 | } catch (error: any) { 28 | return { 29 | status: "error", 30 | message: error.message 31 | }; 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /web/src/assets/audio.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /web/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "vue3-ts-vite-router-tailwindcss", 3 | "version": "0.0.0", 4 | "main": "index.html", 5 | "scripts": { 6 | "dev": "vite", 7 | "build": "run-p type-check build-only", 8 | "preview": "vite preview --port 4173", 9 | "build-only": "vite build", 10 | "type-check": "vue-tsc --noEmit", 11 | "e:dev": "yarn build && cp -r electron/. dist && cd dist && yarn && yarn dev", 12 | "e:build": "yarn build && cp -r electron/. dist && cd dist && yarn && yarn build" 13 | }, 14 | "dependencies": { 15 | "axios": "^1.3.4", 16 | "vue": "^3.2.38", 17 | "vue-router": "^4.1.5" 18 | }, 19 | "devDependencies": { 20 | "@types/node": "^16.11.56", 21 | "@vitejs/plugin-vue": "^3.0.3", 22 | "@vue/tsconfig": "^0.1.3", 23 | "autoprefixer": "^10.4.12", 24 | "npm-run-all": "^4.1.5", 25 | "postcss": "^8.4.18", 26 | "tailwindcss": "^3.2.1", 27 | "typescript": "~4.7.4", 28 | "vite": "^3.2.5", 29 | "vue-tsc": "^0.40.7" 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you find this work useful in your method, you can cite the paper as below." 3 | authors: 4 | - family-names: Shen 5 | given-names: Yongliang 6 | - family-names: Song 7 | given-names: Kaitao 8 | - family-names: Tan 9 | given-names: Xu 10 | - family-names: Li 11 | given-names: Dongsheng 12 | - family-names: Lu 13 | given-names: Weiming 14 | - family-names: Zhuang 15 | given-names: Yueting 16 | title: "HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in HuggingFace" 17 | version: 2.0.4 18 | license: MIT 19 | status: preprint 20 | date-released: 2023-03-30 21 | url: https://github.com/microsoft/JARVIS 22 | preferred-citation: 23 | type: article 24 | authors: 25 | - family-names: Shen 26 | given-names: Yongliang 27 | - family-names: Song 28 | given-names: Kaitao 29 | - family-names: Tan 30 | given-names: Xu 31 | - family-names: Li 32 | given-names: Dongsheng 33 | - family-names: Lu 34 | given-names: Weiming 35 | - family-names: Zhuang 36 | given-names: Yueting 37 | journal: "arXiv preprint arXiv:2303.17580" 38 | title: "HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in HuggingFace" 39 | year: 2023 40 | url: https://arxiv.org/abs/2303.17580 -------------------------------------------------------------------------------- /SUPPORT.md: -------------------------------------------------------------------------------- 1 | # TODO: The maintainer of this repo has not yet edited this file 2 | 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project? 4 | 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help. 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps. 7 | - **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide. 8 | 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.* 10 | 11 | # Support 12 | 13 | ## How to file issues and get help 14 | 15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 16 | issues before filing new issues to avoid duplicates. For new issues, file your bug or 17 | feature request as a new Issue. 18 | 19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER 21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**. 22 | 23 | ## Microsoft Support Policy 24 | 25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above. 26 | -------------------------------------------------------------------------------- /web/src/assets/setting.svg: -------------------------------------------------------------------------------- 1 | 5 | -------------------------------------------------------------------------------- /web/electron/main.js: -------------------------------------------------------------------------------- 1 | // Modules to control application life and create native browser window 2 | const {app, BrowserWindow} = require('electron') 3 | const path = require('path') 4 | 5 | function createWindow () { 6 | // Create the browser window. 7 | const mainWindow = new BrowserWindow({ 8 | width: 800, 9 | height: 600, 10 | webPreferences: { 11 | preload: path.join(__dirname, 'preload.js') 12 | } 13 | }) 14 | 15 | // and load the index.html of the app. 16 | mainWindow.loadFile('index.html') 17 | 18 | // Open the DevTools. 19 | // mainWindow.webContents.openDevTools() 20 | } 21 | 22 | // This method will be called when Electron has finished 23 | // initialization and is ready to create browser windows. 24 | // Some APIs can only be used after this event occurs. 25 | app.whenReady().then(() => { 26 | createWindow() 27 | 28 | app.on('activate', function () { 29 | // On macOS it's common to re-create a window in the app when the 30 | // dock icon is clicked and there are no other windows open. 31 | if (BrowserWindow.getAllWindows().length === 0) createWindow() 32 | }) 33 | }) 34 | 35 | // Quit when all windows are closed, except on macOS. There, it's common 36 | // for applications and their menu bar to stay active until the user quits 37 | // explicitly with Cmd + Q. 38 | app.on('window-all-closed', function () { 39 | if (process.platform !== 'darwin') app.quit() 40 | }) 41 | 42 | // In this file you can include the rest of your app's specific main process 43 | // code. You can also put them in separate files and require them here. 44 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | 2 | To contribute to this GitHub project, you can follow these steps: 3 | 4 | 1. Fork the repository you want to contribute to by clicking the "Fork" button on the project page. 5 | 6 | 2. Clone the repository to your local machine and enter the newly created repo using the following commands: 7 | 8 | ``` 9 | git clone https://github.com/YOUR-GITHUB-USERNAME/JARVIS 10 | cd JARVIS 11 | ``` 12 | 3. Create a new branch for your changes using the following command: 13 | 14 | ``` 15 | git checkout -b "branch-name" 16 | ``` 17 | 4. Make your changes to the code or documentation. 18 | 19 | 5. Add the changes to the staging area using the following command: 20 | ``` 21 | git add . 22 | ``` 23 | 24 | 6. Commit the changes with a meaningful commit message using the following command: 25 | ``` 26 | git commit -m "your commit message" 27 | ``` 28 | 7. Push the changes to your forked repository using the following command: 29 | ``` 30 | git push origin branch-name 31 | ``` 32 | 8. Go to the GitHub website and navigate to your forked repository. 33 | 34 | 9. Click the "New pull request" button. 35 | 36 | 10. Select the branch you just pushed to and the branch you want to merge into on the original repository. 37 | 38 | 11. Add a description of your changes and click the "Create pull request" button. 39 | 40 | 12. Wait for the project maintainer to review your changes and provide feedback. 41 | 42 | 13. Make any necessary changes based on feedback and repeat steps 5-12 until your changes are accepted and merged into the main project. 43 | 44 | 14. Once your changes are merged, you can update your forked repository and local copy of the repository with the following commands: 45 | 46 | ``` 47 | git fetch upstream 48 | git checkout main 49 | git merge upstream/main 50 | ``` 51 | Finally, delete the branch you created with the following command: 52 | ``` 53 | git branch -d branch-name 54 | ``` 55 | That's it you made it 🐣⭐⭐ 56 | -------------------------------------------------------------------------------- /server/models/download.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Set models and datasets to download 4 | models=( 5 | "nlpconnect/vit-gpt2-image-captioning" 6 | "lllyasviel/ControlNet" 7 | "lllyasviel/sd-controlnet-canny" 8 | "lllyasviel/sd-controlnet-depth" 9 | "lllyasviel/sd-controlnet-hed" 10 | "lllyasviel/sd-controlnet-mlsd" 11 | "lllyasviel/sd-controlnet-openpose" 12 | "lllyasviel/sd-controlnet-scribble" 13 | "lllyasviel/sd-controlnet-seg" 14 | "runwayml/stable-diffusion-v1-5" 15 | "damo-vilab/text-to-video-ms-1.7b" 16 | "microsoft/speecht5_asr" 17 | "JorisCos/DCCRNet_Libri1Mix_enhsingle_16k" 18 | "espnet/kan-bayashi_ljspeech_vits" 19 | "facebook/detr-resnet-101" 20 | "microsoft/speecht5_hifigan" 21 | "microsoft/speecht5_vc" 22 | "openai/whisper-base" 23 | "Intel/dpt-large" 24 | "facebook/detr-resnet-50-panoptic" 25 | "facebook/detr-resnet-50" 26 | "google/owlvit-base-patch32" 27 | "impira/layoutlm-document-qa" 28 | "ydshieh/vit-gpt2-coco-en" 29 | "dandelin/vilt-b32-finetuned-vqa" 30 | "lambdalabs/sd-image-variations-diffusers" 31 | "facebook/maskformer-swin-base-coco" 32 | "Intel/dpt-hybrid-midas" 33 | ) 34 | datasets=("Matthijs/cmu-arctic-xvectors") 35 | 36 | # Set the current directory 37 | CURRENT_DIR=$(pwd) 38 | 39 | # Download models 40 | for model in "${models[@]}"; do 41 | echo "----- Downloading from https://huggingface.co/${model} -----" 42 | if [ -d "${model}" ]; then 43 | (cd "${model}" && git pull && git lfs pull) 44 | else 45 | git clone --recurse-submodules "https://huggingface.co/${model}" "${model}" 46 | fi 47 | done 48 | 49 | # Download datasets 50 | for dataset in "${datasets[@]}"; do 51 | echo "----- Downloading from https://huggingface.co/datasets/${dataset} -----" 52 | if [ -d "${dataset}" ]; then 53 | (cd "${dataset}" && git pull && git lfs pull) 54 | else 55 | git clone --recurse-submodules "https://huggingface.co/datasets/${dataset}" "${dataset}" 56 | fi 57 | done -------------------------------------------------------------------------------- /server/models/download.ps1: -------------------------------------------------------------------------------- 1 | $models = @( 2 | "nlpconnect/vit-gpt2-image-captioning", 3 | "lllyasviel/ControlNet", 4 | "lllyasviel/sd-controlnet-canny", 5 | "lllyasviel/sd-controlnet-depth", 6 | "lllyasviel/sd-controlnet-hed", 7 | "lllyasviel/sd-controlnet-mlsd", 8 | "lllyasviel/sd-controlnet-openpose", 9 | "lllyasviel/sd-controlnet-scribble", 10 | "lllyasviel/sd-controlnet-seg", 11 | "runwayml/stable-diffusion-v1-5", 12 | "damo-vilab/text-to-video-ms-1.7b", 13 | "microsoft/speecht5_asr", 14 | "JorisCos/DCCRNet_Libri1Mix_enhsingle_16k", 15 | "espnet/kan-bayashi_ljspeech_vits", 16 | "facebook/detr-resnet-101", 17 | "microsoft/speecht5_hifigan", 18 | "microsoft/speecht5_vc", 19 | "openai/whisper-base", 20 | "Intel/dpt-large", 21 | "facebook/detr-resnet-50-panoptic", 22 | "facebook/detr-resnet-50", 23 | "google/owlvit-base-patch32", 24 | "impira/layoutlm-document-qa", 25 | "ydshieh/vit-gpt2-coco-en", 26 | "dandelin/vilt-b32-finetuned-vqa", 27 | "lambdalabs/sd-image-variations-diffusers", 28 | "facebook/maskformer-swin-base-coco", 29 | "Intel/dpt-hybrid-midas" 30 | ) 31 | 32 | $CURRENT_DIR = Get-Location 33 | 34 | foreach ($model in $models) { 35 | Write-Host "----- Downloading from https://huggingface.co/$model -----" 36 | if (Test-Path $model) { 37 | Set-Location $model 38 | git pull 39 | git lfs pull 40 | Set-Location $CURRENT_DIR 41 | } else { 42 | git clone "https://huggingface.co/$model" $model 43 | } 44 | } 45 | 46 | $datasets = @( 47 | "Matthijs/cmu-arctic-xvectors" 48 | ) 49 | 50 | foreach ($dataset in $datasets) { 51 | Write-Host "----- Downloading from https://huggingface.co/datasets/$dataset -----" 52 | if (Test-Path $dataset) { 53 | Set-Location $dataset 54 | git pull 55 | git lfs pull 56 | Set-Location $CURRENT_DIR 57 | } else { 58 | git clone "https://huggingface.co/datasets/$dataset" $dataset 59 | } 60 | } -------------------------------------------------------------------------------- /web/public/video.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /web/src/components/Loading.vue: -------------------------------------------------------------------------------- 1 | 16 | 17 | 114 | -------------------------------------------------------------------------------- /server/get_token_ids.py: -------------------------------------------------------------------------------- 1 | import tiktoken 2 | 3 | encodings = { 4 | "gpt-4": tiktoken.get_encoding("cl100k_base"), 5 | "gpt-4-32k": tiktoken.get_encoding("cl100k_base"), 6 | "gpt-3.5-turbo": tiktoken.get_encoding("cl100k_base"), 7 | "gpt-3.5-turbo-0301": tiktoken.get_encoding("cl100k_base"), 8 | "text-davinci-003": tiktoken.get_encoding("p50k_base"), 9 | "text-davinci-002": tiktoken.get_encoding("p50k_base"), 10 | "text-davinci-001": tiktoken.get_encoding("r50k_base"), 11 | "text-curie-001": tiktoken.get_encoding("r50k_base"), 12 | "text-babbage-001": tiktoken.get_encoding("r50k_base"), 13 | "text-ada-001": tiktoken.get_encoding("r50k_base"), 14 | "davinci": tiktoken.get_encoding("r50k_base"), 15 | "curie": tiktoken.get_encoding("r50k_base"), 16 | "babbage": tiktoken.get_encoding("r50k_base"), 17 | "ada": tiktoken.get_encoding("r50k_base"), 18 | } 19 | 20 | max_length = { 21 | "gpt-4": 8192, 22 | "gpt-4-32k": 32768, 23 | "gpt-3.5-turbo": 4096, 24 | "gpt-3.5-turbo-0301": 4096, 25 | "text-davinci-003": 4096, 26 | "text-davinci-002": 4096, 27 | "text-davinci-001": 2049, 28 | "text-curie-001": 2049, 29 | "text-babbage-001": 2049, 30 | "text-ada-001": 2049, 31 | "davinci": 2049, 32 | "curie": 2049, 33 | "babbage": 2049, 34 | "ada": 2049 35 | } 36 | 37 | def count_tokens(model_name, text): 38 | return len(encodings[model_name].encode(text)) 39 | 40 | def get_max_context_length(model_name): 41 | return max_length[model_name] 42 | 43 | def get_token_ids_for_task_parsing(model_name): 44 | text = '''{"task": "text-classification", "token-classification", "text2text-generation", "summarization", "translation", "question-answering", "conversational", "text-generation", "sentence-similarity", "tabular-classification", "object-detection", "image-classification", "image-to-image", "image-to-text", "text-to-image", "visual-question-answering", "document-question-answering", "image-segmentation", "text-to-speech", "text-to-video", "automatic-speech-recognition", "audio-to-audio", "audio-classification", "canny-control", "hed-control", "mlsd-control", "normal-control", "openpose-control", "canny-text-to-image", "depth-text-to-image", "hed-text-to-image", "mlsd-text-to-image", "normal-text-to-image", "openpose-text-to-image", "seg-text-to-image", "args", "text", "path", "dep", "id", "-"}''' 45 | res = encodings[model_name].encode(text) 46 | res = list(set(res)) 47 | return res 48 | 49 | def get_token_ids_for_choose_model(model_name): 50 | text = '''{"id": "reason"}''' 51 | res = encodings[model_name].encode(text) 52 | res = list(set(res)) 53 | return res -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # ALL 2 | *.dev.yaml 3 | 4 | # for server 5 | server/models/* 6 | !server/models/download.sh 7 | !server/models/download.ps1 8 | server/logs/ 9 | server/models_dev 10 | server/public/* 11 | !server/public/examples/ 12 | server/public/examples/* 13 | !server/public/examples/a.jpg 14 | !server/public/examples/b.jpg 15 | !server/public/examples/c.jpg 16 | !server/public/examples/d.jpg 17 | !server/public/examples/e.jpg 18 | !server/public/examples/f.jpg 19 | !server/public/examples/g.jpg 20 | 21 | # docker 22 | Dockerfile 23 | docker-compose.yml 24 | 25 | # for gradio 26 | # server/run_gradio.py 27 | 28 | # for web 29 | web/node_modules 30 | web/package-lock.json 31 | web/dist 32 | web/electron-dist 33 | web/yarn.lock 34 | 35 | # Byte-compiled / optimized / DLL files 36 | __pycache__/ 37 | *.py[cod] 38 | *$py.class 39 | 40 | # C extensions 41 | *.so 42 | 43 | # Distribution / packaging 44 | .Python 45 | build/ 46 | develop-eggs/ 47 | dist/ 48 | downloads/ 49 | eggs/ 50 | .eggs/ 51 | lib/ 52 | lib64/ 53 | parts/ 54 | sdist/ 55 | var/ 56 | wheels/ 57 | pip-wheel-metadata/ 58 | share/python-wheels/ 59 | *.egg-info/ 60 | .installed.cfg 61 | *.egg 62 | MANIFEST 63 | 64 | # PyInstaller 65 | # Usually these files are written by a python script from a template 66 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 67 | *.manifest 68 | *.spec 69 | 70 | # Installer logs 71 | pip-log.txt 72 | pip-delete-this-directory.txt 73 | 74 | # Unit test / coverage reports 75 | htmlcov/ 76 | .tox/ 77 | .nox/ 78 | .coverage 79 | .coverage.* 80 | .cache 81 | nosetests.xml 82 | coverage.xml 83 | *.cover 84 | *.py,cover 85 | .hypothesis/ 86 | .pytest_cache/ 87 | 88 | # Translations 89 | *.mo 90 | *.pot 91 | 92 | # Django stuff: 93 | *.log 94 | local_settings.py 95 | db.sqlite3 96 | db.sqlite3-journal 97 | 98 | # Flask stuff: 99 | instance/ 100 | .webassets-cache 101 | 102 | # Scrapy stuff: 103 | .scrapy 104 | 105 | # Sphinx documentation 106 | docs/_build/ 107 | 108 | # PyBuilder 109 | target/ 110 | 111 | # Jupyter Notebook 112 | .ipynb_checkpoints 113 | 114 | # IPython 115 | profile_default/ 116 | ipython_config.py 117 | 118 | # pyenv 119 | .python-version 120 | 121 | # pipenv 122 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 123 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 124 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 125 | # install all needed dependencies. 126 | #Pipfile.lock 127 | 128 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 129 | __pypackages__/ 130 | 131 | # Celery stuff 132 | celerybeat-schedule 133 | celerybeat.pid 134 | 135 | # SageMath parsed files 136 | *.sage.py 137 | 138 | # Environments 139 | .env 140 | .venv 141 | env/ 142 | venv/ 143 | ENV/ 144 | env.bak/ 145 | venv.bak/ 146 | 147 | # Spyder project settings 148 | .spyderproject 149 | .spyproject 150 | 151 | # Rope project settings 152 | .ropeproject 153 | 154 | # mkdocs documentation 155 | /site 156 | 157 | # mypy 158 | .mypy_cache/ 159 | .dmypy.json 160 | dmypy.json 161 | 162 | # Pyre type checker 163 | .pyre/ 164 | 165 | -------------------------------------------------------------------------------- /web/src/prompt/index.ts: -------------------------------------------------------------------------------- 1 | import type {Collection, ChatMessage } from "@/types"; 2 | 3 | const ChatGPTTerminalMessage:ChatMessage[] = [ 4 | { 5 | role: "assistant", 6 | content: "Hi there! I am OpenAI ChatGPT, an AI assistant for you. How can I help you? ", 7 | type: "text", 8 | first: true 9 | }, 10 | { 11 | role: "user", 12 | content: "I want you to act as a linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply with the terminal output inside one unique code block, and nothing else. do not write explanations. do not type commands unless I instruct you to do so. When I need to tell you something in English, I will do so by putting text inside curly brackets {like this}.", 13 | type: "text", 14 | first: true 15 | }, 16 | { 17 | role: "assistant", 18 | content: "Yes, I will do it for you. Please type the command and I will reply with the terminal output.", 19 | type: "text", 20 | first: true 21 | } 22 | ] 23 | 24 | const ChatGPTPolishMessage:ChatMessage[] = [ 25 | { 26 | role: "assistant", 27 | content: "Hi there! I am OpenAI ChatGPT, an AI assistant for you. How can I help you? ", 28 | type: "text", 29 | first: true 30 | }, 31 | { 32 | role: "user", 33 | content: "You are a well-trained AI writing assistant with expertise in writing academic papers for computer conferences. By giving you a draft paragraph, I hope you can help me polish my writing with your knowledge. The language should be concise and consistent with the style of an academic paper.", 34 | type: "text", 35 | first: true 36 | }, 37 | { 38 | role: "assistant", 39 | content: "No problem, I will think carefully and polish the paper for you.", 40 | type: "text", 41 | first: true 42 | }, 43 | ] 44 | 45 | const ChatGPTTranslationMessage:ChatMessage[] = [ 46 | { 47 | role: "assistant", 48 | content: "Hi there! I am OpenAI ChatGPT, an AI assistant for you. How can I help you? ", 49 | type: "text", 50 | first: true 51 | }, 52 | { 53 | role: "user", 54 | content: "I want you to act as an English translator, spelling corrector and improver. I will speak to you in any language and you will detect the language, translate it and answer in the corrected and improved version of my text, in English. I want you to replace my simplified A0-level words and sentences with more beautiful and elegant, upper level English words and sentences. Keep the meaning same, but make them more literary. I want you to only reply the correction, the improvements and nothing else, do not write explanations.", 55 | type: "text", 56 | first: true 57 | }, 58 | { 59 | role: "assistant", 60 | content: "Sure, I will act as an English translator and improver.", 61 | type: "text", 62 | first: true 63 | }, 64 | ] 65 | 66 | 67 | const defaultChatGPTMessage:ChatMessage[] = [ 68 | { 69 | role: "assistant", 70 | content: "Hi there! I am OpenAI ChatGPT, an AI assistant for you. How can I help you? ", 71 | type: "text", 72 | first: true 73 | } 74 | ] 75 | 76 | const defaultHuggingGPTMessage:ChatMessage[] = [ 77 | { 78 | role: "assistant", 79 | content: "Hi there, I am HuggingGPT empowered by Huggingface family! Yes, I can provide thousands of models for dozens of tasks. For more fun and creativity, I have invited Diffusers family to join our team. Feel free to experience it!", 80 | type: "text", 81 | first: true 82 | } 83 | ] 84 | 85 | const promptCollection: Collection = { 86 | "chatgpt": { 87 | "terminal": ChatGPTTerminalMessage, 88 | "polish": ChatGPTPolishMessage, 89 | "translation": ChatGPTTranslationMessage, 90 | "default": defaultChatGPTMessage, 91 | }, 92 | "hugginggpt": { 93 | "default": defaultHuggingGPTMessage 94 | } 95 | } 96 | 97 | 98 | export default promptCollection -------------------------------------------------------------------------------- /server/configs/config.gradio.yaml: -------------------------------------------------------------------------------- 1 | huggingface: 2 | token: REPLACE_WITH_YOUR_HUGGINGFACE_TOKEN_HERE # required: huggingface token @ https://huggingface.co/settings/tokens 3 | dev: false 4 | debug: true 5 | log_file: logs/debug.log 6 | model: text-davinci-003 # currently only support text-davinci-003, we will support more open-source LLMs in the future 7 | use_completion: true 8 | inference_mode: huggingface # local, huggingface or hybrid, prefer hybrid 9 | local_deployment: full # minimal, standard or full, prefer full 10 | device: cuda:0 # cuda:id or cpu 11 | num_candidate_models: 5 12 | max_description_length: 100 13 | proxy: # optional: your proxy server "http://ip:port" 14 | local_inference_endpoint: 15 | host: localhost 16 | port: 8005 17 | logit_bias: 18 | parse_task: 0.1 19 | choose_model: 5 20 | tprompt: 21 | parse_task: >- 22 | #1 Task Planning Stage: The AI assistant can parse user input to several tasks: [{"task": task, "id": task_id, "dep": dependency_task_id, "args": {"text": text or -dep_id, "image": image_url or -dep_id, "audio": audio_url or -dep_id}}]. The special tag "-dep_id" refer to the one generated text/image/audio in the dependency task (Please consider whether the dependency task generates resources of this type.) and "dep_id" must be in "dep" list. The "dep" field denotes the ids of the previous prerequisite tasks which generate a new resource that the current task relies on. The "args" field must in ["text", "image", "audio"], nothing else. The task MUST be selected from the following options: "token-classification", "text2text-generation", "summarization", "translation", "question-answering", "conversational", "text-generation", "sentence-similarity", "tabular-classification", "object-detection", "image-classification", "image-to-image", "image-to-text", "text-to-image", "text-to-video", "visual-question-answering", "document-question-answering", "image-segmentation", "depth-estimation", "text-to-speech", "automatic-speech-recognition", "audio-to-audio", "audio-classification", "canny-control", "hed-control", "mlsd-control", "normal-control", "openpose-control", "canny-text-to-image", "depth-text-to-image", "hed-text-to-image", "mlsd-text-to-image", "normal-text-to-image", "openpose-text-to-image", "seg-text-to-image". There may be multiple tasks of the same type. Think step by step about all the tasks needed to resolve the user's request. Parse out as few tasks as possible while ensuring that the user request can be resolved. Pay attention to the dependencies and order among tasks. If the user input can't be parsed, you need to reply empty JSON []. 23 | choose_model: >- 24 | #2 Model Selection Stage: Given the user request and the parsed tasks, the AI assistant helps the user to select a suitable model from a list of models to process the user request. The assistant should focus more on the description of the model and find the model that has the most potential to solve requests and tasks. Also, prefer models with local inference endpoints for speed and stability. 25 | response_results: >- 26 | #4 Response Generation Stage: With the task execution logs, the AI assistant needs to describe the process and inference results. 27 | demos_or_presteps: 28 | parse_task: demos/demo_parse_task.json 29 | choose_model: demos/demo_choose_model.json 30 | response_results: demos/demo_response_results.json 31 | prompt: 32 | parse_task: The chat log [ {{context}} ] may contain the resources I mentioned. Now I input { {{input}} }. Pay attention to the input and output types of tasks and the dependencies between tasks. 33 | choose_model: >- 34 | Please choose the most suitable model from {{metas}} for the task {{task}}. The output must be in a strict JSON format: {"id": "id", "reason": "your detail reasons for the choice"}. 35 | response_results: >- 36 | Yes. Please first think carefully and directly answer my request based on the inference results. Some of the inferences may not always turn out to be correct and require you to make careful consideration in making decisions. Then please detail your workflow including the used models and inference results for my request in your friendly tone. Please filter out information that is not relevant to my request. Tell me the complete path or urls of files in inference results. If there is nothing in the results, please tell me you can't make it. } -------------------------------------------------------------------------------- /server/demos/demo_parse_task.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "role": "user", 4 | "content": "Give you some pictures e1.jpg, e2.png, e3.jpg, help me count the number of sheep?" 5 | }, 6 | { 7 | "role": "assistant", 8 | "content": "[{\"task\": \"image-to-text\", \"id\": 0, \"dep\": [-1], \"args\": {\"image\": \"e1.jpg\" }}, {\"task\": \"object-detection\", \"id\": 1, \"dep\": [-1], \"args\": {\"image\": \"e1.jpg\" }}, {\"task\": \"visual-question-answering\", \"id\": 2, \"dep\": [1], \"args\": {\"image\": \"-1\", \"text\": \"How many sheep in the picture\"}} }}, {\"task\": \"image-to-text\", \"id\": 3, \"dep\": [-1], \"args\": {\"image\": \"e2.png\" }}, {\"task\": \"object-detection\", \"id\": 4, \"dep\": [-1], \"args\": {\"image\": \"e2.png\" }}, {\"task\": \"visual-question-answering\", \"id\": 5, \"dep\": [4], \"args\": {\"image\": \"-4\", \"text\": \"How many sheep in the picture\"}} }}, {\"task\": \"image-to-text\", \"id\": 6, \"dep\": [-1], \"args\": {\"image\": \"e3.jpg\" }}, {\"task\": \"object-detection\", \"id\": 7, \"dep\": [-1], \"args\": {\"image\": \"e3.jpg\" }}, {\"task\": \"visual-question-answering\", \"id\": 8, \"dep\": [7], \"args\": {\"image\": \"-7\", \"text\": \"How many sheep in the picture\"}}]" 9 | }, 10 | 11 | { 12 | "role":"user", 13 | "content":"Look at /e.jpg, can you tell me how many objects in the picture? Give me a picture and video similar to this one." 14 | }, 15 | { 16 | "role":"assistant", 17 | "content":"[{\"task\": \"image-to-text\", \"id\": 0, \"dep\": [-1], \"args\": {\"image\": \"/e.jpg\" }}, {\"task\": \"object-detection\", \"id\": 1, \"dep\": [-1], \"args\": {\"image\": \"/e.jpg\" }}, {\"task\": \"visual-question-answering\", \"id\": 2, \"dep\": [1], \"args\": {\"image\": \"-1\", \"text\": \"how many objects in the picture?\" }}, {\"task\": \"text-to-image\", \"id\": 3, \"dep\": [0], \"args\": {\"text\": \"\" }}, {\"task\": \"image-to-image\", \"id\": 4, \"dep\": [-1], \"args\": {\"image\": \"/e.jpg\" }}, {\"task\": \"text-to-video\", \"id\": 5, \"dep\": [0], \"args\": {\"text\": \"\" }}]" 18 | }, 19 | 20 | { 21 | "role":"user", 22 | "content":"given a document /images/e.jpeg, answer me what is the student amount? And describe the image with your voice" 23 | }, 24 | { 25 | "role":"assistant", 26 | "content":"{\"task\": \"document-question-answering\", \"id\": 0, \"dep\": [-1], \"args\": {\"image\": \"/images/e.jpeg\", \"text\": \"what is the student amount?\" }}, {\"task\": \"visual-question-answering\", \"id\": 1, \"dep\": [-1], \"args\": {\"image\": \"/images/e.jpeg\", \"text\": \"what is the student amount?\" }}, {\"task\": \"image-to-text\", \"id\": 2, \"dep\": [-1], \"args\": {\"image\": \"/images/e.jpg\" }}, {\"task\": \"text-to-speech\", \"id\": 3, \"dep\": [2], \"args\": {\"text\": \"-2\" }}]" 27 | }, 28 | 29 | { 30 | "role": "user", 31 | "content": "Given an image /example.jpg, first generate a hed image, then based on the hed image generate a new image where a girl is reading a book" 32 | }, 33 | { 34 | "role": "assistant", 35 | "content": "[{\"task\": \"openpose-control\", \"id\": 0, \"dep\": [-1], \"args\": {\"image\": \"/example.jpg\" }}, {\"task\": \"openpose-text-to-image\", \"id\": 1, \"dep\": [0], \"args\": {\"text\": \"a girl is reading a book\", \"image\": \"-0\" }}]" 36 | }, 37 | 38 | { 39 | "role": "user", 40 | "content": "please show me a video and an image of (based on the text) 'a boy is running' and dub it" 41 | }, 42 | { 43 | "role": "assistant", 44 | "content": "[{\"task\": \"text-to-video\", \"id\": 0, \"dep\": [-1], \"args\": {\"text\": \"a boy is running\" }}, {\"task\": \"text-to-speech\", \"id\": 1, \"dep\": [-1], \"args\": {\"text\": \"a boy is running\" }}, {\"task\": \"text-to-image\", \"id\": 2, \"dep\": [-1], \"args\": {\"text\": \"a boy is running\" }}]" 45 | }, 46 | 47 | 48 | { 49 | "role": "user", 50 | "content": "please show me a joke and an image of cat" 51 | }, 52 | { 53 | "role": "assistant", 54 | "content": "[{\"task\": \"conversational\", \"id\": 0, \"dep\": [-1], \"args\": {\"text\": \"please show me a joke of cat\" }}, {\"task\": \"text-to-image\", \"id\": 1, \"dep\": [-1], \"args\": {\"text\": \"a photo of cat\" }}]" 55 | } 56 | ] 57 | -------------------------------------------------------------------------------- /server/configs/config.lite.yaml: -------------------------------------------------------------------------------- 1 | openai: 2 | api_key: REPLACE_WITH_YOUR_OPENAI_API_KEY_HERE 3 | huggingface: 4 | token: REPLACE_WITH_YOUR_HUGGINGFACE_TOKEN_HERE # required: huggingface token @ https://huggingface.co/settings/tokens 5 | dev: false 6 | debug: false 7 | log_file: logs/debug.log 8 | model: text-davinci-003 # currently only support text-davinci-003, gpt-4, we will support more open-source LLMs in the future 9 | use_completion: true 10 | inference_mode: huggingface # local, huggingface or hybrid, prefer hybrid 11 | local_deployment: minimal # minimal, standard or full, prefer full 12 | num_candidate_models: 5 13 | max_description_length: 100 14 | proxy: # optional: your proxy server "http://ip:port" 15 | http_listen: 16 | host: 0.0.0.0 # if you use web as the client, please set `http://{LAN_IP_of_the_server}:{port}/` to `BASE_URL` of `web/src/config/index.ts`. 17 | port: 8004 18 | logit_bias: 19 | parse_task: 0.1 20 | choose_model: 5 21 | tprompt: 22 | parse_task: >- 23 | #1 Task Planning Stage: The AI assistant can parse user input to several tasks: [{"task": task, "id": task_id, "dep": dependency_task_id, "args": {"text": text or -dep_id, "image": image_url or -dep_id, "audio": audio_url or -dep_id}}]. The special tag "-dep_id" refer to the one generated text/image/audio in the dependency task (Please consider whether the dependency task generates resources of this type.) and "dep_id" must be in "dep" list. The "dep" field denotes the ids of the previous prerequisite tasks which generate a new resource that the current task relies on. The "args" field must in ["text", "image", "audio"], nothing else. The task MUST be selected from the following options: "token-classification", "text2text-generation", "summarization", "translation", "question-answering", "conversational", "text-generation", "sentence-similarity", "tabular-classification", "object-detection", "image-classification", "image-to-image", "image-to-text", "text-to-image", "text-to-video", "visual-question-answering", "document-question-answering", "image-segmentation", "depth-estimation", "text-to-speech", "automatic-speech-recognition", "audio-to-audio", "audio-classification", "canny-control", "hed-control", "mlsd-control", "normal-control", "openpose-control", "canny-text-to-image", "depth-text-to-image", "hed-text-to-image", "mlsd-text-to-image", "normal-text-to-image", "openpose-text-to-image", "seg-text-to-image". There may be multiple tasks of the same type. Think step by step about all the tasks needed to resolve the user's request. Parse out as few tasks as possible while ensuring that the user request can be resolved. Pay attention to the dependencies and order among tasks. If the user input can't be parsed, you need to reply empty JSON []. 24 | choose_model: >- 25 | #2 Model Selection Stage: Given the user request and the parsed tasks, the AI assistant helps the user to select a suitable model from a list of models to process the user request. The assistant should focus more on the description of the model and find the model that has the most potential to solve requests and tasks. Also, prefer models with local inference endpoints for speed and stability. 26 | response_results: >- 27 | #4 Response Generation Stage: With the task execution logs, the AI assistant needs to describe the process and inference results. 28 | demos_or_presteps: 29 | parse_task: demos/demo_parse_task.json 30 | choose_model: demos/demo_choose_model.json 31 | response_results: demos/demo_response_results.json 32 | prompt: 33 | parse_task: The chat log [ {{context}} ] may contain the resources I mentioned. Now I input { {{input}} }. Pay attention to the input and output types of tasks and the dependencies between tasks. 34 | choose_model: >- 35 | Please choose the most suitable model from {{metas}} for the task {{task}}. The output must be in a strict JSON format: {"id": "id", "reason": "your detail reasons for the choice"}. 36 | response_results: >- 37 | Yes. Please first think carefully and directly answer my request based on the inference results. Some of the inferences may not always turn out to be correct and require you to make careful consideration in making decisions. Then please detail your workflow including the used models and inference results for my request in your friendly tone. Please filter out information that is not relevant to my request. Tell me the complete path or urls of files in inference results. If there is nothing in the results, please tell me you can't make it. } -------------------------------------------------------------------------------- /server/configs/config.azure.yaml: -------------------------------------------------------------------------------- 1 | azure: 2 | api_key: REPLACE_WITH_YOUR_AZURE_API_KEY_HERE 3 | base_url: REPLACE_WITH_YOUR_ENDPOINT_HERE 4 | deployment_name: REPLACE_WITH_YOUR_DEPLOYMENT_NAME_HERE 5 | api_version: "2022-12-01" 6 | huggingface: 7 | token: REPLACE_WITH_YOUR_HUGGINGFACE_TOKEN_HERE # required: huggingface token @ https://huggingface.co/settings/tokens 8 | dev: false 9 | debug: false 10 | log_file: logs/debug.log 11 | model: text-davinci-003 # currently only support text-davinci-003, gpt-4, we will support more open-source LLMs in the future 12 | use_completion: true 13 | inference_mode: huggingface # local, huggingface or hybrid, prefer hybrid 14 | local_deployment: full # minimal, standard or full, prefer full 15 | device: cuda:0 # cuda:id or cpu 16 | num_candidate_models: 5 17 | max_description_length: 100 18 | proxy: # optional: your proxy server "http://ip:port" 19 | http_listen: 20 | host: 0.0.0.0 # if you use web as the client, please set `http://{LAN_IP_of_the_server}:{port}/` to `BASE_URL` of `web/src/config/index.ts`. 21 | port: 8004 22 | local_inference_endpoint: 23 | host: localhost 24 | port: 8005 25 | logit_bias: 26 | parse_task: 0.1 27 | choose_model: 5 28 | tprompt: 29 | parse_task: >- 30 | #1 Task Planning Stage: The AI assistant can parse user input to several tasks: [{"task": task, "id": task_id, "dep": dependency_task_id, "args": {"text": text or -dep_id, "image": image_url or -dep_id, "audio": audio_url or -dep_id}}]. The special tag "-dep_id" refer to the one generated text/image/audio in the dependency task (Please consider whether the dependency task generates resources of this type.) and "dep_id" must be in "dep" list. The "dep" field denotes the ids of the previous prerequisite tasks which generate a new resource that the current task relies on. The "args" field must in ["text", "image", "audio"], nothing else. The task MUST be selected from the following options: "token-classification", "text2text-generation", "summarization", "translation", "question-answering", "conversational", "text-generation", "sentence-similarity", "tabular-classification", "object-detection", "image-classification", "image-to-image", "image-to-text", "text-to-image", "text-to-video", "visual-question-answering", "document-question-answering", "image-segmentation", "depth-estimation", "text-to-speech", "automatic-speech-recognition", "audio-to-audio", "audio-classification", "canny-control", "hed-control", "mlsd-control", "normal-control", "openpose-control", "canny-text-to-image", "depth-text-to-image", "hed-text-to-image", "mlsd-text-to-image", "normal-text-to-image", "openpose-text-to-image", "seg-text-to-image". There may be multiple tasks of the same type. Think step by step about all the tasks needed to resolve the user's request. Parse out as few tasks as possible while ensuring that the user request can be resolved. Pay attention to the dependencies and order among tasks. If the user input can't be parsed, you need to reply empty JSON []. 31 | choose_model: >- 32 | #2 Model Selection Stage: Given the user request and the parsed tasks, the AI assistant helps the user to select a suitable model from a list of models to process the user request. The assistant should focus more on the description of the model and find the model that has the most potential to solve requests and tasks. Also, prefer models with local inference endpoints for speed and stability. 33 | response_results: >- 34 | #4 Response Generation Stage: With the task execution logs, the AI assistant needs to describe the process and inference results. 35 | demos_or_presteps: 36 | parse_task: demos/demo_parse_task.json 37 | choose_model: demos/demo_choose_model.json 38 | response_results: demos/demo_response_results.json 39 | prompt: 40 | parse_task: The chat log [ {{context}} ] may contain the resources I mentioned. Now I input { {{input}} }. Pay attention to the input and output types of tasks and the dependencies between tasks. 41 | choose_model: >- 42 | Please choose the most suitable model from {{metas}} for the task {{task}}. The output must be in a strict JSON format: {"id": "id", "reason": "your detail reasons for the choice"}. 43 | response_results: >- 44 | Yes. Please first think carefully and directly answer my request based on the inference results. Some of the inferences may not always turn out to be correct and require you to make careful consideration in making decisions. Then please detail your workflow including the used models and inference results for my request in your friendly tone. Please filter out information that is not relevant to my request. Tell me the complete path or urls of files in inference results. If there is nothing in the results, please tell me you can't make it. } -------------------------------------------------------------------------------- /server/configs/config.default.yaml: -------------------------------------------------------------------------------- 1 | openai: 2 | api_key: REPLACE_WITH_YOUR_OPENAI_API_KEY_HERE 3 | # azure: 4 | # api_key: REPLACE_WITH_YOUR_AZURE_API_KEY_HERE 5 | # base_url: REPLACE_WITH_YOUR_ENDPOINT_HERE 6 | # deployment_name: REPLACE_WITH_YOUR_DEPLOYMENT_NAME_HERE 7 | # api_version: "2022-12-01" 8 | huggingface: 9 | token: REPLACE_WITH_YOUR_HUGGINGFACE_TOKEN_HERE # required: huggingface token @ https://huggingface.co/settings/tokens 10 | dev: false 11 | debug: false 12 | log_file: logs/debug.log 13 | model: text-davinci-003 # currently only support text-davinci-003, gpt-4, we will support more open-source LLMs in the future 14 | use_completion: true 15 | inference_mode: hybrid # local, huggingface or hybrid, prefer hybrid 16 | local_deployment: full # minimal, standard or full, prefer full 17 | device: cuda:0 # cuda:id or cpu 18 | num_candidate_models: 5 19 | max_description_length: 100 20 | proxy: # optional: your proxy server "http://ip:port" 21 | http_listen: 22 | host: 0.0.0.0 # if you use web as the client, please set `http://{LAN_IP_of_the_server}:{port}/` to `BASE_URL` of `web/src/config/index.ts`. 23 | port: 8004 24 | local_inference_endpoint: 25 | host: localhost 26 | port: 8005 27 | logit_bias: 28 | parse_task: 0.1 29 | choose_model: 5 30 | tprompt: 31 | parse_task: >- 32 | #1 Task Planning Stage: The AI assistant can parse user input to several tasks: [{"task": task, "id": task_id, "dep": dependency_task_id, "args": {"text": text or -dep_id, "image": image_url or -dep_id, "audio": audio_url or -dep_id}}]. The special tag "-dep_id" refer to the one generated text/image/audio in the dependency task (Please consider whether the dependency task generates resources of this type.) and "dep_id" must be in "dep" list. The "dep" field denotes the ids of the previous prerequisite tasks which generate a new resource that the current task relies on. The "args" field must in ["text", "image", "audio"], nothing else. The task MUST be selected from the following options: "token-classification", "text2text-generation", "summarization", "translation", "question-answering", "conversational", "text-generation", "sentence-similarity", "tabular-classification", "object-detection", "image-classification", "image-to-image", "image-to-text", "text-to-image", "text-to-video", "visual-question-answering", "document-question-answering", "image-segmentation", "depth-estimation", "text-to-speech", "automatic-speech-recognition", "audio-to-audio", "audio-classification", "canny-control", "hed-control", "mlsd-control", "normal-control", "openpose-control", "canny-text-to-image", "depth-text-to-image", "hed-text-to-image", "mlsd-text-to-image", "normal-text-to-image", "openpose-text-to-image", "seg-text-to-image". There may be multiple tasks of the same type. Think step by step about all the tasks needed to resolve the user's request. Parse out as few tasks as possible while ensuring that the user request can be resolved. Pay attention to the dependencies and order among tasks. If the user input can't be parsed, you need to reply empty JSON []. 33 | choose_model: >- 34 | #2 Model Selection Stage: Given the user request and the parsed tasks, the AI assistant helps the user to select a suitable model from a list of models to process the user request. The assistant should focus more on the description of the model and find the model that has the most potential to solve requests and tasks. Also, prefer models with local inference endpoints for speed and stability. 35 | response_results: >- 36 | #4 Response Generation Stage: With the task execution logs, the AI assistant needs to describe the process and inference results. 37 | demos_or_presteps: 38 | parse_task: demos/demo_parse_task.json 39 | choose_model: demos/demo_choose_model.json 40 | response_results: demos/demo_response_results.json 41 | prompt: 42 | parse_task: The chat log [ {{context}} ] may contain the resources I mentioned. Now I input { {{input}} }. Pay attention to the input and output types of tasks and the dependencies between tasks. 43 | choose_model: >- 44 | Please choose the most suitable model from {{metas}} for the task {{task}}. The output must be in a strict JSON format: {"id": "id", "reason": "your detail reasons for the choice"}. 45 | response_results: >- 46 | Yes. Please first think carefully and directly answer my request based on the inference results. Some of the inferences may not always turn out to be correct and require you to make careful consideration in making decisions. Then please detail your workflow including the used models and inference results for my request in your friendly tone. Please filter out information that is not relevant to my request. Tell me the complete path or urls of files in inference results. If there is nothing in the results, please tell me you can't make it. } -------------------------------------------------------------------------------- /server/run_gradio_demo.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | import gradio as gr 3 | import re 4 | from diffusers.utils import load_image 5 | import requests 6 | from awesome_chat import chat_huggingface 7 | 8 | all_messages = [] 9 | OPENAI_KEY = "" 10 | 11 | def add_message(content, role): 12 | message = {"role":role, "content":content} 13 | all_messages.append(message) 14 | 15 | def extract_medias(message): 16 | image_pattern = re.compile(r"(http(s?):|\/)?([\.\/_\w:-])*?\.(jpg|jpeg|tiff|gif|png)") 17 | image_urls = [] 18 | for match in image_pattern.finditer(message): 19 | if match.group(0) not in image_urls: 20 | image_urls.append(match.group(0)) 21 | 22 | audio_pattern = re.compile(r"(http(s?):|\/)?([\.\/_\w:-])*?\.(flac|wav)") 23 | audio_urls = [] 24 | for match in audio_pattern.finditer(message): 25 | if match.group(0) not in audio_urls: 26 | audio_urls.append(match.group(0)) 27 | 28 | video_pattern = re.compile(r"(http(s?):|\/)?([\.\/_\w:-])*?\.(mp4)") 29 | video_urls = [] 30 | for match in video_pattern.finditer(message): 31 | if match.group(0) not in video_urls: 32 | video_urls.append(match.group(0)) 33 | 34 | return image_urls, audio_urls, video_urls 35 | 36 | def set_openai_key(openai_key): 37 | global OPENAI_KEY 38 | OPENAI_KEY = openai_key 39 | return OPENAI_KEY 40 | 41 | def add_text(messages, message): 42 | if len(OPENAI_KEY) == 0 or not OPENAI_KEY.startswith("sk-"): 43 | return messages, "Please set your OpenAI API key first." 44 | add_message(message, "user") 45 | messages = messages + [(message, None)] 46 | image_urls, audio_urls, video_urls = extract_medias(message) 47 | 48 | for image_url in image_urls: 49 | if not image_url.startswith("http"): 50 | image_url = "public/" + image_url 51 | image = load_image(image_url) 52 | name = f"public/images/{str(uuid.uuid4())[:4]}.jpg" 53 | image.save(name) 54 | messages = messages + [((f"{name}",), None)] 55 | for audio_url in audio_urls: 56 | if not audio_url.startswith("http"): 57 | audio_url = "public/" + audio_url 58 | ext = audio_url.split(".")[-1] 59 | name = f"public/audios/{str(uuid.uuid4()[:4])}.{ext}" 60 | response = requests.get(audio_url) 61 | with open(name, "wb") as f: 62 | f.write(response.content) 63 | messages = messages + [((f"{name}",), None)] 64 | for video_url in video_urls: 65 | if not video_url.startswith("http"): 66 | video_url = "public/" + video_url 67 | ext = video_url.split(".")[-1] 68 | name = f"public/audios/{str(uuid.uuid4()[:4])}.{ext}" 69 | response = requests.get(video_url) 70 | with open(name, "wb") as f: 71 | f.write(response.content) 72 | messages = messages + [((f"{name}",), None)] 73 | return messages, "" 74 | 75 | def bot(messages): 76 | if len(OPENAI_KEY) == 0 or not OPENAI_KEY.startswith("sk-"): 77 | return messages 78 | message = chat_huggingface(all_messages, OPENAI_KEY, "openai", "https://api.openai.com/v1/completions")["message"] 79 | image_urls, audio_urls, video_urls = extract_medias(message) 80 | add_message(message, "assistant") 81 | messages[-1][1] = message 82 | for image_url in image_urls: 83 | if not image_url.startswith("http"): 84 | image_url = image_url.replace("public/", "") 85 | messages = messages + [((None, (f"public/{image_url}",)))] 86 | for audio_url in audio_urls: 87 | if not audio_url.startswith("http"): 88 | audio_url = audio_url.replace("public/", "") 89 | messages = messages + [((None, (f"public/{audio_url}",)))] 90 | for video_url in video_urls: 91 | if not video_url.startswith("http"): 92 | video_url = video_url.replace("public/", "") 93 | messages = messages + [((None, (f"public/{video_url}",)))] 94 | return messages 95 | 96 | with gr.Blocks() as demo: 97 | gr.Markdown("

HuggingGPT (Dev)

") 98 | with gr.Row(): 99 | openai_api_key = gr.Textbox( 100 | show_label=False, 101 | placeholder="Set your OpenAI API key here and press Enter", 102 | lines=1, 103 | type="password", 104 | ) 105 | 106 | chatbot = gr.Chatbot([], elem_id="chatbot").style(height=500) 107 | 108 | with gr.Row(): 109 | txt = gr.Textbox( 110 | show_label=False, 111 | placeholder="Enter text and press enter. The url of the multimedia resource must contain the extension name.", 112 | ).style(container=False) 113 | 114 | txt.submit(add_text, [chatbot, txt], [chatbot, txt]).then( 115 | bot, chatbot, chatbot 116 | ) 117 | openai_api_key.submit(set_openai_key, [openai_api_key], [openai_api_key]) 118 | 119 | gr.Examples( 120 | examples=["Given a collection of image A: /examples/a.jpg, B: /examples/b.jpg, C: /examples/c.jpg, please tell me how many zebras in these picture?", 121 | "Please generate a canny image based on /examples/f.jpg", 122 | "show me a joke and an image of cat", 123 | "what is in the /examples/a.jpg", 124 | "generate a video and audio about a dog is running on the grass", 125 | "based on the /examples/a.jpg, please generate a video and audio", 126 | "based on pose of /examples/d.jpg and content of /examples/e.jpg, please show me a new image", 127 | ], 128 | inputs=txt 129 | ) 130 | 131 | demo.launch() -------------------------------------------------------------------------------- /web/src/assets/chatgpt.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /web/src/assets/huggingface.svg: -------------------------------------------------------------------------------- 1 | 2 | 74 | -------------------------------------------------------------------------------- /web/src/assets/logo.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # JARVIS 2 | 3 | **This project is under construction and we will have all the code ready soon.** 4 | 5 | 6 |

7 | 8 | 9 | 10 | ## Updates 11 | + [2023.04.16] Jarvis now supports the OpenAI service on the Azure platform and the GPT-4 model. 12 | + [2023.04.06] We added the Gradio demo and built the web API for `/tasks` and `/results` in `server` mode. 13 | + The Gradio demo is now hosted on Hugging Face Space. (Build with `inference_mode=hybrid` and `local_deployment=standard`) 14 | + The Web API `/tasks` and `/results` access intermediate results for `Stage #1`: task planning and `Stage #1-3`: model selection with execution results. See here. 15 | + [2023.04.03] We added the CLI mode and provided parameters for configuring the scale of local endpoints. 16 | + You can enjoy a lightweight experience with Jarvis without deploying the models locally. See here. 17 | + Just run `python awesome_chat.py --config configs/config.lite.yaml` to experience it. 18 | + [2023.04.01] We updated a version of code for building. 19 | 20 | ## Overview 21 | 22 | Language serves as an interface for LLMs to connect numerous AI models for solving complicated AI tasks! 23 | 24 | See our paper: [HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in HuggingFace](http://arxiv.org/abs/2303.17580), Yongliang Shen, Kaitao Song, Xu Tan, Dongsheng Li, Weiming Lu and Yueting Zhuang (the first two authors contribute equally) 25 | 26 |

27 | 28 | We introduce a collaborative system that consists of **an LLM as the controller** and **numerous expert models as collaborative executors** (from HuggingFace Hub). The workflow of our system consists of four stages: 29 | + **Task Planning**: Using ChatGPT to analyze the requests of users to understand their intention, and disassemble them into possible solvable tasks. 30 | + **Model Selection**: To solve the planned tasks, ChatGPT selects expert models hosted on Hugging Face based on their descriptions. 31 | + **Task Execution**: Invokes and executes each selected model, and return the results to ChatGPT. 32 | + **Response Generation**: Finally, using ChatGPT to integrate the prediction of all models, and generate responses. 33 | 34 | ## System Requirements 35 | 36 | ### Default (Recommended) 37 | 38 | For `configs/config.default.yaml`: 39 | 40 | + Ubuntu 16.04 LTS 41 | + VRAM >= 24GB 42 | + RAM > 12GB (minimal), 16GB (standard), 80GB (full) 43 | + Disk > 284GB 44 | + 42GB for `damo-vilab/text-to-video-ms-1.7b` 45 | + 126GB for `ControlNet` 46 | + 66GB for `stable-diffusion-v1-5` 47 | + 50GB for others 48 | 49 | ### Minimum (Lite) 50 | 51 | For `configs/config.lite.yaml`: 52 | 53 | + Ubuntu 16.04 LTS 54 | + Nothing else 55 | 56 | The configuration `configs/config.lite.yaml` does not require any expert models to be downloaded and deployed locally. However, it means that Jarvis is restricted to models running stably on HuggingFace Inference Endpoints. 57 | 58 | ## Quick Start 59 | 60 | First replace `openai.key` and `huggingface.token` in `server/configs/config.default.yaml` with **your personal OpenAI Key** and **your Hugging Face Token**, or put them in the environment variables `OPENAI_API_KEY` and `HUGGINGFACE_ACCESS_TOKEN` respectfully. Then run the following commands: 61 | 62 | 63 | 64 | ### For Server: 65 | 66 | ```bash 67 | # setup env 68 | cd server 69 | conda create -n jarvis python=3.8 70 | conda activate jarvis 71 | conda install pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvidia 72 | pip install -r requirements.txt 73 | 74 | # download models. Make sure that `git-lfs` is installed. 75 | cd models 76 | bash download.sh # required when `inference_mode` is `local` or `hybrid`. 77 | 78 | # run server 79 | cd .. 80 | python models_server.py --config configs/config.default.yaml # required when `inference_mode` is `local` or `hybrid` 81 | python awesome_chat.py --config configs/config.default.yaml --mode server # for text-davinci-003 82 | ``` 83 | 84 | Now you can access Jarvis' services by the Web API. 85 | 86 | + `/hugginggpt` --method `POST`, access the full service. 87 | + `/tasks` --method `POST`, access intermediate results for Stage #1. 88 | + `/results` --method `POST`, access intermediate results for Stage #1-3. 89 | 90 | For example: 91 | 92 | ```bash 93 | # request 94 | curl --location 'http://localhost:8004/tasks' \ 95 | --header 'Content-Type: application/json' \ 96 | --data '{ 97 | "messages": [ 98 | { 99 | "role": "user", 100 | "content": "based on pose of /examples/d.jpg and content of /examples/e.jpg, please show me a new image" 101 | } 102 | ] 103 | }' 104 | 105 | # response 106 | [{"args":{"image":"/examples/d.jpg"},"dep":[-1],"id":0,"task":"openpose-control"},{"args":{"image":"/examples/e.jpg"},"dep":[-1],"id":1,"task":"image-to-text"},{"args":{"image":"-0","text":"-1"},"dep":[1,0],"id":2,"task":"openpose-text-to-image"}] 107 | ``` 108 | 109 | 110 | ### For Web: 111 | 112 | We provide a user-friendly web page. After starting `awesome_chat.py` in a server mode, you can run the commands to communicate with Jarvis in your browser: 113 | 114 | - you need to install `nodejs` and `npm` first. 115 | - [ IMPORTANT ] if you are running the web client on another machine, you need set `http://{LAN_IP_of_the_server}:{port}/` to `HUGGINGGPT_BASE_URL` of `web/src/config/index.ts`. 116 | - if you want to use the video generation feature, you need to compile `ffmpeg` manually with H.264. 117 | - you can switch to ChatGPT by `double click` on the setting icon! 118 | 119 | ```bash 120 | cd web 121 | npm install 122 | npm run dev 123 | ``` 124 | 125 | ```bash 126 | # Optional: Install ffmpeg 127 | # This command need be executed without errors. 128 | LD_LIBRARY_PATH=/usr/local/lib /usr/local/bin/ffmpeg -i input.mp4 -vcodec libx264 output.mp4 129 | ``` 130 | 131 | 132 | 133 | ### For Gradio 134 | 135 | The Gradio demo is now hosted on Hugging Face Space. You can also run the following commands to start the demo locally: 136 | 137 | ```bash 138 | python models_server.py --config configs/config.gradio.yaml 139 | python run_gradio_demo.py --config configs/config.gradio.yaml 140 | 141 | # or run the HF Space as a Docker image (Build with `inference_mode=hibrid` and `local_deployment=standard`) 142 | docker run -it -p 7860:7860 --platform=linux/amd64 registry.hf.space/microsoft-hugginggpt:latest python app.py 143 | ``` 144 | 145 | ### For CLI: 146 | 147 | You can also run Jarvis more easily in CLI mode: 148 | 149 | ```bash 150 | cd server 151 | python awesome_chat.py --config configs/config.default.yaml --mode cli 152 | ``` 153 | 154 | Examples of CLI mode: 155 | 156 | ``` 157 | Welcome to Jarvis! A collaborative system that consists of an LLM as the controller and numerous expert models as collaborative executors. Jarvis can plan tasks, schedule Hugging Face models, generate friendly responses based on your requests, and help you with many things. Please enter your request (`exit` to exit). 158 | [ User ]: Given a collection of image A: /examples/a.jpg, B: /examples/b.jpg, C: /examples/c.jpg, please tell me how many zebras in these picture? 159 | [ Jarvis ]: Based on your request, I have used several models to process the images you provided. For the image A: /examples/a.jpg, I used the image-to-text model nlpconnect/vit-gpt2-image-captioning to generate the text "a cat sitting on a window sill looking out". Then I used the object-detection model facebook/detr-resnet-50 to generate the image with predicted box '/images/a3dc.jpg', which contains two objects with labels 'potted plant' and 'cat'. For the image B: /examples/b.jpg, I used the image-to-text model nlpconnect/vit-gpt2-image-captioning to generate the text "two zebras standing in a field of dry grass". Then I used the object-detection model facebook/detr-resnet-50 to generate the image with predicted box '/images/f5df.jpg', which contains three objects with labels 'zebra'. For the image C: /examples/c.jpg, I used the image-to-text model nlpconnect/vit-gpt2-image-captioning to generate the text "a zebra standing in a field of tall grass". Then I used the object-detection model facebook/detr-resnet-50 to generate the image with predicted box '/images/69c7.jpg', which contains one object with label 'zebra'. Finally, I used the canny-control model canny-control to generate the image '/images/7e18.png' based on the image /examples/f.jpg. Therefore, there are four zebras in these pictures. Is there anything else I can help you with? 160 | [ User ]: Please answer all the named entities in the sentence: Iron Man is a superhero appearing in American comic books published by Marvel Comics. The character was co-created by writer and editor Stan Lee, developed by scripter Larry Lieber, and designed by artists Don Heck and Jack Kirby. 161 | [ Jarvis ]: Sure, I understand your request. Based on the inference results of the models, I will answer your request step by step. Firstly, I used the token-classification model dslim/bert-base-NER to recognize the named entities in the sentence. The inference result is that the named entities are Iron Man, Stan Lee, Larry Lieber, Don Heck and Jack Kirby. Then, I used the question-answering model deepset/roberta-base-squad2 to answer your request. The inference result is that there is no output since the context cannot be empty. Therefore, I cannot make it. I hope this answer your request. Is there anything else I can help you with? 162 | [ User ]: Please generate a canny image based on /examples/f.jpg 163 | [ Jarvis ]: Sure. I have generated a canny image based on /examples/f.jpg. To do this, I first used the image-to-text model nlpconnect/vit-gpt2-image-captioning to generate the text description of the image, which is "a herd of giraffes and zebras grazing in a field". Then I used the canny-control model to generate the canny image of the original image, which is stored in the path "/images/1ea2.png". Finally, I used the canny-text-to-image model lllyasviel/sd-controlnet-canny to generate the canny image based on the text description and the canny image, which is stored in the path "/images/ba66.png". Do you need anything else? 164 | ``` 165 | 166 | 167 | ## Configuration 168 | 169 | The server-side configuration file is `server/configs/config.default.yaml`, and some parameters are presented as follows: 170 | 171 | + `model`: LLM, currently supports `text-davinci-003`. We are working on integrating more open-source LLMs. 172 | + `inference_mode`: mode of inference endpoints 173 | + `local`: only use the local inference endpoints 174 | + `huggingface`: only use the Hugging Face Inference Endpoints **(free of local inference endpoints)** 175 | + `hybrid`: both of `local` and `huggingface` 176 | + `local_deployment`: scale of locally deployed models, works under `local` or `hybrid` inference mode: 177 | + `minimal` (RAM>12GB, ControlNet only) 178 | + `standard` (RAM>16GB, ControlNet + Standard Pipelines) 179 | + `full` (RAM>42GB, All registered models) 180 | 181 | On a personal laptop, we recommend the configuration of `inference_mode: hybrid `and `local_deployment: minimal`. But the available models under this setting may be limited due to the instability of remote Hugging Face Inference Endpoints. 182 | 183 | ## Screenshots 184 | 185 |

186 | 187 | ## Citation 188 | If you find this work useful in your method, you can cite the paper as below: 189 | 190 | @article{shen2023hugginggpt, 191 | title = {HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in HuggingFace}, 192 | author = {Shen, Yongliang and Song, Kaitao and Tan, Xu and Li, Dongsheng and Lu, Weiming and Zhuang, Yueting}, 193 | journal = {arXiv preprint arXiv:2303.17580}, 194 | year = {2023} 195 | } 196 | 197 | ## Acknowledgement 198 | 199 | - [ChatGPT](https://platform.openai.com/) 200 | - [Hugging Face](https://huggingface.co/) 201 | - [ControlNet](https://github.com/lllyasviel/ControlNet) 202 | - [ChatGPT-vue](https://github.com/lianginx/chatgpt-vue) 203 | -------------------------------------------------------------------------------- /web/src/views/home.vue: -------------------------------------------------------------------------------- 1 | 307 | 308 | 409 | 410 | 433 | -------------------------------------------------------------------------------- /server/models_server.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import random 4 | import uuid 5 | import numpy as np 6 | from transformers import pipeline 7 | from diffusers import DiffusionPipeline, StableDiffusionControlNetPipeline, ControlNetModel, UniPCMultistepScheduler 8 | from diffusers.utils import load_image 9 | from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler 10 | from diffusers.utils import export_to_video 11 | from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5ForSpeechToSpeech 12 | from transformers import BlipProcessor, BlipForConditionalGeneration 13 | from transformers import TrOCRProcessor, VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer 14 | from datasets import load_dataset 15 | from PIL import Image 16 | import flask 17 | from flask import request, jsonify 18 | import waitress 19 | from flask_cors import CORS 20 | import io 21 | from torchvision import transforms 22 | import torch 23 | import torchaudio 24 | from speechbrain.pretrained import WaveformEnhancement 25 | import joblib 26 | from huggingface_hub import hf_hub_url, cached_download 27 | from transformers import AutoImageProcessor, TimesformerForVideoClassification 28 | from transformers import MaskFormerFeatureExtractor, MaskFormerForInstanceSegmentation, AutoFeatureExtractor 29 | from controlnet_aux import OpenposeDetector, MLSDdetector, HEDdetector, CannyDetector, MidasDetector 30 | from controlnet_aux.open_pose.body import Body 31 | from controlnet_aux.mlsd.models.mbv2_mlsd_large import MobileV2_MLSD_Large 32 | from controlnet_aux.hed import Network 33 | from transformers import DPTForDepthEstimation, DPTFeatureExtractor 34 | import warnings 35 | import time 36 | from espnet2.bin.tts_inference import Text2Speech 37 | import soundfile as sf 38 | from asteroid.models import BaseModel 39 | import traceback 40 | import os 41 | import yaml 42 | 43 | warnings.filterwarnings("ignore") 44 | 45 | parser = argparse.ArgumentParser() 46 | parser.add_argument("--config", type=str, default="configs/config.default.yaml") 47 | args = parser.parse_args() 48 | 49 | logger = logging.getLogger(__name__) 50 | logger.setLevel(logging.INFO) 51 | handler = logging.StreamHandler() 52 | handler.setLevel(logging.INFO) 53 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 54 | handler.setFormatter(formatter) 55 | logger.addHandler(handler) 56 | 57 | config = yaml.load(open(args.config, "r"), Loader=yaml.FullLoader) 58 | 59 | # host = config["local_inference_endpoint"]["host"] 60 | port = config["local_inference_endpoint"]["port"] 61 | 62 | local_deployment = config["local_deployment"] 63 | device = config.get("device", "cuda:0") 64 | 65 | PROXY = None 66 | if config["proxy"]: 67 | PROXY = { 68 | "https": config["proxy"], 69 | } 70 | 71 | app = flask.Flask(__name__) 72 | CORS(app) 73 | 74 | start = time.time() 75 | 76 | local_fold = "models" 77 | # if args.config.endswith(".dev"): 78 | # local_fold = "models_dev" 79 | 80 | 81 | def load_pipes(local_deployment): 82 | other_pipes = {} 83 | standard_pipes = {} 84 | controlnet_sd_pipes = {} 85 | if local_deployment in ["full"]: 86 | other_pipes = { 87 | "nlpconnect/vit-gpt2-image-captioning":{ 88 | "model": VisionEncoderDecoderModel.from_pretrained(f"{local_fold}/nlpconnect/vit-gpt2-image-captioning"), 89 | "feature_extractor": ViTImageProcessor.from_pretrained(f"{local_fold}/nlpconnect/vit-gpt2-image-captioning"), 90 | "tokenizer": AutoTokenizer.from_pretrained(f"{local_fold}/nlpconnect/vit-gpt2-image-captioning"), 91 | "device": device 92 | }, 93 | # "Salesforce/blip-image-captioning-large": { 94 | # "model": BlipForConditionalGeneration.from_pretrained(f"{local_fold}/Salesforce/blip-image-captioning-large"), 95 | # "processor": BlipProcessor.from_pretrained(f"{local_fold}/Salesforce/blip-image-captioning-large"), 96 | # "device": device 97 | # }, 98 | "damo-vilab/text-to-video-ms-1.7b": { 99 | "model": DiffusionPipeline.from_pretrained(f"{local_fold}/damo-vilab/text-to-video-ms-1.7b", torch_dtype=torch.float16, variant="fp16"), 100 | "device": device 101 | }, 102 | # "facebook/maskformer-swin-large-ade": { 103 | # "model": MaskFormerForInstanceSegmentation.from_pretrained(f"{local_fold}/facebook/maskformer-swin-large-ade"), 104 | # "feature_extractor" : AutoFeatureExtractor.from_pretrained("facebook/maskformer-swin-large-ade"), 105 | # "device": device 106 | # }, 107 | # "microsoft/trocr-base-printed": { 108 | # "processor": TrOCRProcessor.from_pretrained(f"{local_fold}/microsoft/trocr-base-printed"), 109 | # "model": VisionEncoderDecoderModel.from_pretrained(f"{local_fold}/microsoft/trocr-base-printed"), 110 | # "device": device 111 | # }, 112 | # "microsoft/trocr-base-handwritten": { 113 | # "processor": TrOCRProcessor.from_pretrained(f"{local_fold}/microsoft/trocr-base-handwritten"), 114 | # "model": VisionEncoderDecoderModel.from_pretrained(f"{local_fold}/microsoft/trocr-base-handwritten"), 115 | # "device": device 116 | # }, 117 | "JorisCos/DCCRNet_Libri1Mix_enhsingle_16k": { 118 | "model": BaseModel.from_pretrained("JorisCos/DCCRNet_Libri1Mix_enhsingle_16k"), 119 | "device": device 120 | }, 121 | "espnet/kan-bayashi_ljspeech_vits": { 122 | "model": Text2Speech.from_pretrained(f"espnet/kan-bayashi_ljspeech_vits"), 123 | "device": device 124 | }, 125 | "lambdalabs/sd-image-variations-diffusers": { 126 | "model": DiffusionPipeline.from_pretrained(f"{local_fold}/lambdalabs/sd-image-variations-diffusers"), #torch_dtype=torch.float16 127 | "device": device 128 | }, 129 | # "CompVis/stable-diffusion-v1-4": { 130 | # "model": DiffusionPipeline.from_pretrained(f"{local_fold}/CompVis/stable-diffusion-v1-4"), 131 | # "device": device 132 | # }, 133 | # "stabilityai/stable-diffusion-2-1": { 134 | # "model": DiffusionPipeline.from_pretrained(f"{local_fold}/stabilityai/stable-diffusion-2-1"), 135 | # "device": device 136 | # }, 137 | "runwayml/stable-diffusion-v1-5": { 138 | "model": DiffusionPipeline.from_pretrained(f"{local_fold}/runwayml/stable-diffusion-v1-5"), 139 | "device": device 140 | }, 141 | # "microsoft/speecht5_tts":{ 142 | # "processor": SpeechT5Processor.from_pretrained(f"{local_fold}/microsoft/speecht5_tts"), 143 | # "model": SpeechT5ForTextToSpeech.from_pretrained(f"{local_fold}/microsoft/speecht5_tts"), 144 | # "vocoder": SpeechT5HifiGan.from_pretrained(f"{local_fold}/microsoft/speecht5_hifigan"), 145 | # "embeddings_dataset": load_dataset(f"{local_fold}/Matthijs/cmu-arctic-xvectors", split="validation"), 146 | # "device": device 147 | # }, 148 | # "speechbrain/mtl-mimic-voicebank": { 149 | # "model": WaveformEnhancement.from_hparams(source="speechbrain/mtl-mimic-voicebank", savedir="models/mtl-mimic-voicebank"), 150 | # "device": device 151 | # }, 152 | "microsoft/speecht5_vc":{ 153 | "processor": SpeechT5Processor.from_pretrained(f"{local_fold}/microsoft/speecht5_vc"), 154 | "model": SpeechT5ForSpeechToSpeech.from_pretrained(f"{local_fold}/microsoft/speecht5_vc"), 155 | "vocoder": SpeechT5HifiGan.from_pretrained(f"{local_fold}/microsoft/speecht5_hifigan"), 156 | "embeddings_dataset": load_dataset(f"{local_fold}/Matthijs/cmu-arctic-xvectors", split="validation"), 157 | "device": device 158 | }, 159 | # "julien-c/wine-quality": { 160 | # "model": joblib.load(cached_download(hf_hub_url("julien-c/wine-quality", "sklearn_model.joblib"))) 161 | # }, 162 | # "facebook/timesformer-base-finetuned-k400": { 163 | # "processor": AutoImageProcessor.from_pretrained(f"{local_fold}/facebook/timesformer-base-finetuned-k400"), 164 | # "model": TimesformerForVideoClassification.from_pretrained(f"{local_fold}/facebook/timesformer-base-finetuned-k400"), 165 | # "device": device 166 | # }, 167 | "facebook/maskformer-swin-base-coco": { 168 | "feature_extractor": MaskFormerFeatureExtractor.from_pretrained(f"{local_fold}/facebook/maskformer-swin-base-coco"), 169 | "model": MaskFormerForInstanceSegmentation.from_pretrained(f"{local_fold}/facebook/maskformer-swin-base-coco"), 170 | "device": device 171 | }, 172 | "Intel/dpt-hybrid-midas": { 173 | "model": DPTForDepthEstimation.from_pretrained(f"{local_fold}/Intel/dpt-hybrid-midas", low_cpu_mem_usage=True), 174 | "feature_extractor": DPTFeatureExtractor.from_pretrained(f"{local_fold}/Intel/dpt-hybrid-midas"), 175 | "device": device 176 | } 177 | } 178 | 179 | if local_deployment in ["full", "standard"]: 180 | standard_pipes = { 181 | # "superb/wav2vec2-base-superb-ks": { 182 | # "model": pipeline(task="audio-classification", model=f"{local_fold}/superb/wav2vec2-base-superb-ks"), 183 | # "device": device 184 | # }, 185 | "openai/whisper-base": { 186 | "model": pipeline(task="automatic-speech-recognition", model=f"{local_fold}/openai/whisper-base"), 187 | "device": device 188 | }, 189 | "microsoft/speecht5_asr": { 190 | "model": pipeline(task="automatic-speech-recognition", model=f"{local_fold}/microsoft/speecht5_asr"), 191 | "device": device 192 | }, 193 | "Intel/dpt-large": { 194 | "model": pipeline(task="depth-estimation", model=f"{local_fold}/Intel/dpt-large"), 195 | "device": device 196 | }, 197 | # "microsoft/beit-base-patch16-224-pt22k-ft22k": { 198 | # "model": pipeline(task="image-classification", model=f"{local_fold}/microsoft/beit-base-patch16-224-pt22k-ft22k"), 199 | # "device": device 200 | # }, 201 | "facebook/detr-resnet-50-panoptic": { 202 | "model": pipeline(task="image-segmentation", model=f"{local_fold}/facebook/detr-resnet-50-panoptic"), 203 | "device": device 204 | }, 205 | "facebook/detr-resnet-101": { 206 | "model": pipeline(task="object-detection", model=f"{local_fold}/facebook/detr-resnet-101"), 207 | "device": device 208 | }, 209 | # "openai/clip-vit-large-patch14": { 210 | # "model": pipeline(task="zero-shot-image-classification", model=f"{local_fold}/openai/clip-vit-large-patch14"), 211 | # "device": device 212 | # }, 213 | "google/owlvit-base-patch32": { 214 | "model": pipeline(task="zero-shot-object-detection", model=f"{local_fold}/google/owlvit-base-patch32"), 215 | "device": device 216 | }, 217 | # "microsoft/DialoGPT-medium": { 218 | # "model": pipeline(task="conversational", model=f"{local_fold}/microsoft/DialoGPT-medium"), 219 | # "device": device 220 | # }, 221 | # "bert-base-uncased": { 222 | # "model": pipeline(task="fill-mask", model=f"{local_fold}/bert-base-uncased"), 223 | # "device": device 224 | # }, 225 | # "deepset/roberta-base-squad2": { 226 | # "model": pipeline(task = "question-answering", model=f"{local_fold}/deepset/roberta-base-squad2"), 227 | # "device": device 228 | # }, 229 | # "facebook/bart-large-cnn": { 230 | # "model": pipeline(task="summarization", model=f"{local_fold}/facebook/bart-large-cnn"), 231 | # "device": device 232 | # }, 233 | # "google/tapas-base-finetuned-wtq": { 234 | # "model": pipeline(task="table-question-answering", model=f"{local_fold}/google/tapas-base-finetuned-wtq"), 235 | # "device": device 236 | # }, 237 | # "distilbert-base-uncased-finetuned-sst-2-english": { 238 | # "model": pipeline(task="text-classification", model=f"{local_fold}/distilbert-base-uncased-finetuned-sst-2-english"), 239 | # "device": device 240 | # }, 241 | # "gpt2": { 242 | # "model": pipeline(task="text-generation", model="gpt2"), 243 | # "device": device 244 | # }, 245 | # "mrm8488/t5-base-finetuned-question-generation-ap": { 246 | # "model": pipeline(task="text2text-generation", model=f"{local_fold}/mrm8488/t5-base-finetuned-question-generation-ap"), 247 | # "device": device 248 | # }, 249 | # "Jean-Baptiste/camembert-ner": { 250 | # "model": pipeline(task="token-classification", model=f"{local_fold}/Jean-Baptiste/camembert-ner", aggregation_strategy="simple"), 251 | # "device": device 252 | # }, 253 | # "t5-base": { 254 | # "model": pipeline(task="translation", model=f"{local_fold}/t5-base"), 255 | # "device": device 256 | # }, 257 | "impira/layoutlm-document-qa": { 258 | "model": pipeline(task="document-question-answering", model=f"{local_fold}/impira/layoutlm-document-qa"), 259 | "device": device 260 | }, 261 | "ydshieh/vit-gpt2-coco-en": { 262 | "model": pipeline(task="image-to-text", model=f"{local_fold}/ydshieh/vit-gpt2-coco-en"), 263 | "device": device 264 | }, 265 | "dandelin/vilt-b32-finetuned-vqa": { 266 | "model": pipeline(task="visual-question-answering", model=f"{local_fold}/dandelin/vilt-b32-finetuned-vqa"), 267 | "device": device 268 | } 269 | } 270 | 271 | if local_deployment in ["full", "standard", "minimal"]: 272 | controlnet = ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16) 273 | controlnetpipe = StableDiffusionControlNetPipeline.from_pretrained( 274 | f"{local_fold}/runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16 275 | ) 276 | 277 | def mlsd_control_network(): 278 | model = MobileV2_MLSD_Large() 279 | model.load_state_dict(torch.load(f"{local_fold}/lllyasviel/ControlNet/annotator/ckpts/mlsd_large_512_fp32.pth"), strict=True) 280 | return MLSDdetector(model) 281 | 282 | 283 | hed_network = Network(f"{local_fold}/lllyasviel/ControlNet/annotator/ckpts/network-bsds500.pth") 284 | 285 | controlnet_sd_pipes = { 286 | "openpose-control": { 287 | "model": OpenposeDetector(Body(f"{local_fold}/lllyasviel/ControlNet/annotator/ckpts/body_pose_model.pth")) 288 | }, 289 | "mlsd-control": { 290 | "model": mlsd_control_network() 291 | }, 292 | "hed-control": { 293 | "model": HEDdetector(hed_network) 294 | }, 295 | "scribble-control": { 296 | "model": HEDdetector(hed_network) 297 | }, 298 | "midas-control": { 299 | "model": MidasDetector(model_path=f"{local_fold}/lllyasviel/ControlNet/annotator/ckpts/dpt_hybrid-midas-501f0c75.pt") 300 | }, 301 | "canny-control": { 302 | "model": CannyDetector() 303 | }, 304 | "lllyasviel/sd-controlnet-canny":{ 305 | "control": controlnet, 306 | "model": controlnetpipe, 307 | "device": device 308 | }, 309 | "lllyasviel/sd-controlnet-depth":{ 310 | "control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-depth", torch_dtype=torch.float16), 311 | "model": controlnetpipe, 312 | "device": device 313 | }, 314 | "lllyasviel/sd-controlnet-hed":{ 315 | "control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-hed", torch_dtype=torch.float16), 316 | "model": controlnetpipe, 317 | "device": device 318 | }, 319 | "lllyasviel/sd-controlnet-mlsd":{ 320 | "control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-mlsd", torch_dtype=torch.float16), 321 | "model": controlnetpipe, 322 | "device": device 323 | }, 324 | "lllyasviel/sd-controlnet-openpose":{ 325 | "control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-openpose", torch_dtype=torch.float16), 326 | "model": controlnetpipe, 327 | "device": device 328 | }, 329 | "lllyasviel/sd-controlnet-scribble":{ 330 | "control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-scribble", torch_dtype=torch.float16), 331 | "model": controlnetpipe, 332 | "device": device 333 | }, 334 | "lllyasviel/sd-controlnet-seg":{ 335 | "control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-seg", torch_dtype=torch.float16), 336 | "model": controlnetpipe, 337 | "device": device 338 | } 339 | } 340 | pipes = {**standard_pipes, **other_pipes, **controlnet_sd_pipes} 341 | return pipes 342 | 343 | pipes = load_pipes(local_deployment) 344 | 345 | end = time.time() 346 | during = end - start 347 | 348 | print(f"[ ready ] {during}s") 349 | 350 | @app.route('/running', methods=['GET']) 351 | def running(): 352 | return jsonify({"running": True}) 353 | 354 | @app.route('/status/', methods=['GET']) 355 | def status(model_id): 356 | disabled_models = ["microsoft/trocr-base-printed", "microsoft/trocr-base-handwritten"] 357 | if model_id in pipes.keys() and model_id not in disabled_models: 358 | print(f"[ check {model_id} ] success") 359 | return jsonify({"loaded": True}) 360 | else: 361 | print(f"[ check {model_id} ] failed") 362 | return jsonify({"loaded": False}) 363 | 364 | @app.route('/models/', methods=['POST']) 365 | def models(model_id): 366 | while "using" in pipes[model_id] and pipes[model_id]["using"]: 367 | print(f"[ inference {model_id} ] waiting") 368 | time.sleep(0.1) 369 | pipes[model_id]["using"] = True 370 | print(f"[ inference {model_id} ] start") 371 | 372 | start = time.time() 373 | 374 | pipe = pipes[model_id]["model"] 375 | 376 | if "device" in pipes[model_id]: 377 | try: 378 | pipe.to(pipes[model_id]["device"]) 379 | except: 380 | pipe.device = torch.device(pipes[model_id]["device"]) 381 | pipe.model.to(pipes[model_id]["device"]) 382 | 383 | result = None 384 | try: 385 | # text to video 386 | if model_id == "damo-vilab/text-to-video-ms-1.7b": 387 | pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) 388 | # pipe.enable_model_cpu_offload() 389 | prompt = request.get_json()["text"] 390 | video_frames = pipe(prompt, num_inference_steps=50, num_frames=40).frames 391 | video_path = export_to_video(video_frames) 392 | file_name = str(uuid.uuid4())[:4] 393 | os.system(f"LD_LIBRARY_PATH=/usr/local/lib /usr/local/bin/ffmpeg -i {video_path} -vcodec libx264 public/videos/{file_name}.mp4") 394 | result = {"path": f"/videos/{file_name}.mp4"} 395 | 396 | # controlnet 397 | if model_id.startswith("lllyasviel/sd-controlnet-"): 398 | pipe.controlnet.to('cpu') 399 | pipe.controlnet = pipes[model_id]["control"].to(pipes[model_id]["device"]) 400 | pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config) 401 | control_image = load_image(request.get_json()["img_url"]) 402 | # generator = torch.manual_seed(66) 403 | out_image: Image = pipe(request.get_json()["text"], num_inference_steps=20, image=control_image).images[0] 404 | file_name = str(uuid.uuid4())[:4] 405 | out_image.save(f"public/images/{file_name}.png") 406 | result = {"path": f"/images/{file_name}.png"} 407 | 408 | if model_id.endswith("-control"): 409 | image = load_image(request.get_json()["img_url"]) 410 | if "scribble" in model_id: 411 | control = pipe(image, scribble = True) 412 | elif "canny" in model_id: 413 | control = pipe(image, low_threshold=100, high_threshold=200) 414 | else: 415 | control = pipe(image) 416 | file_name = str(uuid.uuid4())[:4] 417 | control.save(f"public/images/{file_name}.png") 418 | result = {"path": f"/images/{file_name}.png"} 419 | 420 | # image to image 421 | if model_id == "lambdalabs/sd-image-variations-diffusers": 422 | im = load_image(request.get_json()["img_url"]) 423 | file_name = str(uuid.uuid4())[:4] 424 | with open(f"public/images/{file_name}.png", "wb") as f: 425 | f.write(request.data) 426 | tform = transforms.Compose([ 427 | transforms.ToTensor(), 428 | transforms.Resize( 429 | (224, 224), 430 | interpolation=transforms.InterpolationMode.BICUBIC, 431 | antialias=False, 432 | ), 433 | transforms.Normalize( 434 | [0.48145466, 0.4578275, 0.40821073], 435 | [0.26862954, 0.26130258, 0.27577711]), 436 | ]) 437 | inp = tform(im).to(pipes[model_id]["device"]).unsqueeze(0) 438 | out = pipe(inp, guidance_scale=3) 439 | out["images"][0].save(f"public/images/{file_name}.jpg") 440 | result = {"path": f"/images/{file_name}.jpg"} 441 | 442 | # image to text 443 | if model_id == "Salesforce/blip-image-captioning-large": 444 | raw_image = load_image(request.get_json()["img_url"]).convert('RGB') 445 | text = request.get_json()["text"] 446 | inputs = pipes[model_id]["processor"](raw_image, return_tensors="pt").to(pipes[model_id]["device"]) 447 | out = pipe.generate(**inputs) 448 | caption = pipes[model_id]["processor"].decode(out[0], skip_special_tokens=True) 449 | result = {"generated text": caption} 450 | if model_id == "ydshieh/vit-gpt2-coco-en": 451 | img_url = request.get_json()["img_url"] 452 | generated_text = pipe(img_url)[0]['generated_text'] 453 | result = {"generated text": generated_text} 454 | if model_id == "nlpconnect/vit-gpt2-image-captioning": 455 | image = load_image(request.get_json()["img_url"]).convert("RGB") 456 | pixel_values = pipes[model_id]["feature_extractor"](images=image, return_tensors="pt").pixel_values 457 | pixel_values = pixel_values.to(pipes[model_id]["device"]) 458 | generated_ids = pipe.generate(pixel_values, **{"max_length": 200, "num_beams": 1}) 459 | generated_text = pipes[model_id]["tokenizer"].batch_decode(generated_ids, skip_special_tokens=True)[0] 460 | result = {"generated text": generated_text} 461 | # image to text: OCR 462 | if model_id == "microsoft/trocr-base-printed" or model_id == "microsoft/trocr-base-handwritten": 463 | image = load_image(request.get_json()["img_url"]).convert("RGB") 464 | pixel_values = pipes[model_id]["processor"](image, return_tensors="pt").pixel_values 465 | pixel_values = pixel_values.to(pipes[model_id]["device"]) 466 | generated_ids = pipe.generate(pixel_values) 467 | generated_text = pipes[model_id]["processor"].batch_decode(generated_ids, skip_special_tokens=True)[0] 468 | result = {"generated text": generated_text} 469 | 470 | # text to image 471 | if model_id == "runwayml/stable-diffusion-v1-5": 472 | file_name = str(uuid.uuid4())[:4] 473 | text = request.get_json()["text"] 474 | out = pipe(prompt=text) 475 | out["images"][0].save(f"public/images/{file_name}.jpg") 476 | result = {"path": f"/images/{file_name}.jpg"} 477 | 478 | # object detection 479 | if model_id == "google/owlvit-base-patch32" or model_id == "facebook/detr-resnet-101": 480 | img_url = request.get_json()["img_url"] 481 | open_types = ["cat", "couch", "person", "car", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird"] 482 | result = pipe(img_url, candidate_labels=open_types) 483 | 484 | # VQA 485 | if model_id == "dandelin/vilt-b32-finetuned-vqa": 486 | question = request.get_json()["text"] 487 | img_url = request.get_json()["img_url"] 488 | result = pipe(question=question, image=img_url) 489 | 490 | #DQA 491 | if model_id == "impira/layoutlm-document-qa": 492 | question = request.get_json()["text"] 493 | img_url = request.get_json()["img_url"] 494 | result = pipe(img_url, question) 495 | 496 | # depth-estimation 497 | if model_id == "Intel/dpt-large": 498 | output = pipe(request.get_json()["img_url"]) 499 | image = output['depth'] 500 | name = str(uuid.uuid4())[:4] 501 | image.save(f"public/images/{name}.jpg") 502 | result = {"path": f"/images/{name}.jpg"} 503 | 504 | if model_id == "Intel/dpt-hybrid-midas" and model_id == "Intel/dpt-large": 505 | image = load_image(request.get_json()["img_url"]) 506 | inputs = pipes[model_id]["feature_extractor"](images=image, return_tensors="pt") 507 | with torch.no_grad(): 508 | outputs = pipe(**inputs) 509 | predicted_depth = outputs.predicted_depth 510 | prediction = torch.nn.functional.interpolate( 511 | predicted_depth.unsqueeze(1), 512 | size=image.size[::-1], 513 | mode="bicubic", 514 | align_corners=False, 515 | ) 516 | output = prediction.squeeze().cpu().numpy() 517 | formatted = (output * 255 / np.max(output)).astype("uint8") 518 | image = Image.fromarray(formatted) 519 | name = str(uuid.uuid4())[:4] 520 | image.save(f"public/images/{name}.jpg") 521 | result = {"path": f"/images/{name}.jpg"} 522 | 523 | # TTS 524 | if model_id == "espnet/kan-bayashi_ljspeech_vits": 525 | text = request.get_json()["text"] 526 | wav = pipe(text)["wav"] 527 | name = str(uuid.uuid4())[:4] 528 | sf.write(f"public/audios/{name}.wav", wav.cpu().numpy(), pipe.fs, "PCM_16") 529 | result = {"path": f"/audios/{name}.wav"} 530 | 531 | if model_id == "microsoft/speecht5_tts": 532 | text = request.get_json()["text"] 533 | inputs = pipes[model_id]["processor"](text=text, return_tensors="pt") 534 | embeddings_dataset = pipes[model_id]["embeddings_dataset"] 535 | speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(pipes[model_id]["device"]) 536 | pipes[model_id]["vocoder"].to(pipes[model_id]["device"]) 537 | speech = pipe.generate_speech(inputs["input_ids"].to(pipes[model_id]["device"]), speaker_embeddings, vocoder=pipes[model_id]["vocoder"]) 538 | name = str(uuid.uuid4())[:4] 539 | sf.write(f"public/audios/{name}.wav", speech.cpu().numpy(), samplerate=16000) 540 | result = {"path": f"/audios/{name}.wav"} 541 | 542 | # ASR 543 | if model_id == "openai/whisper-base" or model_id == "microsoft/speecht5_asr": 544 | audio_url = request.get_json()["audio_url"] 545 | result = { "text": pipe(audio_url)["text"]} 546 | 547 | # audio to audio 548 | if model_id == "JorisCos/DCCRNet_Libri1Mix_enhsingle_16k": 549 | audio_url = request.get_json()["audio_url"] 550 | wav, sr = torchaudio.load(audio_url) 551 | with torch.no_grad(): 552 | result_wav = pipe(wav.to(pipes[model_id]["device"])) 553 | name = str(uuid.uuid4())[:4] 554 | sf.write(f"public/audios/{name}.wav", result_wav.cpu().squeeze().numpy(), sr) 555 | result = {"path": f"/audios/{name}.wav"} 556 | 557 | if model_id == "microsoft/speecht5_vc": 558 | audio_url = request.get_json()["audio_url"] 559 | wav, sr = torchaudio.load(audio_url) 560 | inputs = pipes[model_id]["processor"](audio=wav, sampling_rate=sr, return_tensors="pt") 561 | embeddings_dataset = pipes[model_id]["embeddings_dataset"] 562 | speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) 563 | pipes[model_id]["vocoder"].to(pipes[model_id]["device"]) 564 | speech = pipe.generate_speech(inputs["input_ids"].to(pipes[model_id]["device"]), speaker_embeddings, vocoder=pipes[model_id]["vocoder"]) 565 | name = str(uuid.uuid4())[:4] 566 | sf.write(f"public/audios/{name}.wav", speech.cpu().numpy(), samplerate=16000) 567 | result = {"path": f"/audios/{name}.wav"} 568 | 569 | # segmentation 570 | if model_id == "facebook/detr-resnet-50-panoptic": 571 | result = [] 572 | segments = pipe(request.get_json()["img_url"]) 573 | image = load_image(request.get_json()["img_url"]) 574 | 575 | colors = [] 576 | for i in range(len(segments)): 577 | colors.append((random.randint(100, 255), random.randint(100, 255), random.randint(100, 255), 50)) 578 | 579 | for segment in segments: 580 | mask = segment["mask"] 581 | mask = mask.convert('L') 582 | layer = Image.new('RGBA', mask.size, colors[i]) 583 | image.paste(layer, (0, 0), mask) 584 | name = str(uuid.uuid4())[:4] 585 | image.save(f"public/images/{name}.jpg") 586 | result = {"path": f"/images/{name}.jpg"} 587 | 588 | if model_id == "facebook/maskformer-swin-base-coco" or model_id == "facebook/maskformer-swin-large-ade": 589 | image = load_image(request.get_json()["img_url"]) 590 | inputs = pipes[model_id]["feature_extractor"](images=image, return_tensors="pt").to(pipes[model_id]["device"]) 591 | outputs = pipe(**inputs) 592 | result = pipes[model_id]["feature_extractor"].post_process_panoptic_segmentation(outputs, target_sizes=[image.size[::-1]])[0] 593 | predicted_panoptic_map = result["segmentation"].cpu().numpy() 594 | predicted_panoptic_map = Image.fromarray(predicted_panoptic_map.astype(np.uint8)) 595 | name = str(uuid.uuid4())[:4] 596 | predicted_panoptic_map.save(f"public/images/{name}.jpg") 597 | result = {"path": f"/images/{name}.jpg"} 598 | 599 | except Exception as e: 600 | print(e) 601 | traceback.print_exc() 602 | result = {"error": {"message": "Error when running the model inference."}} 603 | 604 | if "device" in pipes[model_id]: 605 | try: 606 | pipe.to("cpu") 607 | torch.cuda.empty_cache() 608 | except: 609 | pipe.device = torch.device("cpu") 610 | pipe.model.to("cpu") 611 | torch.cuda.empty_cache() 612 | 613 | pipes[model_id]["using"] = False 614 | 615 | if result is None: 616 | result = {"error": {"message": "model not found"}} 617 | 618 | end = time.time() 619 | during = end - start 620 | print(f"[ complete {model_id} ] {during}s") 621 | print(f"[ result {model_id} ] {result}") 622 | 623 | return jsonify(result) 624 | 625 | 626 | if __name__ == '__main__': 627 | # temp folders 628 | if not os.path.exists("public/audios"): 629 | os.makedirs("public/audios") 630 | if not os.path.exists("public/images"): 631 | os.makedirs("public/images") 632 | if not os.path.exists("public/videos"): 633 | os.makedirs("public/videos") 634 | 635 | waitress.serve(app, host="0.0.0.0", port=port) -------------------------------------------------------------------------------- /server/awesome_chat.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import copy 3 | from io import BytesIO 4 | import io 5 | import os 6 | import random 7 | import time 8 | import traceback 9 | import uuid 10 | import requests 11 | import re 12 | import json 13 | import logging 14 | import argparse 15 | import yaml 16 | from PIL import Image, ImageDraw 17 | from diffusers.utils import load_image 18 | from pydub import AudioSegment 19 | import threading 20 | from queue import Queue 21 | import flask 22 | from flask import request, jsonify 23 | import waitress 24 | from flask_cors import CORS, cross_origin 25 | from get_token_ids import get_token_ids_for_task_parsing, get_token_ids_for_choose_model, count_tokens, get_max_context_length 26 | from huggingface_hub.inference_api import InferenceApi 27 | from huggingface_hub.inference_api import ALL_TASKS 28 | 29 | parser = argparse.ArgumentParser() 30 | parser.add_argument("--config", type=str, default="configs/config.default.yaml") 31 | parser.add_argument("--mode", type=str, default="cli") 32 | args = parser.parse_args() 33 | 34 | if __name__ != "__main__": 35 | args.config = "configs/config.gradio.yaml" 36 | args.mode = "gradio" 37 | 38 | config = yaml.load(open(args.config, "r"), Loader=yaml.FullLoader) 39 | 40 | os.makedirs("logs", exist_ok=True) 41 | os.makedirs("public/images", exist_ok=True) 42 | os.makedirs("public/audios", exist_ok=True) 43 | os.makedirs("public/videos", exist_ok=True) 44 | 45 | 46 | logger = logging.getLogger(__name__) 47 | logger.setLevel(logging.DEBUG) 48 | 49 | handler = logging.StreamHandler() 50 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 51 | handler.setFormatter(formatter) 52 | if not config["debug"]: 53 | handler.setLevel(logging.CRITICAL) 54 | logger.addHandler(handler) 55 | 56 | log_file = config["log_file"] 57 | if log_file: 58 | filehandler = logging.FileHandler(log_file) 59 | filehandler.setLevel(logging.DEBUG) 60 | filehandler.setFormatter(formatter) 61 | logger.addHandler(filehandler) 62 | 63 | LLM = config["model"] 64 | use_completion = config["use_completion"] 65 | 66 | # consistent: wrong msra model name 67 | LLM_encoding = LLM 68 | if config["dev"] and LLM == "gpt-3.5-turbo": 69 | LLM_encoding = "text-davinci-003" 70 | task_parsing_highlight_ids = get_token_ids_for_task_parsing(LLM_encoding) 71 | choose_model_highlight_ids = get_token_ids_for_choose_model(LLM_encoding) 72 | 73 | # ENDPOINT MODEL NAME 74 | # /v1/chat/completions gpt-4, gpt-4-0314, gpt-4-32k, gpt-4-32k-0314, gpt-3.5-turbo, gpt-3.5-turbo-0301 75 | # /v1/completions text-davinci-003, text-davinci-002, text-curie-001, text-babbage-001, text-ada-001, davinci, curie, babbage, ada 76 | 77 | if use_completion: 78 | api_name = "completions" 79 | else: 80 | api_name = "chat/completions" 81 | 82 | API_TYPE = None 83 | # priority: local > azure > openai 84 | if "dev" in config and config["dev"]: 85 | API_TYPE = "local" 86 | elif "azure" in config: 87 | API_TYPE = "azure" 88 | elif "openai" in config: 89 | API_TYPE = "openai" 90 | else: 91 | logger.warning(f"No endpoint specified in {args.config}. The endpoint will be set dynamically according to the client.") 92 | 93 | if args.mode in ["test", "cli"]: 94 | assert API_TYPE, "Only server mode supports dynamic endpoint." 95 | 96 | API_KEY = None 97 | API_ENDPOINT = None 98 | if API_TYPE == "local": 99 | API_ENDPOINT = f"{config['local']['endpoint']}/v1/{api_name}" 100 | elif API_TYPE == "azure": 101 | API_ENDPOINT = f"{config['azure']['base_url']}/openai/deployments/{config['azure']['deployment_name']}/{api_name}?api-version={config['azure']['api_version']}" 102 | API_KEY = config["azure"]["api_key"] 103 | elif API_TYPE == "openai": 104 | API_ENDPOINT = f"https://api.openai.com/v1/{api_name}" 105 | if config["openai"]["api_key"].startswith("sk-"): # Check for valid OpenAI key in config file 106 | API_KEY = config["openai"]["api_key"] 107 | elif "OPENAI_API_KEY" in os.environ and os.getenv("OPENAI_API_KEY").startswith("sk-"): # Check for environment variable OPENAI_API_KEY 108 | API_KEY = os.getenv("OPENAI_API_KEY") 109 | else: 110 | raise ValueError(f"Incorrect OpenAI key. Please check your {args.config} file.") 111 | 112 | PROXY = None 113 | if config["proxy"]: 114 | PROXY = { 115 | "https": config["proxy"], 116 | } 117 | 118 | inference_mode = config["inference_mode"] 119 | 120 | # check the local_inference_endpoint 121 | Model_Server = None 122 | if inference_mode!="huggingface": 123 | Model_Server = "http://" + config["local_inference_endpoint"]["host"] + ":" + str(config["local_inference_endpoint"]["port"]) 124 | message = f"The server of local inference endpoints is not running, please start it first. (or using `inference_mode: huggingface` in {args.config} for a feature-limited experience)" 125 | try: 126 | r = requests.get(Model_Server + "/running") 127 | if r.status_code != 200: 128 | raise ValueError(message) 129 | except: 130 | raise ValueError(message) 131 | 132 | 133 | parse_task_demos_or_presteps = open(config["demos_or_presteps"]["parse_task"], "r").read() 134 | choose_model_demos_or_presteps = open(config["demos_or_presteps"]["choose_model"], "r").read() 135 | response_results_demos_or_presteps = open(config["demos_or_presteps"]["response_results"], "r").read() 136 | 137 | parse_task_prompt = config["prompt"]["parse_task"] 138 | choose_model_prompt = config["prompt"]["choose_model"] 139 | response_results_prompt = config["prompt"]["response_results"] 140 | 141 | parse_task_tprompt = config["tprompt"]["parse_task"] 142 | choose_model_tprompt = config["tprompt"]["choose_model"] 143 | response_results_tprompt = config["tprompt"]["response_results"] 144 | 145 | MODELS = [json.loads(line) for line in open("data/p0_models.jsonl", "r").readlines()] 146 | MODELS_MAP = {} 147 | for model in MODELS: 148 | tag = model["task"] 149 | if tag not in MODELS_MAP: 150 | MODELS_MAP[tag] = [] 151 | MODELS_MAP[tag].append(model) 152 | METADATAS = {} 153 | for model in MODELS: 154 | METADATAS[model["id"]] = model 155 | 156 | HUGGINGFACE_HEADERS = {} 157 | if config["huggingface"]["token"] and config["huggingface"]["token"].startswith("hf_"): # Check for valid huggingface token in config file 158 | HUGGINGFACE_HEADERS = { 159 | "Authorization": f"Bearer {config['huggingface']['token']}", 160 | } 161 | elif "HUGGINGFACE_ACCESS_TOKEN" in os.environ and os.getenv("HUGGINGFACE_ACCESS_TOKEN").startswith("hf_"): # Check for environment variable HUGGINGFACE_ACCESS_TOKEN 162 | HUGGINGFACE_HEADERS = { 163 | "Authorization": f"Bearer {os.getenv('HUGGINGFACE_ACCESS_TOKEN')}", 164 | } 165 | else: 166 | raise ValueError(f"Incorrect HuggingFace token. Please check your {args.config} file.") 167 | 168 | def convert_chat_to_completion(data): 169 | messages = data.pop('messages', []) 170 | tprompt = "" 171 | if messages[0]['role'] == "system": 172 | tprompt = messages[0]['content'] 173 | messages = messages[1:] 174 | final_prompt = "" 175 | for message in messages: 176 | if message['role'] == "user": 177 | final_prompt += (""+ "user" + "\n" + message['content'] + "\n") 178 | elif message['role'] == "assistant": 179 | final_prompt += (""+ "assistant" + "\n" + message['content'] + "\n") 180 | else: 181 | final_prompt += (""+ "system" + "\n" + message['content'] + "\n") 182 | final_prompt = tprompt + final_prompt 183 | final_prompt = final_prompt + "assistant" 184 | data["prompt"] = final_prompt 185 | data['stop'] = data.get('stop', [""]) 186 | data['max_tokens'] = data.get('max_tokens', max(get_max_context_length(LLM) - count_tokens(LLM_encoding, final_prompt), 1)) 187 | return data 188 | 189 | def send_request(data): 190 | api_key = data.pop("api_key") 191 | api_type = data.pop("api_type") 192 | api_endpoint = data.pop("api_endpoint") 193 | if use_completion: 194 | data = convert_chat_to_completion(data) 195 | if api_type == "openai": 196 | HEADER = { 197 | "Authorization": f"Bearer {api_key}" 198 | } 199 | elif api_type == "azure": 200 | HEADER = { 201 | "api-key": api_key, 202 | "Content-Type": "application/json" 203 | } 204 | else: 205 | HEADER = None 206 | response = requests.post(api_endpoint, json=data, headers=HEADER, proxies=PROXY) 207 | if "error" in response.json(): 208 | return response.json() 209 | logger.debug(response.text.strip()) 210 | if use_completion: 211 | return response.json()["choices"][0]["text"].strip() 212 | else: 213 | return response.json()["choices"][0]["message"]["content"].strip() 214 | 215 | def replace_slot(text, entries): 216 | for key, value in entries.items(): 217 | if not isinstance(value, str): 218 | value = str(value) 219 | text = text.replace("{{" + key +"}}", value.replace('"', "'").replace('\n', "")) 220 | return text 221 | 222 | def find_json(s): 223 | s = s.replace("\'", "\"") 224 | start = s.find("{") 225 | end = s.rfind("}") 226 | res = s[start:end+1] 227 | res = res.replace("\n", "") 228 | return res 229 | 230 | def field_extract(s, field): 231 | try: 232 | field_rep = re.compile(f'{field}.*?:.*?"(.*?)"', re.IGNORECASE) 233 | extracted = field_rep.search(s).group(1).replace("\"", "\'") 234 | except: 235 | field_rep = re.compile(f'{field}:\ *"(.*?)"', re.IGNORECASE) 236 | extracted = field_rep.search(s).group(1).replace("\"", "\'") 237 | return extracted 238 | 239 | def get_id_reason(choose_str): 240 | reason = field_extract(choose_str, "reason") 241 | id = field_extract(choose_str, "id") 242 | choose = {"id": id, "reason": reason} 243 | return id.strip(), reason.strip(), choose 244 | 245 | def record_case(success, **args): 246 | if success: 247 | f = open("logs/log_success.jsonl", "a") 248 | else: 249 | f = open("logs/log_fail.jsonl", "a") 250 | log = args 251 | f.write(json.dumps(log) + "\n") 252 | f.close() 253 | 254 | def image_to_bytes(img_url): 255 | img_byte = io.BytesIO() 256 | type = img_url.split(".")[-1] 257 | load_image(img_url).save(img_byte, format="png") 258 | img_data = img_byte.getvalue() 259 | return img_data 260 | 261 | def resource_has_dep(command): 262 | args = command["args"] 263 | for _, v in args.items(): 264 | if "" in v: 265 | return True 266 | return False 267 | 268 | def fix_dep(tasks): 269 | for task in tasks: 270 | args = task["args"] 271 | task["dep"] = [] 272 | for k, v in args.items(): 273 | if "" in v: 274 | dep_task_id = int(v.split("-")[1]) 275 | if dep_task_id not in task["dep"]: 276 | task["dep"].append(dep_task_id) 277 | if len(task["dep"]) == 0: 278 | task["dep"] = [-1] 279 | return tasks 280 | 281 | def unfold(tasks): 282 | flag_unfold_task = False 283 | try: 284 | for task in tasks: 285 | for key, value in task["args"].items(): 286 | if "" in value: 287 | generated_items = value.split(",") 288 | if len(generated_items) > 1: 289 | flag_unfold_task = True 290 | for item in generated_items: 291 | new_task = copy.deepcopy(task) 292 | dep_task_id = int(item.split("-")[1]) 293 | new_task["dep"] = [dep_task_id] 294 | new_task["args"][key] = item 295 | tasks.append(new_task) 296 | tasks.remove(task) 297 | except Exception as e: 298 | print(e) 299 | traceback.print_exc() 300 | logger.debug("unfold task failed.") 301 | 302 | if flag_unfold_task: 303 | logger.debug(f"unfold tasks: {tasks}") 304 | 305 | return tasks 306 | 307 | def chitchat(messages, api_key, api_type, api_endpoint): 308 | data = { 309 | "model": LLM, 310 | "messages": messages, 311 | "api_key": api_key, 312 | "api_type": api_type, 313 | "api_endpoint": api_endpoint 314 | } 315 | return send_request(data) 316 | 317 | def parse_task(context, input, api_key, api_type, api_endpoint): 318 | demos_or_presteps = parse_task_demos_or_presteps 319 | messages = json.loads(demos_or_presteps) 320 | messages.insert(0, {"role": "system", "content": parse_task_tprompt}) 321 | 322 | # cut chat logs 323 | start = 0 324 | while start <= len(context): 325 | history = context[start:] 326 | prompt = replace_slot(parse_task_prompt, { 327 | "input": input, 328 | "context": history 329 | }) 330 | messages.append({"role": "user", "content": prompt}) 331 | history_text = "\nuser".join([m["content"] for m in messages]) 332 | num = count_tokens(LLM_encoding, history_text) 333 | if get_max_context_length(LLM) - num > 800: 334 | break 335 | messages.pop() 336 | start += 2 337 | 338 | logger.debug(messages) 339 | data = { 340 | "model": LLM, 341 | "messages": messages, 342 | "temperature": 0, 343 | "logit_bias": {item: config["logit_bias"]["parse_task"] for item in task_parsing_highlight_ids}, 344 | "api_key": api_key, 345 | "api_type": api_type, 346 | "api_endpoint": api_endpoint 347 | } 348 | return send_request(data) 349 | 350 | def choose_model(input, task, metas, api_key, api_type, api_endpoint): 351 | prompt = replace_slot(choose_model_prompt, { 352 | "input": input, 353 | "task": task, 354 | "metas": metas, 355 | }) 356 | demos_or_presteps = replace_slot(choose_model_demos_or_presteps, { 357 | "input": input, 358 | "task": task, 359 | "metas": metas 360 | }) 361 | messages = json.loads(demos_or_presteps) 362 | messages.insert(0, {"role": "system", "content": choose_model_tprompt}) 363 | messages.append({"role": "user", "content": prompt}) 364 | logger.debug(messages) 365 | data = { 366 | "model": LLM, 367 | "messages": messages, 368 | "temperature": 0, 369 | "logit_bias": {item: config["logit_bias"]["choose_model"] for item in choose_model_highlight_ids}, # 5 370 | "api_key": api_key, 371 | "api_type": api_type, 372 | "api_endpoint": api_endpoint 373 | } 374 | return send_request(data) 375 | 376 | 377 | def response_results(input, results, api_key, api_type, api_endpoint): 378 | results = [v for k, v in sorted(results.items(), key=lambda item: item[0])] 379 | prompt = replace_slot(response_results_prompt, { 380 | "input": input, 381 | }) 382 | demos_or_presteps = replace_slot(response_results_demos_or_presteps, { 383 | "input": input, 384 | "processes": results 385 | }) 386 | messages = json.loads(demos_or_presteps) 387 | messages.insert(0, {"role": "system", "content": response_results_tprompt}) 388 | messages.append({"role": "user", "content": prompt}) 389 | logger.debug(messages) 390 | data = { 391 | "model": LLM, 392 | "messages": messages, 393 | "temperature": 0, 394 | "api_key": api_key, 395 | "api_type": api_type, 396 | "api_endpoint": api_endpoint 397 | } 398 | return send_request(data) 399 | 400 | def huggingface_model_inference(model_id, data, task): 401 | task_url = f"https://api-inference.huggingface.co/models/{model_id}" # InferenceApi does not yet support some tasks 402 | inference = InferenceApi(repo_id=model_id, token=config["huggingface"]["token"]) 403 | 404 | # NLP tasks 405 | if task == "question-answering": 406 | inputs = {"question": data["text"], "context": (data["context"] if "context" in data else "" )} 407 | result = inference(inputs) 408 | if task == "sentence-similarity": 409 | inputs = {"source_sentence": data["text1"], "target_sentence": data["text2"]} 410 | result = inference(inputs) 411 | if task in ["text-classification", "token-classification", "text2text-generation", "summarization", "translation", "conversational", "text-generation"]: 412 | inputs = data["text"] 413 | result = inference(inputs) 414 | 415 | # CV tasks 416 | if task == "visual-question-answering" or task == "document-question-answering": 417 | img_url = data["image"] 418 | text = data["text"] 419 | img_data = image_to_bytes(img_url) 420 | img_base64 = base64.b64encode(img_data).decode("utf-8") 421 | json_data = {} 422 | json_data["inputs"] = {} 423 | json_data["inputs"]["question"] = text 424 | json_data["inputs"]["image"] = img_base64 425 | result = requests.post(task_url, headers=HUGGINGFACE_HEADERS, json=json_data).json() 426 | # result = inference(inputs) # not support 427 | 428 | if task == "image-to-image": 429 | img_url = data["image"] 430 | img_data = image_to_bytes(img_url) 431 | # result = inference(data=img_data) # not support 432 | HUGGINGFACE_HEADERS["Content-Length"] = str(len(img_data)) 433 | r = requests.post(task_url, headers=HUGGINGFACE_HEADERS, data=img_data) 434 | result = r.json() 435 | if "path" in result: 436 | result["generated image"] = result.pop("path") 437 | 438 | if task == "text-to-image": 439 | inputs = data["text"] 440 | img = inference(inputs) 441 | name = str(uuid.uuid4())[:4] 442 | img.save(f"public/images/{name}.png") 443 | result = {} 444 | result["generated image"] = f"/images/{name}.png" 445 | 446 | if task == "image-segmentation": 447 | img_url = data["image"] 448 | img_data = image_to_bytes(img_url) 449 | image = Image.open(BytesIO(img_data)) 450 | predicted = inference(data=img_data) 451 | colors = [] 452 | for i in range(len(predicted)): 453 | colors.append((random.randint(100, 255), random.randint(100, 255), random.randint(100, 255), 155)) 454 | for i, pred in enumerate(predicted): 455 | label = pred["label"] 456 | mask = pred.pop("mask").encode("utf-8") 457 | mask = base64.b64decode(mask) 458 | mask = Image.open(BytesIO(mask), mode='r') 459 | mask = mask.convert('L') 460 | 461 | layer = Image.new('RGBA', mask.size, colors[i]) 462 | image.paste(layer, (0, 0), mask) 463 | name = str(uuid.uuid4())[:4] 464 | image.save(f"public/images/{name}.jpg") 465 | result = {} 466 | result["generated image"] = f"/images/{name}.jpg" 467 | result["predicted"] = predicted 468 | 469 | if task == "object-detection": 470 | img_url = data["image"] 471 | img_data = image_to_bytes(img_url) 472 | predicted = inference(data=img_data) 473 | image = Image.open(BytesIO(img_data)) 474 | draw = ImageDraw.Draw(image) 475 | labels = list(item['label'] for item in predicted) 476 | color_map = {} 477 | for label in labels: 478 | if label not in color_map: 479 | color_map[label] = (random.randint(0, 255), random.randint(0, 100), random.randint(0, 255)) 480 | for label in predicted: 481 | box = label["box"] 482 | draw.rectangle(((box["xmin"], box["ymin"]), (box["xmax"], box["ymax"])), outline=color_map[label["label"]], width=2) 483 | draw.text((box["xmin"]+5, box["ymin"]-15), label["label"], fill=color_map[label["label"]]) 484 | name = str(uuid.uuid4())[:4] 485 | image.save(f"public/images/{name}.jpg") 486 | result = {} 487 | result["generated image"] = f"/images/{name}.jpg" 488 | result["predicted"] = predicted 489 | 490 | if task in ["image-classification"]: 491 | img_url = data["image"] 492 | img_data = image_to_bytes(img_url) 493 | result = inference(data=img_data) 494 | 495 | if task == "image-to-text": 496 | img_url = data["image"] 497 | img_data = image_to_bytes(img_url) 498 | HUGGINGFACE_HEADERS["Content-Length"] = str(len(img_data)) 499 | r = requests.post(task_url, headers=HUGGINGFACE_HEADERS, data=img_data, proxies=PROXY) 500 | result = {} 501 | if "generated_text" in r.json()[0]: 502 | result["generated text"] = r.json()[0].pop("generated_text") 503 | 504 | # AUDIO tasks 505 | if task == "text-to-speech": 506 | inputs = data["text"] 507 | response = inference(inputs, raw_response=True) 508 | # response = requests.post(task_url, headers=HUGGINGFACE_HEADERS, json={"inputs": text}) 509 | name = str(uuid.uuid4())[:4] 510 | with open(f"public/audios/{name}.flac", "wb") as f: 511 | f.write(response.content) 512 | result = {"generated audio": f"/audios/{name}.flac"} 513 | if task in ["automatic-speech-recognition", "audio-to-audio", "audio-classification"]: 514 | audio_url = data["audio"] 515 | audio_data = requests.get(audio_url, timeout=10).content 516 | response = inference(data=audio_data, raw_response=True) 517 | result = response.json() 518 | if task == "audio-to-audio": 519 | content = None 520 | type = None 521 | for k, v in result[0].items(): 522 | if k == "blob": 523 | content = base64.b64decode(v.encode("utf-8")) 524 | if k == "content-type": 525 | type = "audio/flac".split("/")[-1] 526 | audio = AudioSegment.from_file(BytesIO(content)) 527 | name = str(uuid.uuid4())[:4] 528 | audio.export(f"public/audios/{name}.{type}", format=type) 529 | result = {"generated audio": f"/audios/{name}.{type}"} 530 | return result 531 | 532 | def local_model_inference(model_id, data, task): 533 | task_url = f"{Model_Server}/models/{model_id}" 534 | 535 | # contronlet 536 | if model_id.startswith("lllyasviel/sd-controlnet-"): 537 | img_url = data["image"] 538 | text = data["text"] 539 | response = requests.post(task_url, json={"img_url": img_url, "text": text}) 540 | results = response.json() 541 | if "path" in results: 542 | results["generated image"] = results.pop("path") 543 | return results 544 | if model_id.endswith("-control"): 545 | img_url = data["image"] 546 | response = requests.post(task_url, json={"img_url": img_url}) 547 | results = response.json() 548 | if "path" in results: 549 | results["generated image"] = results.pop("path") 550 | return results 551 | 552 | if task == "text-to-video": 553 | response = requests.post(task_url, json=data) 554 | results = response.json() 555 | if "path" in results: 556 | results["generated video"] = results.pop("path") 557 | return results 558 | 559 | # NLP tasks 560 | if task == "question-answering" or task == "sentence-similarity": 561 | response = requests.post(task_url, json=data) 562 | return response.json() 563 | if task in ["text-classification", "token-classification", "text2text-generation", "summarization", "translation", "conversational", "text-generation"]: 564 | response = requests.post(task_url, json=data) 565 | return response.json() 566 | 567 | # CV tasks 568 | if task == "depth-estimation": 569 | img_url = data["image"] 570 | response = requests.post(task_url, json={"img_url": img_url}) 571 | results = response.json() 572 | if "path" in results: 573 | results["generated image"] = results.pop("path") 574 | return results 575 | if task == "image-segmentation": 576 | img_url = data["image"] 577 | response = requests.post(task_url, json={"img_url": img_url}) 578 | results = response.json() 579 | results["generated image"] = results.pop("path") 580 | return results 581 | if task == "image-to-image": 582 | img_url = data["image"] 583 | response = requests.post(task_url, json={"img_url": img_url}) 584 | results = response.json() 585 | if "path" in results: 586 | results["generated image"] = results.pop("path") 587 | return results 588 | if task == "text-to-image": 589 | response = requests.post(task_url, json=data) 590 | results = response.json() 591 | if "path" in results: 592 | results["generated image"] = results.pop("path") 593 | return results 594 | if task == "object-detection": 595 | img_url = data["image"] 596 | response = requests.post(task_url, json={"img_url": img_url}) 597 | predicted = response.json() 598 | if "error" in predicted: 599 | return predicted 600 | image = load_image(img_url) 601 | draw = ImageDraw.Draw(image) 602 | labels = list(item['label'] for item in predicted) 603 | color_map = {} 604 | for label in labels: 605 | if label not in color_map: 606 | color_map[label] = (random.randint(0, 255), random.randint(0, 100), random.randint(0, 255)) 607 | for label in predicted: 608 | box = label["box"] 609 | draw.rectangle(((box["xmin"], box["ymin"]), (box["xmax"], box["ymax"])), outline=color_map[label["label"]], width=2) 610 | draw.text((box["xmin"]+5, box["ymin"]-15), label["label"], fill=color_map[label["label"]]) 611 | name = str(uuid.uuid4())[:4] 612 | image.save(f"public/images/{name}.jpg") 613 | results = {} 614 | results["generated image"] = f"/images/{name}.jpg" 615 | results["predicted"] = predicted 616 | return results 617 | if task in ["image-classification", "image-to-text", "document-question-answering", "visual-question-answering"]: 618 | img_url = data["image"] 619 | text = None 620 | if "text" in data: 621 | text = data["text"] 622 | response = requests.post(task_url, json={"img_url": img_url, "text": text}) 623 | results = response.json() 624 | return results 625 | # AUDIO tasks 626 | if task == "text-to-speech": 627 | response = requests.post(task_url, json=data) 628 | results = response.json() 629 | if "path" in results: 630 | results["generated audio"] = results.pop("path") 631 | return results 632 | if task in ["automatic-speech-recognition", "audio-to-audio", "audio-classification"]: 633 | audio_url = data["audio"] 634 | response = requests.post(task_url, json={"audio_url": audio_url}) 635 | return response.json() 636 | 637 | 638 | def model_inference(model_id, data, hosted_on, task): 639 | if hosted_on == "unknown": 640 | localStatusUrl = f"{Model_Server}/status/{model_id}" 641 | r = requests.get(localStatusUrl) 642 | logger.debug("Local Server Status: " + str(r.json())) 643 | if r.status_code == 200 and "loaded" in r.json() and r.json()["loaded"]: 644 | hosted_on = "local" 645 | else: 646 | huggingfaceStatusUrl = f"https://api-inference.huggingface.co/status/{model_id}" 647 | r = requests.get(huggingfaceStatusUrl, headers=HUGGINGFACE_HEADERS, proxies=PROXY) 648 | logger.debug("Huggingface Status: " + str(r.json())) 649 | if r.status_code == 200 and "loaded" in r.json() and r.json()["loaded"]: 650 | hosted_on = "huggingface" 651 | try: 652 | if hosted_on == "local": 653 | inference_result = local_model_inference(model_id, data, task) 654 | elif hosted_on == "huggingface": 655 | inference_result = huggingface_model_inference(model_id, data, task) 656 | except Exception as e: 657 | print(e) 658 | traceback.print_exc() 659 | inference_result = {"error":{"message": str(e)}} 660 | return inference_result 661 | 662 | 663 | def get_model_status(model_id, url, headers, queue = None): 664 | endpoint_type = "huggingface" if "huggingface" in url else "local" 665 | if "huggingface" in url: 666 | r = requests.get(url, headers=headers, proxies=PROXY) 667 | else: 668 | r = requests.get(url) 669 | if r.status_code == 200 and "loaded" in r.json() and r.json()["loaded"]: 670 | if queue: 671 | queue.put((model_id, True, endpoint_type)) 672 | return True 673 | else: 674 | if queue: 675 | queue.put((model_id, False, None)) 676 | return False 677 | 678 | def get_avaliable_models(candidates, topk=5): 679 | all_available_models = {"local": [], "huggingface": []} 680 | threads = [] 681 | result_queue = Queue() 682 | 683 | for candidate in candidates: 684 | model_id = candidate["id"] 685 | 686 | if inference_mode != "local": 687 | huggingfaceStatusUrl = f"https://api-inference.huggingface.co/status/{model_id}" 688 | thread = threading.Thread(target=get_model_status, args=(model_id, huggingfaceStatusUrl, HUGGINGFACE_HEADERS, result_queue)) 689 | threads.append(thread) 690 | thread.start() 691 | 692 | if inference_mode != "huggingface" and config["local_deployment"] != "minimal": 693 | localStatusUrl = f"{Model_Server}/status/{model_id}" 694 | thread = threading.Thread(target=get_model_status, args=(model_id, localStatusUrl, {}, result_queue)) 695 | threads.append(thread) 696 | thread.start() 697 | 698 | result_count = len(threads) 699 | while result_count: 700 | model_id, status, endpoint_type = result_queue.get() 701 | if status and model_id not in all_available_models: 702 | all_available_models[endpoint_type].append(model_id) 703 | if len(all_available_models["local"] + all_available_models["huggingface"]) >= topk: 704 | break 705 | result_count -= 1 706 | 707 | for thread in threads: 708 | thread.join() 709 | 710 | return all_available_models 711 | 712 | def collect_result(command, choose, inference_result): 713 | result = {"task": command} 714 | result["inference result"] = inference_result 715 | result["choose model result"] = choose 716 | logger.debug(f"inference result: {inference_result}") 717 | return result 718 | 719 | 720 | def run_task(input, command, results, api_key, api_type, api_endpoint): 721 | id = command["id"] 722 | args = command["args"] 723 | task = command["task"] 724 | deps = command["dep"] 725 | if deps[0] != -1: 726 | dep_tasks = [results[dep] for dep in deps] 727 | else: 728 | dep_tasks = [] 729 | 730 | logger.debug(f"Run task: {id} - {task}") 731 | logger.debug("Deps: " + json.dumps(dep_tasks)) 732 | 733 | if deps[0] != -1: 734 | if "image" in args and "-" in args["image"]: 735 | resource_id = int(args["image"].split("-")[1]) 736 | if "generated image" in results[resource_id]["inference result"]: 737 | args["image"] = results[resource_id]["inference result"]["generated image"] 738 | if "audio" in args and "-" in args["audio"]: 739 | resource_id = int(args["audio"].split("-")[1]) 740 | if "generated audio" in results[resource_id]["inference result"]: 741 | args["audio"] = results[resource_id]["inference result"]["generated audio"] 742 | if "text" in args and "-" in args["text"]: 743 | resource_id = int(args["text"].split("-")[1]) 744 | if "generated text" in results[resource_id]["inference result"]: 745 | args["text"] = results[resource_id]["inference result"]["generated text"] 746 | 747 | text = image = audio = None 748 | for dep_task in dep_tasks: 749 | if "generated text" in dep_task["inference result"]: 750 | text = dep_task["inference result"]["generated text"] 751 | logger.debug("Detect the generated text of dependency task (from results):" + text) 752 | elif "text" in dep_task["task"]["args"]: 753 | text = dep_task["task"]["args"]["text"] 754 | logger.debug("Detect the text of dependency task (from args): " + text) 755 | if "generated image" in dep_task["inference result"]: 756 | image = dep_task["inference result"]["generated image"] 757 | logger.debug("Detect the generated image of dependency task (from results): " + image) 758 | elif "image" in dep_task["task"]["args"]: 759 | image = dep_task["task"]["args"]["image"] 760 | logger.debug("Detect the image of dependency task (from args): " + image) 761 | if "generated audio" in dep_task["inference result"]: 762 | audio = dep_task["inference result"]["generated audio"] 763 | logger.debug("Detect the generated audio of dependency task (from results): " + audio) 764 | elif "audio" in dep_task["task"]["args"]: 765 | audio = dep_task["task"]["args"]["audio"] 766 | logger.debug("Detect the audio of dependency task (from args): " + audio) 767 | 768 | if "image" in args and "" in args["image"]: 769 | if image: 770 | args["image"] = image 771 | if "audio" in args and "" in args["audio"]: 772 | if audio: 773 | args["audio"] = audio 774 | if "text" in args and "" in args["text"]: 775 | if text: 776 | args["text"] = text 777 | 778 | for resource in ["image", "audio"]: 779 | if resource in args and not args[resource].startswith("public/") and len(args[resource]) > 0 and not args[resource].startswith("http"): 780 | args[resource] = f"public/{args[resource]}" 781 | 782 | if "-text-to-image" in command['task'] and "text" not in args: 783 | logger.debug("control-text-to-image task, but text is empty, so we use control-generation instead.") 784 | control = task.split("-")[0] 785 | 786 | if control == "seg": 787 | task = "image-segmentation" 788 | command['task'] = task 789 | elif control == "depth": 790 | task = "depth-estimation" 791 | command['task'] = task 792 | else: 793 | task = f"{control}-control" 794 | 795 | command["args"] = args 796 | logger.debug(f"parsed task: {command}") 797 | 798 | if task.endswith("-text-to-image") or task.endswith("-control"): 799 | if inference_mode != "huggingface": 800 | if task.endswith("-text-to-image"): 801 | control = task.split("-")[0] 802 | best_model_id = f"lllyasviel/sd-controlnet-{control}" 803 | else: 804 | best_model_id = task 805 | hosted_on = "local" 806 | reason = "ControlNet is the best model for this task." 807 | choose = {"id": best_model_id, "reason": reason} 808 | logger.debug(f"chosen model: {choose}") 809 | else: 810 | logger.warning(f"Task {command['task']} is not available. ControlNet need to be deployed locally.") 811 | record_case(success=False, **{"input": input, "task": command, "reason": f"Task {command['task']} is not available. ControlNet need to be deployed locally.", "op":"message"}) 812 | inference_result = {"error": f"service related to ControlNet is not available."} 813 | results[id] = collect_result(command, "", inference_result) 814 | return False 815 | elif task in ["summarization", "translation", "conversational", "text-generation", "text2text-generation"]: # ChatGPT Can do 816 | best_model_id = "ChatGPT" 817 | reason = "ChatGPT performs well on some NLP tasks as well." 818 | choose = {"id": best_model_id, "reason": reason} 819 | messages = [{ 820 | "role": "user", 821 | "content": f"[ {input} ] contains a task in JSON format {command}. Now you are a {command['task']} system, the arguments are {command['args']}. Just help me do {command['task']} and give me the result. The result must be in text form without any urls." 822 | }] 823 | response = chitchat(messages, api_key, api_type, api_endpoint) 824 | results[id] = collect_result(command, choose, {"response": response}) 825 | return True 826 | else: 827 | if task not in MODELS_MAP: 828 | logger.warning(f"no available models on {task} task.") 829 | record_case(success=False, **{"input": input, "task": command, "reason": f"task not support: {command['task']}", "op":"message"}) 830 | inference_result = {"error": f"{command['task']} not found in available tasks."} 831 | results[id] = collect_result(command, "", inference_result) 832 | return False 833 | 834 | candidates = MODELS_MAP[task][:10] 835 | all_avaliable_models = get_avaliable_models(candidates, config["num_candidate_models"]) 836 | all_avaliable_model_ids = all_avaliable_models["local"] + all_avaliable_models["huggingface"] 837 | logger.debug(f"avaliable models on {command['task']}: {all_avaliable_models}") 838 | 839 | if len(all_avaliable_model_ids) == 0: 840 | logger.warning(f"no available models on {command['task']}") 841 | record_case(success=False, **{"input": input, "task": command, "reason": f"no available models: {command['task']}", "op":"message"}) 842 | inference_result = {"error": f"no available models on {command['task']} task."} 843 | results[id] = collect_result(command, "", inference_result) 844 | return False 845 | 846 | if len(all_avaliable_model_ids) == 1: 847 | best_model_id = all_avaliable_model_ids[0] 848 | hosted_on = "local" if best_model_id in all_avaliable_models["local"] else "huggingface" 849 | reason = "Only one model available." 850 | choose = {"id": best_model_id, "reason": reason} 851 | logger.debug(f"chosen model: {choose}") 852 | else: 853 | cand_models_info = [ 854 | { 855 | "id": model["id"], 856 | "inference endpoint": all_avaliable_models.get( 857 | "local" if model["id"] in all_avaliable_models["local"] else "huggingface" 858 | ), 859 | "likes": model.get("likes"), 860 | "description": model.get("description", "")[:config["max_description_length"]], 861 | # "language": model.get("meta").get("language") if model.get("meta") else None, 862 | "tags": model.get("meta").get("tags") if model.get("meta") else None, 863 | } 864 | for model in candidates 865 | if model["id"] in all_avaliable_model_ids 866 | ] 867 | 868 | choose_str = choose_model(input, command, cand_models_info, api_key, api_type, api_endpoint) 869 | logger.debug(f"chosen model: {choose_str}") 870 | try: 871 | choose = json.loads(choose_str) 872 | reason = choose["reason"] 873 | best_model_id = choose["id"] 874 | hosted_on = "local" if best_model_id in all_avaliable_models["local"] else "huggingface" 875 | except Exception as e: 876 | logger.warning(f"the response [ {choose_str} ] is not a valid JSON, try to find the model id and reason in the response.") 877 | choose_str = find_json(choose_str) 878 | best_model_id, reason, choose = get_id_reason(choose_str) 879 | hosted_on = "local" if best_model_id in all_avaliable_models["local"] else "huggingface" 880 | inference_result = model_inference(best_model_id, args, hosted_on, command['task']) 881 | 882 | if "error" in inference_result: 883 | logger.warning(f"Inference error: {inference_result['error']}") 884 | record_case(success=False, **{"input": input, "task": command, "reason": f"inference error: {inference_result['error']}", "op":"message"}) 885 | results[id] = collect_result(command, choose, inference_result) 886 | return False 887 | 888 | results[id] = collect_result(command, choose, inference_result) 889 | return True 890 | 891 | def chat_huggingface(messages, api_key, api_type, api_endpoint, return_planning = False, return_results = False): 892 | start = time.time() 893 | context = messages[:-1] 894 | input = messages[-1]["content"] 895 | logger.info("*"*80) 896 | logger.info(f"input: {input}") 897 | 898 | task_str = parse_task(context, input, api_key, api_type, api_endpoint) 899 | 900 | if "error" in task_str: 901 | record_case(success=False, **{"input": input, "task": task_str, "reason": f"task parsing error: {task_str['error']['message']}", "op":"report message"}) 902 | return {"message": task_str["error"]["message"]} 903 | 904 | task_str = task_str.strip() 905 | logger.info(task_str) 906 | 907 | try: 908 | tasks = json.loads(task_str) 909 | except Exception as e: 910 | logger.debug(e) 911 | response = chitchat(messages, api_key, api_type, api_endpoint) 912 | record_case(success=False, **{"input": input, "task": task_str, "reason": "task parsing fail", "op":"chitchat"}) 913 | return {"message": response} 914 | 915 | if task_str == "[]": # using LLM response for empty task 916 | record_case(success=False, **{"input": input, "task": [], "reason": "task parsing fail: empty", "op": "chitchat"}) 917 | response = chitchat(messages, api_key, api_type, api_endpoint) 918 | return {"message": response} 919 | 920 | if len(tasks) == 1 and tasks[0]["task"] in ["summarization", "translation", "conversational", "text-generation", "text2text-generation"]: 921 | record_case(success=True, **{"input": input, "task": tasks, "reason": "chitchat tasks", "op": "chitchat"}) 922 | response = chitchat(messages, api_key, api_type, api_endpoint) 923 | return {"message": response} 924 | 925 | tasks = unfold(tasks) 926 | tasks = fix_dep(tasks) 927 | logger.debug(tasks) 928 | 929 | if return_planning: 930 | return tasks 931 | 932 | results = {} 933 | threads = [] 934 | tasks = tasks[:] 935 | d = dict() 936 | retry = 0 937 | while True: 938 | num_thread = len(threads) 939 | for task in tasks: 940 | # logger.debug(f"d.keys(): {d.keys()}, dep: {dep}") 941 | for dep_id in task["dep"]: 942 | if dep_id >= task["id"]: 943 | task["dep"] = [-1] 944 | break 945 | dep = task["dep"] 946 | if dep[0] == -1 or len(list(set(dep).intersection(d.keys()))) == len(dep): 947 | tasks.remove(task) 948 | thread = threading.Thread(target=run_task, args=(input, task, d, api_key, api_type, api_endpoint)) 949 | thread.start() 950 | threads.append(thread) 951 | if num_thread == len(threads): 952 | time.sleep(0.5) 953 | retry += 1 954 | if retry > 160: 955 | logger.debug("User has waited too long, Loop break.") 956 | break 957 | if len(tasks) == 0: 958 | break 959 | for thread in threads: 960 | thread.join() 961 | 962 | results = d.copy() 963 | 964 | logger.debug(results) 965 | if return_results: 966 | return results 967 | 968 | response = response_results(input, results, api_key, api_type, api_endpoint).strip() 969 | 970 | end = time.time() 971 | during = end - start 972 | 973 | answer = {"message": response} 974 | record_case(success=True, **{"input": input, "task": task_str, "results": results, "response": response, "during": during, "op":"response"}) 975 | logger.info(f"response: {response}") 976 | return answer 977 | 978 | def test(): 979 | # single round examples 980 | inputs = [ 981 | "Given a collection of image A: /examples/a.jpg, B: /examples/b.jpg, C: /examples/c.jpg, please tell me how many zebras in these picture?" 982 | "Can you give me a picture of a small bird flying in the sky with trees and clouds. Generate a high definition image if possible.", 983 | "Please answer all the named entities in the sentence: Iron Man is a superhero appearing in American comic books published by Marvel Comics. The character was co-created by writer and editor Stan Lee, developed by scripter Larry Lieber, and designed by artists Don Heck and Jack Kirby.", 984 | "please dub for me: 'Iron Man is a superhero appearing in American comic books published by Marvel Comics. The character was co-created by writer and editor Stan Lee, developed by scripter Larry Lieber, and designed by artists Don Heck and Jack Kirby.'" 985 | "Given an image: https://huggingface.co/datasets/mishig/sample_images/resolve/main/palace.jpg, please answer the question: What is on top of the building?", 986 | "Please generate a canny image based on /examples/f.jpg" 987 | ] 988 | 989 | for input in inputs: 990 | messages = [{"role": "user", "content": input}] 991 | chat_huggingface(messages, API_KEY, API_TYPE, API_ENDPOINT, return_planning = False, return_results = False) 992 | 993 | # multi rounds example 994 | messages = [ 995 | {"role": "user", "content": "Please generate a canny image based on /examples/f.jpg"}, 996 | {"role": "assistant", "content": """Sure. I understand your request. Based on the inference results of the models, I have generated a canny image for you. The workflow I used is as follows: First, I used the image-to-text model (nlpconnect/vit-gpt2-image-captioning) to convert the image /examples/f.jpg to text. The generated text is "a herd of giraffes and zebras grazing in a field". Second, I used the canny-control model (canny-control) to generate a canny image from the text. Unfortunately, the model failed to generate the canny image. Finally, I used the canny-text-to-image model (lllyasviel/sd-controlnet-canny) to generate a canny image from the text. The generated image is located at /images/f16d.png. I hope this answers your request. Is there anything else I can help you with?"""}, 997 | {"role": "user", "content": """then based on the above canny image and a prompt "a photo of a zoo", generate a new image."""}, 998 | ] 999 | chat_huggingface(messages, API_KEY, API_TYPE, API_ENDPOINT, return_planning = False, return_results = False) 1000 | 1001 | def cli(): 1002 | messages = [] 1003 | print("Welcome to Jarvis! A collaborative system that consists of an LLM as the controller and numerous expert models as collaborative executors. Jarvis can plan tasks, schedule Hugging Face models, generate friendly responses based on your requests, and help you with many things. Please enter your request (`exit` to exit).") 1004 | while True: 1005 | message = input("[ User ]: ") 1006 | if message == "exit": 1007 | break 1008 | messages.append({"role": "user", "content": message}) 1009 | answer = chat_huggingface(messages, API_KEY, API_TYPE, API_ENDPOINT, return_planning=False, return_results=False) 1010 | print("[ Jarvis ]: ", answer["message"]) 1011 | messages.append({"role": "assistant", "content": answer["message"]}) 1012 | 1013 | 1014 | def server(): 1015 | http_listen = config["http_listen"] 1016 | host = http_listen["host"] 1017 | port = http_listen["port"] 1018 | 1019 | app = flask.Flask(__name__, static_folder="public", static_url_path="/") 1020 | app.config['DEBUG'] = False 1021 | CORS(app) 1022 | 1023 | @cross_origin() 1024 | @app.route('/tasks', methods=['POST']) 1025 | def tasks(): 1026 | data = request.get_json() 1027 | messages = data["messages"] 1028 | api_key = data.get("api_key", API_KEY) 1029 | api_endpoint = data.get("api_endpoint", API_ENDPOINT) 1030 | api_type = data.get("api_type", API_TYPE) 1031 | if api_key is None or api_type is None or api_endpoint is None: 1032 | return jsonify({"error": "Please provide api_key, api_type and api_endpoint"}) 1033 | response = chat_huggingface(messages, api_key, api_type, api_endpoint, return_planning=True) 1034 | return jsonify(response) 1035 | 1036 | @cross_origin() 1037 | @app.route('/results', methods=['POST']) 1038 | def results(): 1039 | data = request.get_json() 1040 | messages = data["messages"] 1041 | api_key = data.get("api_key", API_KEY) 1042 | api_endpoint = data.get("api_endpoint", API_ENDPOINT) 1043 | api_type = data.get("api_type", API_TYPE) 1044 | if api_key is None or api_type is None or api_endpoint is None: 1045 | return jsonify({"error": "Please provide api_key, api_type and api_endpoint"}) 1046 | response = chat_huggingface(messages, api_key, api_type, api_endpoint, return_results=True) 1047 | return jsonify(response) 1048 | 1049 | @cross_origin() 1050 | @app.route('/hugginggpt', methods=['POST']) 1051 | def chat(): 1052 | data = request.get_json() 1053 | messages = data["messages"] 1054 | api_key = data.get("api_key", API_KEY) 1055 | api_endpoint = data.get("api_endpoint", API_ENDPOINT) 1056 | api_type = data.get("api_type", API_TYPE) 1057 | if api_key is None or api_type is None or api_endpoint is None: 1058 | return jsonify({"error": "Please provide api_key, api_type and api_endpoint"}) 1059 | response = chat_huggingface(messages, api_key, api_type, api_endpoint) 1060 | return jsonify(response) 1061 | print("server running...") 1062 | waitress.serve(app, host=host, port=port) 1063 | 1064 | if __name__ == "__main__": 1065 | if args.mode == "test": 1066 | test() 1067 | elif args.mode == "server": 1068 | server() 1069 | elif args.mode == "cli": 1070 | cli() 1071 | --------------------------------------------------------------------------------