├── .gitignore ├── CITATION.cff ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── SECURITY.md ├── SUPPORT.md ├── easytool ├── README.md ├── assets │ ├── front.png │ └── logo.png ├── data_funcqa │ ├── funchub │ │ └── math.py │ └── tool_instruction │ │ ├── functions_data.json │ │ └── tool_dic.jsonl ├── data_process.py ├── data_restbench │ └── tool_instruction │ │ └── tmdb_tool.json ├── data_toolbench │ └── tool_instruction │ │ ├── API_description_embeddings.zip │ │ └── toolbench_tool_instruction.json ├── easytool │ ├── __init__.py │ ├── funcQA.py │ ├── restbench.py │ ├── toolbench.py │ ├── toolbench_retrieve.py │ └── util.py ├── main.py └── requirements.txt ├── hugginggpt ├── .dockerignore ├── Dockerfile.jetson ├── README.md ├── assets │ ├── intro.png │ ├── logo.png │ ├── overview.jpg │ ├── prompt_flow.jpg │ ├── screenshot_a.jpg │ └── screenshot_q.jpg ├── server │ ├── awesome_chat.py │ ├── configs │ │ ├── config.azure.yaml │ │ ├── config.default.yaml │ │ ├── config.gradio.yaml │ │ └── config.lite.yaml │ ├── data │ │ └── p0_models.jsonl │ ├── demos │ │ ├── demo_choose_model.json │ │ ├── demo_parse_task.json │ │ └── demo_response_results.json │ ├── get_token_ids.py │ ├── models │ │ ├── download.ps1 │ │ └── download.sh │ ├── models_server.py │ ├── public │ │ └── examples │ │ │ ├── a.jpg │ │ │ ├── b.jpg │ │ │ ├── c.jpg │ │ │ ├── d.jpg │ │ │ ├── e.jpg │ │ │ ├── f.jpg │ │ │ └── g.jpg │ ├── requirements.txt │ └── run_gradio_demo.py └── web │ ├── electron │ ├── .npmrc │ ├── electron-builder.yml │ ├── main.js │ ├── package.json │ └── preload.js │ ├── env.d.ts │ ├── index.html │ ├── package.json │ ├── postcss.config.js │ ├── public │ ├── audio.svg │ ├── favicon.ico │ └── video.svg │ ├── src │ ├── App.vue │ ├── api │ │ ├── chatgpt.ts │ │ └── hugginggpt.ts │ ├── assets │ │ ├── audio.svg │ │ ├── chatgpt.svg │ │ ├── clip.svg │ │ ├── favicon.ico │ │ ├── huggingface.svg │ │ ├── logo.svg │ │ ├── setting.svg │ │ ├── switch.svg │ │ └── tailwind.css │ ├── components │ │ └── Loading.vue │ ├── config │ │ └── index.ts │ ├── main.ts │ ├── prompt │ │ └── index.ts │ ├── router │ │ └── index.ts │ ├── types │ │ └── index.ts │ └── views │ │ └── home.vue │ ├── tailwind.config.js │ ├── tsconfig.config.json │ ├── tsconfig.json │ └── vite.config.ts └── taskbench ├── README.md ├── assets ├── backinstruct.png ├── eval.png ├── logo.png └── logo2.png ├── batch_evaluate.sh ├── data_dailylifeapis ├── alignment_ids.json ├── data.json ├── graph_desc.json ├── tool_desc.json └── user_requests.json ├── data_engine.py ├── data_huggingface ├── alignment_ids.json ├── data.json ├── graph_desc.json ├── tool_desc.json └── user_requests.json ├── data_multimedia ├── alignment_ids.json ├── data.json ├── graph_desc.json ├── tool_desc.json └── user_requests.json ├── evaluate.py ├── format_data.py ├── generate_graph.py ├── graph_sampler.py ├── inference.py ├── requirements.txt └── visualize_graph.py /.gitignore: -------------------------------------------------------------------------------- 1 | # ALL 2 | *.dev.yaml 3 | 4 | legacy/ 5 | 6 | # for server 7 | server/models/* 8 | !server/models/download.sh 9 | !server/models/download.ps1 10 | server/logs/ 11 | server/models_dev 12 | server/public/* 13 | !server/public/examples/ 14 | server/public/examples/* 15 | !server/public/examples/a.jpg 16 | !server/public/examples/b.jpg 17 | !server/public/examples/c.jpg 18 | !server/public/examples/d.jpg 19 | !server/public/examples/e.jpg 20 | !server/public/examples/f.jpg 21 | !server/public/examples/g.jpg 22 | 23 | # docker 24 | Dockerfile 25 | docker-compose.yml 26 | 27 | # for gradio 28 | # server/run_gradio.py 29 | 30 | # for web 31 | web/node_modules 32 | web/package-lock.json 33 | web/dist 34 | web/electron-dist 35 | web/yarn.lock 36 | 37 | # Byte-compiled / optimized / DLL files 38 | __pycache__/ 39 | *.py[cod] 40 | *$py.class 41 | 42 | # C extensions 43 | *.so 44 | 45 | # Distribution / packaging 46 | .Python 47 | build/ 48 | develop-eggs/ 49 | dist/ 50 | downloads/ 51 | eggs/ 52 | .eggs/ 53 | lib/ 54 | lib64/ 55 | parts/ 56 | sdist/ 57 | var/ 58 | wheels/ 59 | pip-wheel-metadata/ 60 | share/python-wheels/ 61 | *.egg-info/ 62 | .installed.cfg 63 | *.egg 64 | MANIFEST 65 | 66 | # PyInstaller 67 | # Usually these files are written by a python script from a template 68 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 69 | *.manifest 70 | *.spec 71 | 72 | # Installer logs 73 | pip-log.txt 74 | pip-delete-this-directory.txt 75 | 76 | # Unit test / coverage reports 77 | htmlcov/ 78 | .tox/ 79 | .nox/ 80 | .coverage 81 | .coverage.* 82 | .cache 83 | nosetests.xml 84 | coverage.xml 85 | *.cover 86 | *.py,cover 87 | .hypothesis/ 88 | .pytest_cache/ 89 | 90 | # Translations 91 | *.mo 92 | *.pot 93 | 94 | # Django stuff: 95 | *.log 96 | local_settings.py 97 | db.sqlite3 98 | db.sqlite3-journal 99 | 100 | # Flask stuff: 101 | instance/ 102 | .webassets-cache 103 | 104 | # Scrapy stuff: 105 | .scrapy 106 | 107 | # Sphinx documentation 108 | docs/_build/ 109 | 110 | # PyBuilder 111 | target/ 112 | 113 | # Jupyter Notebook 114 | .ipynb_checkpoints 115 | 116 | # IPython 117 | profile_default/ 118 | ipython_config.py 119 | 120 | # pyenv 121 | .python-version 122 | 123 | # pipenv 124 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 125 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 126 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 127 | # install all needed dependencies. 128 | #Pipfile.lock 129 | 130 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 131 | __pypackages__/ 132 | 133 | # Celery stuff 134 | celerybeat-schedule 135 | celerybeat.pid 136 | 137 | # SageMath parsed files 138 | *.sage.py 139 | 140 | # Environments 141 | .env 142 | .venv 143 | env/ 144 | venv/ 145 | ENV/ 146 | env.bak/ 147 | venv.bak/ 148 | 149 | # Spyder project settings 150 | .spyderproject 151 | .spyproject 152 | 153 | # Rope project settings 154 | .ropeproject 155 | 156 | # mkdocs documentation 157 | /site 158 | 159 | # mypy 160 | .mypy_cache/ 161 | .dmypy.json 162 | dmypy.json 163 | 164 | # Pyre type checker 165 | .pyre/ 166 | 167 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you find this work useful in your method, you can cite the paper as below." 3 | authors: 4 | - family-names: Shen 5 | given-names: Yongliang 6 | - family-names: Song 7 | given-names: Kaitao 8 | - family-names: Tan 9 | given-names: Xu 10 | - family-names: Li 11 | given-names: Dongsheng 12 | - family-names: Lu 13 | given-names: Weiming 14 | - family-names: Zhuang 15 | given-names: Yueting 16 | title: "HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in HuggingFace" 17 | version: 2.0.4 18 | license: MIT 19 | status: preprint 20 | date-released: 2023-03-30 21 | url: https://github.com/microsoft/JARVIS 22 | preferred-citation: 23 | type: article 24 | authors: 25 | - family-names: Shen 26 | given-names: Yongliang 27 | - family-names: Song 28 | given-names: Kaitao 29 | - family-names: Tan 30 | given-names: Xu 31 | - family-names: Li 32 | given-names: Dongsheng 33 | - family-names: Lu 34 | given-names: Weiming 35 | - family-names: Zhuang 36 | given-names: Yueting 37 | journal: "arXiv preprint arXiv:2303.17580" 38 | title: "HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in HuggingFace" 39 | year: 2023 40 | url: https://arxiv.org/abs/2303.17580 -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | 2 | To contribute to this GitHub project, you can follow these steps: 3 | 4 | 1. Fork the repository you want to contribute to by clicking the "Fork" button on the project page. 5 | 6 | 2. Clone the repository to your local machine and enter the newly created repo using the following commands: 7 | 8 | ``` 9 | git clone https://github.com/YOUR-GITHUB-USERNAME/JARVIS 10 | cd JARVIS 11 | ``` 12 | 3. Create a new branch for your changes using the following command: 13 | 14 | ``` 15 | git checkout -b "branch-name" 16 | ``` 17 | 4. Make your changes to the code or documentation. 18 | 19 | 5. Add the changes to the staging area using the following command: 20 | ``` 21 | git add . 22 | ``` 23 | 24 | 6. Commit the changes with a meaningful commit message using the following command: 25 | ``` 26 | git commit -m "your commit message" 27 | ``` 28 | 7. Push the changes to your forked repository using the following command: 29 | ``` 30 | git push origin branch-name 31 | ``` 32 | 8. Go to the GitHub website and navigate to your forked repository. 33 | 34 | 9. Click the "New pull request" button. 35 | 36 | 10. Select the branch you just pushed to and the branch you want to merge into on the original repository. 37 | 38 | 11. Add a description of your changes and click the "Create pull request" button. 39 | 40 | 12. Wait for the project maintainer to review your changes and provide feedback. 41 | 42 | 13. Make any necessary changes based on feedback and repeat steps 5-12 until your changes are accepted and merged into the main project. 43 | 44 | 14. Once your changes are merged, you can update your forked repository and local copy of the repository with the following commands: 45 | 46 | ``` 47 | git fetch upstream 48 | git checkout main 49 | git merge upstream/main 50 | ``` 51 | Finally, delete the branch you created with the following command: 52 | ``` 53 | git branch -d branch-name 54 | ``` 55 | That's it you made it 🐣⭐⭐ 56 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # JARVIS 2 | 3 | 4 | [![arXiv](https://img.shields.io/badge/arXiv-Paper-.svg)](https://arxiv.org/abs/2303.17580) 5 | [![Open in Spaces](https://img.shields.io/badge/%F0%9F%A4%97-Open%20in%20Spaces-blue)](https://huggingface.co/spaces/microsoft/HuggingGPT) 6 | 7 | The mission of JARVIS is to explore artificial general intelligence (AGI) and deliver cutting-edge research to the whole community. 8 | 9 | ## What's New 10 | 11 | + [2024.01.15] We release Easytool for easier tool usage. 12 | + The code and datasets are available at [EasyTool](/easytool). 13 | + The paper is available at [EasyTool: Enhancing LLM-based Agents with Concise Tool Instruction](https://arxiv.org/abs/2401.06201). 14 | + [2023.11.30] We release TaskBench for evaluating task automation capability of LLMs. 15 | + The code and datasets are available at [TaskBench](/taskbench). 16 | + The paper is available at [TaskBench: Benchmarking Large Language Models for Task Automation](https://arxiv.org/abs/2311.18760). 17 | + [2023.07.28] We are now in the process of planning evaluation and project rebuilding. We will release a new version of Jarvis in the near future. 18 | + [2023.07.24] We released a light langchain version of Jarvis. See here. 19 | + [2023.04.16] Jarvis now supports the OpenAI service on the Azure platform and the GPT-4 model. 20 | + [2023.04.06] We added the Gradio demo and built the web API for `/tasks` and `/results` in `server` mode. 21 | + The Gradio demo is now hosted on Hugging Face Space. (Build with `inference_mode=hybrid` and `local_deployment=standard`) 22 | + The Web API `/tasks` and `/results` access intermediate results for `Stage #1`: task planning and `Stage #1-3`: model selection with execution results. See here. 23 | + [2023.04.03] We added the CLI mode and provided parameters for configuring the scale of local endpoints. 24 | + You can enjoy a lightweight experience with Jarvis without deploying the models locally. See here. 25 | + Just run `python awesome_chat.py --config configs/config.lite.yaml` to experience it. 26 | + [2023.04.01] We updated a version of code for building. 27 | 28 | 29 | ### Overview 30 | 31 | Language serves as an interface for LLMs to connect numerous AI models for solving complicated AI tasks! 32 | 33 |

34 | image 35 |

36 | 37 | 38 | See our paper: [HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in HuggingFace](http://arxiv.org/abs/2303.17580), Yongliang Shen, Kaitao Song, Xu Tan, Dongsheng Li, Weiming Lu and Yueting Zhuang (the first two authors contribute equally) 39 | 40 | We introduce a collaborative system that consists of **an LLM as the controller** and **numerous expert models as collaborative executors** (from HuggingFace Hub). The workflow of our system consists of four stages: 41 | + **Task Planning**: Using ChatGPT to analyze the requests of users to understand their intention, and disassemble them into possible solvable tasks. 42 | + **Model Selection**: To solve the planned tasks, ChatGPT selects expert models hosted on Hugging Face based on their descriptions. 43 | + **Task Execution**: Invokes and executes each selected model, and return the results to ChatGPT. 44 | + **Response Generation**: Finally, using ChatGPT to integrate the prediction of all models, and generate responses. 45 | 46 |

47 | 48 | ### System Requirements 49 | 50 | #### Default (Recommended) 51 | 52 | For `configs/config.default.yaml`: 53 | 54 | + Ubuntu 16.04 LTS 55 | + VRAM >= 24GB 56 | + RAM > 12GB (minimal), 16GB (standard), 80GB (full) 57 | + Disk > 284GB 58 | + 42GB for `damo-vilab/text-to-video-ms-1.7b` 59 | + 126GB for `ControlNet` 60 | + 66GB for `stable-diffusion-v1-5` 61 | + 50GB for others 62 | 63 | #### Minimum (Lite) 64 | 65 | For `configs/config.lite.yaml`: 66 | 67 | + Ubuntu 16.04 LTS 68 | + Nothing else 69 | 70 | The configuration `configs/config.lite.yaml` does not require any expert models to be downloaded and deployed locally. However, it means that Jarvis is restricted to models running stably on HuggingFace Inference Endpoints. 71 | 72 | ### Quick Start 73 | 74 | First replace `openai.key` and `huggingface.token` in `server/configs/config.default.yaml` with **your personal OpenAI Key** and **your Hugging Face Token**, or put them in the environment variables `OPENAI_API_KEY` and `HUGGINGFACE_ACCESS_TOKEN` respectively. Then run the following commands: 75 | 76 | 77 | 78 | #### For Server: 79 | 80 | ```bash 81 | # setup env 82 | cd server 83 | conda create -n jarvis python=3.8 84 | conda activate jarvis 85 | conda install pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvidia 86 | pip install -r requirements.txt 87 | 88 | # download models. Make sure that `git-lfs` is installed. 89 | cd models 90 | bash download.sh # required when `inference_mode` is `local` or `hybrid`. 91 | 92 | # run server 93 | cd .. 94 | python models_server.py --config configs/config.default.yaml # required when `inference_mode` is `local` or `hybrid` 95 | python awesome_chat.py --config configs/config.default.yaml --mode server # for text-davinci-003 96 | ``` 97 | 98 | Now you can access Jarvis' services by the Web API. 99 | 100 | + `/hugginggpt` --method `POST`, access the full service. 101 | + `/tasks` --method `POST`, access intermediate results for Stage #1. 102 | + `/results` --method `POST`, access intermediate results for Stage #1-3. 103 | 104 | For example: 105 | 106 | ```bash 107 | # request 108 | curl --location 'http://localhost:8004/tasks' \ 109 | --header 'Content-Type: application/json' \ 110 | --data '{ 111 | "messages": [ 112 | { 113 | "role": "user", 114 | "content": "based on pose of /examples/d.jpg and content of /examples/e.jpg, please show me a new image" 115 | } 116 | ] 117 | }' 118 | 119 | # response 120 | [{"args":{"image":"/examples/d.jpg"},"dep":[-1],"id":0,"task":"openpose-control"},{"args":{"image":"/examples/e.jpg"},"dep":[-1],"id":1,"task":"image-to-text"},{"args":{"image":"-0","text":"-1"},"dep":[1,0],"id":2,"task":"openpose-text-to-image"}] 121 | ``` 122 | 123 | 124 | #### For Web: 125 | 126 | We provide a user-friendly web page. After starting `awesome_chat.py` in a server mode, you can run the commands to communicate with Jarvis in your browser: 127 | 128 | - you need to install `nodejs` and `npm` first. 129 | - [ IMPORTANT ] if you are running the web client on another machine, you need set `http://{LAN_IP_of_the_server}:{port}/` to `HUGGINGGPT_BASE_URL` of `web/src/config/index.ts`. 130 | - if you want to use the video generation feature, you need to compile `ffmpeg` manually with H.264. 131 | - you can switch to ChatGPT by `double click` on the setting icon! 132 | 133 | ```bash 134 | cd web 135 | npm install 136 | npm run dev 137 | ``` 138 | 139 | ```bash 140 | # Optional: Install ffmpeg 141 | # This command need be executed without errors. 142 | LD_LIBRARY_PATH=/usr/local/lib /usr/local/bin/ffmpeg -i input.mp4 -vcodec libx264 output.mp4 143 | ``` 144 | 145 | 146 | 147 | #### For Gradio 148 | 149 | The Gradio demo is now hosted on Hugging Face Space. You can also run the following commands to start the demo locally: 150 | 151 | ```bash 152 | python models_server.py --config configs/config.gradio.yaml 153 | python run_gradio_demo.py --config configs/config.gradio.yaml 154 | 155 | # or run the HF Space as a Docker image (Build with `inference_mode=hibrid` and `local_deployment=standard`) 156 | docker run -it -p 7860:7860 --platform=linux/amd64 registry.hf.space/microsoft-hugginggpt:latest python app.py 157 | ``` 158 | 159 | #### For CLI: 160 | 161 | You can also run Jarvis more easily in CLI mode: 162 | 163 | ```bash 164 | cd server 165 | python awesome_chat.py --config configs/config.default.yaml --mode cli 166 | ``` 167 | 168 | Examples of CLI mode: 169 | 170 | ``` 171 | Welcome to Jarvis! A collaborative system that consists of an LLM as the controller and numerous expert models as collaborative executors. Jarvis can plan tasks, schedule Hugging Face models, generate friendly responses based on your requests, and help you with many things. Please enter your request (`exit` to exit). 172 | [ User ]: Given a collection of image A: /examples/a.jpg, B: /examples/b.jpg, C: /examples/c.jpg, please tell me how many zebras in these picture? 173 | [ Jarvis ]: Based on your request, I have used several models to process the images you provided. For the image A: /examples/a.jpg, I used the image-to-text model nlpconnect/vit-gpt2-image-captioning to generate the text "a cat sitting on a window sill looking out". Then I used the object-detection model facebook/detr-resnet-50 to generate the image with predicted box '/images/a3dc.jpg', which contains two objects with labels 'potted plant' and 'cat'. For the image B: /examples/b.jpg, I used the image-to-text model nlpconnect/vit-gpt2-image-captioning to generate the text "two zebras standing in a field of dry grass". Then I used the object-detection model facebook/detr-resnet-50 to generate the image with predicted box '/images/f5df.jpg', which contains three objects with labels 'zebra'. For the image C: /examples/c.jpg, I used the image-to-text model nlpconnect/vit-gpt2-image-captioning to generate the text "a zebra standing in a field of tall grass". Then I used the object-detection model facebook/detr-resnet-50 to generate the image with predicted box '/images/69c7.jpg', which contains one object with label 'zebra'. Finally, I used the canny-control model canny-control to generate the image '/images/7e18.png' based on the image /examples/f.jpg. Therefore, there are four zebras in these pictures. Is there anything else I can help you with? 174 | [ User ]: Please answer all the named entities in the sentence: Iron Man is a superhero appearing in American comic books published by Marvel Comics. The character was co-created by writer and editor Stan Lee, developed by scripter Larry Lieber, and designed by artists Don Heck and Jack Kirby. 175 | [ Jarvis ]: Sure, I understand your request. Based on the inference results of the models, I will answer your request step by step. Firstly, I used the token-classification model dslim/bert-base-NER to recognize the named entities in the sentence. The inference result is that the named entities are Iron Man, Stan Lee, Larry Lieber, Don Heck and Jack Kirby. Then, I used the question-answering model deepset/roberta-base-squad2 to answer your request. The inference result is that there is no output since the context cannot be empty. Therefore, I cannot make it. I hope this answer your request. Is there anything else I can help you with? 176 | [ User ]: Please generate a canny image based on /examples/f.jpg 177 | [ Jarvis ]: Sure. I have generated a canny image based on /examples/f.jpg. To do this, I first used the image-to-text model nlpconnect/vit-gpt2-image-captioning to generate the text description of the image, which is "a herd of giraffes and zebras grazing in a field". Then I used the canny-control model to generate the canny image of the original image, which is stored in the path "/images/1ea2.png". Finally, I used the canny-text-to-image model lllyasviel/sd-controlnet-canny to generate the canny image based on the text description and the canny image, which is stored in the path "/images/ba66.png". Do you need anything else? 178 | ``` 179 | 180 | 181 | ### Configuration 182 | 183 | The server-side configuration file is `server/configs/config.default.yaml`, and some parameters are presented as follows: 184 | 185 | + `model`: LLM, currently supports `text-davinci-003`. We are working on integrating more open-source LLMs. 186 | + `inference_mode`: mode of inference endpoints 187 | + `local`: only use the local inference endpoints 188 | + `huggingface`: only use the Hugging Face Inference Endpoints **(free of local inference endpoints)** 189 | + `hybrid`: both of `local` and `huggingface` 190 | + `local_deployment`: scale of locally deployed models, works under `local` or `hybrid` inference mode: 191 | + `minimal` (RAM>12GB, ControlNet only) 192 | + `standard` (RAM>16GB, ControlNet + Standard Pipelines) 193 | + `full` (RAM>42GB, All registered models) 194 | 195 | On a personal laptop, we recommend the configuration of `inference_mode: hybrid `and `local_deployment: minimal`. But the available models under this setting may be limited due to the instability of remote Hugging Face Inference Endpoints. 196 | 197 | ### NVIDIA Jetson Embedded Device Support 198 | A [Dockerfile](./Dockerfile.jetson) is included that provides experimental support for [NVIDIA Jetson embedded devices](https://developer.nvidia.com/embedded-computing). This image provides accelerated ffmpeg, pytorch, torchaudio, and torchvision dependencies. To build the docker image, [ensure that the default docker runtime is set to 'nvidia'](https://github.com/NVIDIA/nvidia-docker/wiki/Advanced-topics#default-runtime). A pre-built image is provided at https://hub.docker.com/r/toolboc/nv-jarvis. 199 | 200 | ```bash 201 | #Build the docker image 202 | docker build --pull --rm -f "Dockerfile.jetson" -t toolboc/nv-jarvis:r35.2.1 203 | ``` 204 | 205 | Due to to memory requirements, JARVIS is required to run on Jetson AGX Orin family devices (64G on-board RAM device preferred) with config options set to: 206 | * `inference_mode: local` 207 | * `local_deployment: standard` 208 | 209 | Models and configs are recommended to be provided through a volume mount from the host to the container as shown in the `docker run` step below. It is possible to uncomment the `# Download local models` section of the [Dockerfile](./Dockerfile.jetson) to build a container with models included. 210 | 211 | #### Start the model server, awesomechat, and web app on Jetson Orin AGX 212 | 213 | ```bash 214 | # run the container which will automatically start the model server 215 | docker run --name jarvis --net=host --gpus all -v ~/jarvis/configs:/app/server/configs -v ~/src/JARVIS/server/models:/app/server/models toolboc/nv-jarvis:r35.2.1 216 | 217 | # (wait for model server to complete initialization) 218 | 219 | # start awesome_chat.py 220 | docker exec jarvis python3 awesome_chat.py --config configs/config.default.yaml --mode server 221 | 222 | #start the web application (application will be acessible at http://localhost:9999) 223 | docker exec jarvis npm run dev --prefix=/app/web 224 | ``` 225 | 226 | ### Screenshots 227 | 228 |

229 | 230 | 231 | 232 | 233 | ## Citation 234 | If you find this work useful in your method, you can cite the paper as below: 235 | 236 | @inproceedings{shen2023hugginggpt, 237 | author = {Shen, Yongliang and Song, Kaitao and Tan, Xu and Li, Dongsheng and Lu, Weiming and Zhuang, Yueting}, 238 | booktitle = {Advances in Neural Information Processing Systems}, 239 | title = {HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in HuggingFace}, 240 | year = {2023} 241 | } 242 | 243 | > 244 | @article{shen2023taskbench, 245 | title = {TaskBench: Benchmarking Large Language Models for Task Automation}, 246 | author = {Shen, Yongliang and Song, Kaitao and Tan, Xu and Zhang, Wenqi and Ren, Kan and Yuan, Siyu and Lu, Weiming and Li, Dongsheng and Zhuang, Yueting}, 247 | journal = {arXiv preprint arXiv:2311.18760}, 248 | year = {2023} 249 | } 250 | 251 | > 252 | @article{yuan2024easytool, 253 | title = {EASYTOOL: Enhancing LLM-based Agents with Concise Tool Instruction}, 254 | author = {Siyu Yuan and Kaitao Song and Jiangjie Chen and Xu Tan and Yongliang Shen and Ren Kan and Dongsheng Li and Deqing Yang}, 255 | journal = {arXiv preprint arXiv:2401.06201}, 256 | year = {2024} 257 | } 258 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /SUPPORT.md: -------------------------------------------------------------------------------- 1 | # TODO: The maintainer of this repo has not yet edited this file 2 | 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project? 4 | 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help. 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps. 7 | - **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide. 8 | 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.* 10 | 11 | # Support 12 | 13 | ## How to file issues and get help 14 | 15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 16 | issues before filing new issues to avoid duplicates. For new issues, file your bug or 17 | feature request as a new Issue. 18 | 19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER 21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**. 22 | 23 | ## Microsoft Support Policy 24 | 25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above. 26 | -------------------------------------------------------------------------------- /easytool/README.md: -------------------------------------------------------------------------------- 1 |

2 |
3 |

4 | 5 |
6 |

EasyTool

7 |

Enhancing LLM-based Agents with Concise Tool Instruction

8 |

9 | 10 | ## What's New 11 | + [2024.01.15] We release Easytool for easier tool usage. 12 | + The code and datasets are available at [easytool](#). 13 | + The paper is available at [EASYTOOL: Enhancing LLM-based Agents with Concise Tool Instruction](https://arxiv.org/abs/2401.06201). 14 | 15 | ## Overview 16 | 17 | LLM-based agents usually employ tool documentation to grasp the selection and usage of tools from different sources, but these documentations could be inconsistent in formats, redundant with excessive length, and lacking demonstrations for instructions. 18 | 19 | EasyTool is an easy but effective method to create clear, structured, and unified instructions from tool documentations for improving LLM-based agents in using tools. 20 | 21 |

22 | image 23 |

24 | 25 | ## Experiment 26 | 27 | ### Prerequisites 28 | 29 | - Prepare requirements: `pip install -r requirements.txt` 30 | - Data Construction: `python3 data_process.py` 31 | 32 | Before running any of the commands, ensure that you have set the necessary API keys. Replace `""` with your actual keys. 33 | ```bash 34 | export OPENAI_API_KEY="your_openai_api_key_here" 35 | export RAPIDAPI_KEY="your_rapidapi_key_here" 36 | ``` 37 | ### ToolBench 38 | You need first get the tool execution code (./data/toolenv/tools.) from the following link: [Google Drive](https://drive.google.com/drive/folders/1yBUQ732mPu-KclJnuQELEhtKakdXFc3J) or [Tsinghua Cloud](https://cloud.tsinghua.edu.cn/f/c9e50625743b40bfbe10/) and then save them to ./toolenv/tools 39 | To inference with LLMs, run the following commands: 40 | ```bash 41 | unzip data_toolbench/tool_instruction/API_description_embeddings.zip -d data_toolbench/tool_instruction/ 42 | 43 | export OPENAI_API_KEY="" 44 | export RAPIDAPI_KEY="" 45 | 46 | python3 main.py \ 47 | --model_name gpt-3.5-turbo \ 48 | --task toolbench \ 49 | --data_type G2 \ 50 | --tool_root_dir ./toolenv/tools 51 | 52 | python3 main.py \ 53 | --model_name gpt-3.5-turbo \ 54 | --task toolbench \ 55 | --data_type G3 \ 56 | --tool_root_dir ./toolenv/tools 57 | 58 | python3 main.py \ 59 | --model_name gpt-3.5-turbo \ 60 | --task toolbench_retrieve \ 61 | --data_type G2 \ 62 | --tool_root_dir ./toolenv/tools 63 | 64 | python3 main.py \ 65 | --model_name gpt-3.5-turbo \ 66 | --task toolbench_retrieve \ 67 | --data_type G3 \ 68 | --tool_root_dir ./toolenv/tools 69 | ``` 70 | 71 | ### FuncQA 72 | 73 | To inference with LLMs, run the following commands: 74 | ```bash 75 | export OPENAI_API_KEY="" 76 | 77 | python3 main.py \ 78 | --model_name gpt-3.5-turbo \ 79 | --task funcqa \ 80 | --data_type funcqa_mh 81 | 82 | python3 main.py \ 83 | --model_name gpt-3.5-turbo \ 84 | --task funcqa \ 85 | --data_type funcqa_oh 86 | ``` 87 | 88 | ### RestBench 89 | 90 | To inference with LLMs, run the following commands: 91 | ```bash 92 | export OPENAI_API_KEY="" 93 | 94 | python3 main.py \ 95 | --model_name gpt-3.5-turbo \ 96 | --task restbench 97 | ``` 98 | 99 | ## Citation 100 | 101 | If you find this work useful in your method, you can cite the paper as below: 102 | 103 | @article{yuan2024easytool, 104 | title = {EASYTOOL: Enhancing LLM-based Agents with Concise Tool Instruction}, 105 | author = {Siyu Yuan and Kaitao Song and Jiangjie Chen and Xu Tan and Yongliang Shen and Ren Kan and Dongsheng Li and Deqing Yang}, 106 | journal = {arXiv preprint arXiv:2401.06201}, 107 | year = {2024} 108 | } 109 | 110 | ## Acknowledgement 111 | 112 | - [ChatGPT](https://platform.openai.com/) 113 | - [Hugging Face](https://huggingface.co/) 114 | - [ToolBench](https://github.com/OpenBMB/ToolBench) 115 | - [RestBench](https://github.com/Yifan-Song793/RestGPT) 116 | - [FuncQA](https://github.com/Ber666/ToolkenGPT) 117 | -------------------------------------------------------------------------------- /easytool/assets/front.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/JARVIS/c62e0faac76c4a2907cabe2cfe4bbe5f2e613400/easytool/assets/front.png -------------------------------------------------------------------------------- /easytool/assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/JARVIS/c62e0faac76c4a2907cabe2cfe4bbe5f2e613400/easytool/assets/logo.png -------------------------------------------------------------------------------- /easytool/data_funcqa/funchub/math.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | # this function is used to round the result to 2 decimal places 4 | # e.g. 52.3523 -> 52.35, 52.0011 -> 52, 0.00000233 -> 0.0000023 5 | def custom_round(x, decimal_places=2): 6 | str_x = f"{x:.10f}" 7 | before_decimal = str_x.split('.')[0] 8 | after_decimal = str_x.split('.')[1] 9 | leading_zeros = len(after_decimal) - len(after_decimal.lstrip('0')) 10 | 11 | if leading_zeros >= 1 and before_decimal == "0": 12 | return round(x, leading_zeros + 2) 13 | else: 14 | return round(x, decimal_places) 15 | 16 | # this function converts a number in scientific notation to decimal notation 17 | def scito_decimal(sci_str): 18 | def split_exponent(number_str): 19 | parts = number_str.split("e") 20 | coefficient = parts[0] 21 | exponent = int(parts[1]) if len(parts) == 2 else 0 22 | return coefficient, exponent 23 | 24 | def multiplyby_10(number_str, exponent): 25 | if exponent == 0: 26 | return number_str 27 | 28 | if exponent > 0: 29 | index = number_str.index(".") if "." in number_str else len(number_str) 30 | number_str = number_str.replace(".", "") 31 | new_index = index + exponent 32 | number_str += "0" * (new_index - len(number_str)) 33 | if new_index < len(number_str): 34 | number_str = number_str[:new_index] + "." + number_str[new_index:] 35 | return number_str 36 | 37 | if exponent < 0: 38 | index = number_str.index(".") if "." in number_str else len(number_str) 39 | number_str = number_str.replace(".", "") 40 | new_index = index + exponent 41 | number_str = "0" * (-new_index) + number_str 42 | number_str = "0." + number_str 43 | return number_str 44 | 45 | coefficient, exponent = split_exponent(sci_str) 46 | decimal_str = multiplyby_10(coefficient, exponent) 47 | 48 | # remove trailing zeros 49 | if "." in decimal_str: 50 | decimal_str = decimal_str.rstrip("0") 51 | 52 | return decimal_str 53 | 54 | # normalize the result to 2 decimal places and remove trailing zeros 55 | def normalize(res, round_to=2): 56 | # we round the result to 2 decimal places 57 | res = custom_round(res, round_to) 58 | res = str(res) 59 | if "." in res: 60 | while res[-1] == "0": 61 | res = res[:-1] 62 | res = res.strip(".") 63 | 64 | # scientific notation 65 | if "e" in res: 66 | res = scito_decimal(res) 67 | 68 | return res 69 | 70 | # 1. add 71 | def add_(args): 72 | 73 | return normalize(sum(args)) 74 | 75 | # 2. subtract 76 | def subtract_(args): 77 | 78 | res = args[0] 79 | for arg in args[1:]: 80 | res -= arg 81 | return normalize(res) 82 | 83 | # 3. multiply 84 | def multiply_(args): 85 | 86 | res = args[0] 87 | for arg in args[1:]: 88 | res *= arg 89 | return normalize(res) 90 | 91 | # 4. divide 92 | def divide_(args): 93 | 94 | res = args[0] 95 | for arg in args[1:]: 96 | res /= arg 97 | return normalize(res) 98 | 99 | # 5. power 100 | def power_(args): 101 | 102 | res = args[0] 103 | for arg in args[1:]: 104 | res **= arg 105 | return normalize(res) 106 | 107 | # 6. square root 108 | def sqrt_(args): 109 | res = args[0] 110 | return normalize(math.sqrt(res)) 111 | 112 | # 7. 10th log 113 | def log_(args): 114 | # if only one argument is passed, it is 10th log 115 | if len(args) == 1: 116 | res = args[0] 117 | return normalize(math.log10(res)) 118 | # if two arguments are passed, it is log with base as the second argument 119 | elif len(args) == 2: 120 | res = args[0] 121 | base = args[1] 122 | return normalize(math.log(res, base)) 123 | else: 124 | raise Exception("Invalid number of arguments passed to log function") 125 | 126 | # 8. natural log 127 | def ln_(args): 128 | res = args[0] 129 | return normalize(math.log(res)) 130 | 131 | 132 | # 9. choose 133 | def choose_(args): 134 | n = args[0] 135 | r = args[1] 136 | return normalize(math.comb(n, r)) 137 | 138 | # 10. permutation 139 | def permutate_(args): 140 | n = args[0] 141 | r = args[1] 142 | return normalize(math.perm(n, r)) 143 | 144 | # 11. greatest common divisor 145 | def gcd_(args): 146 | res = args[0] 147 | for arg in args[1:]: 148 | res = math.gcd(res, arg) 149 | return normalize(res) 150 | 151 | # 12. least common multiple 152 | def lcm_(args): 153 | res = args[0] 154 | for arg in args[1:]: 155 | res = res * arg // math.gcd(res, arg) 156 | return normalize(res) 157 | 158 | # 13. remainder 159 | def remainder_(args): 160 | dividend = args[0] 161 | divisor = args[1] 162 | return normalize(dividend % divisor) -------------------------------------------------------------------------------- /easytool/data_funcqa/tool_instruction/functions_data.json: -------------------------------------------------------------------------------- 1 | { 2 | "0":{ 3 | "ID":0, 4 | "standardized_name":"add_", 5 | "API_description":"'add_' returns the sum of all the arguments passed to it, normalized to 2 decimal places.", 6 | "Usage":{ 7 | "required_parameters":[ 8 | { 9 | "name":"input", 10 | "type":"List" 11 | } 12 | ], 13 | "Example":{ 14 | "Scenario":"if you want to add 2 to 1.", 15 | "Parameters":{ 16 | "input":[2,1] 17 | } 18 | } 19 | } 20 | }, 21 | "1": { 22 | "ID": 1, 23 | "standardized_name": "subtract_", 24 | "API_description": "'subtract_' returns the difference of the arguments passed to it, starting with the first argument and subtracting all subsequent arguments, normalized to 2 decimal places.", 25 | "Usage": { 26 | "required_parameters": [ 27 | { 28 | "name": "input", 29 | "type": "List" 30 | } 31 | ], 32 | "Example": { 33 | "Scenario": "if you want to subtract 2 from 1.", 34 | "Parameters": { 35 | "input": [1,2] 36 | } 37 | } 38 | } 39 | }, 40 | "2": { 41 | "ID": 2, 42 | "standardized_name": "multiply_", 43 | "API_description": "'multiply_' returns the product of all the arguments passed to it, normalized to 2 decimal places.", 44 | "Usage": { 45 | "required_parameters": [ 46 | { 47 | "name": "input", 48 | "type": "List" 49 | } 50 | ], 51 | "Example": { 52 | "Scenario": "if you want to calculate 2*1.", 53 | "Parameters": { 54 | "input": [2,1] 55 | } 56 | } 57 | } 58 | }, 59 | "3": { 60 | "ID": 3, 61 | "standardized_name": "divide_", 62 | "API_description": "'divide_' returns the quotient of the first argument divided by all the subsequent arguments, normalized to 2 decimal places.", 63 | "Usage": { 64 | "required_parameters": [ 65 | { 66 | "name": "input", 67 | "type": "List" 68 | } 69 | ], 70 | "Example": { 71 | "Scenario": "if you want to calculate 4/2.", 72 | "Parameters": { 73 | "input": [4,2] 74 | } 75 | } 76 | } 77 | }, 78 | "4": { 79 | "ID": 4, 80 | "standardized_name": "power_", 81 | "API_description": "'power_' returns the result of raising the first argument to the power of all the subsequent arguments, normalized to 2 decimal places.", 82 | "Usage": { 83 | "required_parameters": [ 84 | { 85 | "name": "input", 86 | "type": "List" 87 | } 88 | ], 89 | "Example": { 90 | "Scenario": "if you want to calculate 2^3.", 91 | "Parameters": { 92 | "input": [2,3] 93 | } 94 | } 95 | } 96 | }, 97 | "5": { 98 | "ID": 5, 99 | "standardized_name": "sqrt_", 100 | "API_description": "'sqrt_' returns the square root of the first argument, normalized to 2 decimal places.", 101 | "Usage": { 102 | "required_parameters": [ 103 | { 104 | "name": "input", 105 | "type": "List" 106 | } 107 | ], 108 | "Example": { 109 | "Scenario": "if you want to get the square root of 9.", 110 | "Parameters": { 111 | "input": [9] 112 | } 113 | } 114 | } 115 | }, 116 | "6": { 117 | "ID": 6, 118 | "standardized_name": "log_", 119 | "API_description": "'log_' returns the base-10 logarithm of the first argument if one argument is provided, or the logarithm with base as the second argument if two arguments are provided, normalized to 2 decimal places.", 120 | "Usage": { 121 | "required_parameters": [ 122 | { 123 | "name": "input", 124 | "type": "List" 125 | } 126 | ], 127 | "Example": { 128 | "Scenario": "if you want to get the logarithm of 100 with base 10.", 129 | "Parameters": { 130 | "input": [100,10] 131 | } 132 | } 133 | } 134 | }, 135 | "7": { 136 | "ID": 7, 137 | "standardized_name": "ln_", 138 | "API_description": "'ln_' returns the natural logarithm of the first argument, normalized to 2 decimal places.", 139 | "Usage": { 140 | "required_parameters": [ 141 | { 142 | "name": "input", 143 | "type": "List" 144 | } 145 | ], 146 | "Example": { 147 | "Scenario": "if you want to get the natural logarithm of 2.718.", 148 | "Parameters": { 149 | "input": [2.718281828459045] 150 | } 151 | } 152 | } 153 | }, 154 | "8": { 155 | "ID": 8, 156 | "standardized_name": "lcm_", 157 | "API_description": "'lcm_' returns the least common multiple of all the arguments passed to it, normalized to 2 decimal places.", 158 | "Usage": { 159 | "required_parameters": [ 160 | { 161 | "name": "input", 162 | "type": "List" 163 | } 164 | ], 165 | "Example": { 166 | "Scenario": "if you want to find the LCM of 12 and 18.", 167 | "Parameters": { 168 | "input": [12,18] 169 | } 170 | } 171 | } 172 | }, 173 | "9": { 174 | "ID": 9, 175 | "standardized_name": "gcd_", 176 | "API_description": "'gcd_' returns the greatest common divisor of all the arguments passed to it, normalized to 2 decimal places.", 177 | "Usage": { 178 | "required_parameters": [ 179 | { 180 | "name": "input", 181 | "type": "List" 182 | } 183 | ], 184 | "Example": { 185 | "Scenario": "if you want to find the GCD of 54 and 24.", 186 | "Parameters": { 187 | "input": [54,24] 188 | } 189 | } 190 | } 191 | }, 192 | "10": { 193 | "ID": 10, 194 | "standardized_name": "remainder_", 195 | "API_description": "'remainder_' returns the remainder of the division of the first argument by the second argument, normalized to 2 decimal places.", 196 | "Usage": { 197 | "required_parameters": [ 198 | { 199 | "name": "input", 200 | "type": "List" 201 | } 202 | ], 203 | "Example": { 204 | "Scenario": "if you want to find the remainder of 10 divided by 3.", 205 | "Parameters": { 206 | "input": [10,3] 207 | } 208 | } 209 | } 210 | }, 211 | "11": { 212 | "ID": 11, 213 | "standardized_name": "choose_", 214 | "API_description": "'choose_' returns the number of ways to choose 'r' items from 'n' options without regard to order, normalized to 2 decimal places.", 215 | "Usage": { 216 | "required_parameters": [ 217 | { 218 | "name": "input", 219 | "type": "List" 220 | } 221 | ], 222 | "Example": { 223 | "Scenario": "if you want to choose 2 items out of 5.", 224 | "Parameters": { 225 | "input": [5,2] 226 | } 227 | } 228 | } 229 | }, 230 | "12": { 231 | "ID": 12, 232 | "standardized_name": "permutate_", 233 | "API_description": "'permutate_' returns the number of ways to arrange 'r' items out of 'n' options, normalized to 2 decimal places.", 234 | "Usage": { 235 | "required_parameters": [ 236 | { 237 | "name": "input", 238 | "type": "List" 239 | } 240 | ], 241 | "Example": { 242 | "Scenario": "if you want to find the number of arrangements of 3 items out of 5.", 243 | "Parameters": { 244 | "input": [5,3] 245 | } 246 | } 247 | } 248 | } 249 | } -------------------------------------------------------------------------------- /easytool/data_funcqa/tool_instruction/tool_dic.jsonl: -------------------------------------------------------------------------------- 1 | {"ID": 0, "description": "'add_' returns the sum of all the arguments passed to it, normalized to 2 decimal places."} 2 | {"ID": 1, "description": "'subtract_' returns the difference of the arguments passed to it, starting with the first argument and subtracting all subsequent arguments, normalized to 2 decimal places."} 3 | {"ID": 2, "description": "'multiply_' returns the product of all the arguments passed to it, normalized to 2 decimal places."} 4 | {"ID": 3, "description": "'divide_' returns the quotient of the first argument divided by all the subsequent arguments, normalized to 2 decimal places."} 5 | {"ID": 4, "description": "'power_' returns the result of raising the first argument to the power of all the subsequent arguments, normalized to 2 decimal places."} 6 | {"ID": 5, "description": "'sqrt_' returns the square root of the first argument, normalized to 2 decimal places."} 7 | {"ID": 6, "description": "'log_' returns the base-10 logarithm of the first argument if one argument is provided, or the logarithm with base as the second argument if two arguments are provided, normalized to 2 decimal places."} 8 | {"ID": 7, "description": "'ln_' returns the natural logarithm of the first argument, normalized to 2 decimal places."} 9 | {"ID": 8, "description": "'lcm_' returns the least common multiple of all the arguments passed to it, normalized to 2 decimal places."} 10 | {"ID": 9, "description": "'gcd_' returns the greatest common divisor of all the arguments passed to it, normalized to 2 decimal places."} 11 | {"ID": 10, "description": "'remainder_' returns the remainder of the division of the first argument by the second argument, normalized to 2 decimal places."} 12 | {"ID": 11, "description": "'choose_' returns the number of ways to choose 'r' items from 'n' options without regard to order, normalized to 2 decimal places."} 13 | {"ID": 12, "description": "'permutate_' returns the number of ways to arrange 'r' items out of 'n' options, normalized to 2 decimal places."} 14 | -------------------------------------------------------------------------------- /easytool/data_process.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gdown 3 | import shutil 4 | import json 5 | from zipfile import ZipFile 6 | 7 | urls = { 8 | "funcqa": "https://drive.google.com/uc?id=13Sj7uIsyqWXoTh1ejWUviTzeQSES2Omd", 9 | "restbench": "https://raw.githubusercontent.com/Yifan-Song793/RestGPT/main/datasets/tmdb.json", 10 | "toolbench": "https://drive.google.com/uc?id=1XFjDxVZdUY7TXYF2yvzx3pJlS2fy78jk", 11 | } 12 | 13 | 14 | def read_jsonline(address): 15 | not_mark = [] 16 | with open(address, 'r', encoding="utf-8") as f: 17 | for jsonstr in f.readlines(): 18 | jsonstr = json.loads(jsonstr) 19 | not_mark.append(jsonstr) 20 | return not_mark 21 | 22 | 23 | def read_json(address): 24 | with open(address, 'r', encoding='utf-8') as json_file: 25 | json_data = json.load(json_file) 26 | return json_data 27 | 28 | 29 | def toolbench_process(data_file, dataset): 30 | ls = read_json(data_file) 31 | all_data = read_json(f"{dataset}/tool_instruction/toolbench_tool_instruction.json") 32 | all_dic = {} 33 | for ID in all_data.keys(): 34 | all_dic[all_data[ID]["tool_name"]] = all_data[ID] 35 | 36 | not_in = [] 37 | for data in ls: 38 | Tool_dic = [] 39 | data_dic = {} 40 | already = [] 41 | for tool in data['api_list']: 42 | if tool['tool_name'] in all_dic: 43 | if all_dic[tool['tool_name']]["ID"] not in already: 44 | already.append(all_dic[tool['tool_name']]["ID"]) 45 | Tool_dic.append({"ID": all_dic[tool['tool_name']]["ID"], 46 | "Description": all_dic[tool['tool_name']]["tool_description"], }) 47 | data["Tool_dic"] = Tool_dic 48 | 49 | json_str = json.dumps(ls, indent=4) 50 | with open(data_file, 'w', encoding='utf-8') as json_file: 51 | json.dump(ls, json_file, ensure_ascii=False, indent=4) 52 | 53 | 54 | def main(): 55 | curr_dir = os.path.dirname(__file__) 56 | 57 | for dataset in [ 58 | "funcqa", 59 | "restbench", 60 | "toolbench" 61 | ]: 62 | dataset_path = os.path.join(curr_dir, "data_{}".format(dataset), "test_data") 63 | 64 | if not os.path.exists(dataset_path): 65 | os.makedirs(dataset_path) 66 | 67 | if dataset == "funcqa": 68 | print("Processing FuncQA dataset ...\n") 69 | # Processing FuncQA dataset ... 70 | temp_file = os.path.join(dataset_path, "data_toolkengpt_0918.zip") 71 | gdown.download(urls[dataset], temp_file, quiet=False) 72 | zf = ZipFile(temp_file, 'r') 73 | zf.extract("data/funcqa/funcqa_oh.json", ".") 74 | zf.extract("data/funcqa/funcqa_mh.json", ".") 75 | os.rename("data/funcqa/funcqa_oh.json", "{}/funcqa_oh.json".format(dataset_path)) 76 | os.rename("data/funcqa/funcqa_mh.json", "{}/funcqa_mh.json".format(dataset_path)) 77 | os.remove(temp_file) 78 | shutil.rmtree("data") 79 | 80 | print("FuncQA dataset Done!\n") 81 | 82 | if dataset == "restbench": 83 | print("Processing RestBench dataset ... \n") 84 | # Processing RestBench Dataset 85 | os.system("wget -P {} -c {}".format(dataset_path, urls[dataset])) 86 | 87 | print("RestBench dataset Done!\n") 88 | 89 | if dataset == "toolbench": 90 | print("Processing ToolBench dataset ... \n") 91 | # Processing ToolBench Dataset 92 | temp_file = os.path.join(dataset_path, "data.zip") 93 | gdown.download(urls[dataset], temp_file, quiet=False) 94 | zf = ZipFile(temp_file, 'r') 95 | zf.extract("data/test_instruction/G2_category.json", ".") 96 | zf.extract("data/test_instruction/G3_instruction.json", ".") 97 | 98 | os.rename("data/test_instruction/G2_category.json", "{}/G2_category.json".format(dataset_path)) 99 | os.rename("data/test_instruction/G3_instruction.json", "{}/G3_instruction.json".format(dataset_path)) 100 | toolbench_process("{}/G2_category.json".format(dataset_path), "data_{}".format(dataset)) 101 | toolbench_process("{}/G3_instruction.json".format(dataset_path), "data_{}".format(dataset)) 102 | os.remove(temp_file) 103 | shutil.rmtree("data") 104 | 105 | print("Toolbench dataset Done!\n") 106 | 107 | 108 | if __name__ == '__main__': 109 | main() 110 | -------------------------------------------------------------------------------- /easytool/data_toolbench/tool_instruction/API_description_embeddings.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/JARVIS/c62e0faac76c4a2907cabe2cfe4bbe5f2e613400/easytool/data_toolbench/tool_instruction/API_description_embeddings.zip -------------------------------------------------------------------------------- /easytool/easytool/__init__.py: -------------------------------------------------------------------------------- 1 | # __init__.py -------------------------------------------------------------------------------- /easytool/easytool/restbench.py: -------------------------------------------------------------------------------- 1 | # — coding: utf-8 – 2 | import openai 3 | import json 4 | import logging 5 | import sys 6 | import argparse 7 | from langchain.chat_models import ChatOpenAI 8 | from langchain.prompts import ( 9 | ChatPromptTemplate, 10 | MessagesPlaceholder, 11 | SystemMessagePromptTemplate, 12 | HumanMessagePromptTemplate 13 | ) 14 | from langchain import LLMChain 15 | import numpy as np 16 | import requests 17 | import os 18 | import subprocess 19 | import re 20 | import importlib.util 21 | from sklearn.metrics.pairwise import cosine_similarity 22 | import pickle 23 | from util import * 24 | 25 | from tqdm import tqdm 26 | 27 | openai.api_key = os.environ["OPENAI_API_KEY"] 28 | 29 | 30 | def get_last_processed_index(progress_file): 31 | """Retrieve the last processed index from the progress file.""" 32 | if os.path.exists(progress_file): 33 | with open(progress_file, 'r', encoding='utf-8') as f: 34 | last_index = f.read().strip() 35 | return int(last_index) if last_index else 0 36 | else: 37 | return 0 38 | 39 | 40 | def update_progress(progress_file, index): 41 | """Update the last processed index in the progress file.""" 42 | with open(progress_file, 'w', encoding='utf-8') as f: 43 | f.write(str(index)) 44 | 45 | 46 | def task_decompose(question, Tool_dic, model_name): 47 | chat = ChatOpenAI(model_name=model_name) 48 | template = "You are a helpful assistant." 49 | system_message_prompt = SystemMessagePromptTemplate.from_template(template) 50 | human_message_prompt = HumanMessagePromptTemplate.from_template( 51 | "We have spotify database and the following tools:\n" 52 | "{Tool_dic}" 53 | "You need to decompose a complex user's question into some simple subtasks and let the model execute it step by step with these tools.\n" 54 | "Please note that: \n" 55 | "1. you should break down tasks into appropriate subtasks to use the tools mentioned above.\n" 56 | "2. You should not only list the subtask, but also list the ID of the tool used to solve this subtask.\n" 57 | "3. If you think you do not need to use the tool to solve the subtask, just leave it as {{\"ID\": -1}}\n" 58 | "4. You must consider the logical connections, order and constraints among the tools to achieve a correct tool path." 59 | "5. You must ONLY output the ID of the tool you chose in a parsible JSON format. Two examples output look like:\n" 60 | "'''\n" 61 | "Question: Pause the player" 62 | "Example 1: [{{\"Task\":\"Get information about the user’s current playback state\", \"ID\":15}}, {{\"Task\":\"Pause playback on the user's account\", \"ID\":19}}]\n" 63 | "'''\n" 64 | "This is the user's question: {question}\n" 65 | "Output:" 66 | ) 67 | chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt]) 68 | chain = LLMChain(llm=chat, prompt=chat_prompt) 69 | ind = 0 70 | while True: 71 | try: 72 | result = chain.run(question=question, Tool_dic=Tool_dic) 73 | result = eval(result.split('\n\n')[0]) 74 | break 75 | except Exception as e: 76 | print(f"task decompose fails: {e}") 77 | if ind > 10: 78 | return -1 79 | ind += 1 80 | continue 81 | return result 82 | 83 | 84 | def task_execution( 85 | Tool_dic, dic_tool, test_data, progress_file, 86 | start_index, total_files, retrieval_num, ind, model_name): 87 | with tqdm(total=total_files, desc="Processing files", initial=start_index) as pbar: 88 | for i, data in enumerate(test_data[start_index:], start=start_index): 89 | question = data["query"] 90 | print(question) 91 | task_path = task_decompose(question, Tool_dic, model_name) 92 | tool_choice_ls = [] 93 | for task in task_path: 94 | if isinstance(task["ID"], list): 95 | for ele in task["ID"]: 96 | tool_choice_ls.append(dic_tool[ele]['tool_usage']) 97 | elif int(task["ID"]) in dic_tool.keys(): 98 | tool_choice_ls.append(dic_tool[task["ID"]]['tool_usage']) 99 | ind = ind + 1 100 | with open(f"restbench_{model_name}_Easytool.jsonl", 'a+', encoding='utf-8') as f: 101 | line = json.dumps({ 102 | "ID": ind, 103 | "question": question, 104 | "task_path": task_path, 105 | "tool_choice_ls": tool_choice_ls 106 | }, ensure_ascii=False) 107 | f.write(line + '\n') 108 | print(tool_choice_ls) 109 | update_progress(progress_file, i + 1) 110 | pbar.update(1) 111 | -------------------------------------------------------------------------------- /easytool/easytool/util.py: -------------------------------------------------------------------------------- 1 | # — coding: utf-8 – 2 | import json 3 | import re 4 | import os 5 | 6 | 7 | def read_jsonline(address): 8 | not_mark = [] 9 | with open(address, 'r', encoding="utf-8") as f: 10 | for jsonstr in f.readlines(): 11 | jsonstr = json.loads(jsonstr) 12 | not_mark.append(jsonstr) 13 | return not_mark 14 | 15 | 16 | def save_json(ls, address): 17 | json_str = json.dumps(ls, indent=4) 18 | with open(address, 'w', encoding='utf-8') as json_file: 19 | json.dump(ls, json_file, ensure_ascii=False, indent=4) 20 | 21 | 22 | def read_json(address): 23 | with open(address, 'r', encoding='utf-8') as json_file: 24 | json_data = json.load(json_file) 25 | return json_data 26 | 27 | 28 | def remove_key(item, key_to_remove): 29 | if isinstance(item, dict): 30 | if key_to_remove in item: 31 | del item[key_to_remove] 32 | for key, value in list(item.items()): # 使用list包裹,防止字典大小改变时引发错误 33 | item[key] = remove_key(value, key_to_remove) 34 | elif isinstance(item, list): 35 | for index, value in enumerate(item): 36 | item[index] = remove_key(value, key_to_remove) 37 | return item 38 | 39 | 40 | def data_clean(dic, key): 41 | dic = remove_key(dic, key) 42 | return dic 43 | 44 | 45 | def lowercase_parameter_keys(input_dict): 46 | if "parameters" in input_dict and isinstance(input_dict["parameters"], dict): 47 | # Convert all keys in the "parameters" dictionary to uppercase 48 | input_dict["parameters"] = {change_name(k.lower()): v for k, v in input_dict["parameters"].items()} 49 | return input_dict 50 | 51 | 52 | def build_index(base_path): 53 | index = {} 54 | for root, dirs, files in os.walk(base_path): 55 | for dir_name in dirs: 56 | if dir_name not in index: 57 | index[dir_name] = [] 58 | index[dir_name].append(root) 59 | return index 60 | 61 | 62 | def change_name(name): 63 | change_list = ["from", "class", "return", "false", "true", "id", "and", "", "ID"] 64 | if name in change_list: 65 | name = "is_" + name.lower() 66 | return name 67 | 68 | 69 | def standardize(string): 70 | res = re.compile("[^\\u4e00-\\u9fa5^a-z^A-Z^0-9^_]") 71 | string = res.sub("_", string) 72 | string = re.sub(r"(_)\1+", "_", string).lower() 73 | while True: 74 | if len(string) == 0: 75 | return string 76 | if string[0] == "_": 77 | string = string[1:] 78 | else: 79 | break 80 | while True: 81 | if len(string) == 0: 82 | return string 83 | if string[-1] == "_": 84 | string = string[:-1] 85 | else: 86 | break 87 | if string[0].isdigit(): 88 | string = "get_" + string 89 | return string 90 | 91 | 92 | def get_last_processed_index(progress_file): 93 | """Retrieve the last processed index from the progress file.""" 94 | if os.path.exists(progress_file): 95 | with open(progress_file, 'r', encoding='utf-8') as f: 96 | last_index = f.read().strip() 97 | return int(last_index) if last_index else 0 98 | else: 99 | return 0 100 | 101 | 102 | def update_progress(progress_file, index): 103 | """Update the last processed index in the progress file.""" 104 | with open(progress_file, 'w', encoding='utf-8') as f: 105 | f.write(str(index)) 106 | 107 | 108 | if __name__ == '__main__': 109 | print("util.py") 110 | -------------------------------------------------------------------------------- /easytool/main.py: -------------------------------------------------------------------------------- 1 | # — coding: utf-8 – 2 | import openai 3 | import json 4 | import argparse 5 | import os 6 | from tqdm import tqdm 7 | from easytool import funcQA, restbench, toolbench_retrieve, toolbench 8 | from easytool.util import * 9 | openai.api_key = os.environ["OPENAI_API_KEY"] 10 | 11 | if __name__ == '__main__': 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('--model_name', type=str, default='gpt-3.5-turbo') 14 | parser.add_argument('--task', type=str, default='funcqa_mh', help='funcqa, toolbench_retrieve, toolbench, restbench') 15 | parser.add_argument('--data_type', type=str, default='G3', help='G2 or G3 or funcqa_mh or funcqa_oh') 16 | parser.add_argument('--tool_root_dir', type=str, default='.toolenv/tools/') 17 | parser.add_argument('--retrieval_num', type=int, default=5) 18 | 19 | args = parser.parse_args() 20 | 21 | if args.task == 'funcqa': 22 | dataset = read_json('data_funcqa/tool_instruction/functions_data.json') 23 | Tool_dic = read_jsonline('data_funcqa/tool_instruction/tool_dic.jsonl') 24 | test_data = read_json(f"data_funcqa/test_data/{args.data_type}.json") 25 | progress_file = f"FuncQA_{args.data_type}_{args.model_name}_Easytool.txt" 26 | 27 | 28 | elif 'toolbench' in args.task: 29 | base_path = args.tool_root_dir 30 | index = build_index(base_path) 31 | dataset = read_json('data_toolbench/tool_instruction/toolbench_tool_instruction.json') 32 | if args.data_type == 'G2': 33 | test_data = read_json(f'''data_toolbench/test_data/{args.data_type}_category.json''') 34 | elif args.data_type == 'G3': 35 | test_data = read_json(f'''data_toolbench/test_data/{args.data_type}_instruction.json''') 36 | progress_file = f'''{args.data_type}_{args.model_name}_Easytool.txt''' 37 | 38 | 39 | elif args.task == 'restbench': 40 | Tool_dic = read_json('data_restbench/tool_instruction/tmdb_tool.json') 41 | dic_tool = {} 42 | for data in Tool_dic: 43 | dic_tool[data['ID']] = data 44 | test_data = read_json('data_restbench/test_data/tmdb.json') 45 | progress_file = f"restbench_{args.model_name}_Easytool.txt" 46 | 47 | else: 48 | print("Wrong task name") 49 | exit() 50 | 51 | start_index = get_last_processed_index(progress_file) 52 | total_files = len(test_data) 53 | retrieval_num = args.retrieval_num 54 | ind = start_index 55 | model_name = args.model_name 56 | 57 | print("-------Start Execution-------") 58 | if args.data_type == 'funcqa_mh': 59 | funcQA.task_execution_mh(args.data_type, start_index, total_files, 60 | retrieval_num, ind, model_name, dataset, 61 | Tool_dic, test_data, progress_file) 62 | elif args.data_type == 'funcqa_oh': 63 | funcQA.task_execution_oh(args.data_type, start_index, total_files, 64 | retrieval_num, ind, model_name, dataset, 65 | Tool_dic, test_data, progress_file) 66 | 67 | 68 | elif args.task == 'toolbench_retrieve': 69 | toolbench_retrieve.task_execution(args.data_type, 70 | base_path, index, dataset, test_data, progress_file, 71 | start_index, total_files, retrieval_num, ind, model_name) 72 | 73 | 74 | 75 | elif args.task == 'toolbench': 76 | toolbench.task_execution(args.data_type, 77 | base_path, index, dataset, test_data, progress_file, 78 | start_index, total_files, retrieval_num, ind, model_name) 79 | 80 | 81 | 82 | elif args.task == 'restbench': 83 | restbench.task_execution( 84 | Tool_dic, dic_tool, test_data, progress_file, 85 | start_index, total_files, retrieval_num, ind, model_name) 86 | 87 | 88 | else: 89 | print("Wrong task name") 90 | exit() 91 | -------------------------------------------------------------------------------- /easytool/requirements.txt: -------------------------------------------------------------------------------- 1 | openai==0.27.8 2 | langchain==0.0.260 3 | gdown==4.6.0 4 | tqdm 5 | argparse 6 | numpy 7 | requests 8 | pickle-mixin 9 | scikit-learn 10 | -------------------------------------------------------------------------------- /hugginggpt/.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | server/models/* 3 | !server/models/download.sh 4 | !server/models/download.ps1 -------------------------------------------------------------------------------- /hugginggpt/Dockerfile.jetson: -------------------------------------------------------------------------------- 1 | # NVIDIA Jetson embedded device support with GPU accelerated local model execution for https://github.com/microsoft/JARVIS 2 | 3 | # Base image for ffmpeg build env: https://catalog.ngc.nvidia.com/orgs/nvidia/containers/l4t-jetpack/tags 4 | FROM nvcr.io/nvidia/l4t-jetpack:r35.2.1 AS build 5 | 6 | RUN apt update && apt install -y --no-install-recommends \ 7 | build-essential git libass-dev libx264-dev cmake && \ 8 | rm -rf /var/lib/apt/lists/* 9 | 10 | # Build ffmpeg dependency libraries 11 | RUN git clone https://github.com/jocover/jetson-ffmpeg.git && \ 12 | cd jetson-ffmpeg && \ 13 | sed -i 's=Libs: -L${libdir} -lnvmpi=Libs: -L${libdir} -lnvmpi -L/usr/lib/aarch64-linux-gnu/tegra -lnvbufsurface=g' nvmpi.pc.in && \ 14 | mkdir build && \ 15 | cd build && \ 16 | cmake .. && \ 17 | make -j$(nproc) && \ 18 | sudo make install && \ 19 | sudo ldconfig && \ 20 | git clone git://source.ffmpeg.org/ffmpeg.git -b release/4.2 --depth=1 && \ 21 | cd ffmpeg && \ 22 | wget https://github.com/jocover/jetson-ffmpeg/raw/master/ffmpeg_nvmpi.patch && \ 23 | git apply ffmpeg_nvmpi.patch && \ 24 | ./configure --enable-nvmpi --enable-libass --enable-libx264 --enable-gpl && \ 25 | make -j$(nproc) 26 | 27 | # Base image: https://catalog.ngc.nvidia.com/orgs/nvidia/containers/l4t-pytorch/tags 28 | # For running JARVIS application layer 29 | from nvcr.io/nvidia/l4t-pytorch:r35.2.1-pth2.0-py3 30 | 31 | ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH 32 | COPY --from=build /usr/local/lib/libnvmpi.a /usr/local/lib 33 | COPY --from=build /usr/local/lib/libnvmpi.so.1.0.0 /usr/local/lib 34 | COPY --from=build jetson-ffmpeg/build/ffmpeg/ffmpeg /usr/local/bin 35 | COPY --from=build jetson-ffmpeg/build/ffmpeg/ffprobe /usr/local/bin 36 | RUN ln /usr/local/lib/libnvmpi.so.1.0.0 /usr/local/lib/libnvmpi.so 37 | ENV MAKEFLAGS="-j$(nproc)" 38 | 39 | COPY ./server/requirements.txt . 40 | 41 | # Install model server dependencies 42 | RUN apt update && apt remove -y \ 43 | opencv-dev opencv-libs opencv-licenses opencv-main opencv-python opencv-scripts python3-numpy && \ 44 | rm -rf /var/lib/apt/lists/* 45 | 46 | RUN python3 -m pip install importlib-metadata==4.13.0 && \ 47 | python3 -m pip install -r requirements.txt && \ 48 | rm -rf requirements.txt 49 | 50 | # Update torch deps via reinstall 51 | RUN python3 -m pip install torch==2.0.0a0+ec3941ad.nv23.2 torchaudio==0.13.1+b90d798 torchvision==0.14.1a0+5e8e2f1 52 | 53 | # Downgrade opencv-python to v4.5 54 | RUN python3 -m pip install opencv-python==4.5.5.64 55 | 56 | # Install nvidia-opencv-dev 57 | RUN apt update && apt install -y --no-install-recommends \ 58 | nvidia-opencv-dev && \ 59 | rm -rf /var/lib/apt/lists/* 60 | 61 | # Fix loading of scikit dep at runtime 62 | ENV LD_PRELOAD='/usr/local/lib/python3.8/dist-packages/scikit_learn.libs/libgomp-d22c30c5.so.1.0.0' 63 | 64 | # Install nodejs npm from nodesource 65 | ENV NVM_DIR /root/.nvm 66 | ENV NODE_VERSION v18.16.0 67 | RUN wget -q -O - https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.3/install.sh | bash && \ 68 | . "$NVM_DIR/nvm.sh" && \ 69 | nvm install $NODE_VERSION && \ 70 | nvm alias default $NODE_VERSION && \ 71 | nvm use default 72 | ENV NODE_PATH $NVM_DIR/versions/node/$NODE_VERSION/lib/node_modules 73 | ENV PATH $NVM_DIR/versions/node/$NODE_VERSION/bin:$PATH 74 | 75 | WORKDIR /app 76 | 77 | # Copy source files 78 | COPY . . 79 | 80 | # Install web server dependencies 81 | RUN apt update && apt install -y --no-install-recommends \ 82 | xdg-utils && \ 83 | rm -rf /var/lib/apt/lists/* && \ 84 | cd web && \ 85 | npm install 86 | 87 | # Download local models 88 | # RUN apt update && apt install -y --no-install-recommends \ 89 | # git-lfs && \ 90 | # rm -rf /var/lib/apt/lists/* && \ 91 | # cd server/models && \ 92 | # bash download.sh 93 | 94 | # Expose the model server ports 95 | EXPOSE 8004 96 | EXPOSE 8005 97 | # Expose the web server port 98 | EXPOSE 9999 99 | 100 | WORKDIR /app/server 101 | 102 | # Start the model and web server 103 | CMD python3 models_server.py --config configs/config.default.yaml; -------------------------------------------------------------------------------- /hugginggpt/README.md: -------------------------------------------------------------------------------- 1 |

2 |
3 |

4 | 5 |
6 |

HuggingGPT

7 |
8 | 9 | License: Apache 2.0 10 | 11 | 12 | License: Apache 2.0 13 | 14 | 15 | License: Apache 2.0 16 | 17 |
18 |

Solving AI Tasks with ChatGPT and its Friends in HuggingFace

19 |

20 | 21 |

22 | image 23 |

24 | 25 | 26 | 29 | 30 | 31 | ## Updates 32 | + [2023.07.28] We are now in the process of planning evaluation and project rebuilding. We will release a new version of Jarvis in the near future. 33 | + [2023.07.24] We released a light langchain version of Jarvis. See here. 34 | + [2023.04.16] Jarvis now supports the OpenAI service on the Azure platform and the GPT-4 model. 35 | + [2023.04.06] We added the Gradio demo and built the web API for `/tasks` and `/results` in `server` mode. 36 | + The Gradio demo is now hosted on Hugging Face Space. (Build with `inference_mode=hybrid` and `local_deployment=standard`) 37 | + The Web API `/tasks` and `/results` access intermediate results for `Stage #1`: task planning and `Stage #1-3`: model selection with execution results. See here. 38 | + [2023.04.03] We added the CLI mode and provided parameters for configuring the scale of local endpoints. 39 | + You can enjoy a lightweight experience with Jarvis without deploying the models locally. See here. 40 | + Just run `python awesome_chat.py --config configs/config.lite.yaml` to experience it. 41 | + [2023.04.01] We updated a version of code for building. 42 | 43 | ## Overview 44 | 45 | Language serves as an interface for LLMs to connect numerous AI models for solving complicated AI tasks! 46 | 47 | See our paper: [HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in HuggingFace](http://arxiv.org/abs/2303.17580), Yongliang Shen, Kaitao Song, Xu Tan, Dongsheng Li, Weiming Lu and Yueting Zhuang (the first two authors contribute equally) 48 | 49 |

50 | 51 | We introduce a collaborative system that consists of **an LLM as the controller** and **numerous expert models as collaborative executors** (from HuggingFace Hub). The workflow of our system consists of four stages: 52 | + **Task Planning**: Using ChatGPT to analyze the requests of users to understand their intention, and disassemble them into possible solvable tasks. 53 | + **Model Selection**: To solve the planned tasks, ChatGPT selects expert models hosted on Hugging Face based on their descriptions. 54 | + **Task Execution**: Invokes and executes each selected model, and return the results to ChatGPT. 55 | + **Response Generation**: Finally, using ChatGPT to integrate the prediction of all models, and generate responses. 56 | 57 | ## System Requirements 58 | 59 | ### Default (Recommended) 60 | 61 | For `configs/config.default.yaml`: 62 | 63 | + Ubuntu 16.04 LTS 64 | + VRAM >= 24GB 65 | + RAM > 12GB (minimal), 16GB (standard), 80GB (full) 66 | + Disk > 284GB 67 | + 42GB for `damo-vilab/text-to-video-ms-1.7b` 68 | + 126GB for `ControlNet` 69 | + 66GB for `stable-diffusion-v1-5` 70 | + 50GB for others 71 | 72 | ### Minimum (Lite) 73 | 74 | For `configs/config.lite.yaml`: 75 | 76 | + Ubuntu 16.04 LTS 77 | + Nothing else 78 | 79 | The configuration `configs/config.lite.yaml` does not require any expert models to be downloaded and deployed locally. However, it means that Jarvis is restricted to models running stably on HuggingFace Inference Endpoints. 80 | 81 | ## Quick Start 82 | 83 | First replace `openai.key` and `huggingface.token` in `server/configs/config.default.yaml` with **your personal OpenAI Key** and **your Hugging Face Token**, or put them in the environment variables `OPENAI_API_KEY` and `HUGGINGFACE_ACCESS_TOKEN` respectively. Then run the following commands: 84 | 85 | 86 | 87 | ### For Server: 88 | 89 | ```bash 90 | # setup env 91 | cd server 92 | conda create -n jarvis python=3.8 93 | conda activate jarvis 94 | conda install pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvidia 95 | pip install -r requirements.txt 96 | 97 | # download models. Make sure that `git-lfs` is installed. 98 | cd models 99 | bash download.sh # required when `inference_mode` is `local` or `hybrid`. 100 | 101 | # run server 102 | cd .. 103 | python models_server.py --config configs/config.default.yaml # required when `inference_mode` is `local` or `hybrid` 104 | python awesome_chat.py --config configs/config.default.yaml --mode server # for text-davinci-003 105 | ``` 106 | 107 | Now you can access Jarvis' services by the Web API. 108 | 109 | + `/hugginggpt` --method `POST`, access the full service. 110 | + `/tasks` --method `POST`, access intermediate results for Stage #1. 111 | + `/results` --method `POST`, access intermediate results for Stage #1-3. 112 | 113 | For example: 114 | 115 | ```bash 116 | # request 117 | curl --location 'http://localhost:8004/tasks' \ 118 | --header 'Content-Type: application/json' \ 119 | --data '{ 120 | "messages": [ 121 | { 122 | "role": "user", 123 | "content": "based on pose of /examples/d.jpg and content of /examples/e.jpg, please show me a new image" 124 | } 125 | ] 126 | }' 127 | 128 | # response 129 | [{"args":{"image":"/examples/d.jpg"},"dep":[-1],"id":0,"task":"openpose-control"},{"args":{"image":"/examples/e.jpg"},"dep":[-1],"id":1,"task":"image-to-text"},{"args":{"image":"-0","text":"-1"},"dep":[1,0],"id":2,"task":"openpose-text-to-image"}] 130 | ``` 131 | 132 | 133 | ### For Web: 134 | 135 | We provide a user-friendly web page. After starting `awesome_chat.py` in a server mode, you can run the commands to communicate with Jarvis in your browser: 136 | 137 | - you need to install `nodejs` and `npm` first. 138 | - [ IMPORTANT ] if you are running the web client on another machine, you need set `http://{LAN_IP_of_the_server}:{port}/` to `HUGGINGGPT_BASE_URL` of `web/src/config/index.ts`. 139 | - if you want to use the video generation feature, you need to compile `ffmpeg` manually with H.264. 140 | - you can switch to ChatGPT by `double click` on the setting icon! 141 | 142 | ```bash 143 | cd web 144 | npm install 145 | npm run dev 146 | ``` 147 | 148 | ```bash 149 | # Optional: Install ffmpeg 150 | # This command need be executed without errors. 151 | LD_LIBRARY_PATH=/usr/local/lib /usr/local/bin/ffmpeg -i input.mp4 -vcodec libx264 output.mp4 152 | ``` 153 | 154 | 155 | 156 | ### For Gradio 157 | 158 | The Gradio demo is now hosted on Hugging Face Space. You can also run the following commands to start the demo locally: 159 | 160 | ```bash 161 | python models_server.py --config configs/config.gradio.yaml 162 | python run_gradio_demo.py --config configs/config.gradio.yaml 163 | 164 | # or run the HF Space as a Docker image (Build with `inference_mode=hibrid` and `local_deployment=standard`) 165 | docker run -it -p 7860:7860 --platform=linux/amd64 registry.hf.space/microsoft-hugginggpt:latest python app.py 166 | ``` 167 | 168 | ### For CLI: 169 | 170 | You can also run Jarvis more easily in CLI mode: 171 | 172 | ```bash 173 | cd server 174 | python awesome_chat.py --config configs/config.default.yaml --mode cli 175 | ``` 176 | 177 | Examples of CLI mode: 178 | 179 | ``` 180 | Welcome to Jarvis! A collaborative system that consists of an LLM as the controller and numerous expert models as collaborative executors. Jarvis can plan tasks, schedule Hugging Face models, generate friendly responses based on your requests, and help you with many things. Please enter your request (`exit` to exit). 181 | [ User ]: Given a collection of image A: /examples/a.jpg, B: /examples/b.jpg, C: /examples/c.jpg, please tell me how many zebras in these picture? 182 | [ Jarvis ]: Based on your request, I have used several models to process the images you provided. For the image A: /examples/a.jpg, I used the image-to-text model nlpconnect/vit-gpt2-image-captioning to generate the text "a cat sitting on a window sill looking out". Then I used the object-detection model facebook/detr-resnet-50 to generate the image with predicted box '/images/a3dc.jpg', which contains two objects with labels 'potted plant' and 'cat'. For the image B: /examples/b.jpg, I used the image-to-text model nlpconnect/vit-gpt2-image-captioning to generate the text "two zebras standing in a field of dry grass". Then I used the object-detection model facebook/detr-resnet-50 to generate the image with predicted box '/images/f5df.jpg', which contains three objects with labels 'zebra'. For the image C: /examples/c.jpg, I used the image-to-text model nlpconnect/vit-gpt2-image-captioning to generate the text "a zebra standing in a field of tall grass". Then I used the object-detection model facebook/detr-resnet-50 to generate the image with predicted box '/images/69c7.jpg', which contains one object with label 'zebra'. Finally, I used the canny-control model canny-control to generate the image '/images/7e18.png' based on the image /examples/f.jpg. Therefore, there are four zebras in these pictures. Is there anything else I can help you with? 183 | [ User ]: Please answer all the named entities in the sentence: Iron Man is a superhero appearing in American comic books published by Marvel Comics. The character was co-created by writer and editor Stan Lee, developed by scripter Larry Lieber, and designed by artists Don Heck and Jack Kirby. 184 | [ Jarvis ]: Sure, I understand your request. Based on the inference results of the models, I will answer your request step by step. Firstly, I used the token-classification model dslim/bert-base-NER to recognize the named entities in the sentence. The inference result is that the named entities are Iron Man, Stan Lee, Larry Lieber, Don Heck and Jack Kirby. Then, I used the question-answering model deepset/roberta-base-squad2 to answer your request. The inference result is that there is no output since the context cannot be empty. Therefore, I cannot make it. I hope this answer your request. Is there anything else I can help you with? 185 | [ User ]: Please generate a canny image based on /examples/f.jpg 186 | [ Jarvis ]: Sure. I have generated a canny image based on /examples/f.jpg. To do this, I first used the image-to-text model nlpconnect/vit-gpt2-image-captioning to generate the text description of the image, which is "a herd of giraffes and zebras grazing in a field". Then I used the canny-control model to generate the canny image of the original image, which is stored in the path "/images/1ea2.png". Finally, I used the canny-text-to-image model lllyasviel/sd-controlnet-canny to generate the canny image based on the text description and the canny image, which is stored in the path "/images/ba66.png". Do you need anything else? 187 | ``` 188 | 189 | 190 | ## Configuration 191 | 192 | The server-side configuration file is `server/configs/config.default.yaml`, and some parameters are presented as follows: 193 | 194 | + `model`: LLM, currently supports `text-davinci-003`. We are working on integrating more open-source LLMs. 195 | + `inference_mode`: mode of inference endpoints 196 | + `local`: only use the local inference endpoints 197 | + `huggingface`: only use the Hugging Face Inference Endpoints **(free of local inference endpoints)** 198 | + `hybrid`: both of `local` and `huggingface` 199 | + `local_deployment`: scale of locally deployed models, works under `local` or `hybrid` inference mode: 200 | + `minimal` (RAM>12GB, ControlNet only) 201 | + `standard` (RAM>16GB, ControlNet + Standard Pipelines) 202 | + `full` (RAM>42GB, All registered models) 203 | 204 | On a personal laptop, we recommend the configuration of `inference_mode: hybrid `and `local_deployment: minimal`. But the available models under this setting may be limited due to the instability of remote Hugging Face Inference Endpoints. 205 | 206 | ## NVIDIA Jetson Embedded Device Support 207 | A [Dockerfile](./Dockerfile.jetson) is included that provides experimental support for [NVIDIA Jetson embedded devices](https://developer.nvidia.com/embedded-computing). This image provides accelerated ffmpeg, pytorch, torchaudio, and torchvision dependencies. To build the docker image, [ensure that the default docker runtime is set to 'nvidia'](https://github.com/NVIDIA/nvidia-docker/wiki/Advanced-topics#default-runtime). A pre-built image is provided at https://hub.docker.com/r/toolboc/nv-jarvis. 208 | 209 | ```bash 210 | #Build the docker image 211 | docker build --pull --rm -f "Dockerfile.jetson" -t toolboc/nv-jarvis:r35.2.1 212 | ``` 213 | 214 | Due to to memory requirements, JARVIS is required to run on Jetson AGX Orin family devices (64G on-board RAM device preferred) with config options set to: 215 | * `inference_mode: local` 216 | * `local_deployment: standard` 217 | 218 | Models and configs are recommended to be provided through a volume mount from the host to the container as shown in the `docker run` step below. It is possible to uncomment the `# Download local models` section of the [Dockerfile](./Dockerfile.jetson) to build a container with models included. 219 | 220 | ### Start the model server, awesomechat, and web app on Jetson Orin AGX 221 | 222 | ```bash 223 | # run the container which will automatically start the model server 224 | docker run --name jarvis --net=host --gpus all -v ~/jarvis/configs:/app/server/configs -v ~/src/JARVIS/server/models:/app/server/models toolboc/nv-jarvis:r35.2.1 225 | 226 | # (wait for model server to complete initialization) 227 | 228 | # start awesome_chat.py 229 | docker exec jarvis python3 awesome_chat.py --config configs/config.default.yaml --mode server 230 | 231 | #start the web application (application will be acessible at http://localhost:9999) 232 | docker exec jarvis npm run dev --prefix=/app/web 233 | ``` 234 | 235 | ## Screenshots 236 | 237 |

238 | 239 | ## Citation 240 | If you find this work useful in your method, you can cite the paper as below: 241 | 242 | @inproceedings{shen2023hugginggpt, 243 | author = {Shen, Yongliang and Song, Kaitao and Tan, Xu and Li, Dongsheng and Lu, Weiming and Zhuang, Yueting}, 244 | booktitle = {Advances in Neural Information Processing Systems}, 245 | title = {HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in HuggingFace}, 246 | year = {2023} 247 | } 248 | 249 | ## Acknowledgement 250 | 251 | - [ChatGPT](https://platform.openai.com/) 252 | - [Hugging Face](https://huggingface.co/) 253 | - [ControlNet](https://github.com/lllyasviel/ControlNet) 254 | - [ChatGPT-vue](https://github.com/lianginx/chatgpt-vue) 255 | -------------------------------------------------------------------------------- /hugginggpt/assets/intro.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/JARVIS/c62e0faac76c4a2907cabe2cfe4bbe5f2e613400/hugginggpt/assets/intro.png -------------------------------------------------------------------------------- /hugginggpt/assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/JARVIS/c62e0faac76c4a2907cabe2cfe4bbe5f2e613400/hugginggpt/assets/logo.png -------------------------------------------------------------------------------- /hugginggpt/assets/overview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/JARVIS/c62e0faac76c4a2907cabe2cfe4bbe5f2e613400/hugginggpt/assets/overview.jpg -------------------------------------------------------------------------------- /hugginggpt/assets/prompt_flow.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/JARVIS/c62e0faac76c4a2907cabe2cfe4bbe5f2e613400/hugginggpt/assets/prompt_flow.jpg -------------------------------------------------------------------------------- /hugginggpt/assets/screenshot_a.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/JARVIS/c62e0faac76c4a2907cabe2cfe4bbe5f2e613400/hugginggpt/assets/screenshot_a.jpg -------------------------------------------------------------------------------- /hugginggpt/assets/screenshot_q.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/JARVIS/c62e0faac76c4a2907cabe2cfe4bbe5f2e613400/hugginggpt/assets/screenshot_q.jpg -------------------------------------------------------------------------------- /hugginggpt/server/configs/config.azure.yaml: -------------------------------------------------------------------------------- 1 | azure: 2 | api_key: REPLACE_WITH_YOUR_AZURE_API_KEY_HERE 3 | base_url: REPLACE_WITH_YOUR_ENDPOINT_HERE 4 | deployment_name: REPLACE_WITH_YOUR_DEPLOYMENT_NAME_HERE 5 | api_version: "2022-12-01" 6 | huggingface: 7 | token: REPLACE_WITH_YOUR_HUGGINGFACE_TOKEN_HERE # required: huggingface token @ https://huggingface.co/settings/tokens 8 | dev: false 9 | debug: false 10 | log_file: logs/debug.log 11 | model: text-davinci-003 # currently only support text-davinci-003, gpt-4, we will support more open-source LLMs in the future 12 | use_completion: true 13 | inference_mode: huggingface # local, huggingface or hybrid, prefer hybrid 14 | local_deployment: full # minimal, standard or full, prefer full 15 | device: cuda:0 # cuda:id or cpu 16 | num_candidate_models: 5 17 | max_description_length: 100 18 | proxy: # optional: your proxy server "http://ip:port" 19 | http_listen: 20 | host: 0.0.0.0 # if you use web as the client, please set `http://{LAN_IP_of_the_server}:{port}/` to `BASE_URL` of `web/src/config/index.ts`. 21 | port: 8004 22 | local_inference_endpoint: 23 | host: localhost 24 | port: 8005 25 | logit_bias: 26 | parse_task: 0.1 27 | choose_model: 5 28 | tprompt: 29 | parse_task: >- 30 | #1 Task Planning Stage: The AI assistant can parse user input to several tasks: [{"task": task, "id": task_id, "dep": dependency_task_id, "args": {"text": text or -dep_id, "image": image_url or -dep_id, "audio": audio_url or -dep_id}}]. The special tag "-dep_id" refer to the one generated text/image/audio in the dependency task (Please consider whether the dependency task generates resources of this type.) and "dep_id" must be in "dep" list. The "dep" field denotes the ids of the previous prerequisite tasks which generate a new resource that the current task relies on. The "args" field must in ["text", "image", "audio"], nothing else. The task MUST be selected from the following options: "token-classification", "text2text-generation", "summarization", "translation", "question-answering", "conversational", "text-generation", "sentence-similarity", "tabular-classification", "object-detection", "image-classification", "image-to-image", "image-to-text", "text-to-image", "text-to-video", "visual-question-answering", "document-question-answering", "image-segmentation", "depth-estimation", "text-to-speech", "automatic-speech-recognition", "audio-to-audio", "audio-classification", "canny-control", "hed-control", "mlsd-control", "normal-control", "openpose-control", "canny-text-to-image", "depth-text-to-image", "hed-text-to-image", "mlsd-text-to-image", "normal-text-to-image", "openpose-text-to-image", "seg-text-to-image". There may be multiple tasks of the same type. Think step by step about all the tasks needed to resolve the user's request. Parse out as few tasks as possible while ensuring that the user request can be resolved. Pay attention to the dependencies and order among tasks. If the user input can't be parsed, you need to reply empty JSON []. 31 | choose_model: >- 32 | #2 Model Selection Stage: Given the user request and the parsed tasks, the AI assistant helps the user to select a suitable model from a list of models to process the user request. The assistant should focus more on the description of the model and find the model that has the most potential to solve requests and tasks. Also, prefer models with local inference endpoints for speed and stability. 33 | response_results: >- 34 | #4 Response Generation Stage: With the task execution logs, the AI assistant needs to describe the process and inference results. 35 | demos_or_presteps: 36 | parse_task: demos/demo_parse_task.json 37 | choose_model: demos/demo_choose_model.json 38 | response_results: demos/demo_response_results.json 39 | prompt: 40 | parse_task: The chat log [ {{context}} ] may contain the resources I mentioned. Now I input { {{input}} }. Pay attention to the input and output types of tasks and the dependencies between tasks. 41 | choose_model: >- 42 | Please choose the most suitable model from {{metas}} for the task {{task}}. The output must be in a strict JSON format: {"id": "id", "reason": "your detail reasons for the choice"}. 43 | response_results: >- 44 | Yes. Please first think carefully and directly answer my request based on the inference results. Some of the inferences may not always turn out to be correct and require you to make careful consideration in making decisions. Then please detail your workflow including the used models and inference results for my request in your friendly tone. Please filter out information that is not relevant to my request. Tell me the complete path or urls of files in inference results. If there is nothing in the results, please tell me you can't make it. } -------------------------------------------------------------------------------- /hugginggpt/server/configs/config.default.yaml: -------------------------------------------------------------------------------- 1 | openai: 2 | api_key: REPLACE_WITH_YOUR_OPENAI_API_KEY_HERE 3 | # azure: 4 | # api_key: REPLACE_WITH_YOUR_AZURE_API_KEY_HERE 5 | # base_url: REPLACE_WITH_YOUR_ENDPOINT_HERE 6 | # deployment_name: REPLACE_WITH_YOUR_DEPLOYMENT_NAME_HERE 7 | # api_version: "2022-12-01" 8 | huggingface: 9 | token: REPLACE_WITH_YOUR_HUGGINGFACE_TOKEN_HERE # required: huggingface token @ https://huggingface.co/settings/tokens 10 | dev: false 11 | debug: false 12 | log_file: logs/debug.log 13 | model: text-davinci-003 # currently only support text-davinci-003, gpt-4, we will support more open-source LLMs in the future 14 | use_completion: true 15 | inference_mode: hybrid # local, huggingface or hybrid, prefer hybrid 16 | local_deployment: full # minimal, standard or full, prefer full 17 | device: cuda:0 # cuda:id or cpu 18 | num_candidate_models: 5 19 | max_description_length: 100 20 | proxy: # optional: your proxy server "http://ip:port" 21 | http_listen: 22 | host: 0.0.0.0 # if you use web as the client, please set `http://{LAN_IP_of_the_server}:{port}/` to `BASE_URL` of `web/src/config/index.ts`. 23 | port: 8004 24 | local_inference_endpoint: 25 | host: localhost 26 | port: 8005 27 | logit_bias: 28 | parse_task: 0.1 29 | choose_model: 5 30 | tprompt: 31 | parse_task: >- 32 | #1 Task Planning Stage: The AI assistant can parse user input to several tasks: [{"task": task, "id": task_id, "dep": dependency_task_id, "args": {"text": text or -dep_id, "image": image_url or -dep_id, "audio": audio_url or -dep_id}}]. The special tag "-dep_id" refer to the one generated text/image/audio in the dependency task (Please consider whether the dependency task generates resources of this type.) and "dep_id" must be in "dep" list. The "dep" field denotes the ids of the previous prerequisite tasks which generate a new resource that the current task relies on. The "args" field must in ["text", "image", "audio"], nothing else. The task MUST be selected from the following options: "token-classification", "text2text-generation", "summarization", "translation", "question-answering", "conversational", "text-generation", "sentence-similarity", "tabular-classification", "object-detection", "image-classification", "image-to-image", "image-to-text", "text-to-image", "text-to-video", "visual-question-answering", "document-question-answering", "image-segmentation", "depth-estimation", "text-to-speech", "automatic-speech-recognition", "audio-to-audio", "audio-classification", "canny-control", "hed-control", "mlsd-control", "normal-control", "openpose-control", "canny-text-to-image", "depth-text-to-image", "hed-text-to-image", "mlsd-text-to-image", "normal-text-to-image", "openpose-text-to-image", "seg-text-to-image". There may be multiple tasks of the same type. Think step by step about all the tasks needed to resolve the user's request. Parse out as few tasks as possible while ensuring that the user request can be resolved. Pay attention to the dependencies and order among tasks. If the user input can't be parsed, you need to reply empty JSON []. 33 | choose_model: >- 34 | #2 Model Selection Stage: Given the user request and the parsed tasks, the AI assistant helps the user to select a suitable model from a list of models to process the user request. The assistant should focus more on the description of the model and find the model that has the most potential to solve requests and tasks. Also, prefer models with local inference endpoints for speed and stability. 35 | response_results: >- 36 | #4 Response Generation Stage: With the task execution logs, the AI assistant needs to describe the process and inference results. 37 | demos_or_presteps: 38 | parse_task: demos/demo_parse_task.json 39 | choose_model: demos/demo_choose_model.json 40 | response_results: demos/demo_response_results.json 41 | prompt: 42 | parse_task: The chat log [ {{context}} ] may contain the resources I mentioned. Now I input { {{input}} }. Pay attention to the input and output types of tasks and the dependencies between tasks. 43 | choose_model: >- 44 | Please choose the most suitable model from {{metas}} for the task {{task}}. The output must be in a strict JSON format: {"id": "id", "reason": "your detail reasons for the choice"}. 45 | response_results: >- 46 | Yes. Please first think carefully and directly answer my request based on the inference results. Some of the inferences may not always turn out to be correct and require you to make careful consideration in making decisions. Then please detail your workflow including the used models and inference results for my request in your friendly tone. Please filter out information that is not relevant to my request. Tell me the complete path or urls of files in inference results. If there is nothing in the results, please tell me you can't make it. } -------------------------------------------------------------------------------- /hugginggpt/server/configs/config.gradio.yaml: -------------------------------------------------------------------------------- 1 | huggingface: 2 | token: REPLACE_WITH_YOUR_HUGGINGFACE_TOKEN_HERE # required: huggingface token @ https://huggingface.co/settings/tokens 3 | dev: false 4 | debug: true 5 | log_file: logs/debug.log 6 | model: text-davinci-003 # currently only support text-davinci-003, we will support more open-source LLMs in the future 7 | use_completion: true 8 | inference_mode: huggingface # local, huggingface or hybrid, prefer hybrid 9 | local_deployment: full # minimal, standard or full, prefer full 10 | device: cuda:0 # cuda:id or cpu 11 | num_candidate_models: 5 12 | max_description_length: 100 13 | proxy: # optional: your proxy server "http://ip:port" 14 | local_inference_endpoint: 15 | host: localhost 16 | port: 8005 17 | logit_bias: 18 | parse_task: 0.1 19 | choose_model: 5 20 | tprompt: 21 | parse_task: >- 22 | #1 Task Planning Stage: The AI assistant can parse user input to several tasks: [{"task": task, "id": task_id, "dep": dependency_task_id, "args": {"text": text or -dep_id, "image": image_url or -dep_id, "audio": audio_url or -dep_id}}]. The special tag "-dep_id" refer to the one generated text/image/audio in the dependency task (Please consider whether the dependency task generates resources of this type.) and "dep_id" must be in "dep" list. The "dep" field denotes the ids of the previous prerequisite tasks which generate a new resource that the current task relies on. The "args" field must in ["text", "image", "audio"], nothing else. The task MUST be selected from the following options: "token-classification", "text2text-generation", "summarization", "translation", "question-answering", "conversational", "text-generation", "sentence-similarity", "tabular-classification", "object-detection", "image-classification", "image-to-image", "image-to-text", "text-to-image", "text-to-video", "visual-question-answering", "document-question-answering", "image-segmentation", "depth-estimation", "text-to-speech", "automatic-speech-recognition", "audio-to-audio", "audio-classification", "canny-control", "hed-control", "mlsd-control", "normal-control", "openpose-control", "canny-text-to-image", "depth-text-to-image", "hed-text-to-image", "mlsd-text-to-image", "normal-text-to-image", "openpose-text-to-image", "seg-text-to-image". There may be multiple tasks of the same type. Think step by step about all the tasks needed to resolve the user's request. Parse out as few tasks as possible while ensuring that the user request can be resolved. Pay attention to the dependencies and order among tasks. If the user input can't be parsed, you need to reply empty JSON []. 23 | choose_model: >- 24 | #2 Model Selection Stage: Given the user request and the parsed tasks, the AI assistant helps the user to select a suitable model from a list of models to process the user request. The assistant should focus more on the description of the model and find the model that has the most potential to solve requests and tasks. Also, prefer models with local inference endpoints for speed and stability. 25 | response_results: >- 26 | #4 Response Generation Stage: With the task execution logs, the AI assistant needs to describe the process and inference results. 27 | demos_or_presteps: 28 | parse_task: demos/demo_parse_task.json 29 | choose_model: demos/demo_choose_model.json 30 | response_results: demos/demo_response_results.json 31 | prompt: 32 | parse_task: The chat log [ {{context}} ] may contain the resources I mentioned. Now I input { {{input}} }. Pay attention to the input and output types of tasks and the dependencies between tasks. 33 | choose_model: >- 34 | Please choose the most suitable model from {{metas}} for the task {{task}}. The output must be in a strict JSON format: {"id": "id", "reason": "your detail reasons for the choice"}. 35 | response_results: >- 36 | Yes. Please first think carefully and directly answer my request based on the inference results. Some of the inferences may not always turn out to be correct and require you to make careful consideration in making decisions. Then please detail your workflow including the used models and inference results for my request in your friendly tone. Please filter out information that is not relevant to my request. Tell me the complete path or urls of files in inference results. If there is nothing in the results, please tell me you can't make it. } -------------------------------------------------------------------------------- /hugginggpt/server/configs/config.lite.yaml: -------------------------------------------------------------------------------- 1 | openai: 2 | api_key: REPLACE_WITH_YOUR_OPENAI_API_KEY_HERE 3 | huggingface: 4 | token: REPLACE_WITH_YOUR_HUGGINGFACE_TOKEN_HERE # required: huggingface token @ https://huggingface.co/settings/tokens 5 | dev: false 6 | debug: false 7 | log_file: logs/debug.log 8 | model: text-davinci-003 # currently only support text-davinci-003, gpt-4, we will support more open-source LLMs in the future 9 | use_completion: true 10 | inference_mode: huggingface # local, huggingface or hybrid, prefer hybrid 11 | local_deployment: minimal # minimal, standard or full, prefer full 12 | num_candidate_models: 5 13 | max_description_length: 100 14 | proxy: # optional: your proxy server "http://ip:port" 15 | http_listen: 16 | host: 0.0.0.0 # if you use web as the client, please set `http://{LAN_IP_of_the_server}:{port}/` to `BASE_URL` of `web/src/config/index.ts`. 17 | port: 8004 18 | logit_bias: 19 | parse_task: 0.1 20 | choose_model: 5 21 | tprompt: 22 | parse_task: >- 23 | #1 Task Planning Stage: The AI assistant can parse user input to several tasks: [{"task": task, "id": task_id, "dep": dependency_task_id, "args": {"text": text or -dep_id, "image": image_url or -dep_id, "audio": audio_url or -dep_id}}]. The special tag "-dep_id" refer to the one generated text/image/audio in the dependency task (Please consider whether the dependency task generates resources of this type.) and "dep_id" must be in "dep" list. The "dep" field denotes the ids of the previous prerequisite tasks which generate a new resource that the current task relies on. The "args" field must in ["text", "image", "audio"], nothing else. The task MUST be selected from the following options: "token-classification", "text2text-generation", "summarization", "translation", "question-answering", "conversational", "text-generation", "sentence-similarity", "tabular-classification", "object-detection", "image-classification", "image-to-image", "image-to-text", "text-to-image", "text-to-video", "visual-question-answering", "document-question-answering", "image-segmentation", "depth-estimation", "text-to-speech", "automatic-speech-recognition", "audio-to-audio", "audio-classification", "canny-control", "hed-control", "mlsd-control", "normal-control", "openpose-control", "canny-text-to-image", "depth-text-to-image", "hed-text-to-image", "mlsd-text-to-image", "normal-text-to-image", "openpose-text-to-image", "seg-text-to-image". There may be multiple tasks of the same type. Think step by step about all the tasks needed to resolve the user's request. Parse out as few tasks as possible while ensuring that the user request can be resolved. Pay attention to the dependencies and order among tasks. If the user input can't be parsed, you need to reply empty JSON []. 24 | choose_model: >- 25 | #2 Model Selection Stage: Given the user request and the parsed tasks, the AI assistant helps the user to select a suitable model from a list of models to process the user request. The assistant should focus more on the description of the model and find the model that has the most potential to solve requests and tasks. Also, prefer models with local inference endpoints for speed and stability. 26 | response_results: >- 27 | #4 Response Generation Stage: With the task execution logs, the AI assistant needs to describe the process and inference results. 28 | demos_or_presteps: 29 | parse_task: demos/demo_parse_task.json 30 | choose_model: demos/demo_choose_model.json 31 | response_results: demos/demo_response_results.json 32 | prompt: 33 | parse_task: The chat log [ {{context}} ] may contain the resources I mentioned. Now I input { {{input}} }. Pay attention to the input and output types of tasks and the dependencies between tasks. 34 | choose_model: >- 35 | Please choose the most suitable model from {{metas}} for the task {{task}}. The output must be in a strict JSON format: {"id": "id", "reason": "your detail reasons for the choice"}. 36 | response_results: >- 37 | Yes. Please first think carefully and directly answer my request based on the inference results. Some of the inferences may not always turn out to be correct and require you to make careful consideration in making decisions. Then please detail your workflow including the used models and inference results for my request in your friendly tone. Please filter out information that is not relevant to my request. Tell me the complete path or urls of files in inference results. If there is nothing in the results, please tell me you can't make it. } -------------------------------------------------------------------------------- /hugginggpt/server/demos/demo_choose_model.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "role": "user", 4 | "content": "{{input}}" 5 | }, 6 | { 7 | "role": "assistant", 8 | "content": "{{task}}" 9 | } 10 | ] -------------------------------------------------------------------------------- /hugginggpt/server/demos/demo_parse_task.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "role": "user", 4 | "content": "Give you some pictures e1.jpg, e2.png, e3.jpg, help me count the number of sheep?" 5 | }, 6 | { 7 | "role": "assistant", 8 | "content": "[{\"task\": \"image-to-text\", \"id\": 0, \"dep\": [-1], \"args\": {\"image\": \"e1.jpg\" }}, {\"task\": \"object-detection\", \"id\": 1, \"dep\": [-1], \"args\": {\"image\": \"e1.jpg\" }}, {\"task\": \"visual-question-answering\", \"id\": 2, \"dep\": [1], \"args\": {\"image\": \"-1\", \"text\": \"How many sheep in the picture\"}} }}, {\"task\": \"image-to-text\", \"id\": 3, \"dep\": [-1], \"args\": {\"image\": \"e2.png\" }}, {\"task\": \"object-detection\", \"id\": 4, \"dep\": [-1], \"args\": {\"image\": \"e2.png\" }}, {\"task\": \"visual-question-answering\", \"id\": 5, \"dep\": [4], \"args\": {\"image\": \"-4\", \"text\": \"How many sheep in the picture\"}} }}, {\"task\": \"image-to-text\", \"id\": 6, \"dep\": [-1], \"args\": {\"image\": \"e3.jpg\" }}, {\"task\": \"object-detection\", \"id\": 7, \"dep\": [-1], \"args\": {\"image\": \"e3.jpg\" }}, {\"task\": \"visual-question-answering\", \"id\": 8, \"dep\": [7], \"args\": {\"image\": \"-7\", \"text\": \"How many sheep in the picture\"}}]" 9 | }, 10 | 11 | { 12 | "role":"user", 13 | "content":"Look at /e.jpg, can you tell me how many objects in the picture? Give me a picture and video similar to this one." 14 | }, 15 | { 16 | "role":"assistant", 17 | "content":"[{\"task\": \"image-to-text\", \"id\": 0, \"dep\": [-1], \"args\": {\"image\": \"/e.jpg\" }}, {\"task\": \"object-detection\", \"id\": 1, \"dep\": [-1], \"args\": {\"image\": \"/e.jpg\" }}, {\"task\": \"visual-question-answering\", \"id\": 2, \"dep\": [1], \"args\": {\"image\": \"-1\", \"text\": \"how many objects in the picture?\" }}, {\"task\": \"text-to-image\", \"id\": 3, \"dep\": [0], \"args\": {\"text\": \"\" }}, {\"task\": \"image-to-image\", \"id\": 4, \"dep\": [-1], \"args\": {\"image\": \"/e.jpg\" }}, {\"task\": \"text-to-video\", \"id\": 5, \"dep\": [0], \"args\": {\"text\": \"\" }}]" 18 | }, 19 | 20 | { 21 | "role":"user", 22 | "content":"given a document /images/e.jpeg, answer me what is the student amount? And describe the image with your voice" 23 | }, 24 | { 25 | "role":"assistant", 26 | "content":"{\"task\": \"document-question-answering\", \"id\": 0, \"dep\": [-1], \"args\": {\"image\": \"/images/e.jpeg\", \"text\": \"what is the student amount?\" }}, {\"task\": \"visual-question-answering\", \"id\": 1, \"dep\": [-1], \"args\": {\"image\": \"/images/e.jpeg\", \"text\": \"what is the student amount?\" }}, {\"task\": \"image-to-text\", \"id\": 2, \"dep\": [-1], \"args\": {\"image\": \"/images/e.jpg\" }}, {\"task\": \"text-to-speech\", \"id\": 3, \"dep\": [2], \"args\": {\"text\": \"-2\" }}]" 27 | }, 28 | 29 | { 30 | "role": "user", 31 | "content": "Given an image /example.jpg, first generate a hed image, then based on the hed image generate a new image where a girl is reading a book" 32 | }, 33 | { 34 | "role": "assistant", 35 | "content": "[{\"task\": \"openpose-control\", \"id\": 0, \"dep\": [-1], \"args\": {\"image\": \"/example.jpg\" }}, {\"task\": \"openpose-text-to-image\", \"id\": 1, \"dep\": [0], \"args\": {\"text\": \"a girl is reading a book\", \"image\": \"-0\" }}]" 36 | }, 37 | 38 | { 39 | "role": "user", 40 | "content": "please show me a video and an image of (based on the text) 'a boy is running' and dub it" 41 | }, 42 | { 43 | "role": "assistant", 44 | "content": "[{\"task\": \"text-to-video\", \"id\": 0, \"dep\": [-1], \"args\": {\"text\": \"a boy is running\" }}, {\"task\": \"text-to-speech\", \"id\": 1, \"dep\": [-1], \"args\": {\"text\": \"a boy is running\" }}, {\"task\": \"text-to-image\", \"id\": 2, \"dep\": [-1], \"args\": {\"text\": \"a boy is running\" }}]" 45 | }, 46 | 47 | 48 | { 49 | "role": "user", 50 | "content": "please show me a joke and an image of cat" 51 | }, 52 | { 53 | "role": "assistant", 54 | "content": "[{\"task\": \"conversational\", \"id\": 0, \"dep\": [-1], \"args\": {\"text\": \"please show me a joke of cat\" }}, {\"task\": \"text-to-image\", \"id\": 1, \"dep\": [-1], \"args\": {\"text\": \"a photo of cat\" }}]" 55 | } 56 | ] 57 | -------------------------------------------------------------------------------- /hugginggpt/server/demos/demo_response_results.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "role": "user", 4 | "content": "{{input}}" 5 | }, 6 | { 7 | "role": "assistant", 8 | "content": "Before give you a response, I want to introduce my workflow for your request, which is shown in the following JSON data: {{processes}}. Do you have any demands regarding my response?" 9 | } 10 | ] -------------------------------------------------------------------------------- /hugginggpt/server/get_token_ids.py: -------------------------------------------------------------------------------- 1 | import tiktoken 2 | 3 | encodings = { 4 | "gpt-4": tiktoken.get_encoding("cl100k_base"), 5 | "gpt-4-32k": tiktoken.get_encoding("cl100k_base"), 6 | "gpt-3.5-turbo": tiktoken.get_encoding("cl100k_base"), 7 | "gpt-3.5-turbo-0301": tiktoken.get_encoding("cl100k_base"), 8 | "text-davinci-003": tiktoken.get_encoding("p50k_base"), 9 | "text-davinci-002": tiktoken.get_encoding("p50k_base"), 10 | "text-davinci-001": tiktoken.get_encoding("r50k_base"), 11 | "text-curie-001": tiktoken.get_encoding("r50k_base"), 12 | "text-babbage-001": tiktoken.get_encoding("r50k_base"), 13 | "text-ada-001": tiktoken.get_encoding("r50k_base"), 14 | "davinci": tiktoken.get_encoding("r50k_base"), 15 | "curie": tiktoken.get_encoding("r50k_base"), 16 | "babbage": tiktoken.get_encoding("r50k_base"), 17 | "ada": tiktoken.get_encoding("r50k_base"), 18 | } 19 | 20 | max_length = { 21 | "gpt-4": 8192, 22 | "gpt-4-32k": 32768, 23 | "gpt-3.5-turbo": 4096, 24 | "gpt-3.5-turbo-0301": 4096, 25 | "text-davinci-003": 4096, 26 | "text-davinci-002": 4096, 27 | "text-davinci-001": 2049, 28 | "text-curie-001": 2049, 29 | "text-babbage-001": 2049, 30 | "text-ada-001": 2049, 31 | "davinci": 2049, 32 | "curie": 2049, 33 | "babbage": 2049, 34 | "ada": 2049 35 | } 36 | 37 | def count_tokens(model_name, text): 38 | return len(encodings[model_name].encode(text)) 39 | 40 | def get_max_context_length(model_name): 41 | return max_length[model_name] 42 | 43 | def get_token_ids_for_task_parsing(model_name): 44 | text = '''{"task": "text-classification", "token-classification", "text2text-generation", "summarization", "translation", "question-answering", "conversational", "text-generation", "sentence-similarity", "tabular-classification", "object-detection", "image-classification", "image-to-image", "image-to-text", "text-to-image", "visual-question-answering", "document-question-answering", "image-segmentation", "text-to-speech", "text-to-video", "automatic-speech-recognition", "audio-to-audio", "audio-classification", "canny-control", "hed-control", "mlsd-control", "normal-control", "openpose-control", "canny-text-to-image", "depth-text-to-image", "hed-text-to-image", "mlsd-text-to-image", "normal-text-to-image", "openpose-text-to-image", "seg-text-to-image", "args", "text", "path", "dep", "id", "-"}''' 45 | res = encodings[model_name].encode(text) 46 | res = list(set(res)) 47 | return res 48 | 49 | def get_token_ids_for_choose_model(model_name): 50 | text = '''{"id": "reason"}''' 51 | res = encodings[model_name].encode(text) 52 | res = list(set(res)) 53 | return res -------------------------------------------------------------------------------- /hugginggpt/server/models/download.ps1: -------------------------------------------------------------------------------- 1 | $models = @( 2 | "nlpconnect/vit-gpt2-image-captioning", 3 | "lllyasviel/ControlNet", 4 | "lllyasviel/sd-controlnet-canny", 5 | "lllyasviel/sd-controlnet-depth", 6 | "lllyasviel/sd-controlnet-hed", 7 | "lllyasviel/sd-controlnet-mlsd", 8 | "lllyasviel/sd-controlnet-openpose", 9 | "lllyasviel/sd-controlnet-scribble", 10 | "lllyasviel/sd-controlnet-seg", 11 | "runwayml/stable-diffusion-v1-5", 12 | "damo-vilab/text-to-video-ms-1.7b", 13 | "microsoft/speecht5_asr", 14 | "JorisCos/DCCRNet_Libri1Mix_enhsingle_16k", 15 | "espnet/kan-bayashi_ljspeech_vits", 16 | "facebook/detr-resnet-101", 17 | "microsoft/speecht5_hifigan", 18 | "microsoft/speecht5_vc", 19 | "openai/whisper-base", 20 | "Intel/dpt-large", 21 | "facebook/detr-resnet-50-panoptic", 22 | "facebook/detr-resnet-50", 23 | "google/owlvit-base-patch32", 24 | "impira/layoutlm-document-qa", 25 | "ydshieh/vit-gpt2-coco-en", 26 | "dandelin/vilt-b32-finetuned-vqa", 27 | "lambdalabs/sd-image-variations-diffusers", 28 | "facebook/maskformer-swin-base-coco", 29 | "Intel/dpt-hybrid-midas" 30 | ) 31 | 32 | $CURRENT_DIR = Get-Location 33 | 34 | foreach ($model in $models) { 35 | Write-Host "----- Downloading from https://huggingface.co/$model -----" 36 | if (Test-Path $model) { 37 | Set-Location $model 38 | git pull 39 | git lfs pull 40 | Set-Location $CURRENT_DIR 41 | } else { 42 | git clone "https://huggingface.co/$model" $model 43 | } 44 | } 45 | 46 | $datasets = @( 47 | "Matthijs/cmu-arctic-xvectors" 48 | ) 49 | 50 | foreach ($dataset in $datasets) { 51 | Write-Host "----- Downloading from https://huggingface.co/datasets/$dataset -----" 52 | if (Test-Path $dataset) { 53 | Set-Location $dataset 54 | git pull 55 | git lfs pull 56 | Set-Location $CURRENT_DIR 57 | } else { 58 | git clone "https://huggingface.co/datasets/$dataset" $dataset 59 | } 60 | } -------------------------------------------------------------------------------- /hugginggpt/server/models/download.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Set models and datasets to download 4 | models=( 5 | "nlpconnect/vit-gpt2-image-captioning" 6 | "lllyasviel/ControlNet" 7 | "lllyasviel/sd-controlnet-canny" 8 | "lllyasviel/sd-controlnet-depth" 9 | "lllyasviel/sd-controlnet-hed" 10 | "lllyasviel/sd-controlnet-mlsd" 11 | "lllyasviel/sd-controlnet-openpose" 12 | "lllyasviel/sd-controlnet-scribble" 13 | "lllyasviel/sd-controlnet-seg" 14 | "runwayml/stable-diffusion-v1-5" 15 | "damo-vilab/text-to-video-ms-1.7b" 16 | "microsoft/speecht5_asr" 17 | "JorisCos/DCCRNet_Libri1Mix_enhsingle_16k" 18 | "espnet/kan-bayashi_ljspeech_vits" 19 | "facebook/detr-resnet-101" 20 | "microsoft/speecht5_hifigan" 21 | "microsoft/speecht5_vc" 22 | "openai/whisper-base" 23 | "Intel/dpt-large" 24 | "facebook/detr-resnet-50-panoptic" 25 | "facebook/detr-resnet-50" 26 | "google/owlvit-base-patch32" 27 | "impira/layoutlm-document-qa" 28 | "ydshieh/vit-gpt2-coco-en" 29 | "dandelin/vilt-b32-finetuned-vqa" 30 | "lambdalabs/sd-image-variations-diffusers" 31 | "facebook/maskformer-swin-base-coco" 32 | "Intel/dpt-hybrid-midas" 33 | ) 34 | datasets=("Matthijs/cmu-arctic-xvectors") 35 | 36 | # Set the current directory 37 | CURRENT_DIR=$(pwd) 38 | 39 | # Download models 40 | for model in "${models[@]}"; do 41 | echo "----- Downloading from https://huggingface.co/${model} -----" 42 | if [ -d "${model}" ]; then 43 | (cd "${model}" && git pull && git lfs pull) 44 | else 45 | git clone --recurse-submodules "https://huggingface.co/${model}" "${model}" 46 | fi 47 | done 48 | 49 | # Download datasets 50 | for dataset in "${datasets[@]}"; do 51 | echo "----- Downloading from https://huggingface.co/datasets/${dataset} -----" 52 | if [ -d "${dataset}" ]; then 53 | (cd "${dataset}" && git pull && git lfs pull) 54 | else 55 | git clone --recurse-submodules "https://huggingface.co/datasets/${dataset}" "${dataset}" 56 | fi 57 | done -------------------------------------------------------------------------------- /hugginggpt/server/public/examples/a.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/JARVIS/c62e0faac76c4a2907cabe2cfe4bbe5f2e613400/hugginggpt/server/public/examples/a.jpg -------------------------------------------------------------------------------- /hugginggpt/server/public/examples/b.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/JARVIS/c62e0faac76c4a2907cabe2cfe4bbe5f2e613400/hugginggpt/server/public/examples/b.jpg -------------------------------------------------------------------------------- /hugginggpt/server/public/examples/c.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/JARVIS/c62e0faac76c4a2907cabe2cfe4bbe5f2e613400/hugginggpt/server/public/examples/c.jpg -------------------------------------------------------------------------------- /hugginggpt/server/public/examples/d.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/JARVIS/c62e0faac76c4a2907cabe2cfe4bbe5f2e613400/hugginggpt/server/public/examples/d.jpg -------------------------------------------------------------------------------- /hugginggpt/server/public/examples/e.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/JARVIS/c62e0faac76c4a2907cabe2cfe4bbe5f2e613400/hugginggpt/server/public/examples/e.jpg -------------------------------------------------------------------------------- /hugginggpt/server/public/examples/f.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/JARVIS/c62e0faac76c4a2907cabe2cfe4bbe5f2e613400/hugginggpt/server/public/examples/f.jpg -------------------------------------------------------------------------------- /hugginggpt/server/public/examples/g.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/JARVIS/c62e0faac76c4a2907cabe2cfe4bbe5f2e613400/hugginggpt/server/public/examples/g.jpg -------------------------------------------------------------------------------- /hugginggpt/server/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/diffusers.git@8c530fc2f6a76a2aefb6b285dce6df1675092ac6#egg=diffusers 2 | git+https://github.com/huggingface/transformers@c612628045822f909020f7eb6784c79700813eda#egg=transformers 3 | git+https://github.com/patrickvonplaten/controlnet_aux@78efc716868a7f5669c288233d65b471f542ce40#egg=controlnet_aux 4 | tiktoken==0.3.3 5 | pydub==0.25.1 6 | espnet==202301 7 | espnet_model_zoo==0.1.7 8 | flask==2.2.3 9 | flask_cors==3.0.10 10 | waitress==2.1.2 11 | datasets==2.11.0 12 | asteroid==0.6.0 13 | speechbrain==0.5.14 14 | timm==0.6.13 15 | typeguard==2.13.3 16 | accelerate==0.18.0 17 | pytesseract==0.3.10 18 | gradio==3.24.1 -------------------------------------------------------------------------------- /hugginggpt/server/run_gradio_demo.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | import gradio as gr 3 | import re 4 | from diffusers.utils import load_image 5 | import requests 6 | from awesome_chat import chat_huggingface 7 | 8 | all_messages = [] 9 | OPENAI_KEY = "" 10 | 11 | def add_message(content, role): 12 | message = {"role":role, "content":content} 13 | all_messages.append(message) 14 | 15 | def extract_medias(message): 16 | image_pattern = re.compile(r"(http(s?):|\/)?([\.\/_\w:-])*?\.(jpg|jpeg|tiff|gif|png)") 17 | image_urls = [] 18 | for match in image_pattern.finditer(message): 19 | if match.group(0) not in image_urls: 20 | image_urls.append(match.group(0)) 21 | 22 | audio_pattern = re.compile(r"(http(s?):|\/)?([\.\/_\w:-])*?\.(flac|wav)") 23 | audio_urls = [] 24 | for match in audio_pattern.finditer(message): 25 | if match.group(0) not in audio_urls: 26 | audio_urls.append(match.group(0)) 27 | 28 | video_pattern = re.compile(r"(http(s?):|\/)?([\.\/_\w:-])*?\.(mp4)") 29 | video_urls = [] 30 | for match in video_pattern.finditer(message): 31 | if match.group(0) not in video_urls: 32 | video_urls.append(match.group(0)) 33 | 34 | return image_urls, audio_urls, video_urls 35 | 36 | def set_openai_key(openai_key): 37 | global OPENAI_KEY 38 | OPENAI_KEY = openai_key 39 | return OPENAI_KEY 40 | 41 | def add_text(messages, message): 42 | if len(OPENAI_KEY) == 0 or not OPENAI_KEY.startswith("sk-"): 43 | return messages, "Please set your OpenAI API key first." 44 | add_message(message, "user") 45 | messages = messages + [(message, None)] 46 | image_urls, audio_urls, video_urls = extract_medias(message) 47 | 48 | for image_url in image_urls: 49 | if not image_url.startswith("http"): 50 | image_url = "public/" + image_url 51 | image = load_image(image_url) 52 | name = f"public/images/{str(uuid.uuid4())[:4]}.jpg" 53 | image.save(name) 54 | messages = messages + [((f"{name}",), None)] 55 | for audio_url in audio_urls: 56 | if not audio_url.startswith("http"): 57 | audio_url = "public/" + audio_url 58 | ext = audio_url.split(".")[-1] 59 | name = f"public/audios/{str(uuid.uuid4()[:4])}.{ext}" 60 | response = requests.get(audio_url) 61 | with open(name, "wb") as f: 62 | f.write(response.content) 63 | messages = messages + [((f"{name}",), None)] 64 | for video_url in video_urls: 65 | if not video_url.startswith("http"): 66 | video_url = "public/" + video_url 67 | ext = video_url.split(".")[-1] 68 | name = f"public/audios/{str(uuid.uuid4()[:4])}.{ext}" 69 | response = requests.get(video_url) 70 | with open(name, "wb") as f: 71 | f.write(response.content) 72 | messages = messages + [((f"{name}",), None)] 73 | return messages, "" 74 | 75 | def bot(messages): 76 | if len(OPENAI_KEY) == 0 or not OPENAI_KEY.startswith("sk-"): 77 | return messages 78 | message = chat_huggingface(all_messages, OPENAI_KEY, "openai", "https://api.openai.com/v1/completions")["message"] 79 | image_urls, audio_urls, video_urls = extract_medias(message) 80 | add_message(message, "assistant") 81 | messages[-1][1] = message 82 | for image_url in image_urls: 83 | if not image_url.startswith("http"): 84 | image_url = image_url.replace("public/", "") 85 | messages = messages + [((None, (f"public/{image_url}",)))] 86 | for audio_url in audio_urls: 87 | if not audio_url.startswith("http"): 88 | audio_url = audio_url.replace("public/", "") 89 | messages = messages + [((None, (f"public/{audio_url}",)))] 90 | for video_url in video_urls: 91 | if not video_url.startswith("http"): 92 | video_url = video_url.replace("public/", "") 93 | messages = messages + [((None, (f"public/{video_url}",)))] 94 | return messages 95 | 96 | with gr.Blocks() as demo: 97 | gr.Markdown("

HuggingGPT (Dev)

") 98 | with gr.Row(): 99 | openai_api_key = gr.Textbox( 100 | show_label=False, 101 | placeholder="Set your OpenAI API key here and press Enter", 102 | lines=1, 103 | type="password", 104 | ) 105 | 106 | chatbot = gr.Chatbot([], elem_id="chatbot").style(height=500) 107 | 108 | with gr.Row(): 109 | txt = gr.Textbox( 110 | show_label=False, 111 | placeholder="Enter text and press enter. The url of the multimedia resource must contain the extension name.", 112 | ).style(container=False) 113 | 114 | txt.submit(add_text, [chatbot, txt], [chatbot, txt]).then( 115 | bot, chatbot, chatbot 116 | ) 117 | openai_api_key.submit(set_openai_key, [openai_api_key], [openai_api_key]) 118 | 119 | gr.Examples( 120 | examples=["Given a collection of image A: /examples/a.jpg, B: /examples/b.jpg, C: /examples/c.jpg, please tell me how many zebras in these picture?", 121 | "Please generate a canny image based on /examples/f.jpg", 122 | "show me a joke and an image of cat", 123 | "what is in the /examples/a.jpg", 124 | "generate a video and audio about a dog is running on the grass", 125 | "based on the /examples/a.jpg, please generate a video and audio", 126 | "based on pose of /examples/d.jpg and content of /examples/e.jpg, please show me a new image", 127 | ], 128 | inputs=txt 129 | ) 130 | 131 | demo.launch() -------------------------------------------------------------------------------- /hugginggpt/web/electron/.npmrc: -------------------------------------------------------------------------------- 1 | registry=https://registry.npmmirror.com 2 | electron_mirror=https://npmmirror.com/mirrors/electron/ 3 | chromedriver_cdnurl=https://npmmirror.com/mirrors/chromedriver 4 | -------------------------------------------------------------------------------- /hugginggpt/web/electron/electron-builder.yml: -------------------------------------------------------------------------------- 1 | appId: com.chatgpt.app 2 | productName: ChatGPT 3 | directories: 4 | output: ../electron-dist 5 | mac: 6 | category: public.app-category.productivity 7 | target: 8 | - target: dmg 9 | arch: 10 | - arm64 11 | - x64 12 | win: 13 | target: 14 | - target: nsis 15 | arch: 16 | - x64 17 | - ia32 18 | -------------------------------------------------------------------------------- /hugginggpt/web/electron/main.js: -------------------------------------------------------------------------------- 1 | // Modules to control application life and create native browser window 2 | const {app, BrowserWindow} = require('electron') 3 | const path = require('path') 4 | 5 | function createWindow () { 6 | // Create the browser window. 7 | const mainWindow = new BrowserWindow({ 8 | width: 800, 9 | height: 600, 10 | webPreferences: { 11 | preload: path.join(__dirname, 'preload.js') 12 | } 13 | }) 14 | 15 | // and load the index.html of the app. 16 | mainWindow.loadFile('index.html') 17 | 18 | // Open the DevTools. 19 | // mainWindow.webContents.openDevTools() 20 | } 21 | 22 | // This method will be called when Electron has finished 23 | // initialization and is ready to create browser windows. 24 | // Some APIs can only be used after this event occurs. 25 | app.whenReady().then(() => { 26 | createWindow() 27 | 28 | app.on('activate', function () { 29 | // On macOS it's common to re-create a window in the app when the 30 | // dock icon is clicked and there are no other windows open. 31 | if (BrowserWindow.getAllWindows().length === 0) createWindow() 32 | }) 33 | }) 34 | 35 | // Quit when all windows are closed, except on macOS. There, it's common 36 | // for applications and their menu bar to stay active until the user quits 37 | // explicitly with Cmd + Q. 38 | app.on('window-all-closed', function () { 39 | if (process.platform !== 'darwin') app.quit() 40 | }) 41 | 42 | // In this file you can include the rest of your app's specific main process 43 | // code. You can also put them in separate files and require them here. 44 | -------------------------------------------------------------------------------- /hugginggpt/web/electron/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "chatgpt", 3 | "version": "1.0.0", 4 | "main": "main.js", 5 | "scripts": { 6 | "dev": "electron .", 7 | "build": "electron-builder" 8 | }, 9 | "devDependencies": { 10 | "electron": "^23.1.0", 11 | "electron-builder": "^23.6.0" 12 | } 13 | } -------------------------------------------------------------------------------- /hugginggpt/web/electron/preload.js: -------------------------------------------------------------------------------- 1 | /** 2 | * The preload script runs before. It has access to web APIs 3 | * as well as Electron's renderer process modules and some 4 | * polyfilled Node.js functions. 5 | * 6 | * https://www.electronjs.org/docs/latest/tutorial/sandbox 7 | */ 8 | window.addEventListener('DOMContentLoaded', () => { 9 | const replaceText = (selector, text) => { 10 | const element = document.getElementById(selector) 11 | if (element) element.innerText = text 12 | } 13 | 14 | for (const type of ['chrome', 'node', 'electron']) { 15 | replaceText(`${type}-version`, process.versions[type]) 16 | } 17 | }) 18 | -------------------------------------------------------------------------------- /hugginggpt/web/env.d.ts: -------------------------------------------------------------------------------- 1 | /// 2 | -------------------------------------------------------------------------------- /hugginggpt/web/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | HuggingGPT 8 | 9 | 10 |
11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /hugginggpt/web/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "vue3-ts-vite-router-tailwindcss", 3 | "version": "0.0.0", 4 | "main": "index.html", 5 | "scripts": { 6 | "dev": "vite", 7 | "build": "run-p type-check build-only", 8 | "preview": "vite preview --port 4173", 9 | "build-only": "vite build", 10 | "type-check": "vue-tsc --noEmit", 11 | "e:dev": "yarn build && cp -r electron/. dist && cd dist && yarn && yarn dev", 12 | "e:build": "yarn build && cp -r electron/. dist && cd dist && yarn && yarn build" 13 | }, 14 | "dependencies": { 15 | "axios": "^1.3.4", 16 | "vue": "^3.2.38", 17 | "vue-router": "^4.1.5" 18 | }, 19 | "devDependencies": { 20 | "@types/node": "^16.11.56", 21 | "@vitejs/plugin-vue": "^3.0.3", 22 | "@vue/tsconfig": "^0.1.3", 23 | "autoprefixer": "^10.4.12", 24 | "npm-run-all": "^4.1.5", 25 | "postcss": "^8.4.18", 26 | "tailwindcss": "^3.2.1", 27 | "typescript": "~4.7.4", 28 | "vite": "^3.2.5", 29 | "vue-tsc": "^0.40.7" 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /hugginggpt/web/postcss.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | plugins: { 3 | tailwindcss: {}, 4 | autoprefixer: {}, 5 | }, 6 | } 7 | -------------------------------------------------------------------------------- /hugginggpt/web/public/audio.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hugginggpt/web/public/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/JARVIS/c62e0faac76c4a2907cabe2cfe4bbe5f2e613400/hugginggpt/web/public/favicon.ico -------------------------------------------------------------------------------- /hugginggpt/web/public/video.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hugginggpt/web/src/App.vue: -------------------------------------------------------------------------------- 1 | 3 | 4 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /hugginggpt/web/src/api/chatgpt.ts: -------------------------------------------------------------------------------- 1 | import type { CleanChatMessage } from "@/types"; 2 | import axios, { AxiosError } from "axios"; 3 | import { CHAT_GPT_URL, CHAT_GPT_LLM } from "@/config"; 4 | 5 | axios.defaults.headers.post["Content-Type"] = "application/json"; 6 | 7 | export async function chatgpt(messageList: CleanChatMessage[], apiKey: string) { 8 | var endpoint = `${CHAT_GPT_URL}/v1/chat/completions` 9 | 10 | try { 11 | const completion = await axios({ 12 | url: endpoint, 13 | method: "post", 14 | headers: { 15 | Authorization: `Bearer ${apiKey}`, 16 | }, 17 | data: { 18 | model: CHAT_GPT_LLM, 19 | messages: messageList 20 | }, 21 | timeout: 60000, // 180 seconds 22 | }); 23 | return { 24 | status: "success", 25 | data: completion.data.choices[0].message.content, 26 | }; 27 | } catch (error: any) { 28 | return { 29 | status: "error", 30 | message: error.message 31 | }; 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /hugginggpt/web/src/api/hugginggpt.ts: -------------------------------------------------------------------------------- 1 | import type { CleanChatMessage } from "@/types"; 2 | import axios, { AxiosError } from "axios"; 3 | import { HUGGINGGPT_BASE_URL } from "@/config"; 4 | 5 | const model = "gpt-3.5-turbo"; 6 | 7 | axios.defaults.headers.post["Content-Type"] = "application/json"; 8 | 9 | export async function hugginggpt(messageList: CleanChatMessage[]) { 10 | var endpoint = `${HUGGINGGPT_BASE_URL}/hugginggpt` 11 | try { 12 | const response = await axios({ 13 | url: endpoint, 14 | method: "post", 15 | data: { 16 | model, 17 | messages: messageList.slice(1) 18 | }, 19 | timeout: 180000, // 180 seconds 20 | }); 21 | return { 22 | status: "success", 23 | data: response.data.message, 24 | }; 25 | } catch (error: any) { 26 | return { 27 | status: "error", 28 | message: error.message 29 | }; 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /hugginggpt/web/src/assets/audio.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hugginggpt/web/src/assets/chatgpt.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hugginggpt/web/src/assets/clip.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /hugginggpt/web/src/assets/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/JARVIS/c62e0faac76c4a2907cabe2cfe4bbe5f2e613400/hugginggpt/web/src/assets/favicon.ico -------------------------------------------------------------------------------- /hugginggpt/web/src/assets/huggingface.svg: -------------------------------------------------------------------------------- 1 | 2 | 9 | 13 | 17 | 21 | 25 | 29 | 38 | 42 | 43 | 44 | 48 | 49 | 53 | 57 | 61 | 65 | 69 | 73 | 74 | -------------------------------------------------------------------------------- /hugginggpt/web/src/assets/logo.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hugginggpt/web/src/assets/setting.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /hugginggpt/web/src/assets/switch.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hugginggpt/web/src/assets/tailwind.css: -------------------------------------------------------------------------------- 1 | @tailwind base; 2 | @tailwind components; 3 | @tailwind utilities; 4 | 5 | @layer components { 6 | .btn { 7 | @apply px-4 py-2 text-sm font-medium tracking-wide text-white capitalize transition-colors duration-300 transform bg-blue-700 rounded-md hover:bg-blue-600 focus:outline-none focus:bg-blue-600 whitespace-nowrap disabled:bg-blue-300; 8 | } 9 | .input { 10 | @apply px-4 py-2 text-gray-700 bg-white border rounded-md mr-2 sm:mr-4 focus:border-blue-400 focus:outline-none focus:ring focus:ring-blue-300 focus:ring-opacity-40 flex-grow; 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /hugginggpt/web/src/components/Loading.vue: -------------------------------------------------------------------------------- 1 | 16 | 17 | 114 | -------------------------------------------------------------------------------- /hugginggpt/web/src/config/index.ts: -------------------------------------------------------------------------------- 1 | const HUGGINGGPT_BASE_URL = "http://localhost:8004" 2 | 3 | // use ChatGPT: double click on the setting icon 4 | const CHAT_GPT_URL = "https://api.openai.com" 5 | const CHAT_GPT_LLM = "gpt-3.5-turbo" // gpt-3.5-turbo, gpt-4 6 | // Dev: local endpoint 7 | // const CHAT_GPT_URL = "http://localhost:8006" 8 | 9 | 10 | export {HUGGINGGPT_BASE_URL, CHAT_GPT_URL, CHAT_GPT_LLM} -------------------------------------------------------------------------------- /hugginggpt/web/src/main.ts: -------------------------------------------------------------------------------- 1 | import { createApp } from "vue"; 2 | import App from "./App.vue"; 3 | import router from "./router"; 4 | import "./assets/tailwind.css"; 5 | 6 | const app = createApp(App); 7 | 8 | app.use(router).mount("#app"); 9 | -------------------------------------------------------------------------------- /hugginggpt/web/src/prompt/index.ts: -------------------------------------------------------------------------------- 1 | import type {Collection, ChatMessage } from "@/types"; 2 | 3 | const ChatGPTTerminalMessage:ChatMessage[] = [ 4 | { 5 | role: "assistant", 6 | content: "Hi there! I am OpenAI ChatGPT, an AI assistant for you. How can I help you? ", 7 | type: "text", 8 | first: true 9 | }, 10 | { 11 | role: "user", 12 | content: "I want you to act as a linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply with the terminal output inside one unique code block, and nothing else. do not write explanations. do not type commands unless I instruct you to do so. When I need to tell you something in English, I will do so by putting text inside curly brackets {like this}.", 13 | type: "text", 14 | first: true 15 | }, 16 | { 17 | role: "assistant", 18 | content: "Yes, I will do it for you. Please type the command and I will reply with the terminal output.", 19 | type: "text", 20 | first: true 21 | } 22 | ] 23 | 24 | const ChatGPTPolishMessage:ChatMessage[] = [ 25 | { 26 | role: "assistant", 27 | content: "Hi there! I am OpenAI ChatGPT, an AI assistant for you. How can I help you? ", 28 | type: "text", 29 | first: true 30 | }, 31 | { 32 | role: "user", 33 | content: "You are a well-trained AI writing assistant with expertise in writing academic papers for computer conferences. By giving you a draft paragraph, I hope you can help me polish my writing with your knowledge. The language should be concise and consistent with the style of an academic paper.", 34 | type: "text", 35 | first: true 36 | }, 37 | { 38 | role: "assistant", 39 | content: "No problem, I will think carefully and polish the paper for you.", 40 | type: "text", 41 | first: true 42 | }, 43 | ] 44 | 45 | const ChatGPTTranslationMessage:ChatMessage[] = [ 46 | { 47 | role: "assistant", 48 | content: "Hi there! I am OpenAI ChatGPT, an AI assistant for you. How can I help you? ", 49 | type: "text", 50 | first: true 51 | }, 52 | { 53 | role: "user", 54 | content: "I want you to act as an English translator, spelling corrector and improver. I will speak to you in any language and you will detect the language, translate it and answer in the corrected and improved version of my text, in English. I want you to replace my simplified A0-level words and sentences with more beautiful and elegant, upper level English words and sentences. Keep the meaning same, but make them more literary. I want you to only reply the correction, the improvements and nothing else, do not write explanations.", 55 | type: "text", 56 | first: true 57 | }, 58 | { 59 | role: "assistant", 60 | content: "Sure, I will act as an English translator and improver.", 61 | type: "text", 62 | first: true 63 | }, 64 | ] 65 | 66 | 67 | const defaultChatGPTMessage:ChatMessage[] = [ 68 | { 69 | role: "assistant", 70 | content: "Hi there! I am OpenAI ChatGPT, an AI assistant for you. How can I help you? ", 71 | type: "text", 72 | first: true 73 | } 74 | ] 75 | 76 | const defaultHuggingGPTMessage:ChatMessage[] = [ 77 | { 78 | role: "assistant", 79 | content: "Hi there, I am HuggingGPT empowered by Huggingface family! Yes, I can provide thousands of models for dozens of tasks. For more fun and creativity, I have invited Diffusers family to join our team. Feel free to experience it!", 80 | type: "text", 81 | first: true 82 | } 83 | ] 84 | 85 | const promptCollection: Collection = { 86 | "chatgpt": { 87 | "terminal": ChatGPTTerminalMessage, 88 | "polish": ChatGPTPolishMessage, 89 | "translation": ChatGPTTranslationMessage, 90 | "default": defaultChatGPTMessage, 91 | }, 92 | "hugginggpt": { 93 | "default": defaultHuggingGPTMessage 94 | } 95 | } 96 | 97 | 98 | export default promptCollection -------------------------------------------------------------------------------- /hugginggpt/web/src/router/index.ts: -------------------------------------------------------------------------------- 1 | import { createRouter, createWebHashHistory } from "vue-router"; 2 | 3 | const router = createRouter({ 4 | history: createWebHashHistory(import.meta.env.BASE_URL), 5 | routes: [ 6 | { 7 | path: "/", 8 | name: "home", 9 | component: () => import("@/views/home.vue"), 10 | }, 11 | ], 12 | }); 13 | 14 | export default router; 15 | -------------------------------------------------------------------------------- /hugginggpt/web/src/types/index.ts: -------------------------------------------------------------------------------- 1 | export interface ChatMessage { 2 | role: "user" | "assistant" | "system"; 3 | type: "text" | "image" | "audio" | "video" | "code"; 4 | first: boolean; 5 | content: string; 6 | } 7 | 8 | export interface CleanChatMessage { 9 | role: "user" | "assistant" | "system"; 10 | content: string; 11 | } 12 | 13 | export interface Collection { 14 | chatgpt: { 15 | [key: string]: ChatMessage[]; 16 | }; 17 | hugginggpt: { 18 | [key: string]: ChatMessage[]; 19 | }; 20 | } 21 | -------------------------------------------------------------------------------- /hugginggpt/web/tailwind.config.js: -------------------------------------------------------------------------------- 1 | /** @type {import('tailwindcss').Config} */ 2 | module.exports = { 3 | content: ["./index.html", "./src/**/*.{vue,js,ts,jsx,tsx}"], 4 | theme: { 5 | extend: {}, 6 | }, 7 | plugins: [], 8 | } 9 | -------------------------------------------------------------------------------- /hugginggpt/web/tsconfig.config.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "@vue/tsconfig/tsconfig.node.json", 3 | "include": ["vite.config.*", "vitest.config.*", "cypress.config.*"], 4 | "compilerOptions": { 5 | "composite": true, 6 | "types": ["node"] 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /hugginggpt/web/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "@vue/tsconfig/tsconfig.web.json", 3 | "include": ["env.d.ts", "src/**/*", "src/**/*.vue"], 4 | "compilerOptions": { 5 | "baseUrl": ".", 6 | "paths": { 7 | "@/*": ["./src/*"] 8 | }, 9 | "resolveJsonModule": true 10 | }, 11 | 12 | "references": [ 13 | { 14 | "path": "./tsconfig.config.json" 15 | } 16 | ] 17 | } 18 | -------------------------------------------------------------------------------- /hugginggpt/web/vite.config.ts: -------------------------------------------------------------------------------- 1 | import { fileURLToPath, URL } from "node:url"; 2 | import path from "path"; 3 | import { defineConfig } from "vite"; 4 | import vue from "@vitejs/plugin-vue"; 5 | 6 | // https://vitejs.dev/config/ 7 | export default defineConfig({ 8 | server: { 9 | host: "0.0.0.0", 10 | port: 9999, 11 | open: true, 12 | cors: true, 13 | }, 14 | plugins: [vue()], 15 | base: "./", 16 | resolve: { 17 | alias: { 18 | "@": path.resolve(__dirname, "./src"), 19 | }, 20 | }, 21 | }); 22 | -------------------------------------------------------------------------------- /taskbench/assets/backinstruct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/JARVIS/c62e0faac76c4a2907cabe2cfe4bbe5f2e613400/taskbench/assets/backinstruct.png -------------------------------------------------------------------------------- /taskbench/assets/eval.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/JARVIS/c62e0faac76c4a2907cabe2cfe4bbe5f2e613400/taskbench/assets/eval.png -------------------------------------------------------------------------------- /taskbench/assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/JARVIS/c62e0faac76c4a2907cabe2cfe4bbe5f2e613400/taskbench/assets/logo.png -------------------------------------------------------------------------------- /taskbench/assets/logo2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/JARVIS/c62e0faac76c4a2907cabe2cfe4bbe5f2e613400/taskbench/assets/logo2.png -------------------------------------------------------------------------------- /taskbench/batch_evaluate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | set -e 4 | 5 | export CUDA_VISIBLE_DEVICES=0 6 | 7 | data_dir=$1 8 | prediction_dir=$2 9 | 10 | if [[ $data_dir == *"dailylifeapis"* ]]; then 11 | dependency_type="temporal" 12 | else 13 | dependency_type="resource" 14 | fi 15 | 16 | for file in $data_dir/$prediction_dir/*.json 17 | do 18 | llm=$(basename $file .json) 19 | # replace prediction_dir's "predictions" with "metrics" 20 | metrics=$(echo $prediction_dir | sed 's/predictions/metrics/g') 21 | if [ -f $data_dir/$metrics/${llm}_splits_all_tools_all_metric_all.json ] && [ -s $data_dir/$metrics/${llm}_splits_all_tools_all_metric_all.json ]; 22 | then 23 | continue 24 | fi 25 | echo $llm 26 | python evaluate.py --data_dir $data_dir --prediction_dir $prediction_dir --llm $llm --splits all --n_tools all --mode add --dependency_type $dependency_type -m all 27 | done -------------------------------------------------------------------------------- /taskbench/data_huggingface/tool_desc.json: -------------------------------------------------------------------------------- 1 | { 2 | "nodes": [ 3 | { 4 | "id": "Token Classification", 5 | "desc": "Token classification is a natural language understanding task in which a label is assigned to some tokens in a text. Some popular token classification subtasks are Named Entity Recognition (NER) and Part-of-Speech (PoS) tagging. NER models could be trained to identify specific entities in a text, such as dates, individuals and places; and PoS tagging would identify, for example, which words in a text are verbs, nouns, and punctuation marks.", 6 | "input-type": [ 7 | "text" 8 | ], 9 | "output-type": [ 10 | "text" 11 | ] 12 | }, 13 | { 14 | "id": "Translation", 15 | "desc": "Translation is the task of converting text from one language to another.", 16 | "input-type": [ 17 | "text" 18 | ], 19 | "output-type": [ 20 | "text" 21 | ] 22 | }, 23 | { 24 | "id": "Summarization", 25 | "desc": "Summarization is the task of producing a shorter version of a document while preserving its important information. Some models can extract text from the original input, while other models can generate entirely new text.", 26 | "input-type": [ 27 | "text" 28 | ], 29 | "output-type": [ 30 | "text" 31 | ] 32 | }, 33 | { 34 | "id": "Question Answering", 35 | "desc": "Question Answering models can retrieve the answer to a question from a given text, which is useful for searching for an answer in a document.", 36 | "input-type": [ 37 | "text", 38 | "text" 39 | ], 40 | "output-type": [ 41 | "text" 42 | ] 43 | }, 44 | { 45 | "id": "Conversational", 46 | "desc": "Conversational response modelling is the task of generating conversational text that is relevant, coherent and knowledgable given a prompt. These models have applications in chatbots, and as a part of voice assistants", 47 | "input-type": [ 48 | "text" 49 | ], 50 | "output-type": [ 51 | "text" 52 | ] 53 | }, 54 | { 55 | "id": "Text Generation", 56 | "desc": "Generating text is the task of producing new text. These models can, for example, fill in incomplete text or paraphrase.", 57 | "input-type": [ 58 | "text" 59 | ], 60 | "output-type": [ 61 | "text" 62 | ] 63 | }, 64 | { 65 | "id": "Sentence Similarity", 66 | "desc": "Sentence Similarity is the task of determining how similar two texts are. This task is particularly useful for information retrieval and clustering/grouping.", 67 | "input-type": [ 68 | "text", 69 | "text" 70 | ], 71 | "output-type": [] 72 | }, 73 | { 74 | "id": "Tabular Classification", 75 | "desc": "Tabular classification is the task of classifying a table (in Image format).", 76 | "input-type": [ 77 | "image" 78 | ], 79 | "output-type": [ 80 | "text" 81 | ] 82 | }, 83 | { 84 | "id": "Object Detection", 85 | "desc": "Object Detection models allow users to identify objects of certain defined classes. Object detection models receive an image as input and output the images with bounding boxes and labels on detected objects.", 86 | "input-type": [ 87 | "image" 88 | ], 89 | "output-type": [ 90 | "text" 91 | ] 92 | }, 93 | { 94 | "id": "Image Classification", 95 | "desc": "Image classification is the task of assigning a label or class to an entire image. Images are expected to have only one class for each image. Image classification models take an image as input and return a prediction about which class the image belongs to.", 96 | "input-type": [ 97 | "image" 98 | ], 99 | "output-type": [ 100 | "text" 101 | ] 102 | }, 103 | { 104 | "id": "Image-to-Image", 105 | "desc": "Image-to-image is the task of transforming a source image to match the characteristics of a target image or a target image domain. Any image manipulation and enhancement is possible with image to image models.", 106 | "input-type": [ 107 | "image" 108 | ], 109 | "output-type": [ 110 | "image" 111 | ] 112 | }, 113 | { 114 | "id": "Image-to-Text", 115 | "desc": "Image to text models output a text from a given image. Image captioning or optical character recognition can be considered as the most common applications of image to text.", 116 | "input-type": [ 117 | "image" 118 | ], 119 | "output-type": [ 120 | "text" 121 | ] 122 | }, 123 | { 124 | "id": "Text-to-Image", 125 | "desc": "Generates images from input text. These models can be used to generate images based on text prompts.", 126 | "input-type": [ 127 | "text" 128 | ], 129 | "output-type": [ 130 | "image" 131 | ] 132 | }, 133 | { 134 | "id": "Text-to-Video", 135 | "desc": "Generates videos from input text. These models can be used to generate videos based on text prompts.", 136 | "input-type": [ 137 | "text" 138 | ], 139 | "output-type": [ 140 | "video" 141 | ] 142 | }, 143 | { 144 | "id": "Visual Question Answering", 145 | "desc": "Visual Question Answering is the task of answering questions based on an image.", 146 | "input-type": [ 147 | "image", 148 | "text" 149 | ], 150 | "output-type": [ 151 | "text" 152 | ] 153 | }, 154 | { 155 | "id": "Document Question Answering", 156 | "desc": "Document Question Answering (also known as Document Visual Question Answering) is the task of answering questions on document images. Document question answering models take a (document, question) pair as input and return an answer in natural language. Models usually rely on multi-modal features, combining text, position of words (bounding-boxes) and image.", 157 | "input-type": [ 158 | "image", 159 | "text" 160 | ], 161 | "output-type": [ 162 | "text" 163 | ] 164 | }, 165 | { 166 | "id": "Image Segmentation", 167 | "desc": "Image Segmentation divides an image into segments where each pixel in the image is mapped to an object. This task has multiple variants such as instance segmentation, panoptic segmentation and semantic segmentation.", 168 | "input-type": [ 169 | "image" 170 | ], 171 | "output-type": [ 172 | "image" 173 | ] 174 | }, 175 | { 176 | "id": "Depth Estimation", 177 | "desc": "Depth estimation is the task of predicting depth of the objects present in an image.", 178 | "input-type": [ 179 | "image" 180 | ], 181 | "output-type": [ 182 | "image" 183 | ] 184 | }, 185 | { 186 | "id": "Text-to-Speech", 187 | "desc": "Text-to-Speech (TTS) is the task of generating natural sounding speech given text input. TTS models can be extended to have a single model that generates speech for multiple speakers and multiple languages.", 188 | "input-type": [ 189 | "text" 190 | ], 191 | "output-type": [ 192 | "audio" 193 | ] 194 | }, 195 | { 196 | "id": "Automatic Speech Recognition", 197 | "desc": "Automatic Speech Recognition (ASR), also known as Speech to Text (STT), is the task of transcribing a given audio to text. It has many applications, such as voice user interfaces.", 198 | "input-type": [ 199 | "audio" 200 | ], 201 | "output-type": [ 202 | "text" 203 | ] 204 | }, 205 | { 206 | "id": "Audio-to-Audio", 207 | "desc": "Audio-to-Audio is a family of tasks in which the input is an audio and the output is one or multiple generated audios. Some example tasks are speech enhancement and source separation.", 208 | "input-type": [ 209 | "audio" 210 | ], 211 | "output-type": [ 212 | "audio" 213 | ] 214 | }, 215 | { 216 | "id": "Audio Classification", 217 | "desc": "Audio classification is the task of assigning a label or class to a given audio. It can be used for recognizing which command a user is giving or the emotion of a statement, as well as identifying a speaker.", 218 | "input-type": [ 219 | "audio" 220 | ], 221 | "output-type": [ 222 | "text" 223 | ] 224 | }, 225 | { 226 | "id": "Image Editing", 227 | "desc": "Image editing is the task of modifying an image to match a given text description. It can be used to modify the attributes of an image, such as the color of an object or the background.", 228 | "input-type": [ 229 | "text", 230 | "image" 231 | ], 232 | "output-type": [ 233 | "image" 234 | ] 235 | } 236 | ] 237 | } -------------------------------------------------------------------------------- /taskbench/data_multimedia/tool_desc.json: -------------------------------------------------------------------------------- 1 | { 2 | "nodes": [ 3 | { 4 | "id": "Image Downloader", 5 | "desc": "Downloads an image from a given URL.", 6 | "input-type": [ 7 | "url" 8 | ], 9 | "output-type": [ 10 | "image" 11 | ] 12 | }, 13 | { 14 | "id": "Video Downloader", 15 | "desc": "Downloads a video from a given URL.", 16 | "input-type": [ 17 | "url" 18 | ], 19 | "output-type": [ 20 | "video" 21 | ] 22 | }, 23 | { 24 | "id": "Audio Downloader", 25 | "desc": "Downloads an audio file from a given URL.", 26 | "input-type": [ 27 | "url" 28 | ], 29 | "output-type": [ 30 | "audio" 31 | ] 32 | }, 33 | { 34 | "id": "Text Downloader", 35 | "desc": "Downloads the text content from a given URL.", 36 | "input-type": [ 37 | "url" 38 | ], 39 | "output-type": [ 40 | "text" 41 | ] 42 | }, 43 | { 44 | "id": "Text Search", 45 | "desc": "Searches for a specific text or keyword on the internet.", 46 | "input-type": [ 47 | "text" 48 | ], 49 | "output-type": [ 50 | "text" 51 | ] 52 | }, 53 | { 54 | "id": "Image Search", 55 | "desc": "Searches for images on the internet based on a given query.", 56 | "input-type": [ 57 | "text" 58 | ], 59 | "output-type": [ 60 | "Image" 61 | ] 62 | }, 63 | { 64 | "id": "Image Search (by Image)", 65 | "desc": "Performs a similar image search using an input image.", 66 | "input-type": [ 67 | "image" 68 | ], 69 | "output-type": [ 70 | "image" 71 | ] 72 | }, 73 | { 74 | "id": "URL Extractor", 75 | "desc": "Extracts URL from text", 76 | "input-type": [ 77 | "text" 78 | ], 79 | "output-type": [ 80 | "url" 81 | ] 82 | }, 83 | { 84 | "id": "Video Search", 85 | "desc": "Searches for videos on the internet based on a given query.", 86 | "input-type": [ 87 | "text" 88 | ], 89 | "output-type": [ 90 | "video" 91 | ] 92 | }, 93 | { 94 | "id": "Text-to-Image", 95 | "desc": "Generates an image based on a given text description.", 96 | "input-type": [ 97 | "text" 98 | ], 99 | "output-type": [ 100 | "image" 101 | ] 102 | }, 103 | { 104 | "id": "Text-to-Video", 105 | "desc": "Generates a video based on a given text description.", 106 | "input-type": [ 107 | "text" 108 | ], 109 | "output-type": [ 110 | "video" 111 | ] 112 | }, 113 | { 114 | "id": "Text-to-Audio", 115 | "desc": "Generates an audio file based on a given text description.", 116 | "input-type": [ 117 | "text" 118 | ], 119 | "output-type": [ 120 | "audio" 121 | ] 122 | }, 123 | { 124 | "id": "Image-to-Text", 125 | "desc": "Extracts text from an input image using Optical Character Recognition (OCR).", 126 | "input-type": [ 127 | "image" 128 | ], 129 | "output-type": [ 130 | "text" 131 | ] 132 | }, 133 | { 134 | "id": "Audio-to-Text", 135 | "desc": "Transcribes speech from an audio file into text.", 136 | "input-type": [ 137 | "audio" 138 | ], 139 | "output-type": [ 140 | "text" 141 | ] 142 | }, 143 | { 144 | "id": "Video-to-Text", 145 | "desc": "Transcribes speech from a video file into text.", 146 | "input-type": [ 147 | "video" 148 | ], 149 | "output-type": [ 150 | "text" 151 | ] 152 | }, 153 | { 154 | "id": "Audio Noise Reduction", 155 | "desc": "Reduces background noise or unwanted sounds from a given audio file.", 156 | "input-type": [ 157 | "audio" 158 | ], 159 | "output-type": [ 160 | "audio" 161 | ] 162 | }, 163 | { 164 | "id": "Audio Effects", 165 | "desc": "Applies various audio effects to a given audio file according to human instruction, such as reverb, chorus, or equalization.", 166 | "input-type": [ 167 | "audio", 168 | "text" 169 | ], 170 | "output-type": [ 171 | "audio" 172 | ] 173 | }, 174 | { 175 | "id": "Audio Splicer", 176 | "desc": "Combines two audio files into a single output file.", 177 | "input-type": [ 178 | "audio", 179 | "audio" 180 | ], 181 | "output-type": [ 182 | "audio" 183 | ] 184 | }, 185 | { 186 | "id": "Voice Changer", 187 | "desc": "Modifies the characteristics of a recorded voice according to human instruction, such as tone, pitch, or gender.", 188 | "input-type": [ 189 | "audio", 190 | "text" 191 | ], 192 | "output-type": [ 193 | "audio" 194 | ] 195 | }, 196 | { 197 | "id": "Text Summarizer", 198 | "desc": "Summarizes a given text into a shorter version while retaining the main points.", 199 | "input-type": [ 200 | "text" 201 | ], 202 | "output-type": [ 203 | "text" 204 | ] 205 | }, 206 | { 207 | "id": "Text Translator", 208 | "desc": "Translates a given text from one language to english.", 209 | "input-type": [ 210 | "text" 211 | ], 212 | "output-type": [ 213 | "text" 214 | ] 215 | }, 216 | { 217 | "id": "Text Sentiment Analysis", 218 | "desc": "Analyzes the sentiment of a given text, identifying if it is positive, negative, or neutral.", 219 | "input-type": [ 220 | "text" 221 | ], 222 | "output-type": [ 223 | "text" 224 | ] 225 | }, 226 | { 227 | "id": "Text Grammar Checker", 228 | "desc": "Checks a given text for grammatical errors and suggests corrections.", 229 | "input-type": [ 230 | "text" 231 | ], 232 | "output-type": [ 233 | "text" 234 | ] 235 | }, 236 | { 237 | "id": "Text Simplifier", 238 | "desc": "Rewrites a given text in a simpler and more understandable manner.", 239 | "input-type": [ 240 | "text" 241 | ], 242 | "output-type": [ 243 | "text" 244 | ] 245 | }, 246 | { 247 | "id": "Text Expander", 248 | "desc": "Expands a given short text into a more detailed and descriptive version.", 249 | "input-type": [ 250 | "text" 251 | ], 252 | "output-type": [ 253 | "text" 254 | ] 255 | }, 256 | { 257 | "id": "Keyword Extractor", 258 | "desc": "Extracts the most important keywords and phrases from a given text.", 259 | "input-type": [ 260 | "text" 261 | ], 262 | "output-type": [ 263 | "text" 264 | ] 265 | }, 266 | { 267 | "id": "Text Paraphraser", 268 | "desc": "Rewrites a given text using different words while maintaining its original meaning.", 269 | "input-type": [ 270 | "text" 271 | ], 272 | "output-type": [ 273 | "text" 274 | ] 275 | }, 276 | { 277 | "id": "Article Spinner", 278 | "desc": "Rewrites a given article using synonyms and syntax changes to create a new, unique version.", 279 | "input-type": [ 280 | "text" 281 | ], 282 | "output-type": [ 283 | "text" 284 | ] 285 | }, 286 | { 287 | "id": "Topic Generator", 288 | "desc": "Generates a list of relevant topics or ideas based on a given input.", 289 | "input-type": [ 290 | "text" 291 | ], 292 | "output-type": [ 293 | "text" 294 | ] 295 | }, 296 | { 297 | "id": "Audio-to-Image", 298 | "desc": "Generates an image that visually represents a given audio, such as a waveform or spectrogram.", 299 | "input-type": [ 300 | "audio" 301 | ], 302 | "output-type": [ 303 | "image" 304 | ] 305 | }, 306 | { 307 | "id": "Image-to-Video", 308 | "desc": "Creates a slideshow video using two input images.", 309 | "input-type": [ 310 | "image", 311 | "image" 312 | ], 313 | "output-type": [ 314 | "video" 315 | ] 316 | }, 317 | { 318 | "id": "Video-to-Audio", 319 | "desc": "Extracts the audio track from a given video file.", 320 | "input-type": [ 321 | "video" 322 | ], 323 | "output-type": [ 324 | "audio" 325 | ] 326 | }, 327 | { 328 | "id": "Video-to-Image", 329 | "desc": "Extracts a still image from a given video.", 330 | "input-type": [ 331 | "video" 332 | ], 333 | "output-type": [ 334 | "image" 335 | ] 336 | }, 337 | { 338 | "id": "Image Stitcher", 339 | "desc": "Stitches together two input images to create a panorama or collage.", 340 | "input-type": [ 341 | "image", 342 | "image" 343 | ], 344 | "output-type": [ 345 | "image" 346 | ] 347 | }, 348 | { 349 | "id": "Image Colorizer", 350 | "desc": "Adds color to a black and white input image using deep learning techniques.", 351 | "input-type": [ 352 | "image" 353 | ], 354 | "output-type": [ 355 | "image" 356 | ] 357 | }, 358 | { 359 | "id": "Image Style Transfer", 360 | "desc": "Applies the visual style of one image (style) to the content of another image (content) using neural style transfer techniques.", 361 | "input-type": [ 362 | "image", "image" 363 | ], 364 | "output-type": [ 365 | "image" 366 | ] 367 | }, 368 | { 369 | "id": "Video Stabilizer", 370 | "desc": "Stabilizes a shaky input video to produce a smoother output video.", 371 | "input-type": [ 372 | "video" 373 | ], 374 | "output-type": [ 375 | "video" 376 | ] 377 | }, 378 | { 379 | "id": "Video Speed Changer", 380 | "desc": "Adjusts the playback speed of a given video according to human instruction, either speeding it up or slowing it down.", 381 | "input-type": [ 382 | "video", "text" 383 | ], 384 | "output-type": [ 385 | "video" 386 | ] 387 | }, 388 | { 389 | "id": "Video Synchronization", 390 | "desc": "Synchronizes the timing of an existing voiceover or audio file with the visuals of a given video.", 391 | "input-type": [ 392 | "video", 393 | "audio" 394 | ], 395 | "output-type": [ 396 | "video" 397 | ] 398 | }, 399 | { 400 | "id": "Video Voiceover", 401 | "desc": "Adds a voiceover to a given video using a provided script or text.", 402 | "input-type": [ 403 | "video", 404 | "text" 405 | ], 406 | "output-type": [ 407 | "video" 408 | ] 409 | } 410 | ] 411 | } -------------------------------------------------------------------------------- /taskbench/format_data.py: -------------------------------------------------------------------------------- 1 | import json 2 | import click 3 | import traceback 4 | 5 | def formulate_sample(data, dependency_type): 6 | try: 7 | user_request = data["result"]["user_request"] 8 | invoking_graph = data["result"]["invoking_graph"] 9 | task_steps = data["result"]["task_steps"] 10 | nodes = invoking_graph["nodes"] 11 | links = invoking_graph["links"] 12 | user_request = data["result"]["user_request"] 13 | if "check_by_teacher" in data["result"]: 14 | check_by_teacher = data["result"]["check_by_teacher"] 15 | else: 16 | check_by_teacher = invoking_graph["check_by_teacher"] 17 | for node in nodes: 18 | node["task"] = node["id"] 19 | node.pop("id") 20 | if dependency_type == "resource": 21 | node["task"] = node["task"].replace("_", " ") 22 | node["arguments"] = node["input"] 23 | node.pop("input") 24 | 25 | for node in nodes: 26 | assert isinstance(node, dict) 27 | assert "task" in node 28 | assert "arguments" in node 29 | if isinstance(node["arguments"], str) and node["arguments"].startswith("" 38 | break 39 | for link in links: 40 | assert isinstance(link, dict) 41 | assert "source" in link 42 | assert "target" in link 43 | if dependency_type == "resource": 44 | link["source"] = link["source"].replace("_", " ") 45 | link["target"] = link["target"].replace("_", " ") 46 | assert isinstance(task_steps, list) 47 | assert isinstance(nodes, list) 48 | assert len(nodes) == len(task_steps) 49 | assert isinstance(user_request, str) 50 | assert isinstance(check_by_teacher, str) 51 | except Exception as e: 52 | print(e) 53 | traceback.print_exc() 54 | return None, None, None, None, None 55 | return user_request, task_steps, links, nodes, check_by_teacher 56 | 57 | @click.command() 58 | @click.option('--data_dir', default='data_huggingface', help='Path to the data directory') 59 | @click.option('--dependency_type', default='resource') 60 | def formulate(data_dir, dependency_type): 61 | rf = open(f"{data_dir}/data_raw.json", "r") 62 | wf_format = open(f"{data_dir}/data.json", "w") 63 | wf_error = open(f"{data_dir}/data_error.json", "w") 64 | wf_ur = open(f"{data_dir}/user_requests.json", "w") 65 | 66 | all = 0 67 | format = 0 68 | for line in rf: 69 | all += 1 70 | data = json.loads(line) 71 | method = data["method"] 72 | n_tools = data["number_of_tools"] 73 | seed = data["seed"] 74 | _id = data["id"] 75 | sampled_nodes = data["sampled_nodes"] 76 | sampled_links = data["sampled_links"] 77 | for sampled_node in sampled_nodes: 78 | sampled_node["task"] = sampled_node["id"] 79 | sampled_node.pop("id") 80 | if dependency_type == "resource": 81 | sampled_node["task"] = sampled_node["task"].replace("_", " ") 82 | if "input" in sampled_node: 83 | sampled_node["input-type"] = sampled_node["input"] 84 | sampled_node.pop("input") 85 | sampled_node["output-type"] = sampled_node["output"] 86 | sampled_node.pop("output") 87 | else: 88 | sampled_node["arguments"] = sampled_node["parameters"] 89 | sampled_node.pop("parameters") 90 | user_request, task_steps, links, nodes, check_by_teacher = formulate_sample(data, dependency_type) 91 | if user_request is None: 92 | wf_error.write(line) 93 | continue 94 | format += 1 95 | result = { 96 | "id": _id, 97 | "seed": seed, 98 | "type": method, 99 | "n_tools": n_tools, 100 | "sampled_nodes": sampled_nodes, 101 | "sampled_links": sampled_links, 102 | "user_request": user_request, 103 | "task_steps": task_steps, 104 | "task_nodes": nodes, 105 | "task_links": links, 106 | "check_by_teacher": check_by_teacher, 107 | } 108 | wf_format.write(json.dumps(result)+"\n") 109 | ur_result = { 110 | "id": _id, 111 | "user_request": user_request, 112 | } 113 | wf_ur.write(json.dumps(ur_result)+"\n") 114 | wf_format.close() 115 | wf_error.close() 116 | wf_ur.close() 117 | rf.close() 118 | print(f"Format {format} out of {all}") 119 | 120 | if __name__ == "__main__": 121 | formulate() -------------------------------------------------------------------------------- /taskbench/generate_graph.py: -------------------------------------------------------------------------------- 1 | import json 2 | import click 3 | 4 | def generate_graph_resource(tool_file): 5 | with open(tool_file) as f: 6 | data = json.load(f) 7 | data = data["nodes"] 8 | assert "input-type" in data[0] and "output-type" in data[0], "Input and output types are not defined" 9 | nodes = [] 10 | for i in range(len(data)): 11 | nodes.append({"id": data[i]["id"], "desc": data[i]["desc"], "input-type": data[i]["input-type"], "output-type": data[i]["output-type"]}) 12 | links = [] 13 | for i in range(len(nodes)): 14 | for j in range(len(nodes)): 15 | if i != j: 16 | if len(set(nodes[i]["output-type"]).intersection(set(nodes[j]["input-type"]))) > 0: 17 | links.append({"source": nodes[i]["id"], "target": nodes[j]["id"], "type": list(set(nodes[i]["output-type"]).intersection(set(nodes[j]["input-type"])))[0]}) 18 | graph = {"nodes": nodes, "links": links} 19 | with open(tool_file.replace("tools", "graph"), 'w') as f: 20 | json.dump(graph, f, indent=2) 21 | 22 | def generate_graph_temporal(tool_file): 23 | with open(tool_file) as f: 24 | data = json.load(f) 25 | nodes = [] 26 | data = data["nodes"] 27 | if "parameters" not in data[0] and "input-type" not in data[0]: 28 | for i in range(len(data)): 29 | nodes.append({"id": data[i]["id"], "desc": data[i]["desc"]}) 30 | elif "input-type" not in data[0]: 31 | for i in range(len(data)): 32 | nodes.append({"id": data[i]["id"], "desc": data[i]["desc"], "parameters": data[i]["parameters"]}) 33 | else: 34 | for i in range(len(data)): 35 | nodes.append({"id": data[i]["id"], "desc": data[i]["desc"], "parameters": data[i]["parameters"], "input-type": data[i]["input-type"], "output-type": data[i]["output-type"]}) 36 | links = [] 37 | for i in range(len(nodes)): 38 | for j in range(len(nodes)): 39 | if i != j: 40 | links.append({"source": nodes[i]["id"], "target": nodes[j]["id"], "type": "complete"}) 41 | graph = {"nodes": nodes, "links": links} 42 | with open(tool_file.replace("tools", "graph"), 'w') as f: 43 | json.dump(graph, f, indent=2) 44 | 45 | @click.command() 46 | @click.option('--data_dir') 47 | @click.option('--tool_desc', default=None, help='Path to the tool description file') 48 | @click.option('--dependency_type', default='resource', help='Type of graph to generate') 49 | def generate_graph(tool_desc, data_dir, dependency_type): 50 | if tool_desc: 51 | tool_file = tool_desc 52 | else: 53 | tool_file = f"{data_dir}/graph_desc.json" 54 | if dependency_type == "temporal": 55 | generate_graph_temporal(tool_file) 56 | elif dependency_type == "resource": 57 | generate_graph_resource(tool_file) 58 | else: 59 | print("Type not supported") 60 | 61 | if __name__ == "__main__": 62 | generate_graph() -------------------------------------------------------------------------------- /taskbench/graph_sampler.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | import random 3 | import json 4 | import matplotlib.pyplot as plt 5 | import click 6 | 7 | random.seed(0) 8 | class GraphSampler: 9 | def __init__(self, graph: nx.Graph = None, file_name = None): 10 | if file_name: 11 | with open(file_name, "r") as f: 12 | data = json.load(f) 13 | 14 | # Represent your graph in NetworkX 15 | graph = nx.DiGraph() 16 | 17 | # Add nodes to the graph 18 | if "input-type" in data["nodes"][0]: 19 | for node in data["nodes"]: 20 | graph.add_node(node["id"], desc=node["desc"], input_type=node["input-type"], output_type=node["output-type"]) 21 | else: 22 | for node in data["nodes"]: 23 | graph.add_node(node["id"], desc=node["desc"], parameters=node["parameters"]) 24 | 25 | # Add edges to the graph 26 | for link in data["links"]: 27 | graph.add_edge(link["source"], link["target"], type=link["type"]) 28 | 29 | self.graph = graph 30 | 31 | def sample_subgraph_by_weight(self, number_weights, method_weights): 32 | method = random.choices(list(method_weights.keys()), weights=list(method_weights.values()))[0] 33 | if method == "single": 34 | tool_number = 1 35 | else: 36 | tool_number = random.choices(list(number_weights.keys()), weights=list(number_weights.values()))[0] 37 | return self.sample_subgraph(tool_number, sample_method=method) 38 | 39 | def sample_subgraph(self, num_nodes=3, sample_method="chain"): 40 | seed_node = random.choice(list(self.graph.nodes)) 41 | if sample_method == "single": 42 | sub_G = nx.DiGraph() 43 | sub_G.add_node(seed_node) 44 | return sub_G 45 | elif sample_method == "chain": 46 | return self.sample_subgraph_chain(seed_node, num_nodes) 47 | elif sample_method == "dag": 48 | return self.sample_subgraph_dag(seed_node, num_nodes) 49 | else: 50 | raise ValueError("Invalid sample method") 51 | 52 | def sample_subgraph_chain(self, seed_node, num_nodes): 53 | # Create a list to store the sub-graph nodes 54 | sub_graph_nodes = [seed_node] 55 | head_node = seed_node 56 | tail_node = seed_node 57 | edges = [] 58 | 59 | # Keep adding nodes until we reach the desired number 60 | while len(sub_graph_nodes) < num_nodes: 61 | # Get the neighbors of the last node in the sub-graph 62 | head_node_neighbors = list(self.graph.predecessors(head_node)) 63 | tail_node_neighbors = list(self.graph.successors(tail_node)) 64 | neighbors = head_node_neighbors + tail_node_neighbors 65 | 66 | # If the node has neighbors, randomly select one and add it to the sub-graph 67 | if len(neighbors) > 0: 68 | neighbor = random.choice(neighbors) 69 | if neighbor not in sub_graph_nodes: 70 | if neighbor in head_node_neighbors: 71 | sub_graph_nodes.insert(0, neighbor) 72 | edges.insert(0, (neighbor, head_node)) 73 | head_node = neighbor 74 | else: 75 | sub_graph_nodes.append(neighbor) 76 | edges.append((tail_node, neighbor)) 77 | tail_node = neighbor 78 | else: 79 | break 80 | 81 | # Create the sub-graph 82 | sub_G = nx.DiGraph() 83 | sub_G.add_nodes_from(sub_graph_nodes) 84 | sub_G.add_edges_from(edges) 85 | 86 | return sub_G 87 | 88 | def sample_subgraph_dag(self, seed_node, num_nodes): 89 | # Create a list to store the sub-graph nodes 90 | sub_graph_nodes = [seed_node] 91 | edges = [] 92 | 93 | # Keep adding nodes until we reach the desired number 94 | while len(sub_graph_nodes) < num_nodes: 95 | # Randomly select a node from the current sub-graph 96 | node = random.choice(sub_graph_nodes) 97 | # prec_neighbors = list(self.graph.predecessors(node)) 98 | succ_neighbors = list(self.graph.successors(node)) 99 | 100 | if "input_type" in self.graph.nodes[node]: 101 | # filter exisiting income edge type 102 | prec_neighbors = [] 103 | input_type = list(self.graph.nodes[node]["input_type"]) 104 | all_in_edges = list(self.graph.in_edges(node, data=True)) 105 | for edge in edges: 106 | for ref_edge in all_in_edges: 107 | if edge[0] == ref_edge[0] and edge[1] == ref_edge[1]: 108 | input_type.remove(ref_edge[2]["type"]) 109 | for edge in all_in_edges: 110 | if edge[2]["type"] in input_type: 111 | prec_neighbors.append(edge[0]) 112 | else: 113 | prec_neighbors = list(self.graph.predecessors(node)) 114 | 115 | neighbors = prec_neighbors + succ_neighbors 116 | 117 | # If the node has neighbors, randomly select one and add it to the sub-graph 118 | if neighbors: 119 | neighbor = random.choice(neighbors) 120 | if neighbor not in sub_graph_nodes: 121 | if neighbor in prec_neighbors: 122 | edges.append((neighbor, node)) 123 | else: 124 | edges.append((node, neighbor)) 125 | sub_graph_nodes.append(neighbor) 126 | # If the node has no neighbors, select a new node from the original graph 127 | else: 128 | node = random.choice(list(self.graph.nodes)) 129 | if node not in sub_graph_nodes: 130 | sub_graph_nodes.append(node) 131 | 132 | # Create the sub-graph 133 | sub_G = nx.DiGraph() 134 | sub_G.add_nodes_from(sub_graph_nodes) 135 | sub_G.add_edges_from(edges) 136 | 137 | return sub_G 138 | 139 | def sample_subgraph_random_walk(self, seed_node, num_nodes): 140 | # Create a list to store the sub-graph nodes 141 | sub_graph_nodes = [seed_node] 142 | edges = [] 143 | 144 | # Keep adding nodes until we reach the desired number 145 | while len(sub_graph_nodes) < num_nodes: 146 | # Randomly select a node from the current sub-graph 147 | node = random.choice(sub_graph_nodes) 148 | neighbors = list(self.graph.successors(node)) 149 | 150 | # If the node has neighbors, randomly select one and add it to the sub-graph 151 | if neighbors: 152 | neighbor = random.choice(neighbors) 153 | if neighbor not in sub_graph_nodes: 154 | edges.append((node, neighbor)) 155 | sub_graph_nodes.append(neighbor) 156 | # If the node has no neighbors, select a new node from the original graph 157 | else: 158 | node = random.choice(list(self.graph.nodes)) 159 | if node not in sub_graph_nodes: 160 | sub_graph_nodes.append(node) 161 | 162 | # Create the sub-graph 163 | sub_G = nx.DiGraph() 164 | sub_G.add_nodes_from(sub_graph_nodes) 165 | sub_G.add_edges_from(edges) 166 | 167 | return sub_G 168 | 169 | def sample_subgraph_random_walk_with_restart(self, seed_node, num_nodes, restart_prob=0.15): 170 | # Create a list to store the sub-graph nodes 171 | sub_graph_nodes = [seed_node] 172 | edges = [] 173 | 174 | # Keep adding nodes until we reach the desired number 175 | while len(sub_graph_nodes) < num_nodes: 176 | # Randomly select a node from the current sub-graph 177 | node = random.choice(sub_graph_nodes) 178 | neighbors = list(self.graph.successors(node)) 179 | 180 | # If the node has neighbors, randomly select one and add it to the sub-graph 181 | if neighbors: 182 | neighbor = random.choice(neighbors) 183 | if neighbor not in sub_graph_nodes: 184 | edges.append((node, neighbor)) 185 | sub_graph_nodes.append(neighbor) 186 | # If the node has no neighbors, select a new node from the original graph 187 | else: 188 | node = random.choice(list(self.graph.nodes)) 189 | if node not in sub_graph_nodes: 190 | sub_graph_nodes.append(node) 191 | 192 | # Randomly restart the walk 193 | if random.random() < restart_prob: 194 | node = random.choice(list(self.graph.nodes)) 195 | if node not in sub_graph_nodes: 196 | sub_graph_nodes.append(node) 197 | 198 | # Create the sub-graph 199 | sub_G = nx.DiGraph() 200 | sub_G.add_nodes_from(sub_graph_nodes) 201 | sub_G.add_edges_from(edges) 202 | 203 | return sub_G 204 | 205 | @click.command() 206 | @click.option('--file_name', default='graph_desc_original.json', help='Path to the json file') 207 | @click.option('--sample_method', default='chain', help='Type of graph to generate') 208 | @click.option('--num_nodes', default=3, help='Number of nodes in the subgraph') 209 | @click.option('--save_figure', default=False, help='Save the figure') 210 | def sample_subgraph(file_name, sample_method, num_nodes, save_figure): 211 | # Create a graph sampler 212 | random.seed(0) 213 | sampler = GraphSampler(file_name=file_name) 214 | 215 | # Sample a sub-graph 216 | sub_G = sampler.sample_subgraph(num_nodes, sample_method=sample_method) 217 | print("Sub-graph nodes:", sub_G.nodes) 218 | print("Sub-graph edges:", sub_G.edges) 219 | 220 | # Visualize the sub-graph 221 | if save_figure: 222 | pos = nx.circular_layout(sub_G) 223 | nx.draw_networkx_nodes(sub_G, pos, node_color="skyblue", node_size=300) 224 | nx.draw_networkx_edges(sub_G, pos, arrows=True) 225 | nx.draw_networkx_labels(sub_G, pos, font_size=8) 226 | plt.axis("off") 227 | plt.tight_layout() 228 | plt.savefig("test.png") 229 | 230 | 231 | 232 | if __name__ == "__main__": 233 | sample_subgraph() -------------------------------------------------------------------------------- /taskbench/requirements.txt: -------------------------------------------------------------------------------- 1 | requests==2.28.1 2 | click==8.0.4 3 | emoji==2.8.0 4 | networkx==2.8.4 5 | aiohttp==3.8.1 6 | matplotlib==3.7.1 7 | pandas==1.2.4 8 | numpy==1.23.5 9 | scikit-learn==1.0.2 10 | Levenshtein==0.21.1 11 | scipy==1.10.0 12 | datasets==2.14.5 -------------------------------------------------------------------------------- /taskbench/visualize_graph.py: -------------------------------------------------------------------------------- 1 | import json 2 | import networkx as nx 3 | import matplotlib.pyplot as plt 4 | import click 5 | 6 | @click.command() 7 | @click.option('--data_dir') 8 | def visialize_graph(data_dir): 9 | graph_file = f"{data_dir}/graph_desc.json" 10 | with open(graph_file, "r") as f: 11 | data = json.load(f) 12 | 13 | G = nx.DiGraph() 14 | 15 | for node in data["nodes"]: 16 | G.add_node(node["id"]) 17 | 18 | for link in data["links"]: 19 | G.add_edge(link["source"], link["target"]) 20 | 21 | pos = nx.spring_layout(G) 22 | pos = nx.random_layout(G) 23 | pos = nx.kamada_kawai_layout(G) 24 | 25 | # Show the visualization 26 | plt.figure(figsize=(60, 60), dpi=80) 27 | plt.tight_layout() 28 | plt.axis("off") 29 | plt.show() 30 | 31 | nx.draw_networkx_nodes(G, pos, node_color="skyblue", node_size=1200) 32 | nx.draw_networkx_edges(G, pos, arrows=True, arrowsize=40) 33 | nx.draw_networkx_labels(G, pos, font_size=50, font_color="green", font_weight="bold") 34 | plt.savefig(graph_file.replace(".json", ".pdf")) 35 | 36 | if __name__ == "__main__": 37 | visialize_graph() --------------------------------------------------------------------------------