├── .env.example ├── .gitignore ├── README.md ├── devbox.json ├── devbox.lock ├── dify ├── dify_to_openai.py ├── git_traverse.py ├── linkedin.py ├── quack_memo.py ├── twitter.py └── yt_transcribe.py ├── infinity_mxbai_embed_large_v1.py ├── infinity_mxbai_rerank_large_v1.py ├── infinity_snowflake_arctic_embed_l_335m.py ├── outlines_llama3_8b.py ├── vllm_arctic_480b.py ├── vllm_aya_8b.py ├── vllm_codeqwen_110b_v1_5.py ├── vllm_deepseek_coder_33b.py ├── vllm_duckdb_nsql_7b.py ├── vllm_llama3_70b.py ├── vllm_llama3_8b.py ├── vllm_seallm_7b_v2_5.py └── vllm_sqlcoder_7b_2.py /.env.example: -------------------------------------------------------------------------------- 1 | INFINITY_API_KEY="" 2 | VLLM_API_KEY="" -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # IPython notebook 33 | .ipynb_checkpoints 34 | 35 | # pyenv 36 | .python-version 37 | 38 | # pipenv 39 | # According to pypa/pipenv#598, it is recommended to include Pipfile; 40 | # however, in case of a broad build, you might prefer to avoid the 41 | # implicit modification of shell environment during the build. 42 | Pipfile.lock 43 | 44 | # venv 45 | .venv 46 | env/ 47 | venv/ 48 | .env/ 49 | .venv/ 50 | 51 | # virtualenv 52 | .virtualenv 53 | 54 | # PyCharm 55 | .idea/ 56 | .vscode/ 57 | .history/ 58 | 59 | # Environments 60 | .env 61 | .envrc 62 | 63 | # Spyder project settings 64 | .spyder-py3 65 | 66 | # Rope project settings 67 | .ropeproject 68 | 69 | # Mr Developer 70 | .mr.developer.cfg 71 | .project 72 | .pydevproject 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # Local Configuration for pytest 78 | pytest.ini 79 | 80 | # To avoid sphinx and others to use the docutils from system 81 | .local/share/docutils/ 82 | 83 | # Log Files 84 | *.log 85 | *.log.* 86 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Overview 2 | This repository is designed for deploying and managing server processes that handle embeddings using the Infinity Embedding model or Large Language Models with an OpenAI compatible vLLM server using Modal. 3 | 4 | ## Key Components 5 | 1. **vllm_llama_70b.py, vllm_deepseek_coder_33b.py, vllm_llama3-8b.py, vllm_seallm_7b_v2_5.py, vllm_sqlcoder_7b_2.py, vllm_duckdb_nsql_7b.py, vllm_codeqwen_110b_v1_5.py** 6 | - These scripts contain the function `openai_compatible_server()` which initiates an OpenAI compatible vLLM server by running a command that instantiates an OpenAI compatible FastAPI server.. 7 | - The `BASE_MODEL` variable appears to define the model path for the embedding tool, which is not shown but can be inferred from the context. 8 | 9 | 2. **infinity_mxbai_embed_large_v1.py, infinity_mxbai_rerank_large_v1.py, infinity_snowflake_arctic_embed_l_335m.py** 10 | - These scripts contain the function `infinity_embeddings_server()` which initiates the Infinity Embed server by running a command that utilizes the Infinity embedding tool with specified options (like CUDA device and Torch engine). 11 | - The `BASE_MODEL` variable appears to define the model path for the embedding tool, which is not shown but can be inferred from the context. 12 | 13 | 3. **devbox.json** 14 | - This configuration file specifies the programming environment for the repository, including versions of Python, Pip, and Node.js. 15 | - It also defines shell initialization hooks like activating a Python virtual environment and installing necessary Python packages, among other administration scripts. 16 | 17 | 4. **.env.example** 18 | - This file template shows environment variables that are likely necessary for the project to run (e.g., API keys for Infinity API and VLLM API). 19 | 20 | ## Prerequisites 21 | Before diving into the project setup, make sure to: 22 | - [Have Devbox installed](https://www.jetify.com/devbox/docs/installing_devbox/), as it manages the development and operation environment for this project. 23 | - Set up necessary API keys by copying `.env.example` to `.env` and filling in the required values for `INFINITY_API_KEY` and `VLLM_API_KEY`. 24 | 25 | ## Environment Setup 26 | 1. **Initializing Development Environment with Devbox:** 27 | - Enter the Devbox shell environment by running: 28 | ```bash 29 | devbox shell 30 | ``` 31 | - This action will set up the environment according to the `init_hook` specified in `devbox.json`, which activates the Python virtual environment and installs the required packages. 32 | 33 | ## Deployment 34 | The scripts available in the repository can be deployed using the [Modal](https://modal.com/docs/examples/hello_world) tool. Deploy a script by running the corresponding command: 35 | ```bash 36 | modal deploy infinity_mxbai_embed_large_v1.py 37 | modal deploy infinity_mxbai_rerank_large_v1.py 38 | modal deploy infinity_snowflake_arctic_embed_l_335m.py 39 | 40 | modal deploy vllm_llama3_70b.py 41 | modal deploy vllm_deepseek_coder_33b.py 42 | modal deploy vllm_llama3-8b.py 43 | modal deploy vllm_seallm_7b_v2_5.py 44 | modal deploy vllm_sqlcoder_7b_2.py 45 | modal deploy vllm_duckdb_nsql_7b.py 46 | modal deploy vllm_codeqwen_110b_v1_5.py 47 | ``` 48 | Each command will deploy the respective script, launching the Infinity embeddings server or an OpenAI compatible vLLM server configured per the script's specifications. 49 | 50 | ## Inference 51 | 52 | Expect cold starts between 30s and 1 minute with Modal. Both the vLLM and Infinity servers take in an API key, specified in your `.env` file. You can use this to make requests for inference on these models: 53 | 54 | **Querying LLMs**: 55 | ```bash 56 | time curl \ 57 | -H "Content-Type: application/json" \ 58 | -H "Authorization: Bearer " \ 59 | -d '{ 60 | "model": "TheBloke/deepseek-coder-33B-instruct-AWQ", 61 | "messages": [ 62 | { 63 | "role": "user", 64 | "content": "Write me a python snake game." 65 | } 66 | ], 67 | "temperature": 0, 68 | "max_tokens": 1024 69 | }' 70 | ``` 71 | 72 | **Querying Embeddings**: 73 | ```bash 74 | time curl \ 75 | -H "Content-Type: application/json" \ 76 | -H "Authorization: Bearer " \ 77 | -d '{ 78 | "model": "Snowflake/snowflake-arctic-embed-l", 79 | "input": ["The quick brown fox jumps over the lazy dog."] 80 | }' 81 | ``` 82 | 83 | **Querying Rerankings**: 84 | ```bash 85 | time curl -X 'POST' \ 86 | \ 87 | -H 'accept: application/json' \ 88 | -H "Authorization: Bearer " \ 89 | -H 'Content-Type: application/json' \ 90 | -d '{ 91 | "model": "mixedbread-ai/mxbai-rerank-large-v1", 92 | "query": "What is the python package infinity_emb?", 93 | "documents": [ 94 | "This is a document not related to the python package infinity_emb, hence...", 95 | "Paris is in France!", 96 | "infinity_emb is a package for sentence embeddings and rerankings using transformer models in Python!" 97 | ], 98 | "return_documents": true 99 | }' 100 | ``` 101 | -------------------------------------------------------------------------------- /devbox.json: -------------------------------------------------------------------------------- 1 | { 2 | "packages": [ 3 | "python@3.10.13", 4 | "python310Packages.pip@23.2.1", 5 | "nodejs@21.5.0" 6 | ], 7 | "shell": { 8 | "init_hook": [ 9 | ". $VENV_DIR/bin/activate", 10 | "pip install modal python-dotenv", 11 | "modal profile activate dwarvesf" 12 | ], 13 | "scripts": { 14 | "test": [ 15 | "echo \"Error: no test specified\" && exit 1" 16 | ] 17 | } 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /devbox.lock: -------------------------------------------------------------------------------- 1 | { 2 | "lockfile_version": "1", 3 | "packages": { 4 | "nodejs@21.5.0": { 5 | "last_modified": "2024-01-13T22:55:27-05:00", 6 | "plugin_version": "0.0.2", 7 | "resolved": "github:NixOS/nixpkgs/dd5621df6dcb90122b50da5ec31c411a0de3e538#nodejs_21", 8 | "source": "devbox-search", 9 | "version": "21.5.0", 10 | "systems": { 11 | "aarch64-darwin": { 12 | "outputs": [ 13 | { 14 | "name": "out", 15 | "path": "/nix/store/ybpqk26vz7k9grapsgx0sd900s0sp4sa-nodejs-21.5.0", 16 | "default": true 17 | }, 18 | { 19 | "name": "libv8", 20 | "path": "/nix/store/p72cdykz2wpc4v23kyjh7p640l9lsxpw-nodejs-21.5.0-libv8" 21 | } 22 | ], 23 | "store_path": "/nix/store/ybpqk26vz7k9grapsgx0sd900s0sp4sa-nodejs-21.5.0" 24 | }, 25 | "aarch64-linux": { 26 | "outputs": [ 27 | { 28 | "name": "out", 29 | "path": "/nix/store/brnzb5xxgdx6bbicygz83ybi5inqp09v-nodejs-21.5.0", 30 | "default": true 31 | }, 32 | { 33 | "name": "libv8", 34 | "path": "/nix/store/nb5gk97p2r1lvn06af24hir58589dgxh-nodejs-21.5.0-libv8" 35 | } 36 | ], 37 | "store_path": "/nix/store/brnzb5xxgdx6bbicygz83ybi5inqp09v-nodejs-21.5.0" 38 | }, 39 | "x86_64-darwin": { 40 | "outputs": [ 41 | { 42 | "name": "out", 43 | "path": "/nix/store/yvgnx3lj8am9mqn30yr09sb4ia7qy3w8-nodejs-21.5.0", 44 | "default": true 45 | }, 46 | { 47 | "name": "libv8", 48 | "path": "/nix/store/ln1hcnch8rm7scsskrpp38irjba1cj17-nodejs-21.5.0-libv8" 49 | } 50 | ], 51 | "store_path": "/nix/store/yvgnx3lj8am9mqn30yr09sb4ia7qy3w8-nodejs-21.5.0" 52 | }, 53 | "x86_64-linux": { 54 | "outputs": [ 55 | { 56 | "name": "out", 57 | "path": "/nix/store/nxfirpvaycr7wqzwl6wqifpdrqn7is7x-nodejs-21.5.0", 58 | "default": true 59 | }, 60 | { 61 | "name": "libv8", 62 | "path": "/nix/store/6bixqqylqn5bpzdhymsa21i9qf3z7ms2-nodejs-21.5.0-libv8" 63 | } 64 | ], 65 | "store_path": "/nix/store/nxfirpvaycr7wqzwl6wqifpdrqn7is7x-nodejs-21.5.0" 66 | } 67 | } 68 | }, 69 | "python310Packages.pip@23.2.1": { 70 | "last_modified": "2023-12-13T17:54:10-05:00", 71 | "plugin_version": "0.0.2", 72 | "resolved": "github:NixOS/nixpkgs/fd04bea4cbf76f86f244b9e2549fca066db8ddff#python310Packages.pip", 73 | "source": "devbox-search", 74 | "version": "23.2.1", 75 | "systems": { 76 | "aarch64-darwin": { 77 | "outputs": [ 78 | { 79 | "name": "out", 80 | "path": "/nix/store/vjyrxbxqsadvr9g6mzig6y406dhwcrqi-python3.10-pip-23.2.1", 81 | "default": true 82 | }, 83 | { 84 | "name": "man", 85 | "path": "/nix/store/2ra46imgas5srgxx8mgs424akh5j1msv-python3.10-pip-23.2.1-man", 86 | "default": true 87 | }, 88 | { 89 | "name": "dist", 90 | "path": "/nix/store/xjj6xjh21k08rkyg6hnkbyrygqhqgn8y-python3.10-pip-23.2.1-dist" 91 | } 92 | ], 93 | "store_path": "/nix/store/vjyrxbxqsadvr9g6mzig6y406dhwcrqi-python3.10-pip-23.2.1" 94 | }, 95 | "aarch64-linux": { 96 | "outputs": [ 97 | { 98 | "name": "out", 99 | "path": "/nix/store/j8pxwv7vyjm8z2fqglijjvabbkmxbv9r-python3.10-pip-23.2.1", 100 | "default": true 101 | }, 102 | { 103 | "name": "man", 104 | "path": "/nix/store/bcfliw00z123ddyi8hsfxvfn2npdcpdq-python3.10-pip-23.2.1-man", 105 | "default": true 106 | }, 107 | { 108 | "name": "dist", 109 | "path": "/nix/store/2326b2sr9p2z9bsghd2pzs346g2qjn7f-python3.10-pip-23.2.1-dist" 110 | } 111 | ], 112 | "store_path": "/nix/store/j8pxwv7vyjm8z2fqglijjvabbkmxbv9r-python3.10-pip-23.2.1" 113 | }, 114 | "x86_64-darwin": { 115 | "outputs": [ 116 | { 117 | "name": "out", 118 | "path": "/nix/store/7lyvqf8wl47wzgsqmlcz39ycmwxyg9zx-python3.10-pip-23.2.1", 119 | "default": true 120 | }, 121 | { 122 | "name": "man", 123 | "path": "/nix/store/0j02s5hbsdhfxvvay5dm9j68nhalm39v-python3.10-pip-23.2.1-man", 124 | "default": true 125 | }, 126 | { 127 | "name": "dist", 128 | "path": "/nix/store/8dvqh9ai83lqaaixs3bhmf1n4jxgp4v7-python3.10-pip-23.2.1-dist" 129 | } 130 | ], 131 | "store_path": "/nix/store/7lyvqf8wl47wzgsqmlcz39ycmwxyg9zx-python3.10-pip-23.2.1" 132 | }, 133 | "x86_64-linux": { 134 | "outputs": [ 135 | { 136 | "name": "out", 137 | "path": "/nix/store/gdhvfi5zaqzpa5l3kk0spmv71r549slf-python3.10-pip-23.2.1", 138 | "default": true 139 | }, 140 | { 141 | "name": "man", 142 | "path": "/nix/store/p37y2fhm6l3nd624jmr2r680wf4p0544-python3.10-pip-23.2.1-man", 143 | "default": true 144 | }, 145 | { 146 | "name": "dist", 147 | "path": "/nix/store/r4qj7psryk532rkh0srhn25ygv8yk541-python3.10-pip-23.2.1-dist" 148 | } 149 | ], 150 | "store_path": "/nix/store/gdhvfi5zaqzpa5l3kk0spmv71r549slf-python3.10-pip-23.2.1" 151 | } 152 | } 153 | }, 154 | "python@3.10.13": { 155 | "last_modified": "2024-03-22T07:26:23-04:00", 156 | "plugin_version": "0.0.4", 157 | "resolved": "github:NixOS/nixpkgs/a3ed7406349a9335cb4c2a71369b697cecd9d351#python310", 158 | "source": "devbox-search", 159 | "version": "3.10.13", 160 | "systems": { 161 | "aarch64-darwin": { 162 | "outputs": [ 163 | { 164 | "name": "out", 165 | "path": "/nix/store/p6fpa9wj3a4g2r0is803z35gvb00vrqh-python3-3.10.13", 166 | "default": true 167 | } 168 | ], 169 | "store_path": "/nix/store/p6fpa9wj3a4g2r0is803z35gvb00vrqh-python3-3.10.13" 170 | }, 171 | "aarch64-linux": { 172 | "outputs": [ 173 | { 174 | "name": "out", 175 | "path": "/nix/store/h3i2nr5lij71z112sh0j368gd8idqhyd-python3-3.10.13", 176 | "default": true 177 | }, 178 | { 179 | "name": "debug", 180 | "path": "/nix/store/rw8pxxh7gc95iplzs98g9dzgmm9qh017-python3-3.10.13-debug" 181 | } 182 | ], 183 | "store_path": "/nix/store/h3i2nr5lij71z112sh0j368gd8idqhyd-python3-3.10.13" 184 | }, 185 | "x86_64-darwin": { 186 | "outputs": [ 187 | { 188 | "name": "out", 189 | "path": "/nix/store/a1j6kh88inyisjbbvkkhxx3c7xbfwmlf-python3-3.10.13", 190 | "default": true 191 | } 192 | ], 193 | "store_path": "/nix/store/a1j6kh88inyisjbbvkkhxx3c7xbfwmlf-python3-3.10.13" 194 | }, 195 | "x86_64-linux": { 196 | "outputs": [ 197 | { 198 | "name": "out", 199 | "path": "/nix/store/bj6brw67kppcpdk2gb92l1rf3zirx5jc-python3-3.10.13", 200 | "default": true 201 | }, 202 | { 203 | "name": "debug", 204 | "path": "/nix/store/4r4gxff34w58gan55jgna0jfjqav0xga-python3-3.10.13-debug" 205 | } 206 | ], 207 | "store_path": "/nix/store/bj6brw67kppcpdk2gb92l1rf3zirx5jc-python3-3.10.13" 208 | } 209 | } 210 | } 211 | } 212 | } 213 | -------------------------------------------------------------------------------- /dify/dify_to_openai.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | import string 4 | import time 5 | from typing import List, Optional 6 | from pydantic import BaseModel 7 | from fastapi import FastAPI, HTTPException, Header, Request 8 | from fastapi.responses import StreamingResponse, JSONResponse 9 | 10 | from modal import Image, Mount, asgi_app, Secret, web_endpoint, App 11 | 12 | # Set up Modal image 13 | image = Image.debian_slim().pip_install("fastapi", "httpx", "python-dotenv") 14 | 15 | # Create Modal App 16 | app = App(name="dify-to-openai") 17 | 18 | # Create FastAPI app 19 | fastapi_app = FastAPI() 20 | 21 | # Helper functions 22 | def generate_id(): 23 | return ''.join(random.choices(string.ascii_letters + string.digits, k=29)) 24 | 25 | class ChatCompletionRequest(BaseModel): 26 | model: str 27 | messages: List[dict] 28 | stream: Optional[bool] = False 29 | 30 | @fastapi_app.get("/") 31 | async def root(): 32 | return {"message": "Dify2OpenAI service is running"} 33 | 34 | @fastapi_app.get("/v1/models") 35 | async def get_models(): 36 | models = { 37 | "object": "list", 38 | "data": [ 39 | { 40 | "id": "dify", 41 | "object": "model", 42 | "owned_by": "dify", 43 | "permission": None, 44 | } 45 | ] 46 | } 47 | return JSONResponse(content=models) 48 | 49 | @fastapi_app.post("/v1/chat/completions") 50 | async def chat_completions(request: ChatCompletionRequest, authorization: str = Header(None)): 51 | import httpx 52 | import os 53 | 54 | # Access environment variables from secrets 55 | DIFY_API_URL = os.environ.get("DIFY_API_URL") 56 | BOT_TYPE = os.environ.get("BOT_TYPE", "Chat") 57 | INPUT_VARIABLE = os.environ.get("INPUT_VARIABLE", "") 58 | OUTPUT_VARIABLE = os.environ.get("OUTPUT_VARIABLE", "") 59 | MODELS_NAME = os.environ.get("MODELS_NAME", "dify") 60 | 61 | if not DIFY_API_URL: 62 | raise ValueError("DIFY API URL is required.") 63 | 64 | if not authorization: 65 | raise HTTPException(status_code=401, detail="Unauthorized") 66 | 67 | token = authorization.split(" ")[1] 68 | if not token: 69 | raise HTTPException(status_code=401, detail="Unauthorized") 70 | 71 | messages = request.messages 72 | stream = request.stream 73 | 74 | query_string = "" 75 | if BOT_TYPE == "Chat": 76 | last_message = messages[-1] 77 | history = "\n".join([f"{m['role']}: {m['content']}" for m in messages[:-1]]) 78 | query_string = f"here is our talk history:\n'''\n{history}\n'''\n\nhere is my question:\n{last_message['content']}" 79 | elif BOT_TYPE in ["Completion", "Workflow"]: 80 | query_string = messages[-1]["content"] 81 | 82 | api_path = { 83 | "Chat": "/chat-messages", 84 | "Completion": "/completion-messages", 85 | "Workflow": "/workflows/run" 86 | }.get(BOT_TYPE) 87 | 88 | if not api_path: 89 | raise ValueError("Invalid bot type in the environment variable.") 90 | 91 | request_body = { 92 | "inputs": {INPUT_VARIABLE: query_string} if INPUT_VARIABLE else {}, 93 | "query": query_string if not INPUT_VARIABLE else None, 94 | "response_mode": "streaming", 95 | "conversation_id": "", 96 | "user": "apiuser", 97 | "auto_generate_name": False 98 | } 99 | 100 | async def generate_stream(): 101 | async with httpx.AsyncClient() as client: 102 | async with client.stream("POST", f"{DIFY_API_URL}{api_path}", 103 | json=request_body, 104 | headers={"Authorization": f"Bearer {token}", "Content-Type": "application/json"}) as response: 105 | buffer = "" 106 | async for chunk in response.aiter_bytes(): 107 | buffer += chunk.decode() 108 | lines = buffer.split("\n") 109 | for line in lines[:-1]: 110 | line = line.strip() 111 | if line.startswith("data:"): 112 | try: 113 | chunk_obj = json.loads(line[5:].strip()) 114 | if chunk_obj["event"] in ["message", "agent_message", "text_chunk"]: 115 | chunk_content = chunk_obj.get("data", {}).get("text", "") or chunk_obj.get("answer", "") 116 | chunk_id = f"chatcmpl-{generate_id()}" 117 | yield f"data: {json.dumps({'id': chunk_id, 'object': 'chat.completion.chunk', 'created': chunk_obj.get('created_at'), 'model': request.model, 'choices': [{'index': 0, 'delta': {'content': chunk_content}, 'finish_reason': None}]})}\n\n" 118 | elif chunk_obj["event"] in ["workflow_finished", "message_end"]: 119 | chunk_id = f"chatcmpl-{generate_id()}" 120 | yield f"data: {json.dumps({'id': chunk_id, 'object': 'chat.completion.chunk', 'created': chunk_obj.get('created_at'), 'model': request.model, 'choices': [{'index': 0, 'delta': {}, 'finish_reason': 'stop'}]})}\n\n" 121 | yield "data: [DONE]\n\n" 122 | return 123 | except json.JSONDecodeError: 124 | continue 125 | buffer = lines[-1] 126 | 127 | if stream: 128 | return StreamingResponse(generate_stream(), media_type="text/event-stream") 129 | else: 130 | full_response = "" 131 | usage_data = {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0} 132 | async for chunk in generate_stream(): 133 | if chunk.startswith("data: "): 134 | chunk_data = json.loads(chunk[6:]) 135 | if "choices" in chunk_data and chunk_data["choices"]: 136 | delta = chunk_data["choices"][0].get("delta", {}) 137 | if "content" in delta: 138 | full_response += delta["content"] 139 | if chunk_data["choices"][0].get("finish_reason") == "stop": 140 | break 141 | 142 | formatted_response = { 143 | "id": f"chatcmpl-{generate_id()}", 144 | "object": "chat.completion", 145 | "created": int(time.time()), 146 | "model": request.model, 147 | "choices": [ 148 | { 149 | "index": 0, 150 | "message": { 151 | "role": "assistant", 152 | "content": full_response.strip(), 153 | }, 154 | "finish_reason": "stop", 155 | } 156 | ], 157 | "usage": usage_data, 158 | } 159 | return JSONResponse(content=formatted_response) 160 | 161 | @app.function(image=image, secrets=[Secret.from_name("dify-secret")]) 162 | @asgi_app() 163 | def dify_to_openai_app(): 164 | return fastapi_app 165 | -------------------------------------------------------------------------------- /dify/git_traverse.py: -------------------------------------------------------------------------------- 1 | import os 2 | import fnmatch 3 | import shutil 4 | from pydantic import BaseModel 5 | from fastapi import HTTPException, Header 6 | from fastapi.responses import JSONResponse 7 | from modal import Image, App, web_endpoint, Secret, Volume, method, enter, exit 8 | from typing import Optional, List, Union 9 | from enum import Enum 10 | from urllib.parse import urlparse 11 | from asyncio import Lock 12 | 13 | # Create Modal Image with required dependencies 14 | image = ( 15 | Image.debian_slim() 16 | .apt_install("git") 17 | .pip_install("gitpython") 18 | ) 19 | 20 | # Create Modal App 21 | app = App(name="git-traverser") 22 | 23 | # Create a volume to store cloned repositories 24 | repo_volume = Volume.from_name("repo-volume", create_if_missing=True) 25 | 26 | class RepoType(str, Enum): 27 | GITHUB = "github" 28 | GITLAB = "gitlab" 29 | 30 | class GitRepoRequest(BaseModel): 31 | repo_url: str 32 | branch: Optional[str] = "main" 33 | type: Optional[Union[RepoType, str]] = None 34 | file_patterns: Optional[Union[List[str], str]] = None 35 | git_token: Optional[str] = None 36 | 37 | @classmethod 38 | def parse_obj(cls, obj): 39 | # Convert "null" strings to None 40 | for key, value in obj.items(): 41 | if isinstance(value, str) and value.lower() == "null": 42 | obj[key] = None 43 | 44 | # Set default branch to "main" if it's None 45 | if obj.get('branch') is None: 46 | obj['branch'] = "main" 47 | 48 | return super().parse_obj(obj) 49 | 50 | # Directories and files to ignore 51 | IGNORE_PATTERNS = [ 52 | "node_modules", 53 | "__pycache__", 54 | "env", 55 | "venv", 56 | ".venv", 57 | "virtualenv", 58 | "target/dependency", 59 | "build/dependencies", 60 | "dist", 61 | "out", 62 | "bundle", 63 | "vendor", 64 | "tmp", 65 | "temp", 66 | "deps", 67 | "pkg", 68 | "Pods", 69 | ".git", 70 | ".*", 71 | "*.lock", # This will catch package-lock.json, yarn.lock, Gemfile.lock, etc. 72 | "package-lock.json", 73 | "yarn.lock", 74 | "pnpm-lock.yaml", 75 | "Gemfile.lock", 76 | "Pipfile.lock", 77 | "poetry.lock", 78 | "composer.lock", 79 | "Cargo.lock", 80 | "mix.lock", 81 | "shard.lock", 82 | "Podfile.lock", 83 | "gradle.lockfile", 84 | "pubspec.lock", 85 | "project.assets.json", 86 | "packages.lock.json", 87 | "*.pyc", 88 | "*.pyo", 89 | "*.pyd", 90 | "*.so", 91 | "*.dll", 92 | "*.exe", 93 | "*.bin", 94 | "*.obj", 95 | "*.o", 96 | "*.a", 97 | "*.lib", 98 | "*.log", 99 | "*.cache", 100 | "*.bak", 101 | "*.swp", 102 | "*.swo", 103 | "*.tmp", 104 | "*.temp", 105 | "*.DS_Store", 106 | "Thumbs.db", 107 | "desktop.ini", 108 | "go.sum", 109 | ] 110 | 111 | # Important file patterns 112 | DEFAULT_IMPORTANT_FILE_PATTERNS = [ 113 | "*.md", 114 | "README*", 115 | "CONTRIBUTING*", 116 | "CHANGELOG*", 117 | "go.mod", 118 | "go.sum", 119 | "package.json", 120 | "package-lock.json", 121 | "yarn.lock", 122 | "Gemfile", 123 | "Gemfile.lock", 124 | "requirements.txt", 125 | "setup.py", 126 | "Pipfile", 127 | "Pipfile.lock", 128 | "pom.xml", 129 | "build.gradle", 130 | "Cargo.toml", 131 | "Cargo.lock", 132 | "devbox.json", 133 | "Dockerfile", 134 | ".gitignore", 135 | ".dockerignore", 136 | "docker-compose.yml", 137 | "docker-compose.yaml", 138 | ".env.example", 139 | "Makefile", 140 | "*.config.js", 141 | "tsconfig.json", 142 | "tslint.json", 143 | "eslintrc.*", 144 | "prettierrc.*", 145 | ] 146 | 147 | def should_ignore(path: str) -> bool: 148 | path_parts = path.split(os.sep) 149 | for part in path_parts: 150 | if any(fnmatch.fnmatch(part, pattern) for pattern in IGNORE_PATTERNS): 151 | return True 152 | return False 153 | 154 | def is_important_file(rel_path: str, custom_patterns: Optional[List[str]] = None) -> bool: 155 | patterns = custom_patterns if custom_patterns is not None else DEFAULT_IMPORTANT_FILE_PATTERNS 156 | return any(fnmatch.fnmatch(rel_path, pattern) for pattern in patterns) 157 | 158 | def validate_bearer_token(bearer_token: str, valid_token: str) -> bool: 159 | return bearer_token == f"Bearer {valid_token}" 160 | 161 | def detect_repo_type(repo_url: str) -> RepoType: 162 | if "github.com" in repo_url: 163 | return RepoType.GITHUB 164 | elif "gitlab.com" in repo_url: 165 | return RepoType.GITLAB 166 | else: 167 | raise ValueError("Unable to detect repository type. Please specify 'type' in the request.") 168 | 169 | @app.cls(image=image, container_idle_timeout=30, allow_concurrent_inputs=10, volumes={"/repos": repo_volume}) 170 | class GitTraverser: 171 | @enter() 172 | def initialize(self): 173 | self.clone_dir = "/repos" 174 | if not os.path.exists(self.clone_dir): 175 | os.makedirs(self.clone_dir) 176 | repo_volume.reload() 177 | self.repo_locks = {} 178 | 179 | @exit() 180 | def cleanup(self): 181 | print("Cleaning up repository directory...") 182 | for item in os.listdir(self.clone_dir): 183 | item_path = os.path.join(self.clone_dir, item) 184 | if os.path.islink(item_path): 185 | os.unlink(item_path) 186 | elif os.path.isdir(item_path): 187 | shutil.rmtree(item_path) 188 | else: 189 | os.remove(item_path) 190 | 191 | # Commit changes to the volume 192 | repo_volume.commit() 193 | print("Repository directory cleared.") 194 | 195 | @method() 196 | async def traverse_git_repo(self, repo_url: str, branch: Optional[str] = None, repo_type: RepoType = None, token: Optional[str] = None, file_patterns: Optional[List[str]] = None) -> dict: 197 | """ 198 | Clone a git repository blobless if it doesn't exist, traverse it, and return its directory structure. 199 | """ 200 | 201 | import git 202 | 203 | # Detect repo type if not provided 204 | if repo_type is None: 205 | repo_type = detect_repo_type(repo_url) 206 | 207 | # Extract repo name from the URL 208 | repo_name = os.path.splitext(os.path.basename(urlparse(repo_url).path))[0] 209 | clone_dir = os.path.join(self.clone_dir, repo_name) 210 | 211 | # Get or create a lock for this repository 212 | if repo_name not in self.repo_locks: 213 | self.repo_locks[repo_name] = Lock() 214 | 215 | async with self.repo_locks[repo_name]: 216 | def prepare_clone_url(): 217 | if token: 218 | if repo_type == RepoType.GITHUB: 219 | return repo_url.replace('https://', f'https://{token}@') 220 | elif repo_type == RepoType.GITLAB: 221 | return repo_url.replace('https://', f'https://oauth2:{token}@') 222 | return repo_url 223 | 224 | try: 225 | clone_url = prepare_clone_url() 226 | 227 | # Check if the repository exists 228 | git.cmd.Git().ls_remote(clone_url) 229 | 230 | if os.path.exists(clone_dir): 231 | print(f"Repository directory already exists: {clone_dir}") 232 | repo = git.Repo(clone_dir) 233 | else: 234 | print(f"Cloning repository: {repo_url}") 235 | 236 | # Clone without specifying a branch first 237 | repo = git.Repo.clone_from(clone_url, clone_dir, filter='blob:none') 238 | print(f"Successfully cloned repository") 239 | 240 | # After cloning, try to checkout the specified branch if it exists 241 | if branch: 242 | try: 243 | repo.git.checkout(branch) 244 | print(f"Checked out branch: {branch}") 245 | except git.GitCommandError: 246 | print(f"Branch '{branch}' not found, staying on default branch") 247 | 248 | def traverse_directory(path='.'): 249 | result = {} 250 | try: 251 | items = repo.git.ls_tree('-r', '--name-only', 'HEAD', path).splitlines() 252 | except git.GitCommandError as e: 253 | print(f"Git error in traverse_directory: {str(e)}") 254 | return result 255 | 256 | for item in items: 257 | if should_ignore(item): 258 | continue 259 | 260 | parts = item.split('/') 261 | current = result 262 | for part in parts[:-1]: 263 | if part not in current: 264 | current[part] = {} 265 | current = current[part] 266 | 267 | if is_important_file(item, file_patterns): 268 | try: 269 | content = repo.git.show(f'HEAD:{item}') 270 | current[parts[-1]] = content 271 | except git.GitCommandError as e: 272 | print(f"Error reading file {item}: {str(e)}") 273 | current[parts[-1]] = "Error reading file" 274 | else: 275 | current[parts[-1]] = "file" 276 | 277 | return result 278 | 279 | # Traverse the repository 280 | structure = traverse_directory() 281 | 282 | # Commit changes to the volume 283 | repo_volume.commit() 284 | 285 | return {"structure": structure} 286 | 287 | except git.GitCommandError as e: 288 | if "Repository not found" in str(e): 289 | raise Exception(f"Repository not found: {repo_url}") 290 | elif "Remote branch not found" in str(e): 291 | raise Exception(f"Branch '{branch}' not found in the repository") 292 | else: 293 | raise Exception(f"Git command error: {str(e)}") 294 | except Exception as e: 295 | raise Exception(f"Error traversing repository: {str(e)}") 296 | 297 | @app.function(image=image, secrets=[Secret.from_name("git-traverser-secret")]) 298 | @web_endpoint(method="POST") 299 | def get_git_structure( 300 | request: GitRepoRequest, 301 | authorization: str = Header(None), 302 | x_git_token: Optional[str] = Header(None, alias="X-Git-Token") 303 | ): 304 | try: 305 | # Validate bearer token 306 | valid_token = os.environ["API_KEY"] 307 | if not authorization or not validate_bearer_token(authorization, valid_token): 308 | raise HTTPException(status_code=401, detail="Invalid or missing bearer token") 309 | 310 | # Detect or use provided repo type 311 | repo_type = request.type if request.type not in (None, "null") else detect_repo_type(request.repo_url) 312 | 313 | # Use git_token from request body if provided, otherwise use x_git_token from header 314 | git_token = request.git_token or x_git_token 315 | 316 | # Convert file_patterns to List[str] if it's a string 317 | file_patterns = request.file_patterns 318 | if isinstance(file_patterns, str): 319 | file_patterns = [pattern.strip() for pattern in file_patterns.split(',')] 320 | 321 | # Use the branch from the request, which will default to "main" if not provided or set to "null" 322 | branch = request.branch 323 | 324 | structure = GitTraverser().traverse_git_repo.remote( 325 | request.repo_url, 326 | branch, 327 | repo_type, 328 | git_token, 329 | file_patterns 330 | ) 331 | return JSONResponse(content=structure) 332 | except HTTPException as he: 333 | return JSONResponse(content={"error": he.detail}, status_code=he.status_code) 334 | except Exception as e: 335 | return JSONResponse(content={"error": str(e)}, status_code=400) 336 | 337 | if __name__ == "__main__": 338 | app.serve() 339 | -------------------------------------------------------------------------------- /dify/linkedin.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | from fastapi.responses import JSONResponse 3 | from modal import Image, App, web_endpoint 4 | from typing import List, Optional 5 | 6 | # Create Modal Image with required dependencies 7 | image = Image.debian_slim().pip_install("playwright").run_commands( 8 | "apt-get update", 9 | "apt-get install -y software-properties-common", 10 | "apt-add-repository non-free", 11 | "apt-add-repository contrib", 12 | "playwright install-deps chromium", 13 | "playwright install chromium", 14 | ) 15 | 16 | # Create Modal App 17 | app = App(name="linkedin-job-scraper") 18 | 19 | class JobRequest(BaseModel): 20 | location: str 21 | keywords: Optional[str] = None 22 | limit: Optional[int] = 10 23 | 24 | @app.function(image=image) 25 | def scrape_linkedin_jobs(location: str, keywords: Optional[str] = None, limit: int = 10) -> List[dict]: 26 | """ 27 | Scrape LinkedIn job postings based on location and optional keywords 28 | """ 29 | from playwright.sync_api import sync_playwright 30 | import re 31 | 32 | search_url = f"https://www.linkedin.com/jobs/search/?location={location}" 33 | if keywords: 34 | search_url += f"&keywords={keywords}" 35 | 36 | jobs = [] 37 | 38 | with sync_playwright() as p: 39 | browser = p.chromium.launch(headless=True) 40 | context = browser.new_context(viewport={"width": 1920, "height": 1080}) 41 | context.set_default_timeout(120000) 42 | page = context.new_page() 43 | page.goto(search_url) 44 | 45 | # Wait for job listings to load 46 | page.wait_for_selector(".jobs-search__results-list") 47 | 48 | # Scroll to load more jobs 49 | for _ in range(limit // 25 + 1): # LinkedIn loads ~25 jobs per scroll 50 | page.evaluate("window.scrollTo(0, document.body.scrollHeight)") 51 | 52 | job_cards = page.query_selector_all(".jobs-search__results-list > li") 53 | 54 | for card in job_cards[:limit]: 55 | title_elem = card.query_selector(".base-search-card__title") 56 | company_elem = card.query_selector(".base-search-card__subtitle") 57 | location_elem = card.query_selector(".job-search-card__location") 58 | link_elem = card.query_selector("a.base-card__full-link") 59 | 60 | if title_elem and company_elem and location_elem and link_elem: 61 | job = { 62 | "title": title_elem.inner_text(), 63 | "company": company_elem.inner_text(), 64 | "location": location_elem.inner_text(), 65 | "link": link_elem.get_attribute("href"), 66 | } 67 | jobs.append(job) 68 | 69 | browser.close() 70 | 71 | return jobs 72 | 73 | @app.function(image=image) 74 | @web_endpoint(method="POST") 75 | def get_linkedin_jobs(request: JobRequest): 76 | try: 77 | jobs = scrape_linkedin_jobs.remote(request.location, request.keywords, request.limit) 78 | return JSONResponse(content={"jobs": jobs}) 79 | except Exception as e: 80 | return JSONResponse(content={"error": str(e)}, status_code=400) 81 | 82 | if __name__ == "__main__": 83 | app.serve() -------------------------------------------------------------------------------- /dify/quack_memo.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | from fastapi import FastAPI 3 | from fastapi.middleware.cors import CORSMiddleware 4 | import json 5 | from datetime import date, datetime 6 | from modal import Image, App, asgi_app 7 | 8 | image = Image.debian_slim().pip_install("duckdb==1.1.0", "fastapi") 9 | app = App(name="quack") 10 | 11 | class Query(BaseModel): 12 | sql: str 13 | 14 | def json_serial(obj): 15 | if isinstance(obj, (datetime, date)): 16 | return obj.isoformat() 17 | raise TypeError(f"Type {type(obj)} not serializable") 18 | 19 | @app.function(image=image, min_containers=1, scaledown_window=300, allow_concurrent_inputs=10) 20 | def preload_and_query_duckdb(query: str): 21 | import duckdb 22 | conn = duckdb.connect('vault.duckdb') 23 | 24 | # Check if the 'vault' table exists 25 | table_exists = conn.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='vault'").fetchone() is not None 26 | 27 | # Preload the data only if the 'vault' table doesn't exist 28 | if not table_exists: 29 | conn.execute("INSTALL httpfs") 30 | conn.execute("LOAD httpfs") 31 | conn.execute("IMPORT DATABASE 'https://memo.d.foundation/db'") 32 | conn.execute("INSTALL fts") 33 | conn.execute("LOAD fts") 34 | conn.execute("PRAGMA create_fts_index('vault', 'file_path', 'title', 'md_content', 'tags', 'authors')") 35 | conn.execute("INSTALL vss") 36 | conn.execute("LOAD vss") 37 | conn.execute("SET hnsw_enable_experimental_persistence = true") 38 | conn.execute("CREATE INDEX emb_openai_hnsw_index ON vault USING HNSW (embeddings_openai)") 39 | conn.execute("CREATE INDEX emb_spr_custom_hnsw_index ON vault USING HNSW (embeddings_spr_custom)") 40 | 41 | try: 42 | result = conn.execute(query).fetchall() 43 | column_names = [desc[0] for desc in conn.description] 44 | formatted_result = [dict(zip(column_names, row)) for row in result] 45 | return formatted_result 46 | finally: 47 | conn.close() 48 | 49 | # Create FastAPI app 50 | web_app = FastAPI( 51 | title="Quack DuckDB Query API", 52 | version="1.0", 53 | description="API for querying DuckDB database.", 54 | ) 55 | 56 | # Add CORS middleware 57 | web_app.add_middleware( 58 | CORSMiddleware, 59 | allow_origins=["*"], 60 | allow_credentials=True, 61 | allow_methods=["*"], 62 | allow_headers=["*"], 63 | ) 64 | 65 | @web_app.post("/") 66 | async def query_duckdb(query: Query): 67 | try: 68 | result = preload_and_query_duckdb.remote(query.sql) 69 | json_compatible_result = json.loads( 70 | json.dumps(result, default=json_serial) 71 | ) 72 | return {"result": json_compatible_result} 73 | except Exception as e: 74 | return {"error": str(e)} 75 | 76 | @app.function(image=image) 77 | @asgi_app() 78 | def serve(): 79 | return web_app 80 | -------------------------------------------------------------------------------- /dify/twitter.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | from fastapi.responses import JSONResponse 3 | from modal import Image, App, web_endpoint 4 | 5 | # Create Modal Image with required dependencies 6 | image = Image.debian_slim().pip_install("playwright").run_commands( 7 | "apt-get update", 8 | "apt-get install -y software-properties-common", 9 | "apt-add-repository non-free", 10 | "apt-add-repository contrib", 11 | "playwright install-deps chromium", 12 | "playwright install chromium", 13 | ) 14 | 15 | # Create Modal App 16 | app = App(name="twitter-scraper") 17 | 18 | class TweetRequest(BaseModel): 19 | url: str 20 | 21 | @app.function(image=image) 22 | def scrape_tweet(url: str) -> dict: 23 | """ 24 | Scrape a single tweet page for Tweet thread 25 | Return parent tweet, reply tweets and recommended tweets 26 | """ 27 | from playwright.sync_api import sync_playwright 28 | 29 | _xhr_calls = [] 30 | 31 | def intercept_response(response): 32 | """capture all background requests and save them""" 33 | if response.request.resource_type == "xhr": 34 | _xhr_calls.append(response) 35 | return response 36 | 37 | with sync_playwright() as pw: 38 | browser = pw.chromium.launch(headless=True) 39 | context = browser.new_context(viewport={"width": 1920, "height": 1080}) 40 | context.set_default_timeout(120000) 41 | page = context.new_page() 42 | 43 | # enable background request intercepting: 44 | page.on("response", intercept_response) 45 | # go to url and wait for the page to load 46 | page.goto(url) 47 | page.wait_for_selector("[data-testid='tweet']") 48 | 49 | # find all tweet background requests: 50 | tweet_calls = [f for f in _xhr_calls if "TweetResultByRestId" in f.url] 51 | for xhr in tweet_calls: 52 | data = xhr.json() 53 | return data['data']['tweetResult']['result'] 54 | 55 | @app.function(image=image) 56 | @web_endpoint(method="POST") 57 | def get_tweet(request: TweetRequest): 58 | try: 59 | tweet_data = scrape_tweet.remote(request.url) 60 | return JSONResponse(content={"tweet_data": tweet_data}) 61 | except Exception as e: 62 | return JSONResponse(content={"error": str(e)}, status_code=400) 63 | -------------------------------------------------------------------------------- /dify/yt_transcribe.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | from fastapi.responses import JSONResponse 3 | from modal import Image, App, web_endpoint 4 | from typing import List, Optional 5 | 6 | image = Image.debian_slim().pip_install("youtube_transcript_api") 7 | app = App(name="youtube-transcript") 8 | 9 | class TranscriptRequest(BaseModel): 10 | video_id: str 11 | languages: Optional[List[str]] = None 12 | 13 | @app.function(image=image) 14 | def get_youtube_transcript(video_id: str, languages: Optional[List[str]] = None): 15 | from youtube_transcript_api import YouTubeTranscriptApi 16 | try: 17 | if languages: 18 | return YouTubeTranscriptApi.get_transcript(video_id, languages=languages) 19 | else: 20 | return YouTubeTranscriptApi.get_transcript(video_id) 21 | except Exception as e: 22 | raise Exception(f"Error fetching transcript: {str(e)}") 23 | 24 | @app.function(image=image) 25 | @web_endpoint(method="POST") 26 | def get_transcript(request: TranscriptRequest): 27 | try: 28 | transcript = get_youtube_transcript.remote(request.video_id, request.languages) 29 | formatted_transcript = [f"{entry['start']}: {entry['text']}" for entry in transcript] 30 | return JSONResponse(content={"transcript": formatted_transcript}) 31 | except Exception as e: 32 | return JSONResponse(content={"error": str(e)}, status_code=400) 33 | 34 | if __name__ == "__main__": 35 | app.serve() -------------------------------------------------------------------------------- /infinity_mxbai_embed_large_v1.py: -------------------------------------------------------------------------------- 1 | # # Fast inference with Infinity (mixedbread-ai/mxbai-embed-large-v1) 2 | 3 | import os 4 | import subprocess 5 | import secrets 6 | 7 | from modal import Image, Secret, App, enter, gpu, method, web_server 8 | 9 | MODEL_DIR = "/model" 10 | BASE_MODEL = "mixedbread-ai/mxbai-embed-large-v1" 11 | 12 | # ## Define a container image 13 | 14 | 15 | # We want to create a Modal image which has the model weights pre-saved to a directory. The benefit of this 16 | # is that the container no longer has to re-download the model from Huggingface - instead, it will take 17 | # advantage of Modal's internal filesystem for faster cold starts. 18 | # 19 | # ### Download the weights 20 | # We can download the model to a particular directory using the HuggingFace utility function `snapshot_download`. 21 | # 22 | # Tip: avoid using global variables in this function. Changes to code outside this function will not be detected and the download step will not re-run. 23 | def download_model_to_folder(): 24 | from huggingface_hub import snapshot_download 25 | from transformers.utils import move_cache 26 | 27 | os.makedirs(MODEL_DIR, exist_ok=True) 28 | 29 | snapshot_download( 30 | BASE_MODEL, 31 | local_dir=MODEL_DIR, 32 | ignore_patterns=["*.pt", "*.bin"], # Using safetensors 33 | ) 34 | move_cache() 35 | 36 | 37 | # ### Image definition 38 | # We'll start from a recommended Docker Hub image and install `vLLM`. 39 | # Then we'll use `run_function` to run the function defined above to ensure the weights of 40 | # the model are saved within the container image. 41 | image = ( 42 | Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10") 43 | .pip_install( 44 | "wheel==0.44.0", 45 | "huggingface_hub==0.25.0", 46 | "hf-transfer==0.1.8", 47 | "torch==2.4.1", 48 | "transformers==4.44.2", 49 | "sentence-transformers==3.1.0", 50 | "infinity_emb[all]==0.0.56" 51 | ) 52 | .apt_install("git") 53 | # Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s. 54 | .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) 55 | .run_function( 56 | download_model_to_folder, 57 | secrets=[Secret.from_name("huggingface")], 58 | timeout=60 * 20, 59 | ) 60 | ) 61 | 62 | app = App("infinity-mxbai-embed-large-v1", image=image) 63 | GPU_CONFIG = gpu.T4(count=1) 64 | 65 | 66 | # Run a web server on port 7997 and expose the Infinity embedding server 67 | @app.function( 68 | allow_concurrent_inputs=100, 69 | container_idle_timeout=15, 70 | gpu=GPU_CONFIG, 71 | secrets=[ 72 | Secret.from_name("huggingface"), 73 | Secret.from_dotenv(), 74 | ], 75 | ) 76 | @web_server(7997, startup_timeout=300) 77 | def infinity_embeddings_server(): 78 | cmd = f"infinity_emb v2 --device cuda --engine torch --model-id {BASE_MODEL}" 79 | subprocess.Popen(cmd, shell=True) 80 | -------------------------------------------------------------------------------- /infinity_mxbai_rerank_large_v1.py: -------------------------------------------------------------------------------- 1 | # # Fast inference with Infinity (mixedbread-ai/mxbai-rerank-large-v1) 2 | 3 | import os 4 | import subprocess 5 | import secrets 6 | 7 | from modal import Image, Secret, App, enter, gpu, method, web_server 8 | 9 | MODEL_DIR = "/model" 10 | BASE_MODEL = "mixedbread-ai/mxbai-rerank-large-v1" 11 | 12 | # ## Define a container image 13 | 14 | 15 | # We want to create a Modal image which has the model weights pre-saved to a directory. The benefit of this 16 | # is that the container no longer has to re-download the model from Huggingface - instead, it will take 17 | # advantage of Modal's internal filesystem for faster cold starts. 18 | # 19 | # ### Download the weights 20 | # We can download the model to a particular directory using the HuggingFace utility function `snapshot_download`. 21 | # 22 | # Tip: avoid using global variables in this function. Changes to code outside this function will not be detected and the download step will not re-run. 23 | def download_model_to_folder(): 24 | from huggingface_hub import snapshot_download 25 | from transformers.utils import move_cache 26 | 27 | os.makedirs(MODEL_DIR, exist_ok=True) 28 | 29 | snapshot_download( 30 | BASE_MODEL, 31 | local_dir=MODEL_DIR, 32 | ignore_patterns=["*.pt", "*.bin"], # Using safetensors 33 | ) 34 | move_cache() 35 | 36 | 37 | # ### Image definition 38 | # We'll start from a recommended Docker Hub image and install `vLLM`. 39 | # Then we'll use `run_function` to run the function defined above to ensure the weights of 40 | # the model are saved within the container image. 41 | image = ( 42 | Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10") 43 | .pip_install( 44 | "wheel==0.44.0", 45 | "huggingface_hub==0.25.0", 46 | "hf-transfer==0.1.8", 47 | "torch==2.4.1", 48 | "transformers==4.44.2", 49 | "sentence-transformers==3.1.0", 50 | "infinity_emb[all]==0.0.56" 51 | ) 52 | .apt_install("git") 53 | # Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s. 54 | .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) 55 | .run_function( 56 | download_model_to_folder, 57 | secrets=[Secret.from_name("huggingface")], 58 | timeout=60 * 20, 59 | ) 60 | ) 61 | 62 | app = App("infinity-mxbai-rerank-large-v1", image=image) 63 | GPU_CONFIG = gpu.T4(count=1) 64 | 65 | 66 | # Run a web server on port 7997 and expose the Infinity embedding server 67 | @app.function( 68 | allow_concurrent_inputs=100, 69 | container_idle_timeout=15, 70 | gpu=GPU_CONFIG, 71 | secrets=[ 72 | Secret.from_name("huggingface"), 73 | Secret.from_dotenv(), 74 | ], 75 | ) 76 | @web_server(7997, startup_timeout=300) 77 | def infinity_embeddings_server(): 78 | cmd = f"infinity_emb v2 --device cuda --engine torch --model-id {BASE_MODEL}" 79 | subprocess.Popen(cmd, shell=True) 80 | -------------------------------------------------------------------------------- /infinity_snowflake_arctic_embed_l_335m.py: -------------------------------------------------------------------------------- 1 | # # Fast inference with Infinity (Snowflake/snowflake-arctic-embed-l) 2 | 3 | import os 4 | import subprocess 5 | import secrets 6 | 7 | from modal import Image, Secret, App, enter, gpu, method, web_server 8 | 9 | MODEL_DIR = "/model" 10 | BASE_MODEL = "Snowflake/snowflake-arctic-embed-l" 11 | 12 | # ## Define a container image 13 | 14 | 15 | # We want to create a Modal image which has the model weights pre-saved to a directory. The benefit of this 16 | # is that the container no longer has to re-download the model from Huggingface - instead, it will take 17 | # advantage of Modal's internal filesystem for faster cold starts. 18 | # 19 | # ### Download the weights 20 | # We can download the model to a particular directory using the HuggingFace utility function `snapshot_download`. 21 | # 22 | # Tip: avoid using global variables in this function. Changes to code outside this function will not be detected and the download step will not re-run. 23 | def download_model_to_folder(): 24 | from huggingface_hub import snapshot_download 25 | from transformers.utils import move_cache 26 | 27 | os.makedirs(MODEL_DIR, exist_ok=True) 28 | 29 | snapshot_download( 30 | BASE_MODEL, 31 | local_dir=MODEL_DIR, 32 | ignore_patterns=["*.pt", "*.bin"], # Using safetensors 33 | ) 34 | move_cache() 35 | 36 | 37 | # ### Image definition 38 | # We'll start from a recommended Docker Hub image and install `vLLM`. 39 | # Then we'll use `run_function` to run the function defined above to ensure the weights of 40 | # the model are saved within the container image. 41 | image = ( 42 | Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10") 43 | .pip_install( 44 | "wheel==0.44.0", 45 | "huggingface_hub==0.25.0", 46 | "hf-transfer==0.1.8", 47 | "torch==2.4.1", 48 | "transformers==4.44.2", 49 | "sentence-transformers==3.1.0", 50 | "infinity_emb[all]==0.0.56" 51 | ) 52 | .apt_install("git") 53 | # Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s. 54 | .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) 55 | .run_function( 56 | download_model_to_folder, 57 | secrets=[Secret.from_name("huggingface")], 58 | timeout=60 * 20, 59 | ) 60 | ) 61 | 62 | app = App("infinity-snowflake-arctic-embed-l-335m", image=image) 63 | GPU_CONFIG = gpu.T4(count=1) 64 | 65 | 66 | # Run a web server on port 8000 and expose vLLM OpenAI compatible server 67 | @app.function( 68 | allow_concurrent_inputs=100, 69 | container_idle_timeout=15, 70 | gpu=GPU_CONFIG, 71 | secrets=[ 72 | Secret.from_name("huggingface"), 73 | Secret.from_dotenv(), 74 | ], 75 | ) 76 | @web_server(7997, startup_timeout=300) 77 | def infinity_embeddings_server(): 78 | cmd = f"infinity_emb v2 --device cuda --engine torch --model-id {BASE_MODEL}" 79 | subprocess.Popen(cmd, shell=True) 80 | -------------------------------------------------------------------------------- /outlines_llama3_8b.py: -------------------------------------------------------------------------------- 1 | # # Fast inference with vLLM (meta-llama/Meta-Llama-3-8B-Instruct) 2 | # 3 | # In this example, we show how to run basic inference, using [`vLLM`](https://github.com/vllm-project/vllm) 4 | # to take advantage of PagedAttention, which speeds up sequential inferences with optimized key-value caching. 5 | 6 | import os 7 | import subprocess 8 | import secrets 9 | 10 | 11 | from modal import Image, Secret, App, enter, gpu, method, web_server 12 | 13 | MODEL_DIR = "/model" 14 | BASE_MODEL = "meta-llama/Meta-Llama-3-8B-Instruct" 15 | 16 | # ## Define a container image 17 | 18 | 19 | # We want to create a Modal image which has the model weights pre-saved to a directory. The benefit of this 20 | # is that the container no longer has to re-download the model from Huggingface - instead, it will take 21 | # advantage of Modal's internal filesystem for faster cold starts. 22 | # 23 | # ### Download the weights 24 | # We can download the model to a particular directory using the HuggingFace utility function `snapshot_download`. 25 | # 26 | # Tip: avoid using global variables in this function. Changes to code outside this function will not be detected and the download step will not re-run. 27 | def download_model_to_folder(): 28 | from huggingface_hub import snapshot_download 29 | from transformers.utils import move_cache 30 | 31 | os.makedirs(MODEL_DIR, exist_ok=True) 32 | 33 | snapshot_download( 34 | BASE_MODEL, 35 | local_dir=MODEL_DIR, 36 | ignore_patterns=["*.pt", "*.bin"], # Using safetensors 37 | ) 38 | move_cache() 39 | 40 | 41 | # ### Image definition 42 | # We'll start from a recommended Docker Hub image and install `vLLM`. 43 | # Then we'll use `run_function` to run the function defined above to ensure the weights of 44 | # the model are saved within the container image. 45 | image = ( 46 | Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10") 47 | .pip_install( 48 | "vllm==0.4.3", 49 | "wheel==0.43.0", 50 | "packaging==24.0", 51 | "huggingface_hub==0.23.3", 52 | "hf-transfer==0.1.6", 53 | "torch==2.3.0", 54 | "autoawq==0.2.5", 55 | "outlines[serve]==0.0.34", 56 | ) 57 | .apt_install("git") 58 | .run_commands( 59 | "pip install flash-attn==2.5.8 --no-build-isolation", 60 | ) 61 | # Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s. 62 | .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) 63 | .run_function( 64 | download_model_to_folder, 65 | secrets=[Secret.from_name("huggingface")], 66 | timeout=60 * 20, 67 | ) 68 | ) 69 | 70 | app = App("outlines-llama3-8b", image=image) 71 | GPU_CONFIG = gpu.A100(size="40GB", count=1) 72 | 73 | 74 | # Run a web server on port 7997 and expose the Infinity embedding server 75 | @app.function( 76 | allow_concurrent_inputs=100, 77 | container_idle_timeout=15, 78 | gpu=GPU_CONFIG, 79 | secrets=[ 80 | Secret.from_name("huggingface"), 81 | Secret.from_dotenv(), 82 | ], 83 | ) 84 | @web_server(8000, startup_timeout=300) 85 | def outlines_server(): 86 | target = BASE_MODEL 87 | cmd = f"python -m outlines.serve.serve --model {target} --port 8000" 88 | subprocess.Popen(cmd, shell=True) 89 | -------------------------------------------------------------------------------- /vllm_arctic_480b.py: -------------------------------------------------------------------------------- 1 | # # Fast inference with vLLM (Snowflake/snowflake-arctic-instruct) 2 | # 3 | # In this example, we show how to run basic inference, using [`vLLM`](https://github.com/vllm-project/vllm) 4 | # to take advantage of PagedAttention, which speeds up sequential inferences with optimized key-value caching. 5 | 6 | import os 7 | import subprocess 8 | import secrets 9 | 10 | 11 | from modal import Image, Secret, App, enter, gpu, method, web_server 12 | 13 | MODEL_DIR = "/model" 14 | BASE_MODEL = "Snowflake/snowflake-arctic-instruct" 15 | 16 | # ## Define a container image 17 | 18 | 19 | # We want to create a Modal image which has the model weights pre-saved to a directory. The benefit of this 20 | # is that the container no longer has to re-download the model from Huggingface - instead, it will take 21 | # advantage of Modal's internal filesystem for faster cold starts. 22 | # 23 | # ### Download the weights 24 | # We can download the model to a particular directory using the HuggingFace utility function `snapshot_download`. 25 | # 26 | # Tip: avoid using global variables in this function. Changes to code outside this function will not be detected and the download step will not re-run. 27 | def download_model_to_folder(): 28 | from huggingface_hub import snapshot_download 29 | from transformers.utils import move_cache 30 | 31 | os.makedirs(MODEL_DIR, exist_ok=True) 32 | 33 | snapshot_download( 34 | BASE_MODEL, 35 | local_dir=MODEL_DIR, 36 | ignore_patterns=["*.pt", "*.bin"], # Using safetensors 37 | ) 38 | move_cache() 39 | 40 | 41 | # ### Image definition 42 | # We'll start from a recommended Docker Hub image and install `vLLM`. 43 | # Then we'll use `run_function` to run the function defined above to ensure the weights of 44 | # the model are saved within the container image. 45 | image = ( 46 | Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10") 47 | .pip_install( 48 | "vllm==0.6.1.post2", 49 | "wheel==0.44.0", 50 | "packaging==24.1", 51 | "huggingface_hub==0.25.0", 52 | "hf-transfer==0.1.8", 53 | "torch==2.4.0", 54 | ) 55 | .apt_install("git") 56 | .run_commands( 57 | "pip install flash-attn==2.6.3 --no-build-isolation", 58 | ) # Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s. 59 | .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) 60 | .run_function( 61 | download_model_to_folder, 62 | secrets=[Secret.from_name("huggingface")], 63 | timeout=60 * 60, 64 | ) 65 | ) 66 | 67 | app = App("vllm-arctic", image=image) 68 | GPU_CONFIG = gpu.A100(size="40GB", count=1) 69 | 70 | 71 | # Run a web server on port 7997 and expose the Infinity embedding server 72 | @app.function( 73 | allow_concurrent_inputs=100, 74 | container_idle_timeout=15, 75 | gpu=GPU_CONFIG, 76 | secrets=[ 77 | Secret.from_name("huggingface"), 78 | Secret.from_dotenv(), 79 | ], 80 | ) 81 | @web_server(8000, startup_timeout=300) 82 | def openai_compatible_server(): 83 | target = BASE_MODEL 84 | cmd = f"python -m vllm.entrypoints.openai.api_server --model {target} --port 8000" 85 | subprocess.Popen(cmd, shell=True) 86 | -------------------------------------------------------------------------------- /vllm_aya_8b.py: -------------------------------------------------------------------------------- 1 | # # Fast inference with vLLM (CohereForAI/aya-23-8B) 2 | # 3 | # In this example, we show how to run basic inference, using [`vLLM`](https://github.com/vllm-project/vllm) 4 | # to take advantage of PagedAttention, which speeds up sequential inferences with optimized key-value caching. 5 | 6 | import os 7 | import subprocess 8 | import secrets 9 | 10 | 11 | from modal import Image, Secret, App, enter, gpu, method, web_server 12 | 13 | MODEL_DIR = "/model" 14 | BASE_MODEL = "CohereForAI/aya-23-8B" 15 | 16 | # ## Define a container image 17 | 18 | 19 | # We want to create a Modal image which has the model weights pre-saved to a directory. The benefit of this 20 | # is that the container no longer has to re-download the model from Huggingface - instead, it will take 21 | # advantage of Modal's internal filesystem for faster cold starts. 22 | # 23 | # ### Download the weights 24 | # We can download the model to a particular directory using the HuggingFace utility function `snapshot_download`. 25 | # 26 | # Tip: avoid using global variables in this function. Changes to code outside this function will not be detected and the download step will not re-run. 27 | def download_model_to_folder(): 28 | from huggingface_hub import snapshot_download 29 | from transformers.utils import move_cache 30 | 31 | os.makedirs(MODEL_DIR, exist_ok=True) 32 | 33 | snapshot_download( 34 | BASE_MODEL, 35 | local_dir=MODEL_DIR, 36 | ignore_patterns=["*.pt", "*.bin"], # Using safetensors 37 | ) 38 | move_cache() 39 | 40 | 41 | # ### Image definition 42 | # We'll start from a recommended Docker Hub image and install `vLLM`. 43 | # Then we'll use `run_function` to run the function defined above to ensure the weights of 44 | # the model are saved within the container image. 45 | image = ( 46 | Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10") 47 | .pip_install( 48 | "vllm==0.6.1.post2", 49 | "wheel==0.44.0", 50 | "packaging==24.1", 51 | "huggingface_hub==0.25.0", 52 | "hf-transfer==0.1.8", 53 | "torch==2.4.0", 54 | ) 55 | .apt_install("git") 56 | .run_commands( 57 | "pip install flash-attn==2.6.3 --no-build-isolation", 58 | ) # Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s. 59 | .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) 60 | .run_function( 61 | download_model_to_folder, 62 | secrets=[Secret.from_name("huggingface")], 63 | timeout=60 * 20, 64 | ) 65 | ) 66 | 67 | app = App("vllm-aya-8b", image=image) 68 | GPU_CONFIG = gpu.A100(size="40GB", count=1) 69 | 70 | 71 | # Run a web server on port 7997 and expose the Infinity embedding server 72 | @app.function( 73 | allow_concurrent_inputs=100, 74 | container_idle_timeout=15, 75 | gpu=GPU_CONFIG, 76 | secrets=[ 77 | Secret.from_name("huggingface"), 78 | Secret.from_dotenv(), 79 | ], 80 | ) 81 | @web_server(8000, startup_timeout=300) 82 | def openai_compatible_server(): 83 | target = BASE_MODEL 84 | cmd = f"python -m vllm.entrypoints.openai.api_server --model {target} --port 8000" 85 | subprocess.Popen(cmd, shell=True) 86 | -------------------------------------------------------------------------------- /vllm_codeqwen_110b_v1_5.py: -------------------------------------------------------------------------------- 1 | # # Fast inference with vLLM (Qwen/Qwen1.5-110B-Chat-AWQ) 2 | # 3 | # In this example, we show how to run basic inference, using [`vLLM`](https://github.com/vllm-project/vllm) 4 | # to take advantage of PagedAttention, which speeds up sequential inferences with optimized key-value caching. 5 | 6 | import os 7 | import subprocess 8 | import secrets 9 | 10 | 11 | from modal import Image, Secret, App, enter, gpu, method, web_server 12 | 13 | MODEL_DIR = "/model" 14 | BASE_MODEL = "Qwen/Qwen1.5-110B-Chat-AWQ" 15 | 16 | # ## Define a container image 17 | 18 | 19 | # We want to create a Modal image which has the model weights pre-saved to a directory. The benefit of this 20 | # is that the container no longer has to re-download the model from Huggingface - instead, it will take 21 | # advantage of Modal's internal filesystem for faster cold starts. 22 | # 23 | # ### Download the weights 24 | # We can download the model to a particular directory using the HuggingFace utility function `snapshot_download`. 25 | # 26 | # Tip: avoid using global variables in this function. Changes to code outside this function will not be detected and the download step will not re-run. 27 | def download_model_to_folder(): 28 | from huggingface_hub import snapshot_download 29 | from transformers.utils import move_cache 30 | 31 | os.makedirs(MODEL_DIR, exist_ok=True) 32 | 33 | snapshot_download( 34 | BASE_MODEL, 35 | local_dir=MODEL_DIR, 36 | ignore_patterns=["*.pt", "*.bin"], # Using safetensors 37 | ) 38 | move_cache() 39 | 40 | 41 | # ### Image definition 42 | # We'll start from a recommended Docker Hub image and install `vLLM`. 43 | # Then we'll use `run_function` to run the function defined above to ensure the weights of 44 | # the model are saved within the container image. 45 | image = ( 46 | Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10") 47 | .pip_install( 48 | "vllm==0.5.2", 49 | "wheel==0.43.0", 50 | "packaging==24.1", 51 | "huggingface_hub==0.24.0", 52 | "hf-transfer==0.1.6", 53 | "torch==2.3.1", 54 | "autoawq==0.2.5", 55 | ) 56 | .apt_install("git") 57 | .run_commands( 58 | "pip install flash-attn==2.6.1 --no-build-isolation", 59 | ) # Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s. 60 | .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) 61 | .run_function( 62 | download_model_to_folder, 63 | secrets=[Secret.from_name("huggingface")], 64 | timeout=60 * 20, 65 | ) 66 | ) 67 | 68 | app = App("vllm-codeqwen-110b-v1.5", image=image) 69 | GPU_CONFIG = gpu.A100(memory=80, count=2) 70 | 71 | 72 | # Run a web server on port 8000 and expose vLLM OpenAI compatible server 73 | @app.function( 74 | allow_concurrent_inputs=100, 75 | container_idle_timeout=15, 76 | gpu=GPU_CONFIG, 77 | secrets=[ 78 | Secret.from_name("huggingface"), 79 | Secret.from_dotenv(), 80 | ], 81 | ) 82 | @web_server(8000, startup_timeout=900) 83 | def openai_compatible_server(): 84 | target = BASE_MODEL 85 | cmd = f"python -m vllm.entrypoints.openai.api_server --model {target} --port 8000 --quantization awq" 86 | subprocess.Popen(cmd, shell=True) 87 | -------------------------------------------------------------------------------- /vllm_deepseek_coder_33b.py: -------------------------------------------------------------------------------- 1 | # # Fast inference with vLLM (TheBloke/deepseek-coder-33B-instruct-AWQ) 2 | # 3 | # In this example, we show how to run basic inference, using [`vLLM`](https://github.com/vllm-project/vllm) 4 | # to take advantage of PagedAttention, which speeds up sequential inferences with optimized key-value caching. 5 | 6 | import os 7 | import subprocess 8 | import secrets 9 | 10 | 11 | from modal import Image, Secret, App, enter, gpu, method, web_server 12 | 13 | MODEL_DIR = "/model" 14 | BASE_MODEL = "TheBloke/deepseek-coder-33B-instruct-AWQ" 15 | 16 | # ## Define a container image 17 | 18 | 19 | # We want to create a Modal image which has the model weights pre-saved to a directory. The benefit of this 20 | # is that the container no longer has to re-download the model from Huggingface - instead, it will take 21 | # advantage of Modal's internal filesystem for faster cold starts. 22 | # 23 | # ### Download the weights 24 | # We can download the model to a particular directory using the HuggingFace utility function `snapshot_download`. 25 | # 26 | # Tip: avoid using global variables in this function. Changes to code outside this function will not be detected and the download step will not re-run. 27 | def download_model_to_folder(): 28 | from huggingface_hub import snapshot_download 29 | from transformers.utils import move_cache 30 | 31 | os.makedirs(MODEL_DIR, exist_ok=True) 32 | 33 | snapshot_download( 34 | BASE_MODEL, 35 | local_dir=MODEL_DIR, 36 | ignore_patterns=["*.pt", "*.bin"], # Using safetensors 37 | ) 38 | move_cache() 39 | 40 | 41 | # ### Image definition 42 | # We'll start from a recommended Docker Hub image and install `vLLM`. 43 | # Then we'll use `run_function` to run the function defined above to ensure the weights of 44 | # the model are saved within the container image. 45 | image = ( 46 | Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10") 47 | .pip_install( 48 | "vllm==0.5.2", 49 | "wheel==0.43.0", 50 | "packaging==24.1", 51 | "huggingface_hub==0.24.0", 52 | "hf-transfer==0.1.6", 53 | "torch==2.3.1", 54 | "autoawq==0.2.5", 55 | ) 56 | .apt_install("git") 57 | .run_commands( 58 | "pip install flash-attn==2.6.1 --no-build-isolation", 59 | ) 60 | # Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s. 61 | .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) 62 | .run_function( 63 | download_model_to_folder, 64 | secrets=[Secret.from_name("huggingface")], 65 | timeout=60 * 20, 66 | ) 67 | ) 68 | 69 | app = App("vllm-deepseek-coder-33b", image=image) 70 | GPU_CONFIG = gpu.A100(memory=80, count=1) 71 | 72 | 73 | # Run a web server on port 8000 and expose vLLM OpenAI compatible server 74 | @app.function( 75 | allow_concurrent_inputs=100, 76 | container_idle_timeout=15, 77 | gpu=GPU_CONFIG, 78 | secrets=[ 79 | Secret.from_name("huggingface"), 80 | Secret.from_dotenv(), 81 | ], 82 | ) 83 | @web_server(8000, startup_timeout=300) 84 | def openai_compatible_server(): 85 | target = BASE_MODEL 86 | cmd = f"python -m vllm.entrypoints.openai.api_server --model {target} --port 8000 --quantization awq" 87 | subprocess.Popen(cmd, shell=True) 88 | -------------------------------------------------------------------------------- /vllm_duckdb_nsql_7b.py: -------------------------------------------------------------------------------- 1 | # # Fast inference with vLLM (motherduckdb/DuckDB-NSQL-7B-v0.1) 2 | # 3 | # In this example, we show how to run basic inference, using [`vLLM`](https://github.com/vllm-project/vllm) 4 | # to take advantage of PagedAttention, which speeds up sequential inferences with optimized key-value caching. 5 | 6 | import os 7 | import subprocess 8 | import secrets 9 | 10 | 11 | from modal import Image, Secret, App, enter, gpu, method, web_server 12 | 13 | MODEL_DIR = "/model" 14 | BASE_MODEL = "motherduckdb/DuckDB-NSQL-7B-v0.1" 15 | 16 | # ## Define a container image 17 | 18 | 19 | # We want to create a Modal image which has the model weights pre-saved to a directory. The benefit of this 20 | # is that the container no longer has to re-download the model from Huggingface - instead, it will take 21 | # advantage of Modal's internal filesystem for faster cold starts. 22 | # 23 | # ### Download the weights 24 | # We can download the model to a particular directory using the HuggingFace utility function `snapshot_download`. 25 | # 26 | # Tip: avoid using global variables in this function. Changes to code outside this function will not be detected and the download step will not re-run. 27 | def download_model_to_folder(): 28 | from huggingface_hub import snapshot_download 29 | from transformers.utils import move_cache 30 | 31 | os.makedirs(MODEL_DIR, exist_ok=True) 32 | 33 | snapshot_download( 34 | BASE_MODEL, 35 | local_dir=MODEL_DIR, 36 | ignore_patterns=["*.pt", "*.bin"], # Using safetensors 37 | ) 38 | move_cache() 39 | 40 | 41 | # ### Image definition 42 | # We'll start from a recommended Docker Hub image and install `vLLM`. 43 | # Then we'll use `run_function` to run the function defined above to ensure the weights of 44 | # the model are saved within the container image. 45 | image = ( 46 | Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10") 47 | .pip_install( 48 | "vllm==0.6.1.post2", 49 | "wheel==0.44.0", 50 | "packaging==24.1", 51 | "huggingface_hub==0.25.0", 52 | "hf-transfer==0.1.8", 53 | "torch==2.4.0", 54 | ) 55 | .apt_install("git") 56 | .run_commands( 57 | "pip install flash-attn==2.6.3 --no-build-isolation", 58 | ) 59 | # Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s. 60 | .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) 61 | .run_function( 62 | download_model_to_folder, 63 | secrets=[Secret.from_name("huggingface")], 64 | timeout=60 * 20, 65 | ) 66 | ) 67 | 68 | app = App("vllm-duckdb-nsql-7b", image=image) 69 | GPU_CONFIG = gpu.A100(size="40GB", count=1) 70 | 71 | 72 | # Run a web server on port 8000 and expose vLLM OpenAI compatible server 73 | @app.function( 74 | allow_concurrent_inputs=100, 75 | container_idle_timeout=15, 76 | gpu=GPU_CONFIG, 77 | secrets=[ 78 | Secret.from_name("huggingface"), 79 | Secret.from_dotenv(), 80 | ], 81 | ) 82 | @web_server(8000, startup_timeout=300) 83 | def openai_compatible_server(): 84 | target = BASE_MODEL 85 | cmd = f"python -m vllm.entrypoints.openai.api_server --model {target} --port 8000" 86 | subprocess.Popen(cmd, shell=True) 87 | -------------------------------------------------------------------------------- /vllm_llama3_70b.py: -------------------------------------------------------------------------------- 1 | # # Fast inference with vLLM (PrunaAI/Meta-Llama-3-70b-instruct-AWQ-smashed) 2 | # 3 | # In this example, we show how to run basic inference, using [`vLLM`](https://github.com/vllm-project/vllm) 4 | # to take advantage of PagedAttention, which speeds up sequential inferences with optimized key-value caching. 5 | 6 | import os 7 | import subprocess 8 | import secrets 9 | 10 | 11 | from modal import Image, Secret, App, enter, gpu, method, web_server 12 | 13 | MODEL_DIR = "/model" 14 | BASE_MODEL = "PrunaAI/Meta-Llama-3-70b-instruct-AWQ-smashed" 15 | 16 | # ## Define a container image 17 | 18 | 19 | # We want to create a Modal image which has the model weights pre-saved to a directory. The benefit of this 20 | # is that the container no longer has to re-download the model from Huggingface - instead, it will take 21 | # advantage of Modal's internal filesystem for faster cold starts. 22 | # 23 | # ### Download the weights 24 | # We can download the model to a particular directory using the HuggingFace utility function `snapshot_download`. 25 | # 26 | # Tip: avoid using global variables in this function. Changes to code outside this function will not be detected and the download step will not re-run. 27 | def download_model_to_folder(): 28 | from huggingface_hub import snapshot_download 29 | from transformers.utils import move_cache 30 | 31 | os.makedirs(MODEL_DIR, exist_ok=True) 32 | 33 | snapshot_download( 34 | BASE_MODEL, 35 | local_dir=MODEL_DIR, 36 | ignore_patterns=["*.pt", "*.bin"], # Using safetensors 37 | ) 38 | move_cache() 39 | 40 | 41 | # ### Image definition 42 | # We'll start from a recommended Docker Hub image and install `vLLM`. 43 | # Then we'll use `run_function` to run the function defined above to ensure the weights of 44 | # the model are saved within the container image. 45 | image = ( 46 | Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10") 47 | .pip_install( 48 | "vllm==0.6.1.post2", 49 | "wheel==0.44.0", 50 | "packaging==24.1", 51 | "huggingface_hub==0.25.0", 52 | "hf-transfer==0.1.8", 53 | "torch==2.4.0", 54 | ) 55 | .apt_install("git") 56 | .run_commands( 57 | "pip install flash-attn==2.6.3 --no-build-isolation", 58 | ) 59 | # Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s. 60 | .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) 61 | .run_function( 62 | download_model_to_folder, 63 | secrets=[Secret.from_name("huggingface")], 64 | timeout=60 * 20, 65 | ) 66 | ) 67 | 68 | app = App("vllm-llama-3-70b", image=image) 69 | GPU_CONFIG = gpu.A100(memory=80, count=1) 70 | 71 | 72 | # Run a web server on port 8000 and expose vLLM OpenAI compatible server 73 | @app.function( 74 | allow_concurrent_inputs=100, 75 | container_idle_timeout=15, 76 | gpu=GPU_CONFIG, 77 | secrets=[ 78 | Secret.from_name("huggingface"), 79 | Secret.from_dotenv(), 80 | ], 81 | ) 82 | @web_server(8000, startup_timeout=300) 83 | def openai_compatible_server(): 84 | target = BASE_MODEL 85 | cmd = f"python -m vllm.entrypoints.openai.api_server --model {target} --port 8000 --quantization awq" 86 | subprocess.Popen(cmd, shell=True) 87 | -------------------------------------------------------------------------------- /vllm_llama3_8b.py: -------------------------------------------------------------------------------- 1 | # # Fast inference with vLLM (meta-llama/Meta-Llama-3-8B-Instruct) 2 | # 3 | # In this example, we show how to run basic inference, using [`vLLM`](https://github.com/vllm-project/vllm) 4 | # to take advantage of PagedAttention, which speeds up sequential inferences with optimized key-value caching. 5 | 6 | import os 7 | import subprocess 8 | import secrets 9 | 10 | 11 | from modal import Image, Secret, App, enter, gpu, method, web_server 12 | 13 | MODEL_DIR = "/model" 14 | BASE_MODEL = "meta-llama/Meta-Llama-3-8B-Instruct" 15 | 16 | # ## Define a container image 17 | 18 | 19 | # We want to create a Modal image which has the model weights pre-saved to a directory. The benefit of this 20 | # is that the container no longer has to re-download the model from Huggingface - instead, it will take 21 | # advantage of Modal's internal filesystem for faster cold starts. 22 | # 23 | # ### Download the weights 24 | # We can download the model to a particular directory using the HuggingFace utility function `snapshot_download`. 25 | # 26 | # Tip: avoid using global variables in this function. Changes to code outside this function will not be detected and the download step will not re-run. 27 | def download_model_to_folder(): 28 | from huggingface_hub import snapshot_download 29 | from transformers.utils import move_cache 30 | 31 | os.makedirs(MODEL_DIR, exist_ok=True) 32 | 33 | snapshot_download( 34 | BASE_MODEL, 35 | local_dir=MODEL_DIR, 36 | ignore_patterns=["*.pt", "*.bin"], # Using safetensors 37 | ) 38 | move_cache() 39 | 40 | 41 | # ### Image definition 42 | # We'll start from a recommended Docker Hub image and install `vLLM`. 43 | # Then we'll use `run_function` to run the function defined above to ensure the weights of 44 | # the model are saved within the container image. 45 | image = ( 46 | Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10") 47 | .pip_install( 48 | "vllm==0.6.1.post2", 49 | "wheel==0.44.0", 50 | "packaging==24.1", 51 | "huggingface_hub==0.25.0", 52 | "hf-transfer==0.1.8", 53 | "torch==2.4.0", 54 | ) 55 | .apt_install("git") 56 | .run_commands( 57 | "pip install flash-attn==2.6.3 --no-build-isolation", 58 | ) 59 | # Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s. 60 | .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) 61 | .run_function( 62 | download_model_to_folder, 63 | secrets=[Secret.from_name("huggingface")], 64 | timeout=60 * 20, 65 | ) 66 | ) 67 | 68 | app = App("vllm-llama3-8b", image=image) 69 | GPU_CONFIG = gpu.A100(size="40GB", count=1) 70 | 71 | 72 | # Run a web server on port 7997 and expose the Infinity embedding server 73 | @app.function( 74 | allow_concurrent_inputs=100, 75 | container_idle_timeout=15, 76 | gpu=GPU_CONFIG, 77 | secrets=[ 78 | Secret.from_name("huggingface"), 79 | Secret.from_dotenv(), 80 | ], 81 | ) 82 | @web_server(8000, startup_timeout=300) 83 | def openai_compatible_server(): 84 | target = BASE_MODEL 85 | cmd = f"python -m vllm.entrypoints.openai.api_server --model {target} --port 8000" 86 | subprocess.Popen(cmd, shell=True) 87 | -------------------------------------------------------------------------------- /vllm_seallm_7b_v2_5.py: -------------------------------------------------------------------------------- 1 | # # Fast inference with vLLM (SeaLLMs/SeaLLM-7B-v2.5) 2 | # 3 | # In this example, we show how to run basic inference, using [`vLLM`](https://github.com/vllm-project/vllm) 4 | # to take advantage of PagedAttention, which speeds up sequential inferences with optimized key-value caching. 5 | 6 | import os 7 | import subprocess 8 | import secrets 9 | 10 | 11 | from modal import Image, Secret, App, enter, gpu, method, web_server 12 | 13 | MODEL_DIR = "/model" 14 | BASE_MODEL = "SeaLLMs/SeaLLM-7B-v2.5" 15 | 16 | # ## Define a container image 17 | 18 | 19 | # We want to create a Modal image which has the model weights pre-saved to a directory. The benefit of this 20 | # is that the container no longer has to re-download the model from Huggingface - instead, it will take 21 | # advantage of Modal's internal filesystem for faster cold starts. 22 | # 23 | # ### Download the weights 24 | # We can download the model to a particular directory using the HuggingFace utility function `snapshot_download`. 25 | # 26 | # Tip: avoid using global variables in this function. Changes to code outside this function will not be detected and the download step will not re-run. 27 | def download_model_to_folder(): 28 | from huggingface_hub import snapshot_download 29 | from transformers.utils import move_cache 30 | 31 | os.makedirs(MODEL_DIR, exist_ok=True) 32 | 33 | snapshot_download( 34 | BASE_MODEL, 35 | local_dir=MODEL_DIR, 36 | ignore_patterns=["*.pt", "*.bin"], # Using safetensors 37 | ) 38 | move_cache() 39 | 40 | 41 | # ### Image definition 42 | # We'll start from a recommended Docker Hub image and install `vLLM`. 43 | # Then we'll use `run_function` to run the function defined above to ensure the weights of 44 | # the model are saved within the container image. 45 | image = ( 46 | Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10") 47 | .pip_install( 48 | "vllm==0.6.1.post2", 49 | "wheel==0.44.0", 50 | "packaging==24.1", 51 | "huggingface_hub==0.25.0", 52 | "hf-transfer==0.1.8", 53 | "torch==2.4.0", 54 | ) 55 | .apt_install("git") 56 | .run_commands( 57 | "pip install flash-attn==2.6.3 --no-build-isolation", 58 | ) 59 | # Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s. 60 | .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) 61 | .run_function( 62 | download_model_to_folder, 63 | secrets=[Secret.from_name("huggingface")], 64 | timeout=60 * 20, 65 | ) 66 | ) 67 | 68 | app = App("vllm-seallm-7b-v2.5", image=image) 69 | GPU_CONFIG = gpu.A100(size="40GB", count=1) 70 | 71 | 72 | # Run a web server on port 8000 and expose vLLM OpenAI compatible server 73 | @app.function( 74 | allow_concurrent_inputs=100, 75 | container_idle_timeout=15, 76 | gpu=GPU_CONFIG, 77 | secrets=[ 78 | Secret.from_name("huggingface"), 79 | Secret.from_dotenv(), 80 | ], 81 | ) 82 | @web_server(8000, startup_timeout=300) 83 | def openai_compatible_server(): 84 | target = BASE_MODEL 85 | cmd = f"python -m vllm.entrypoints.openai.api_server --model {target} --port 8000" 86 | subprocess.Popen(cmd, shell=True) 87 | -------------------------------------------------------------------------------- /vllm_sqlcoder_7b_2.py: -------------------------------------------------------------------------------- 1 | # # Fast inference with vLLM (defog/sqlcoder-7b-2) 2 | # 3 | # In this example, we show how to run basic inference, using [`vLLM`](https://github.com/vllm-project/vllm) 4 | # to take advantage of PagedAttention, which speeds up sequential inferences with optimized key-value caching. 5 | 6 | import os 7 | import subprocess 8 | import secrets 9 | 10 | 11 | from modal import Image, Secret, App, enter, gpu, method, web_server 12 | 13 | MODEL_DIR = "/model" 14 | BASE_MODEL = "defog/sqlcoder-7b-2" 15 | 16 | # ## Define a container image 17 | 18 | 19 | # We want to create a Modal image which has the model weights pre-saved to a directory. The benefit of this 20 | # is that the container no longer has to re-download the model from Huggingface - instead, it will take 21 | # advantage of Modal's internal filesystem for faster cold starts. 22 | # 23 | # ### Download the weights 24 | # We can download the model to a particular directory using the HuggingFace utility function `snapshot_download`. 25 | # 26 | # Tip: avoid using global variables in this function. Changes to code outside this function will not be detected and the download step will not re-run. 27 | def download_model_to_folder(): 28 | from huggingface_hub import snapshot_download 29 | from transformers.utils import move_cache 30 | 31 | os.makedirs(MODEL_DIR, exist_ok=True) 32 | 33 | snapshot_download( 34 | BASE_MODEL, 35 | local_dir=MODEL_DIR, 36 | ignore_patterns=["*.pt", "*.bin"], # Using safetensors 37 | ) 38 | move_cache() 39 | 40 | 41 | # ### Image definition 42 | # We'll start from a recommended Docker Hub image and install `vLLM`. 43 | # Then we'll use `run_function` to run the function defined above to ensure the weights of 44 | # the model are saved within the container image. 45 | image = ( 46 | Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10") 47 | .pip_install( 48 | "vllm==0.6.1.post2", 49 | "wheel==0.44.0", 50 | "packaging==24.1", 51 | "huggingface_hub==0.25.0", 52 | "hf-transfer==0.1.8", 53 | "torch==2.4.0", 54 | ) 55 | .apt_install("git") 56 | .run_commands( 57 | "pip install flash-attn==2.6.3 --no-build-isolation", 58 | ) 59 | # Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s. 60 | .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) 61 | .run_function( 62 | download_model_to_folder, 63 | secrets=[Secret.from_name("huggingface")], 64 | timeout=60 * 20, 65 | ) 66 | ) 67 | 68 | app = App("vllm-defog-sqlcoder-7b-2", image=image) 69 | GPU_CONFIG = gpu.A100(size="40GB", count=1) 70 | 71 | 72 | # Run a web server on port 8000 and expose vLLM OpenAI compatible server 73 | @app.function( 74 | allow_concurrent_inputs=100, 75 | container_idle_timeout=15, 76 | gpu=GPU_CONFIG, 77 | secrets=[ 78 | Secret.from_name("huggingface"), 79 | Secret.from_dotenv(), 80 | ], 81 | ) 82 | @web_server(8000, startup_timeout=300) 83 | def openai_compatible_server(): 84 | target = BASE_MODEL 85 | cmd = f"python -m vllm.entrypoints.openai.api_server --model {target} --port 8000" 86 | subprocess.Popen(cmd, shell=True) 87 | --------------------------------------------------------------------------------