├── .gitignore ├── LICENSE ├── README.md ├── README_CN.md ├── client_configs.py ├── model_server.py └── serve_llm_pipeline.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 110 | .pdm.toml 111 | .pdm-python 112 | .pdm-build/ 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # PyCharm 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | #.idea/ 163 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Chayenne 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ModelServer 2 | 3 | A ModelServer class based on the SGLang framework. Fully self-built, suggestions for further optimization are welcome. Using SGLang v0.2.15 right now. 4 | 5 | You can also refer to the [Chinese Readme](./README_CN.md). 6 | 7 | The ModelServer framework implements efficient, flexible, and highly fault-tolerant model service management. It can adapt to models of different scales and diverse task requirements, providing a reliable infrastructure for the deployment and application of large-scale language models. 8 | 9 | After configuration, you can get a response or embedding from these kind of commands: 10 | 11 | ``` 12 | from model_sever import ModelServer 13 | message = [ 14 | { 15 | "role": "user", 16 | "content": "Interact with a household to solve a task. Here are a few examples.\nYou are in the middle of a room. Looking quickly around you, you see a cabinet 4, a cabinet 3, a cabinet 2, a cabinet 1, a coffeemachine 1, a countertop 1, a diningtable 3, a diningtable 2, a diningtable 1, a drawer 1, a fridge 1, a garbagecan 1, a microwave 1, a sidetable 1, a sinkbasin 1, a stoveburner 4, a stoveburner 3, a stoveburner 2, a stoveburner 1, and a toaster 1.\nYour task is to: find some apple and put it in sidetable.\n> think: To solve the task, I need to find and take an apple, then put it in sidetable.\nOK.\n> think: First I need to find an apple. An apple is more likely to appear in fridges (1), diningtables (1-3), sidetables (1), countertops (1), sinkbasins (1), garbagecan (1). I can check one by one, starting with fridge 1.\nOK.\n> go to fridge 1\nThe fridge 1 is closed.\n> open fridge 1\nYou open the fridge 1. The fridge 1 is open. In it, you see a lettuce 2, a mug 2, a potato 2, and a tomato 1.\n> go to diningtable 1\nOn the diningtable 1, you see nothing.\n> go to diningtable 2\nOn the diningtable 2, you see a pineapple 1.\n> go to diningtable 3\nOn the diningtable 3, you see a bowl 2, a bowl 1, a bread 2, and a bread 1.\n> go to sidetable 1\nOn the sidetable 1, you see a cup 1, a lettuce 1, a peppershaker 3, a potato 1, and a saltshaker 1.\n> go to countertop 1\nOn the countertop 1, you see a butterknife 1, a dishsponge 1, and a soapbottle 1.\n> go to sinkbasin 1\nOn the sinkbasin 1, you see a cup 2, a egg 1, a mug 1, and a potato 3.\n> go to garbagecan 1\nOn the garbagecan 1, you see a apple 3, and a egg 3.\n> think: Now I find an apple (3). Next, I need to take it.\n> take apple 3 from garbagecan 1\nYou pick up the apple 3 from the garbagecan 1.\n> think: Now I find an apple (3). Next, I need to put it in/on sidetable 1.\nOK.\n> go to sidetable 1\nOn the sidetable 1, you see a cup 1, a lettuce 1, a peppershaker 3, a potato 1, and a saltshaker 1.\n> put apple 3 in/on sidetable 1\nYou put the apple 3 in/on the sidetable 1.\nYou are in the middle of a room. Looking quickly around you, you see a cabinet 4, a cabinet 3, a cabinet 2, a cabinet 1, a countertop 1, a garbagecan 1, a handtowelholder 2, a handtowelholder 1, a sinkbasin 2, a sinkbasin 1, a toilet 1, a toiletpaperhanger 1, and a towelholder 1.\nYour task is to: put some spraybottle on toilet.\n> think: To solve the task, I need to find and take a sparybottle, then put it on toilet.\nOK.\n> think: First I need to find a spraybottle. A spraybottle is more likely to appear in cabinet (1-4), countertop (1), toilet (1), sinkbasin (1-2), garbagecan (1). I can check one by one, starting with cabinet 1.\nOK.\n> go to cabinet 1\nOn the cabinet 1, you see a cloth 1, a soapbar 1, a soapbottle 1.\n> go to cabinet 2\nThe cabinet 2 is closed.\n> open cabinet 2\nYou open the cabinet 2. The cabinet 2 is open. In it, you see a candle 1, and a spraybottle 2.\n> think: Now I find a spraybottle (2). Next, I need to take it.\n> take spraybottle 2 from cabinet 2\nYou pick up the spraybottle 2 from the cabinet 2.\n> think: Now I take a spraybottle (2). Next, I need to put it in/on toilet 1.\nOK.\n> go to toilet 1\nOn the toilet 1, you see a soapbottle 2.\n> put spraybottle 2 in/on toilet 1\nYou put the spraybottle 2 in/on the toilet 1.\n\n\nHere is the task:\nYou are in the middle of a room. Looking quickly around you, you see a bed 1, a desk 2, a desk 1, a drawer 6, a drawer 5, a drawer 4, a drawer 3, a drawer 2, a drawer 1, a garbagecan 1, a laundryhamper 1, a safe 1, a shelf 6, a shelf 5, a shelf 4, a shelf 3, a shelf 2, and a shelf 1.\nYour task is to: put a mug in desk.\n>", 17 | }, 18 | { 19 | "role": "assistant", 20 | "content": "think: To solve the task, I need to find and take a mug, then put it in/on desk.", 21 | }, 22 | {"role": "user", "content": "OK."}, 23 | { 24 | "role": "assistant", 25 | "content": "think: First I need to find a mug. A mug is more likely to appear in desk (1-2), drawer (1-6), shelf (1-6), garbagecan (1), laundryhamper (1), safe (1). I can check one by one, starting with desk 1.", 26 | }, 27 | {"role": "user", "content": "OK."}, 28 | {"role": "assistant", "content": "go to desk 1"}, 29 | { 30 | "role": "user", 31 | "content": "On the desk 1, you see a bowl 1, a cd 2, a creditcard 2, a desklamp 1, a keychain 1, a laptop 2, a pen 2, and a pencil 2.", 32 | }, 33 | { 34 | "role": "assistant", 35 | "content": "think: Now I don't find a mug in desk 1. Next, I need to check desk 2.", 36 | }, 37 | {"role": "user", "content": "OK."}, 38 | {"role": "assistant", "content": "go to desk 2"}, 39 | { 40 | "role": "user", 41 | "content": "On the desk 2, you see a alarmclock 1, a book 3, a bowl 2, a creditcard 3, and a keychain 3.", 42 | }, 43 | ] 44 | 45 | server = ModelServer() 46 | # `8` here is the completion model size configured in the `client_configs` 47 | complition = server.get_completion_or_embedding("8", message) 48 | # get the completion from a 8b instruct model, i.e. Llama3.1 8B 49 | print(complition) 50 | 51 | # `7` here is the embedding model size configured in the `client_configs` 52 | embedding = server.get_completion_or_embedding( 53 | "7", 54 | message="As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.", 55 | get_embedding=True, 56 | ) 57 | # get the embedding of a 7b embedding model, i.e. `Alibaba-NLP/gte-Qwen1.5-7B-instruct` 58 | print(embedding[:10]) 59 | ``` 60 | 61 | ## Get Started 62 | 63 | ### Install SGLang 64 | 65 | Below are the dependencies for the SGLang framework in my framework currently. Will update later. 66 | 67 | ```bash 68 | pip install sglang==0.2.15 69 | pip install flashinfer==0.1.6 -i https://flashinfer.ai/whl/cu121/torch2.3/ 70 | 71 | # lower version of vllm will lead to errors about multimodal-config 72 | pip install vllm==0.5.5 73 | 74 | pip install triton==2.3.1 75 | 76 | # change the cuda version according to your local device 77 | pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121 78 | ``` 79 | It is recommended to follow the above specified versions to avoid potential errors. 80 | 81 | ### Modify `client_config.py` 82 | 83 | Modify the IP address of the server and the model path in `client_config.py`: 84 | 85 | ```python 86 | SERVER_IP = "[SECRET IP, REPLACE WITH YOURS]" 87 | MODEL_NAME_8B = "8bins" 88 | MODEL_NAME_70B = "70bins" 89 | EMBEDDING_7B = "7embed" 90 | ``` 91 | 92 | ### Run the Server Engine 93 | 94 | ```bash 95 | python serve_llm_pipeline.py 96 | ``` 97 | 98 | ### Test the Server Latency 99 | 100 | ```bash 101 | python client_config.py 102 | ``` 103 | 104 | ### Test the ModelServer 105 | ```bash 106 | python model_server.py 107 | ``` 108 | 109 | ## Code Structure 110 | 111 | ### `client_configs.py` 112 | 113 | #### Constants 114 | 115 | - **Server Configuration**: Configurations for all servers, hosting models of different sizes (e.g., `8B` and `70B`) as well as embedding models. Each server is represented by a `Server` named tuple, containing attributes such as `ip` (IP address), `port` (port number), `model_size` (model size), `model_path` (model path), and `gpus` (GPU configuration). 116 | - **BENCHMAK_MESSAGE**: Defines a benchmark message (`BENCHMAK_MESSAGE`) used to test the performance of different servers. 117 | - **Completion_Servers**: List of server configurations for dialogue models. 118 | - **Embedding_Servers**: List of server configurations for embedding models (newly added). 119 | 120 | #### Functions 121 | 122 | - `get_fastest_server`: Tests the latency of each server and returns the fastest server along with its latency. Servers with latency higher than the current lowest latency are skipped, which is particularly significant when server latencies are highly uneven. (Support for embedding model servers has been added) 123 | - `get_all_latency`: Checks and prints the latency of all configured servers, including both completion and embedding model servers. 124 | - `get_running_server_sizes`: Returns a list of model sizes currently running on servers. 125 | 126 | ### `serve_llm_pipeline.py` 127 | 128 | #### Functions 129 | 130 | - `get_eno1_inet_address`: Retrieves the IP address associated with the `eno1` network interface. 131 | - `is_gpu_free`: Checks if the specified GPU is free (memory usage below a certain threshold). 132 | - `get_gpu_memory_info`: Gets total and available memory information for specified GPUs. 133 | - `get_free_memory_ratio`: Calculates the ratio of available memory to total memory for GPUs. 134 | - `get_comond_infos`: Dynamically constructs commands to start servers, generating appropriate startup parameters based on server configurations. 135 | - `main`: Main function that manages GPU availability and starts model servers. Uses `ThreadPoolExecutor` to concurrently manage multiple servers. 136 | 137 | #### Features 138 | 139 | - **Dynamic Resource Management:** Dynamically checks and manages GPU resources, ensuring servers are only started when resources are sufficient. 140 | - **Server Initialization:** Automatically starts servers, ensuring correct configuration and resource allocation. 141 | - **Concurrency:** Uses `ThreadPoolExecutor` to concurrently manage multiple servers, maximizing resource utilization. 142 | - **GPU Memory Management:** Real-time monitoring of GPU memory usage, dynamically adjusting server startup strategies. 143 | 144 | ### `model_server.py` 145 | 146 | #### Constants 147 | 148 | - `LATENCY_GROWING_RATE`: Latency growth rate, used for dynamically adjusting latency thresholds. 149 | - `MAX_RETRY`: Maximum number of retry attempts, improving system fault tolerance. 150 | - `INF`: Represents infinity, used for initializing latency comparisons. 151 | 152 | #### `ModelServer` Class 153 | 154 | Manages the creation and interaction of different model servers (including completion and embedding models) using automated restart and fault recovery methods. 155 | 156 | ##### Latency Management and Automatic Restart 157 | 158 | - **Latency Monitoring:** Real-time monitoring of server response time in the `get_completion_or_embedding` method. 159 | - **Dynamic Threshold Adjustment:** Uses `LATENCY_GROWING_RATE` to dynamically adjust acceptable latency thresholds. 160 | - **Automatic Restart:** Triggers the `_manage_model_server` method to rebuild server connections when response times are detected to be too long. 161 | 162 | ##### Fault Tolerance Mechanism 163 | 164 | - **Multiple Attempts:** Uses the `MAX_RETRY` mechanism to attempt requests multiple times in case of errors. 165 | - **Error Handling:** Captures and logs exceptions, attempting to rebuild server connections. 166 | - **Graceful Degradation:** Gracefully shuts down the service through the `turn_off_running_flag` method when all attempts fail. 167 | 168 | ##### Server Construction and Rebuilding Logic 169 | 170 | - **Initial Construction:** Selects the fastest server based on current configurations. 171 | - **Dynamic Rebuilding:** Automatically selects a new, faster server when server performance degrades. 172 | - **Resource Optimization:** Balances service quality and resource utilization through reasonable rebuilding strategies. 173 | 174 | ##### New Features 175 | 176 | - **Embedding Model Support:** Added support for embedding models, capable of handling text embedding requests. 177 | - **Configuration File Support:** Introduced configuration file management for server running states, increasing flexibility. 178 | - **Separate Completion and Embedding Methods:** The `get_completion_or_embedding` method handles completion and embedding tasks separately based on request type. 179 | 180 | ## Usage Suggestions 181 | 182 | 1. Set `LATENCY_GROWING_RATE` and `MAX_RETRY` reasonably to balance system response speed and stability. 183 | 2. Monitor GPU resource usage and adjust server configurations as needed. 184 | 3. Regularly check and update `BENCHMAK_MESSAGE` to ensure it effectively tests server performance. 185 | 4. Consider adding more servers or optimizing existing server configurations under high load conditions. 186 | 5. Utilize embedding model functionality for text analysis and similarity calculation tasks. 187 | 188 | 189 | ## Trouble Shooting 190 | 191 | 1. If you encounter the error `eno1` not found, you can directly remove `get_eno1_inet_address` in `serve_llm_pipeline.py` and set the IP address manually. IP address is used to differentiate clusters if you want to run engines on multiple clusters with different IPs. If you only have one cluster/node, you can manually set its IP address. 192 | 193 | 2. If you encounter the error: 194 | 195 | ``` 196 | RuntimeError: Tried to instantiate class '_core_C.ScalarType', but it does not exist! Ensure that it is registered via torch::class_::declare("torch._C.ScalarType"); 197 | ``` 198 | 199 | You can solve it by installing the correct version of torch: 200 | 201 | ```bash 202 | pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu121 203 | ``` 204 | 205 | 3. If you encounter the error: 206 | 207 | ``` 208 | ImportError: /usr/lib/x86_64-linux-gnu/libc.so.6: version 'GLIBC_2.34' not found (required by /xxx/.triton/cache/41ce1f58e0a8aa9865e66b90d58b3307bb64c5a006830e49543444faf56202fc/cuda_utils.so) 209 | ``` 210 | 211 | You can solve it by deleting the cache: 212 | 213 | ```bash 214 | rm -rf /xxx/.triton/cache/* 215 | ``` -------------------------------------------------------------------------------- /README_CN.md: -------------------------------------------------------------------------------- 1 | # ModelServer 2 | 3 | 基于 SGLang 框架的 ModelServer 类。完全自建,还请提出更多优化方式。采用的 SGLang 版本为 v0.2.15。 4 | 5 | ModelServer 框架实现了高效、灵活且具有强大容错能力的模型服务管理。它能够适应不同规模的模型和多样的任务需求,为大规模语言模型的部署和应用提供了可靠的基础设施。 6 | 7 | ## 快速使用 8 | 9 | ### 安装 SGLang 10 | 11 | 参考我当前的配置,安装 SGLang 和依赖项。 12 | 13 | ```bash 14 | pip install sglang==0.2.15 15 | pip install flashinfer==0.1.6 -i https://flashinfer.ai/whl/cu121/torch2.3/ 16 | 17 | # 较低版本的vllm可能导致关于multimodal-config的错误 18 | pip install vllm==0.5.5 19 | 20 | pip install triton==2.3.1 21 | 22 | # 根据您的本地设备更改cuda版本 23 | pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121 24 | ``` 25 | 26 | ### 修改 `client_config.py` 27 | 28 | 在 `client_config.py` 中配置你本地的服务器的 IP 地址和模型路径等等: 29 | 30 | ```python 31 | SERVER_IP = "[SECRET IP, REPLACE WITH YOURS]" 32 | MODEL_NAME_8B = "8bins" 33 | MODEL_NAME_70B = "70bins" 34 | EMBEDDING_7B = "7embed" 35 | ``` 36 | 37 | ### 启动 Model Engine 38 | 39 | ```bash 40 | python serve_llm_pipeline.py 41 | ``` 42 | 43 | ### 测试服务器延迟 44 | 45 | ```python 46 | python client_config.py 47 | ``` 48 | 49 | ### 测试 ModelServer 50 | 51 | ```bash 52 | python model_server.py 53 | ``` 54 | 55 | ## 代码结构 56 | 57 | ### `client_configs.py` 58 | 59 | #### 常量 60 | 61 | - **Server 配置**:全部服务器配置,托管了不同大小的模型(如 `8B` 和 `70B`)以及嵌入模型。每个服务器通过一个 `Server` 命名元组来表示,包含属性如 `ip`(IP 地址)、`port`(端口)、`model_size`(模型大小)、`model_path`(模型路径)和 `gpus`(GPU 配置)。 62 | - **BENCHMAK_MESSAGE**: 定义了一个基准消息(`BENCHMAK_MESSAGE`),用于测试不同服务器的性能。 63 | - **Completion_Servers**:对话模型的服务器配置列表。 64 | - **Embedding_Servers**:嵌入模型的服务器配置列表(新增)。 65 | 66 | #### 函数 67 | 68 | - `get_fastest_server`: 测试各个服务器的延迟,并返回最快的服务器及其延迟。延迟长于当前最低 latency 的服务器将被跳过,这在服务器 latency 非常不均衡时有显著意义。(新增了对嵌入模型服务器的支持) 69 | - `get_all_latency`: 检查并打印所有配置服务器的延迟,包括完成模型和嵌入模型服务器。 70 | - `get_running_server_sizes`: 返回当前运行中的服务器模型大小列表。 71 | 72 | ### `serve_llm_pipeline.py` 73 | 74 | #### 函数 75 | 76 | - `get_eno1_inet_address`: 获取网络接口 `eno1` 相关的 IP 地址。 77 | - `is_gpu_free`: 检查指定的 GPU 是否空闲(内存使用量低于某个阈值)。 78 | - `get_gpu_memory_info`: 获取指定 GPU 的总内存和可用内存信息。 79 | - `get_free_memory_ratio`: 计算 GPU 可用内存与总内存的比例。 80 | - `get_comond_infos`: 动态构建用于启动服务器的命令,根据服务器的配置生成适当的启动参数。 81 | - `main`: 主函数,管理 GPU 的可用性并启动模型服务器。使用 `ThreadPoolExecutor` 实现并发管理多个服务器。 82 | 83 | #### features 84 | 85 | - **动态资源管理:** 动态检查和管理 GPU 资源,确保只有在资源充足时才启动服务器。 86 | - **服务器初始化:** 自动化地启动服务器,确保使用正确的配置和资源分配。 87 | - **并发性:** 使用 `ThreadPoolExecutor` 并发地管理多个服务器,充分利用资源。 88 | - **GPU 内存管理:** 实时监控 GPU 内存使用情况,动态调整服务器启动策略。 89 | 90 | ### `model_server.py` 91 | 92 | #### 常量 93 | 94 | - `LATENCY_GROWING_RATE`:延迟增长率,用于动态调整延迟阈值。 95 | - `MAX_RETRY`:最大重试次数,提高系统容错能力。 96 | - `INF`:表示无穷大,用于初始化延迟比较。 97 | 98 | #### `ModelServer` 类 99 | 100 | 采用自动化重启和容错恢复的方式管理不同模型服务器(包括完成模型和嵌入模型)的创建和交互。 101 | 102 | ##### 延迟管理和自动重启 103 | 104 | - **延迟监测:** 在 `get_completion_or_embedding` 方法中实时监测服务器响应时间。 105 | - **动态阈值调整:** 使用 `LATENCY_GROWING_RATE` 动态调整可接受的延迟阈值。 106 | - **自动重启:** 当检测到响应时间过长时,触发 `_manage_model_server` 方法重建服务器连接。 107 | 108 | ##### 容错机制 109 | 110 | - **多次尝试:** 使用 `MAX_RETRY` 机制,在发生错误时多次尝试请求。 111 | - **错误处理:** 捕获并记录异常,尝试重建服务器连接。 112 | - **优雅降级:** 当所有尝试失败时,通过 `turn_off_running_flag` 方法优雅地关闭服务。 113 | 114 | ##### 服务器构建与重建逻辑 115 | 116 | - **初始化构建:** 根据当前配置选择最快的服务器。 117 | - **动态重建:** 当服务器性能下降时,自动选择新的更快服务器。 118 | - **资源优化:** 通过合理的重建策略,平衡服务质量和资源利用。 119 | 120 | ##### 新特性 121 | 122 | - **嵌入模型支持:** 新增对嵌入模型的支持,可以处理文本嵌入请求。 123 | - **配置文件支持:** 引入配置文件管理服务器运行状态,提高灵活性。 124 | - **分离的完成和嵌入方法:** `get_completion_or_embedding` 方法根据请求类型分别处理完成和嵌入任务。 125 | 126 | ## 使用建议 127 | 128 | 1. 合理设置 `LATENCY_GROWING_RATE` 和 `MAX_RETRY`,以平衡系统响应速度和稳定性。 129 | 2. 监控 GPU 资源使用情况,适时调整服务器配置。 130 | 3. 定期检查和更新 `BENCHMAK_MESSAGE`,确保其能够有效测试服务器性能。 131 | 4. 在高负载情况下,考虑增加更多的服务器或优化现有服务器配置。 132 | 5. 利用嵌入模型功能进行文本分析和相似度计算任务。 133 | 134 | ## Trouble Shooting 135 | 136 | 1. 如果遇到`eno1`未找到的错误,可以直接在`serve_llm_pipeline.py`中移除`get_eno1_inet_address`并手动设置IP地址。(为了在具有不同 IP 的多个集群上运行 Model Engine 时,我采用 IP 地址来区分集群。如果不需要在多个集群上运行,就不用区分。) 137 | 2. 如果遇到以下错误: 138 | 139 | ```bash 140 | RuntimeError: Tried to instantiate class '_core_C.ScalarType', but it does not exist! Ensure that it is registered via torch::class_::declare("torch._C.ScalarType"); 141 | ``` 142 | 143 | 可以通过安装正确版本的torch来解决: 144 | 145 | ```bash 146 | pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu121 147 | ``` 148 | 149 | 1. 如果遇到以下错误: 150 | 151 | ```bash 152 | ImportError: /usr/lib/x86_64-linux-gnu/libc.so.6: version 'GLIBC_2.34' not found (required by /xxx/.triton/cache/41ce1f58e0a8aa9865e66b90d58b3307bb64c5a006830e49543444faf56202fc/cuda_utils.so) 153 | ``` 154 | 155 | 可以通过删除缓存来解决: 156 | 157 | ```bash 158 | rm -rf /xxx/.triton/cache/* 159 | ``` 160 | -------------------------------------------------------------------------------- /client_configs.py: -------------------------------------------------------------------------------- 1 | """ 2 | client_configs.py 3 | """ 4 | 5 | from collections import namedtuple 6 | import openai 7 | from typing import List, Optional 8 | from IPython import embed 9 | import time 10 | import multiprocessing 11 | from typing import List, Optional 12 | 13 | SERVER_IP = "[SECRET IP, REPLACE WITH YOURS]" 14 | MODEL_NAME_8B = "8bins" 15 | MODEL_NAME_70B = "70bins" 16 | EMBEDDING_7B = "7embed" 17 | INF = 100 18 | 19 | Server = namedtuple("Server", ["ip", "port", "model_size", "model_path", "gpus"]) 20 | BENCHMAK_MESSAGE = [ 21 | { 22 | "role": "user", 23 | "content": "Interact with a household to solve a task. Here are a few examples.\nYou are in the middle of a room. Looking quickly around you, you see a cabinet 4, a cabinet 3, a cabinet 2, a cabinet 1, a coffeemachine 1, a countertop 1, a diningtable 3, a diningtable 2, a diningtable 1, a drawer 1, a fridge 1, a garbagecan 1, a microwave 1, a sidetable 1, a sinkbasin 1, a stoveburner 4, a stoveburner 3, a stoveburner 2, a stoveburner 1, and a toaster 1.\nYour task is to: find some apple and put it in sidetable.\n> think: To solve the task, I need to find and take an apple, then put it in sidetable.\nOK.\n> think: First I need to find an apple. An apple is more likely to appear in fridges (1), diningtables (1-3), sidetables (1), countertops (1), sinkbasins (1), garbagecan (1). I can check one by one, starting with fridge 1.\nOK.\n> go to fridge 1\nThe fridge 1 is closed.\n> open fridge 1\nYou open the fridge 1. The fridge 1 is open. In it, you see a lettuce 2, a mug 2, a potato 2, and a tomato 1.\n> go to diningtable 1\nOn the diningtable 1, you see nothing.\n> go to diningtable 2\nOn the diningtable 2, you see a pineapple 1.\n> go to diningtable 3\nOn the diningtable 3, you see a bowl 2, a bowl 1, a bread 2, and a bread 1.\n> go to sidetable 1\nOn the sidetable 1, you see a cup 1, a lettuce 1, a peppershaker 3, a potato 1, and a saltshaker 1.\n> go to countertop 1\nOn the countertop 1, you see a butterknife 1, a dishsponge 1, and a soapbottle 1.\n> go to sinkbasin 1\nOn the sinkbasin 1, you see a cup 2, a egg 1, a mug 1, and a potato 3.\n> go to garbagecan 1\nOn the garbagecan 1, you see a apple 3, and a egg 3.\n> think: Now I find an apple (3). Next, I need to take it.\n> take apple 3 from garbagecan 1\nYou pick up the apple 3 from the garbagecan 1.\n> think: Now I find an apple (3). Next, I need to put it in/on sidetable 1.\nOK.\n> go to sidetable 1\nOn the sidetable 1, you see a cup 1, a lettuce 1, a peppershaker 3, a potato 1, and a saltshaker 1.\n> put apple 3 in/on sidetable 1\nYou put the apple 3 in/on the sidetable 1.\nYou are in the middle of a room. Looking quickly around you, you see a cabinet 4, a cabinet 3, a cabinet 2, a cabinet 1, a countertop 1, a garbagecan 1, a handtowelholder 2, a handtowelholder 1, a sinkbasin 2, a sinkbasin 1, a toilet 1, a toiletpaperhanger 1, and a towelholder 1.\nYour task is to: put some spraybottle on toilet.\n> think: To solve the task, I need to find and take a sparybottle, then put it on toilet.\nOK.\n> think: First I need to find a spraybottle. A spraybottle is more likely to appear in cabinet (1-4), countertop (1), toilet (1), sinkbasin (1-2), garbagecan (1). I can check one by one, starting with cabinet 1.\nOK.\n> go to cabinet 1\nOn the cabinet 1, you see a cloth 1, a soapbar 1, a soapbottle 1.\n> go to cabinet 2\nThe cabinet 2 is closed.\n> open cabinet 2\nYou open the cabinet 2. The cabinet 2 is open. In it, you see a candle 1, and a spraybottle 2.\n> think: Now I find a spraybottle (2). Next, I need to take it.\n> take spraybottle 2 from cabinet 2\nYou pick up the spraybottle 2 from the cabinet 2.\n> think: Now I take a spraybottle (2). Next, I need to put it in/on toilet 1.\nOK.\n> go to toilet 1\nOn the toilet 1, you see a soapbottle 2.\n> put spraybottle 2 in/on toilet 1\nYou put the spraybottle 2 in/on the toilet 1.\n\n\nHere is the task:\nYou are in the middle of a room. Looking quickly around you, you see a bed 1, a desk 2, a desk 1, a drawer 6, a drawer 5, a drawer 4, a drawer 3, a drawer 2, a drawer 1, a garbagecan 1, a laundryhamper 1, a safe 1, a shelf 6, a shelf 5, a shelf 4, a shelf 3, a shelf 2, and a shelf 1.\nYour task is to: put a mug in desk.\n>", 24 | }, 25 | { 26 | "role": "assistant", 27 | "content": "think: To solve the task, I need to find and take a mug, then put it in/on desk.", 28 | }, 29 | {"role": "user", "content": "OK."}, 30 | { 31 | "role": "assistant", 32 | "content": "think: First I need to find a mug. A mug is more likely to appear in desk (1-2), drawer (1-6), shelf (1-6), garbagecan (1), laundryhamper (1), safe (1). I can check one by one, starting with desk 1.", 33 | }, 34 | {"role": "user", "content": "OK."}, 35 | {"role": "assistant", "content": "go to desk 1"}, 36 | { 37 | "role": "user", 38 | "content": "On the desk 1, you see a bowl 1, a cd 2, a creditcard 2, a desklamp 1, a keychain 1, a laptop 2, a pen 2, and a pencil 2.", 39 | }, 40 | { 41 | "role": "assistant", 42 | "content": "think: Now I don't find a mug in desk 1. Next, I need to check desk 2.", 43 | }, 44 | {"role": "user", "content": "OK."}, 45 | {"role": "assistant", "content": "go to desk 2"}, 46 | { 47 | "role": "user", 48 | "content": "On the desk 2, you see a alarmclock 1, a book 3, a bowl 2, a creditcard 3, and a keychain 3.", 49 | }, 50 | ] 51 | 52 | 53 | Completion_Servers = [ 54 | Server( 55 | ip=SERVER_IP, 56 | port=8056, 57 | model_size="8", 58 | model_path=MODEL_NAME_8B, 59 | gpus=[1], 60 | ), 61 | Server( 62 | ip=SERVER_IP, 63 | port=8064, 64 | model_size="8", 65 | model_path=MODEL_NAME_8B, 66 | gpus=[2], 67 | ), 68 | Server( 69 | ip=SERVER_IP, 70 | port=8072, 71 | model_size="8", 72 | model_path=MODEL_NAME_8B, 73 | gpus=[3], 74 | ), 75 | Server( 76 | ip=SERVER_IP, 77 | port=8080, 78 | model_size="8", 79 | model_path=MODEL_NAME_8B, 80 | gpus=[4], 81 | ), 82 | Server( 83 | ip=SERVER_IP, 84 | port=8088, 85 | model_size="8", 86 | model_path=MODEL_NAME_8B, 87 | gpus=[5], 88 | ), 89 | Server( 90 | ip=SERVER_IP, 91 | port=8096, 92 | model_size="8", 93 | model_path=MODEL_NAME_8B, 94 | gpus=[6], 95 | ), 96 | Server( 97 | ip=SERVER_IP, 98 | port=8104, 99 | model_size="8", 100 | model_path=MODEL_NAME_8B, 101 | gpus=[7], 102 | ), 103 | #! 以下是 70B model 的 config,请不要同时开启 104 | # Server( 105 | # ip=SERVER_IP, 106 | # port=8400, 107 | # model_size="70", 108 | # model_path=MODEL_NAME_70B, 109 | # gpus=[0, 1, 2, 3], 110 | # ), 111 | ] 112 | 113 | Embedding_Servers = [ 114 | Server( 115 | ip=SERVER_IP, 116 | port=7777, 117 | model_size="7", 118 | model_path=EMBEDDING_7B, 119 | gpus=[0], 120 | ), 121 | ] 122 | 123 | 124 | def get_fastest_server( 125 | initial_latency=10, model_size="8", test_embedding_servers: bool = False 126 | ): 127 | SERVERS = Embedding_Servers if test_embedding_servers else Completion_Servers 128 | min_latency = initial_latency 129 | fastest_server = None 130 | 131 | def test_server(server: Server): 132 | def get_completion_or_embedding( 133 | client, 134 | message: List, 135 | temperature: float = 0.0, 136 | max_tokens: int = 256, 137 | model_name: Optional[str] = None, 138 | ) -> str: 139 | def target(queue): 140 | try: 141 | if not test_embedding_servers: 142 | completion = client.chat.completions.create( 143 | model=model_name, 144 | messages=message, 145 | max_tokens=max_tokens, 146 | temperature=temperature, 147 | stop=["<|eot_id|>"], 148 | ) 149 | queue.put(completion) 150 | else: 151 | embedding = client.embeddings.create( 152 | input=message[0]["content"], model=model_name 153 | ) 154 | queue.put(embedding) 155 | 156 | except Exception as e: 157 | queue.put(e) 158 | 159 | start_time = time.time() 160 | queue = multiprocessing.Queue() 161 | process = multiprocessing.Process(target=target, args=(queue,)) 162 | process.start() 163 | 164 | process.join(timeout=min_latency) # 等待进程完成或超时 165 | 166 | if process.is_alive(): 167 | print( 168 | f"Timeout: server {server.ip}:{server.port} took longer than {min_latency:.3f} seconds." 169 | ) 170 | process.terminate() # 终止进程 171 | process.join() # 确保进程已终止 172 | return None, INF 173 | else: 174 | result = queue.get() 175 | if isinstance(result, Exception): 176 | raise result 177 | latency = time.time() - start_time 178 | print(f"Connection Time: {latency:.3f} s") 179 | if not test_embedding_servers: 180 | return str(result.choices[0].message.content), latency 181 | else: 182 | return (list(result.data[0].embedding), latency) 183 | 184 | client = openai.OpenAI( 185 | base_url=(f"http://{server.ip}:{server.port}/v1"), 186 | api_key=("sk-1dwqsdv4r3wef3rvefg34ef1dwRv"), 187 | ) 188 | 189 | try: 190 | response, latency = get_completion_or_embedding( 191 | client, 192 | BENCHMAK_MESSAGE, 193 | 0.0, 194 | 256, 195 | server.model_path, 196 | ) 197 | print( 198 | f"Get response: {response}" 199 | if not test_embedding_servers 200 | else f"Get embedding: {response[:10]}" 201 | ) 202 | if response is not None and len(response) > 0: 203 | print( 204 | f""" 205 | ============================================================ 206 | Cluster: {server.ip} 207 | Port: {server.port} 208 | Model: {server.model_path} 209 | Size: {server.model_size} 210 | GPUs: {server.gpus} 211 | Latency: {latency:.3f} s 212 | ============================================================ 213 | """ 214 | ) 215 | return True, latency 216 | else: 217 | return False, INF 218 | 219 | except Exception as e: 220 | print( 221 | f"Could not connect to server {server.ip}:{server.port} due to error: {e}" 222 | ) 223 | return False, INF 224 | 225 | for server in SERVERS: 226 | if server.model_size == model_size: 227 | print( 228 | f"Testing Completion" 229 | if not test_embedding_servers 230 | else "Testing Embedding" 231 | ) 232 | status, latency = test_server(server) 233 | if status and (latency < min_latency): 234 | min_latency = latency 235 | fastest_server = server 236 | 237 | if fastest_server: 238 | print( 239 | f"Fastest server is {fastest_server.ip}:{fastest_server.port} with latency {min_latency:.3f} s" 240 | ) 241 | return fastest_server, min_latency 242 | else: 243 | print("No servers responded in a timely manner.") 244 | return None, INF 245 | 246 | 247 | def get_all_latency(test_embedding_servers: bool = False): 248 | SERVERS = Embedding_Servers if test_embedding_servers else Completion_Servers 249 | 250 | def test_server(server: Server): 251 | client = openai.OpenAI( 252 | base_url=(f"http://{server.ip}:{server.port}/v1"), 253 | api_key=("sk-1dwqsdv4r3wef3rvefg34ef1dwRv"), 254 | ) 255 | 256 | try: 257 | start_time = time.time() 258 | if not test_embedding_servers: 259 | completion = client.chat.completions.create( 260 | model=server.model_path, 261 | messages=BENCHMAK_MESSAGE, 262 | max_tokens=256, 263 | temperature=0.9, 264 | stop=["<|eot_id|>", "\nObservation", "Observation"], 265 | ) 266 | response = str(completion.choices[0].message.content) 267 | else: 268 | embedding = client.embeddings.create( 269 | input=BENCHMAK_MESSAGE[0]["content"], model=server.model_path 270 | ) 271 | client.embeddings.create(input="how are you", model=server.model_path) 272 | response = str(embedding.data[0].embedding[:10]) 273 | duration = time.time() - start_time 274 | print( 275 | f""" 276 | ============================================================ 277 | TESTING SERVER 278 | Cluster: {server.ip} 279 | Port: {server.port} 280 | Model: {server.model_path} 281 | Size: {server.model_size} 282 | GPUs: {server.gpus} 283 | ============================================================ 284 | """ 285 | ) 286 | print(f"Connection Time: {duration:.3f}s for {server.ip}:{server.port}") 287 | print( 288 | f"Get response: {response}" 289 | if not test_embedding_servers 290 | else f"GOT EMBEDDING: {response}" 291 | ) 292 | return True 293 | except Exception as e: 294 | print( 295 | f""" 296 | ============================================================ 297 | TESTING SERVER 298 | Cluster: {server.ip} 299 | Port: {server.port} 300 | Model: {server.model_path} 301 | Size: {server.model_size} 302 | GPUs: {server.gpus} 303 | ============================================================ 304 | """ 305 | ) 306 | print( 307 | f"Could not connect to server {server.ip}:{server.port} due to error: {e}" 308 | ) 309 | return False 310 | 311 | for server in SERVERS: 312 | test_server(server) 313 | 314 | 315 | def get_running_server_sizes(SERVERS=Completion_Servers + Embedding_Servers): 316 | server_sizes = [server.model_size for server in SERVERS] 317 | return server_sizes 318 | 319 | 320 | if __name__ == "__main__": 321 | server, min_latency = get_fastest_server( 322 | initial_latency=10, model_size="8", test_embedding_servers=False 323 | ) 324 | print(server) 325 | server, min_latency = get_fastest_server( 326 | initial_latency=10, model_size="7", test_embedding_servers=True 327 | ) 328 | print(server) 329 | get_all_latency(test_embedding_servers=True) 330 | get_all_latency(test_embedding_servers=False) 331 | -------------------------------------------------------------------------------- /model_server.py: -------------------------------------------------------------------------------- 1 | """ 2 | model_server.py 3 | """ 4 | 5 | import os 6 | 7 | os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" 8 | os.environ["PYTHONUTF8"] = "1" 9 | import time 10 | from typing import Dict, List 11 | import openai 12 | import json 13 | from client_configs import ( 14 | get_fastest_server, 15 | get_running_server_sizes, 16 | MODEL_NAME_70B, 17 | MODEL_NAME_8B, 18 | EMBEDDING_7B, 19 | EMBEDDING_2B, 20 | BENCHMAK_MESSAGE, 21 | ) 22 | 23 | LATENCY_GROWING_RATE = 20 24 | MAX_RETRY = 20 25 | INF = 200 26 | 27 | 28 | class ModelServer: 29 | def __init__(self, config_path: str = None) -> None: 30 | running_server_sizes = get_running_server_sizes() 31 | ( 32 | self.completion_client_70b, 33 | self.completion_client_8b, 34 | self.embedding_client_7b, 35 | self.embedding_client_2b, 36 | ) = (None, None, None, None) 37 | self.latency_70b, self.latency_8b, self.latency_7b, self.latency_2b = ( 38 | INF, 39 | INF, 40 | INF, 41 | INF, 42 | ) 43 | self.config_path = config_path 44 | # Turn the running flag in config path when the server failed to get response 45 | if "70" in running_server_sizes: 46 | self._manage_model_server(latency_bound=3, model_size="70") 47 | if "8" in running_server_sizes: 48 | self._manage_model_server(latency_bound=3, model_size="8") 49 | if "7" in running_server_sizes: 50 | self._manage_model_server( 51 | latency_bound=3, model_size="7", get_embedding=True 52 | ) 53 | if "2" in running_server_sizes: 54 | self._manage_model_server( 55 | latency_bound=3, model_size="2", get_embedding=True 56 | ) 57 | 58 | def turn_off_running_flag(self) -> None: 59 | with open(self.config_path, "r", encoding="utf-8") as rf: 60 | info_dict = json.load(rf) 61 | info_dict["is_running"] = False 62 | with open(self.config_path, "w", encoding="utf-8") as wf: 63 | json.dump(info_dict, wf, indent=4) 64 | 65 | def _manage_model_server( 66 | self, latency_bound, model_size: str, get_embedding: bool = False 67 | ) -> None: 68 | build_latency = latency_bound 69 | build_count = 0 70 | status = False 71 | while not status: 72 | server, latency_bound = get_fastest_server( 73 | initial_latency=build_latency, 74 | model_size=model_size, 75 | test_embedding_servers=get_embedding, 76 | ) 77 | # latency_bound+=10 78 | if server is not None: 79 | client = openai.OpenAI( 80 | base_url=(f"http://{server.ip}:{server.port}/v1"), 81 | api_key=("sk-1dwqsdv4r3wef3rvefg34ef1dwRv"), 82 | ) 83 | if model_size == "70" and not get_embedding: 84 | self.completion_client_70b, self.latency_70b = client, latency_bound 85 | elif model_size == "8" and not get_embedding: 86 | self.completion_client_8b, self.latency_8b = client, latency_bound 87 | elif model_size == "7" and get_embedding: 88 | self.embedding_client_7b, self.latency_7b = client, latency_bound 89 | elif model_size == "2" and get_embedding: 90 | self.embedding_client_2b, self.latency_2b = client, latency_bound 91 | else: 92 | raise NotImplementedError 93 | print( 94 | f"Model server {model_size}B built with latency_bound {latency_bound}." 95 | ) 96 | status = True 97 | else: 98 | build_latency *= LATENCY_GROWING_RATE 99 | build_count += 1 100 | print( 101 | f"Attempt {build_count} to build model server {model_size}B failed." 102 | ) 103 | if build_count > MAX_RETRY: 104 | assert self.config_path is not None, "Config path is required." 105 | self.turn_off_running_flag() 106 | raise RuntimeError( 107 | f"Could not build model server after {MAX_RETRY} attempts." 108 | ) 109 | 110 | def get_completion_or_embedding( 111 | self, 112 | model_size: str, 113 | message, 114 | temperature: float = 0.0, 115 | max_tokens: int = 256, 116 | get_embedding: bool = False, 117 | ) -> str: 118 | # print(f"Message: {message}") 119 | assert model_size in ["70", "8", "7", "2"] 120 | if not get_embedding: 121 | model_name = MODEL_NAME_70B if model_size == "70" else MODEL_NAME_8B 122 | else: 123 | model_name = EMBEDDING_7B if model_size == "7" else EMBEDDING_2B 124 | 125 | for attempt in range(MAX_RETRY): 126 | try: 127 | assert ( 128 | (self.completion_client_70b is not None and model_size == "70") 129 | or (self.completion_client_8b is not None and model_size == "8") 130 | or (self.embedding_client_7b is not None and model_size == "7") 131 | or (self.embedding_client_2b is not None and model_size == "2") 132 | ), "Model server not initialized." 133 | 134 | if not get_embedding: 135 | client = ( 136 | self.completion_client_70b 137 | if model_size == "70" 138 | else self.completion_client_8b 139 | ) 140 | latency_bound = ( 141 | self.latency_70b if model_size == "70" else self.latency_8b 142 | ) 143 | else: 144 | client = ( 145 | self.embedding_client_7b 146 | if model_size == "7" 147 | else self.embedding_client_2b 148 | ) 149 | latency_bound = ( 150 | self.latency_7b if model_size == "7" else self.latency_2b 151 | ) 152 | # print( 153 | # f"Using client {client.base_url} with latency bound {latency_bound}." 154 | # ) 155 | start_time = time.time() 156 | if not get_embedding: 157 | assert type(message) == list, "Message should be a list." 158 | response = client.chat.completions.create( 159 | model=model_name, 160 | messages=message, 161 | max_tokens=max_tokens, 162 | temperature=temperature, 163 | stop=["<|eot_id|>"], 164 | ) 165 | else: 166 | assert type(message) == str, "Message should be a string." 167 | response = client.embeddings.create( 168 | model=model_name, 169 | input=message, 170 | ) 171 | elapsed_time = time.time() - start_time 172 | # elapsed_time = 50 173 | # print(f"Connection Time: {elapsed_time:.3f} s") 174 | 175 | if elapsed_time >= LATENCY_GROWING_RATE * latency_bound: 176 | print( 177 | f"Rebuilding model seed due to response delay ({elapsed_time:.3f}) longer than {LATENCY_GROWING_RATE} * latency bound ({latency_bound:.3f})." 178 | ) 179 | self._manage_model_server( 180 | latency_bound=LATENCY_GROWING_RATE * latency_bound, 181 | model_size=model_size, 182 | get_embedding=get_embedding, 183 | ) 184 | 185 | return ( 186 | str(response.choices[0].message.content) 187 | if not get_embedding 188 | else response.data[0].embedding 189 | ) 190 | 191 | except Exception as e: 192 | print(f"Attempt {attempt + 1} to get response failed with error: {e}") 193 | print(f"Rebuilding model server {model_size}B.") 194 | self._manage_model_server( 195 | latency_bound=INF, 196 | model_size=model_size, 197 | get_embedding=get_embedding, 198 | ) 199 | 200 | error_message = ( 201 | f"All clients failed to produce a completion after {MAX_RETRY} attempts." 202 | ) 203 | print(error_message) 204 | print(message) 205 | assert self.config_path is not None, "Config path is required." 206 | self.turn_off_running_flag() 207 | raise RuntimeError(error_message) 208 | 209 | 210 | if __name__ == "__main__": 211 | #! Test the model server 212 | server = ModelServer() 213 | message = BENCHMAK_MESSAGE 214 | # message = [] 215 | for i in range(10): 216 | print(f"Completion {i}:") 217 | complition = server.get_completion_or_embedding("8", message) 218 | print(complition) 219 | 220 | embedding = None 221 | for i in range(10): 222 | print(f"Embedding {i}:") 223 | embedding = server.get_completion_or_embedding( 224 | "7", 225 | message="As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.", 226 | get_embedding=True, 227 | ) 228 | print(embedding[:10]) 229 | -------------------------------------------------------------------------------- /serve_llm_pipeline.py: -------------------------------------------------------------------------------- 1 | """ 2 | serve_llm_pipeline.py 3 | """ 4 | 5 | import subprocess 6 | import time 7 | from concurrent.futures import ThreadPoolExecutor, as_completed 8 | import re 9 | import socket 10 | from client_configs import Server, Completion_Servers, Embedding_Servers 11 | from IPython import embed 12 | 13 | #! 如果 GPU ulitization 较低的话,开不了这么长的 context length 14 | MAX_CONTEXT_LENGTH = 65536 * 2 15 | CHUNKED_PREFILL_SIZE = int(MAX_CONTEXT_LENGTH / 8) 16 | 17 | 18 | def get_eno1_inet_address(): 19 | result = subprocess.run(["ifconfig"], capture_output=True, text=True) 20 | output = result.stdout 21 | match = re.search(r"eno1:.*?(inet\s+(\d+\.\d+\.\d+\.\d+))", output, re.DOTALL) 22 | if match: 23 | return match.group(2) 24 | return None 25 | 26 | 27 | def is_gpu_free(gpu_ids): 28 | gpu_ids_string = ",".join(map(str, gpu_ids)) 29 | command = f"nvidia-smi --query-gpu=memory.used --format=csv,nounits,noheader -i {gpu_ids_string}" 30 | output = subprocess.check_output(command, shell=True).decode("utf-8").strip() 31 | used_memory = [int(x) for x in output.split("\n")] 32 | return all( 33 | used < 10000 for used in used_memory 34 | ) # GPUs with usage below 10000 MB are considered free 35 | 36 | 37 | def get_gpu_memory_info(gpu_ids): 38 | gpu_ids_string = ",".join(map(str, gpu_ids)) 39 | # 获取总内存 40 | command_total = f"nvidia-smi --query-gpu=memory.total --format=csv,nounits,noheader -i {gpu_ids_string}" 41 | output_total = ( 42 | subprocess.check_output(command_total, shell=True).decode("utf-8").strip() 43 | ) 44 | total_memory = [int(x) for x in output_total.split("\n")] 45 | # 获取剩余内存 46 | command_free = f"nvidia-smi --query-gpu=memory.free --format=csv,nounits,noheader -i {gpu_ids_string}" 47 | output_free = ( 48 | subprocess.check_output(command_free, shell=True).decode("utf-8").strip() 49 | ) 50 | free_memory = [int(x) for x in output_free.split("\n")] 51 | return total_memory, free_memory 52 | 53 | 54 | def get_free_memory_ratio(gpu_ids): 55 | total_memory, free_memory = get_gpu_memory_info(gpu_ids) 56 | free_memory_ratio = [free / total for free, total in zip(free_memory, total_memory)] 57 | return free_memory_ratio 58 | 59 | 60 | def get_comond_infos(server: Server): 61 | assert len(server.gpus) > 0, "No GPUs assigned to the server." 62 | assert ( 63 | server.port % int(server.model_size) == 0 64 | ), "Port should be divisible by model size." 65 | assert ( 66 | server.model_size in server.model_path 67 | ), "Model size should be in the model name." 68 | group_gpu_string = ",".join(map(str, server.gpus)) 69 | tensor_parallel_size = len(server.gpus) 70 | command = f""" 71 | CUDA_VISIBLE_DEVICES={group_gpu_string} python -m sglang.launch_server --enable-p2p-check --model-path {server.model_path} \ 72 | --dtype auto --tensor-parallel-size {tensor_parallel_size} \ 73 | --context-length {MAX_CONTEXT_LENGTH if server.model_size != "7" else 32768} --chunked-prefill-size {CHUNKED_PREFILL_SIZE if server.model_size != "7" else int(32768 / 8)} \ 74 | --port {server.port} --host 0.0.0.0 --api-key sk-1dwqsdv4r3wef3rvefg34ef1dwRv """ 75 | #! host 0.0.0.0 可以用于广播 76 | # if server.model_size == "8" or server.model_size == "7": 77 | # command += " --enable-torch-compile " 78 | # if server.model_size == "7": 79 | # command += " --is-embedding " 80 | #! 8b 模型需要开启 torch compile,70b 还没优化 81 | return (group_gpu_string, command, server.port, server.model_size) 82 | 83 | 84 | def main(): 85 | eno1_ip_address = get_eno1_inet_address() 86 | ServerID = int(eno1_ip_address[-1]) 87 | assert ServerID in [3, 4], "Model should be served on server 3 or 4." 88 | assert str(ServerID) in socket.gethostname(), "ServerID should be in the hostname." 89 | 90 | print( 91 | f""" 92 | ============================================================ 93 | Cluster: {socket.gethostname()} 94 | IP: {get_eno1_inet_address()} 95 | ============================================================ 96 | """ 97 | ) 98 | 99 | command_infos = [ 100 | get_comond_infos(server) 101 | for server in (Completion_Servers + Embedding_Servers) 102 | if (server.ip == eno1_ip_address) 103 | ] 104 | 105 | def run_with_gpu_check(command_info): 106 | group_id, command, port, model_size = command_info 107 | 108 | while not is_gpu_free(group_id): 109 | print(f"Waiting for GPU(s) {group_id} to be free...") 110 | time.sleep(10) 111 | 112 | print( 113 | f"Serving {model_size}b model on server {ServerID} port {port} with GPUs {group_id}" 114 | ) 115 | free_gpu_ration = min(get_free_memory_ratio(group_id)) 116 | gpu_ultization = None 117 | if free_gpu_ration >= 0.95: 118 | gpu_ultization = 0.80 119 | elif free_gpu_ration >= 0.85: 120 | gpu_ultization = 0.70 121 | else: 122 | raise ValueError("GPU memory is not enough.") 123 | command = command + f" --mem-fraction-static {gpu_ultization} " 124 | print(command) 125 | subprocess.run(command, shell=True) 126 | 127 | with ThreadPoolExecutor(max_workers=len(command_infos)) as executor: 128 | futures = [ 129 | executor.submit(run_with_gpu_check, cmd_info) for cmd_info in command_infos 130 | ] 131 | for future in as_completed(futures): 132 | future.result() 133 | 134 | 135 | if __name__ == "__main__": 136 | main() 137 | --------------------------------------------------------------------------------