├── .gitignore
├── LICENSE
├── README.md
├── README_CN.md
├── client_configs.py
├── model_server.py
└── serve_llm_pipeline.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 | 
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 | 
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 | 
121 | # SageMath parsed files
122 | *.sage.py
123 | 
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 | 
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 | 
137 | # Rope project settings
138 | .ropeproject
139 | 
140 | # mkdocs documentation
141 | /site
142 | 
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 | 
148 | # Pyre type checker
149 | .pyre/
150 | 
151 | # pytype static type analyzer
152 | .pytype/
153 | 
154 | # Cython debug symbols
155 | cython_debug/
156 | 
157 | # PyCharm
158 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
161 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Chayenne
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ModelServer
  2 | 
  3 | A ModelServer class based on the SGLang framework. Fully self-built, suggestions for further optimization are welcome. Using SGLang v0.2.15 right now.
  4 | 
  5 | You can also refer to the [Chinese Readme](./README_CN.md).
  6 | 
  7 | The ModelServer framework implements efficient, flexible, and highly fault-tolerant model service management. It can adapt to models of different scales and diverse task requirements, providing a reliable infrastructure for the deployment and application of large-scale language models.
  8 | 
  9 | After configuration, you can get a response or embedding from these kind of commands:
 10 | 
 11 | ```
 12 | from model_sever import ModelServer
 13 | message = [
 14 |     {
 15 |         "role": "user",
 16 |         "content": "Interact with a household to solve a task. Here are a few examples.\nYou are in the middle of a room. Looking quickly around you, you see a cabinet 4, a cabinet 3, a cabinet 2, a cabinet 1, a coffeemachine 1, a countertop 1, a diningtable 3, a diningtable 2, a diningtable 1, a drawer 1, a fridge 1, a garbagecan 1, a microwave 1, a sidetable 1, a sinkbasin 1, a stoveburner 4, a stoveburner 3, a stoveburner 2, a stoveburner 1, and a toaster 1.\nYour task is to: find some apple and put it in sidetable.\n> think: To solve the task, I need to find and take an apple, then put it in sidetable.\nOK.\n> think: First I need to find an apple. An apple is more likely to appear in fridges (1), diningtables (1-3), sidetables (1), countertops (1), sinkbasins (1), garbagecan (1). I can check one by one, starting with fridge 1.\nOK.\n> go to fridge 1\nThe fridge 1 is closed.\n> open fridge 1\nYou open the fridge 1. The fridge 1 is open. In it, you see a lettuce 2, a mug 2, a potato 2, and a tomato 1.\n> go to diningtable 1\nOn the diningtable 1, you see nothing.\n> go to diningtable 2\nOn the diningtable 2, you see a pineapple 1.\n> go to diningtable 3\nOn the diningtable 3, you see a bowl 2, a bowl 1, a bread 2, and a bread 1.\n> go to sidetable 1\nOn the sidetable 1, you see a cup 1, a lettuce 1, a peppershaker 3, a potato 1, and a saltshaker 1.\n> go to countertop 1\nOn the countertop 1, you see a butterknife 1, a dishsponge 1, and a soapbottle 1.\n> go to sinkbasin 1\nOn the sinkbasin 1, you see a cup 2, a egg 1, a mug 1, and a potato 3.\n> go to garbagecan 1\nOn the garbagecan 1, you see a apple 3, and a egg 3.\n> think: Now I find an apple (3). Next, I need to take it.\n> take apple 3 from garbagecan 1\nYou pick up the apple 3 from the garbagecan 1.\n> think: Now I find an apple (3). Next, I need to put it in/on sidetable 1.\nOK.\n> go to sidetable 1\nOn the sidetable 1, you see a cup 1, a lettuce 1, a peppershaker 3, a potato 1, and a saltshaker 1.\n> put apple 3 in/on sidetable 1\nYou put the apple 3 in/on the sidetable 1.\nYou are in the middle of a room. Looking quickly around you, you see a cabinet 4, a cabinet 3, a cabinet 2, a cabinet 1, a countertop 1, a garbagecan 1, a handtowelholder 2, a handtowelholder 1, a sinkbasin 2, a sinkbasin 1, a toilet 1, a toiletpaperhanger 1, and a towelholder 1.\nYour task is to: put some spraybottle on toilet.\n> think: To solve the task, I need to find and take a sparybottle, then put it on toilet.\nOK.\n> think: First I need to find a spraybottle. A spraybottle is more likely to appear in cabinet (1-4), countertop (1), toilet (1), sinkbasin (1-2), garbagecan (1). I can check one by one, starting with cabinet 1.\nOK.\n> go to cabinet 1\nOn the cabinet 1, you see a cloth 1, a soapbar 1, a soapbottle 1.\n> go to cabinet 2\nThe cabinet 2 is closed.\n> open cabinet 2\nYou open the cabinet 2. The cabinet 2 is open. In it, you see a candle 1, and a spraybottle 2.\n> think: Now I find a spraybottle (2). Next, I need to take it.\n> take spraybottle 2 from cabinet 2\nYou pick up the spraybottle 2 from the cabinet 2.\n> think: Now I take a spraybottle (2). Next, I need to put it in/on toilet 1.\nOK.\n> go to toilet 1\nOn the toilet 1, you see a soapbottle 2.\n> put spraybottle 2 in/on toilet 1\nYou put the spraybottle 2 in/on the toilet 1.\n\n\nHere is the task:\nYou are in the middle of a room. Looking quickly around you, you see a bed 1, a desk 2, a desk 1, a drawer 6, a drawer 5, a drawer 4, a drawer 3, a drawer 2, a drawer 1, a garbagecan 1, a laundryhamper 1, a safe 1, a shelf 6, a shelf 5, a shelf 4, a shelf 3, a shelf 2, and a shelf 1.\nYour task is to: put a mug in desk.\n>",
 17 |     },
 18 |     {
 19 |         "role": "assistant",
 20 |         "content": "think: To solve the task, I need to find and take a mug, then put it in/on desk.",
 21 |     },
 22 |     {"role": "user", "content": "OK."},
 23 |     {
 24 |         "role": "assistant",
 25 |         "content": "think: First I need to find a mug. A mug is more likely to appear in desk (1-2), drawer (1-6), shelf (1-6), garbagecan (1), laundryhamper (1), safe (1). I can check one by one, starting with desk 1.",
 26 |     },
 27 |     {"role": "user", "content": "OK."},
 28 |     {"role": "assistant", "content": "go to desk 1"},
 29 |     {
 30 |         "role": "user",
 31 |         "content": "On the desk 1, you see a bowl 1, a cd 2, a creditcard 2, a desklamp 1, a keychain 1, a laptop 2, a pen 2, and a pencil 2.",
 32 |     },
 33 |     {
 34 |         "role": "assistant",
 35 |         "content": "think: Now I don't find a mug in desk 1. Next, I need to check desk 2.",
 36 |     },
 37 |     {"role": "user", "content": "OK."},
 38 |     {"role": "assistant", "content": "go to desk 2"},
 39 |     {
 40 |         "role": "user",
 41 |         "content": "On the desk 2, you see a alarmclock 1, a book 3, a bowl 2, a creditcard 3, and a keychain 3.",
 42 |     },
 43 | ]
 44 | 
 45 | server = ModelServer()
 46 | # `8` here is the completion model size configured in the `client_configs`
 47 | complition = server.get_completion_or_embedding("8", message)
 48 | # get the completion from a 8b instruct model, i.e. Llama3.1 8B
 49 | print(complition)
 50 | 
 51 | # `7` here is the embedding model size configured in the `client_configs`
 52 | embedding = server.get_completion_or_embedding(
 53 |     "7",
 54 |     message="As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
 55 |     get_embedding=True,
 56 | )
 57 | # get the embedding of a 7b embedding model, i.e. `Alibaba-NLP/gte-Qwen1.5-7B-instruct`
 58 | print(embedding[:10])
 59 | ```
 60 | 
 61 | ## Get Started
 62 | 
 63 | ### Install SGLang
 64 | 
 65 | Below are the dependencies for the SGLang framework in my framework currently. Will update later.
 66 | 
 67 | ```bash
 68 | pip install sglang==0.2.15
 69 | pip install flashinfer==0.1.6 -i https://flashinfer.ai/whl/cu121/torch2.3/
 70 | 
 71 | # lower version of vllm will lead to errors about multimodal-config
 72 | pip install vllm==0.5.5
 73 | 
 74 | pip install triton==2.3.1
 75 | 
 76 | # change the cuda version according to your local device
 77 | pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121
 78 | ```
 79 | It is recommended to follow the above specified versions to avoid potential errors.
 80 | 
 81 | ### Modify `client_config.py`
 82 | 
 83 | Modify the IP address of the server and the model path in `client_config.py`:
 84 | 
 85 | ```python
 86 | SERVER_IP = "[SECRET IP, REPLACE WITH YOURS]"
 87 | MODEL_NAME_8B = "8bins"
 88 | MODEL_NAME_70B = "70bins"
 89 | EMBEDDING_7B = "7embed"
 90 | ```
 91 | 
 92 | ### Run the Server Engine
 93 | 
 94 | ```bash
 95 | python serve_llm_pipeline.py
 96 | ```
 97 | 
 98 | ### Test the Server Latency
 99 | 
100 | ```bash
101 | python client_config.py
102 | ```
103 | 
104 | ### Test the ModelServer
105 | ```bash
106 | python model_server.py
107 | ```
108 | 
109 | ## Code Structure
110 | 
111 | ### `client_configs.py`
112 | 
113 | #### Constants
114 | 
115 | - **Server Configuration**: Configurations for all servers, hosting models of different sizes (e.g., `8B` and `70B`) as well as embedding models. Each server is represented by a `Server` named tuple, containing attributes such as `ip` (IP address), `port` (port number), `model_size` (model size), `model_path` (model path), and `gpus` (GPU configuration).
116 | - **BENCHMAK_MESSAGE**: Defines a benchmark message (`BENCHMAK_MESSAGE`) used to test the performance of different servers.
117 | - **Completion_Servers**: List of server configurations for dialogue models.
118 | - **Embedding_Servers**: List of server configurations for embedding models (newly added).
119 | 
120 | #### Functions
121 | 
122 | - `get_fastest_server`: Tests the latency of each server and returns the fastest server along with its latency. Servers with latency higher than the current lowest latency are skipped, which is particularly significant when server latencies are highly uneven. (Support for embedding model servers has been added)
123 | - `get_all_latency`: Checks and prints the latency of all configured servers, including both completion and embedding model servers.
124 | - `get_running_server_sizes`: Returns a list of model sizes currently running on servers.
125 | 
126 | ### `serve_llm_pipeline.py`
127 | 
128 | #### Functions
129 | 
130 | - `get_eno1_inet_address`: Retrieves the IP address associated with the `eno1` network interface.
131 | - `is_gpu_free`: Checks if the specified GPU is free (memory usage below a certain threshold).
132 | - `get_gpu_memory_info`: Gets total and available memory information for specified GPUs.
133 | - `get_free_memory_ratio`: Calculates the ratio of available memory to total memory for GPUs.
134 | - `get_comond_infos`: Dynamically constructs commands to start servers, generating appropriate startup parameters based on server configurations.
135 | - `main`: Main function that manages GPU availability and starts model servers. Uses `ThreadPoolExecutor` to concurrently manage multiple servers.
136 | 
137 | #### Features
138 | 
139 | - **Dynamic Resource Management:** Dynamically checks and manages GPU resources, ensuring servers are only started when resources are sufficient.
140 | - **Server Initialization:** Automatically starts servers, ensuring correct configuration and resource allocation.
141 | - **Concurrency:** Uses `ThreadPoolExecutor` to concurrently manage multiple servers, maximizing resource utilization.
142 | - **GPU Memory Management:** Real-time monitoring of GPU memory usage, dynamically adjusting server startup strategies.
143 | 
144 | ### `model_server.py`
145 | 
146 | #### Constants
147 | 
148 | - `LATENCY_GROWING_RATE`: Latency growth rate, used for dynamically adjusting latency thresholds.
149 | - `MAX_RETRY`: Maximum number of retry attempts, improving system fault tolerance.
150 | - `INF`: Represents infinity, used for initializing latency comparisons.
151 | 
152 | #### `ModelServer` Class
153 | 
154 | Manages the creation and interaction of different model servers (including completion and embedding models) using automated restart and fault recovery methods.
155 | 
156 | ##### Latency Management and Automatic Restart
157 | 
158 | - **Latency Monitoring:** Real-time monitoring of server response time in the `get_completion_or_embedding` method.
159 | - **Dynamic Threshold Adjustment:** Uses `LATENCY_GROWING_RATE` to dynamically adjust acceptable latency thresholds.
160 | - **Automatic Restart:** Triggers the `_manage_model_server` method to rebuild server connections when response times are detected to be too long.
161 | 
162 | ##### Fault Tolerance Mechanism
163 | 
164 | - **Multiple Attempts:** Uses the `MAX_RETRY` mechanism to attempt requests multiple times in case of errors.
165 | - **Error Handling:** Captures and logs exceptions, attempting to rebuild server connections.
166 | - **Graceful Degradation:** Gracefully shuts down the service through the `turn_off_running_flag` method when all attempts fail.
167 | 
168 | ##### Server Construction and Rebuilding Logic
169 | 
170 | - **Initial Construction:** Selects the fastest server based on current configurations.
171 | - **Dynamic Rebuilding:** Automatically selects a new, faster server when server performance degrades.
172 | - **Resource Optimization:** Balances service quality and resource utilization through reasonable rebuilding strategies.
173 | 
174 | ##### New Features
175 | 
176 | - **Embedding Model Support:** Added support for embedding models, capable of handling text embedding requests.
177 | - **Configuration File Support:** Introduced configuration file management for server running states, increasing flexibility.
178 | - **Separate Completion and Embedding Methods:** The `get_completion_or_embedding` method handles completion and embedding tasks separately based on request type.
179 | 
180 | ## Usage Suggestions
181 | 
182 | 1. Set `LATENCY_GROWING_RATE` and `MAX_RETRY` reasonably to balance system response speed and stability.
183 | 2. Monitor GPU resource usage and adjust server configurations as needed.
184 | 3. Regularly check and update `BENCHMAK_MESSAGE` to ensure it effectively tests server performance.
185 | 4. Consider adding more servers or optimizing existing server configurations under high load conditions.
186 | 5. Utilize embedding model functionality for text analysis and similarity calculation tasks.
187 | 
188 | 
189 | ## Trouble Shooting
190 | 
191 | 1. If you encounter the error `eno1` not found, you can directly remove `get_eno1_inet_address` in `serve_llm_pipeline.py` and set the IP address manually. IP address is used to differentiate clusters if you want to run engines on multiple clusters with different IPs. If you only have one cluster/node, you can manually set its IP address.
192 | 
193 | 2. If you encounter the error:
194 | 
195 | ```
196 | RuntimeError: Tried to instantiate class '_core_C.ScalarType', but it does not exist! Ensure that it is registered via torch::class_<ScalarType, Base, torch::detail::intrusive_ptr_target>::declare("torch._C.ScalarType");
197 | ```
198 | 
199 | You can solve it by installing the correct version of torch:
200 | 
201 | ```bash
202 | pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu121 
203 | ```
204 | 
205 | 3. If you encounter the error:
206 | 
207 | ```
208 | ImportError: /usr/lib/x86_64-linux-gnu/libc.so.6: version 'GLIBC_2.34' not found (required by /xxx/.triton/cache/41ce1f58e0a8aa9865e66b90d58b3307bb64c5a006830e49543444faf56202fc/cuda_utils.so)
209 | ```
210 | 
211 | You can solve it by deleting the cache:
212 | 
213 | ```bash
214 | rm -rf /xxx/.triton/cache/*
215 | ```


--------------------------------------------------------------------------------
/README_CN.md:
--------------------------------------------------------------------------------
  1 | # ModelServer
  2 | 
  3 | 基于 SGLang 框架的 ModelServer 类。完全自建，还请提出更多优化方式。采用的 SGLang 版本为 v0.2.15。
  4 | 
  5 | ModelServer 框架实现了高效、灵活且具有强大容错能力的模型服务管理。它能够适应不同规模的模型和多样的任务需求，为大规模语言模型的部署和应用提供了可靠的基础设施。
  6 | 
  7 | ## 快速使用
  8 | 
  9 | ### 安装 SGLang
 10 | 
 11 | 参考我当前的配置，安装 SGLang 和依赖项。
 12 | 
 13 | ```bash
 14 | pip install sglang==0.2.15
 15 | pip install flashinfer==0.1.6 -i https://flashinfer.ai/whl/cu121/torch2.3/
 16 | 
 17 | # 较低版本的vllm可能导致关于multimodal-config的错误
 18 | pip install vllm==0.5.5
 19 | 
 20 | pip install triton==2.3.1
 21 | 
 22 | # 根据您的本地设备更改cuda版本
 23 | pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121
 24 | ```
 25 | 
 26 | ### 修改 `client_config.py`
 27 | 
 28 | 在 `client_config.py` 中配置你本地的服务器的 IP 地址和模型路径等等：
 29 | 
 30 | ```python
 31 | SERVER_IP = "[SECRET IP, REPLACE WITH YOURS]"
 32 | MODEL_NAME_8B = "8bins"
 33 | MODEL_NAME_70B = "70bins"
 34 | EMBEDDING_7B = "7embed"
 35 | ```
 36 | 
 37 | ### 启动 Model Engine
 38 | 
 39 | ```bash
 40 | python serve_llm_pipeline.py
 41 | ```
 42 | 
 43 | ### 测试服务器延迟
 44 | 
 45 | ```python
 46 | python client_config.py
 47 | ```
 48 | 
 49 | ### 测试 ModelServer
 50 | 
 51 | ```bash
 52 | python model_server.py
 53 | ```
 54 | 
 55 | ## 代码结构
 56 | 
 57 | ### `client_configs.py`
 58 | 
 59 | #### 常量
 60 | 
 61 | - **Server 配置**：全部服务器配置，托管了不同大小的模型（如 `8B` 和 `70B`）以及嵌入模型。每个服务器通过一个 `Server` 命名元组来表示，包含属性如 `ip`（IP 地址）、`port`（端口）、`model_size`（模型大小）、`model_path`（模型路径）和 `gpus`（GPU 配置）。
 62 | - **BENCHMAK_MESSAGE**： 定义了一个基准消息（`BENCHMAK_MESSAGE`），用于测试不同服务器的性能。
 63 | - **Completion_Servers**：对话模型的服务器配置列表。
 64 | - **Embedding_Servers**：嵌入模型的服务器配置列表（新增）。
 65 | 
 66 | #### 函数
 67 | 
 68 | - `get_fastest_server`： 测试各个服务器的延迟，并返回最快的服务器及其延迟。延迟长于当前最低 latency 的服务器将被跳过，这在服务器 latency 非常不均衡时有显著意义。（新增了对嵌入模型服务器的支持）
 69 | - `get_all_latency`： 检查并打印所有配置服务器的延迟，包括完成模型和嵌入模型服务器。
 70 | - `get_running_server_sizes`： 返回当前运行中的服务器模型大小列表。
 71 | 
 72 | ### `serve_llm_pipeline.py`
 73 | 
 74 | #### 函数
 75 | 
 76 | - `get_eno1_inet_address`： 获取网络接口 `eno1` 相关的 IP 地址。
 77 | - `is_gpu_free`： 检查指定的 GPU 是否空闲（内存使用量低于某个阈值）。
 78 | - `get_gpu_memory_info`： 获取指定 GPU 的总内存和可用内存信息。
 79 | - `get_free_memory_ratio`： 计算 GPU 可用内存与总内存的比例。
 80 | - `get_comond_infos`： 动态构建用于启动服务器的命令，根据服务器的配置生成适当的启动参数。
 81 | - `main`： 主函数，管理 GPU 的可用性并启动模型服务器。使用 `ThreadPoolExecutor` 实现并发管理多个服务器。
 82 | 
 83 | #### features
 84 | 
 85 | - **动态资源管理：** 动态检查和管理 GPU 资源，确保只有在资源充足时才启动服务器。
 86 | - **服务器初始化：** 自动化地启动服务器，确保使用正确的配置和资源分配。
 87 | - **并发性：** 使用 `ThreadPoolExecutor` 并发地管理多个服务器，充分利用资源。
 88 | - **GPU 内存管理：** 实时监控 GPU 内存使用情况，动态调整服务器启动策略。
 89 | 
 90 | ### `model_server.py`
 91 | 
 92 | #### 常量
 93 | 
 94 | - `LATENCY_GROWING_RATE`：延迟增长率，用于动态调整延迟阈值。
 95 | - `MAX_RETRY`：最大重试次数，提高系统容错能力。
 96 | - `INF`：表示无穷大，用于初始化延迟比较。
 97 | 
 98 | ####  `ModelServer` 类
 99 | 
100 | 采用自动化重启和容错恢复的方式管理不同模型服务器（包括完成模型和嵌入模型）的创建和交互。
101 | 
102 | ##### 延迟管理和自动重启
103 | 
104 | - **延迟监测：** 在 `get_completion_or_embedding` 方法中实时监测服务器响应时间。
105 | - **动态阈值调整：** 使用 `LATENCY_GROWING_RATE` 动态调整可接受的延迟阈值。
106 | - **自动重启：** 当检测到响应时间过长时，触发 `_manage_model_server` 方法重建服务器连接。
107 | 
108 | ##### 容错机制
109 | 
110 | - **多次尝试：** 使用 `MAX_RETRY` 机制，在发生错误时多次尝试请求。
111 | - **错误处理：** 捕获并记录异常，尝试重建服务器连接。
112 | - **优雅降级：** 当所有尝试失败时，通过 `turn_off_running_flag` 方法优雅地关闭服务。
113 | 
114 | ##### 服务器构建与重建逻辑
115 | 
116 | - **初始化构建：** 根据当前配置选择最快的服务器。
117 | - **动态重建：** 当服务器性能下降时，自动选择新的更快服务器。
118 | - **资源优化：** 通过合理的重建策略，平衡服务质量和资源利用。
119 | 
120 | ##### 新特性
121 | 
122 | - **嵌入模型支持：** 新增对嵌入模型的支持，可以处理文本嵌入请求。
123 | - **配置文件支持：** 引入配置文件管理服务器运行状态，提高灵活性。
124 | - **分离的完成和嵌入方法：** `get_completion_or_embedding` 方法根据请求类型分别处理完成和嵌入任务。
125 | 
126 | ## 使用建议
127 | 
128 | 1. 合理设置 `LATENCY_GROWING_RATE` 和 `MAX_RETRY`，以平衡系统响应速度和稳定性。
129 | 2. 监控 GPU 资源使用情况，适时调整服务器配置。
130 | 3. 定期检查和更新 `BENCHMAK_MESSAGE`，确保其能够有效测试服务器性能。
131 | 4. 在高负载情况下，考虑增加更多的服务器或优化现有服务器配置。
132 | 5. 利用嵌入模型功能进行文本分析和相似度计算任务。
133 | 
134 | ## Trouble Shooting
135 | 
136 | 1. 如果遇到`eno1`未找到的错误，可以直接在`serve_llm_pipeline.py`中移除`get_eno1_inet_address`并手动设置IP地址。（为了在具有不同 IP 的多个集群上运行 Model Engine 时，我采用 IP 地址来区分集群。如果不需要在多个集群上运行，就不用区分。）
137 | 2. 如果遇到以下错误：
138 | 
139 | ```bash
140 | RuntimeError: Tried to instantiate class '_core_C.ScalarType', but it does not exist! Ensure that it is registered via torch::class_<ScalarType, Base, torch::detail::intrusive_ptr_target>::declare("torch._C.ScalarType");
141 | ```
142 | 
143 | 可以通过安装正确版本的torch来解决：
144 | 
145 | ```bash
146 | pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu121
147 | ```
148 | 
149 | 1. 如果遇到以下错误：
150 | 
151 | ```bash
152 | ImportError: /usr/lib/x86_64-linux-gnu/libc.so.6: version 'GLIBC_2.34' not found (required by /xxx/.triton/cache/41ce1f58e0a8aa9865e66b90d58b3307bb64c5a006830e49543444faf56202fc/cuda_utils.so)
153 | ```
154 | 
155 | 可以通过删除缓存来解决：
156 | 
157 | ```bash
158 | rm -rf /xxx/.triton/cache/*
159 | ```
160 | 


--------------------------------------------------------------------------------
/client_configs.py:
--------------------------------------------------------------------------------
  1 | """
  2 | client_configs.py
  3 | """
  4 | 
  5 | from collections import namedtuple
  6 | import openai
  7 | from typing import List, Optional
  8 | from IPython import embed
  9 | import time
 10 | import multiprocessing
 11 | from typing import List, Optional
 12 | 
 13 | SERVER_IP = "[SECRET IP, REPLACE WITH YOURS]"
 14 | MODEL_NAME_8B = "8bins"
 15 | MODEL_NAME_70B = "70bins"
 16 | EMBEDDING_7B = "7embed"
 17 | INF = 100
 18 | 
 19 | Server = namedtuple("Server", ["ip", "port", "model_size", "model_path", "gpus"])
 20 | BENCHMAK_MESSAGE = [
 21 |     {
 22 |         "role": "user",
 23 |         "content": "Interact with a household to solve a task. Here are a few examples.\nYou are in the middle of a room. Looking quickly around you, you see a cabinet 4, a cabinet 3, a cabinet 2, a cabinet 1, a coffeemachine 1, a countertop 1, a diningtable 3, a diningtable 2, a diningtable 1, a drawer 1, a fridge 1, a garbagecan 1, a microwave 1, a sidetable 1, a sinkbasin 1, a stoveburner 4, a stoveburner 3, a stoveburner 2, a stoveburner 1, and a toaster 1.\nYour task is to: find some apple and put it in sidetable.\n> think: To solve the task, I need to find and take an apple, then put it in sidetable.\nOK.\n> think: First I need to find an apple. An apple is more likely to appear in fridges (1), diningtables (1-3), sidetables (1), countertops (1), sinkbasins (1), garbagecan (1). I can check one by one, starting with fridge 1.\nOK.\n> go to fridge 1\nThe fridge 1 is closed.\n> open fridge 1\nYou open the fridge 1. The fridge 1 is open. In it, you see a lettuce 2, a mug 2, a potato 2, and a tomato 1.\n> go to diningtable 1\nOn the diningtable 1, you see nothing.\n> go to diningtable 2\nOn the diningtable 2, you see a pineapple 1.\n> go to diningtable 3\nOn the diningtable 3, you see a bowl 2, a bowl 1, a bread 2, and a bread 1.\n> go to sidetable 1\nOn the sidetable 1, you see a cup 1, a lettuce 1, a peppershaker 3, a potato 1, and a saltshaker 1.\n> go to countertop 1\nOn the countertop 1, you see a butterknife 1, a dishsponge 1, and a soapbottle 1.\n> go to sinkbasin 1\nOn the sinkbasin 1, you see a cup 2, a egg 1, a mug 1, and a potato 3.\n> go to garbagecan 1\nOn the garbagecan 1, you see a apple 3, and a egg 3.\n> think: Now I find an apple (3). Next, I need to take it.\n> take apple 3 from garbagecan 1\nYou pick up the apple 3 from the garbagecan 1.\n> think: Now I find an apple (3). Next, I need to put it in/on sidetable 1.\nOK.\n> go to sidetable 1\nOn the sidetable 1, you see a cup 1, a lettuce 1, a peppershaker 3, a potato 1, and a saltshaker 1.\n> put apple 3 in/on sidetable 1\nYou put the apple 3 in/on the sidetable 1.\nYou are in the middle of a room. Looking quickly around you, you see a cabinet 4, a cabinet 3, a cabinet 2, a cabinet 1, a countertop 1, a garbagecan 1, a handtowelholder 2, a handtowelholder 1, a sinkbasin 2, a sinkbasin 1, a toilet 1, a toiletpaperhanger 1, and a towelholder 1.\nYour task is to: put some spraybottle on toilet.\n> think: To solve the task, I need to find and take a sparybottle, then put it on toilet.\nOK.\n> think: First I need to find a spraybottle. A spraybottle is more likely to appear in cabinet (1-4), countertop (1), toilet (1), sinkbasin (1-2), garbagecan (1). I can check one by one, starting with cabinet 1.\nOK.\n> go to cabinet 1\nOn the cabinet 1, you see a cloth 1, a soapbar 1, a soapbottle 1.\n> go to cabinet 2\nThe cabinet 2 is closed.\n> open cabinet 2\nYou open the cabinet 2. The cabinet 2 is open. In it, you see a candle 1, and a spraybottle 2.\n> think: Now I find a spraybottle (2). Next, I need to take it.\n> take spraybottle 2 from cabinet 2\nYou pick up the spraybottle 2 from the cabinet 2.\n> think: Now I take a spraybottle (2). Next, I need to put it in/on toilet 1.\nOK.\n> go to toilet 1\nOn the toilet 1, you see a soapbottle 2.\n> put spraybottle 2 in/on toilet 1\nYou put the spraybottle 2 in/on the toilet 1.\n\n\nHere is the task:\nYou are in the middle of a room. Looking quickly around you, you see a bed 1, a desk 2, a desk 1, a drawer 6, a drawer 5, a drawer 4, a drawer 3, a drawer 2, a drawer 1, a garbagecan 1, a laundryhamper 1, a safe 1, a shelf 6, a shelf 5, a shelf 4, a shelf 3, a shelf 2, and a shelf 1.\nYour task is to: put a mug in desk.\n>",
 24 |     },
 25 |     {
 26 |         "role": "assistant",
 27 |         "content": "think: To solve the task, I need to find and take a mug, then put it in/on desk.",
 28 |     },
 29 |     {"role": "user", "content": "OK."},
 30 |     {
 31 |         "role": "assistant",
 32 |         "content": "think: First I need to find a mug. A mug is more likely to appear in desk (1-2), drawer (1-6), shelf (1-6), garbagecan (1), laundryhamper (1), safe (1). I can check one by one, starting with desk 1.",
 33 |     },
 34 |     {"role": "user", "content": "OK."},
 35 |     {"role": "assistant", "content": "go to desk 1"},
 36 |     {
 37 |         "role": "user",
 38 |         "content": "On the desk 1, you see a bowl 1, a cd 2, a creditcard 2, a desklamp 1, a keychain 1, a laptop 2, a pen 2, and a pencil 2.",
 39 |     },
 40 |     {
 41 |         "role": "assistant",
 42 |         "content": "think: Now I don't find a mug in desk 1. Next, I need to check desk 2.",
 43 |     },
 44 |     {"role": "user", "content": "OK."},
 45 |     {"role": "assistant", "content": "go to desk 2"},
 46 |     {
 47 |         "role": "user",
 48 |         "content": "On the desk 2, you see a alarmclock 1, a book 3, a bowl 2, a creditcard 3, and a keychain 3.",
 49 |     },
 50 | ]
 51 | 
 52 | 
 53 | Completion_Servers = [
 54 |     Server(
 55 |         ip=SERVER_IP,
 56 |         port=8056,
 57 |         model_size="8",
 58 |         model_path=MODEL_NAME_8B,
 59 |         gpus=[1],
 60 |     ),
 61 |     Server(
 62 |         ip=SERVER_IP,
 63 |         port=8064,
 64 |         model_size="8",
 65 |         model_path=MODEL_NAME_8B,
 66 |         gpus=[2],
 67 |     ),
 68 |     Server(
 69 |         ip=SERVER_IP,
 70 |         port=8072,
 71 |         model_size="8",
 72 |         model_path=MODEL_NAME_8B,
 73 |         gpus=[3],
 74 |     ),
 75 |     Server(
 76 |         ip=SERVER_IP,
 77 |         port=8080,
 78 |         model_size="8",
 79 |         model_path=MODEL_NAME_8B,
 80 |         gpus=[4],
 81 |     ),
 82 |     Server(
 83 |         ip=SERVER_IP,
 84 |         port=8088,
 85 |         model_size="8",
 86 |         model_path=MODEL_NAME_8B,
 87 |         gpus=[5],
 88 |     ),
 89 |     Server(
 90 |         ip=SERVER_IP,
 91 |         port=8096,
 92 |         model_size="8",
 93 |         model_path=MODEL_NAME_8B,
 94 |         gpus=[6],
 95 |     ),
 96 |     Server(
 97 |         ip=SERVER_IP,
 98 |         port=8104,
 99 |         model_size="8",
100 |         model_path=MODEL_NAME_8B,
101 |         gpus=[7],
102 |     ),
103 |     #! 以下是 70B model 的 config，请不要同时开启
104 |     #     Server(
105 |     #     ip=SERVER_IP,
106 |     #     port=8400,
107 |     #     model_size="70",
108 |     #     model_path=MODEL_NAME_70B,
109 |     #     gpus=[0, 1, 2, 3],
110 |     # ),
111 | ]
112 | 
113 | Embedding_Servers = [
114 |     Server(
115 |         ip=SERVER_IP,
116 |         port=7777,
117 |         model_size="7",
118 |         model_path=EMBEDDING_7B,
119 |         gpus=[0],
120 |     ),
121 | ]
122 | 
123 | 
124 | def get_fastest_server(
125 |     initial_latency=10, model_size="8", test_embedding_servers: bool = False
126 | ):
127 |     SERVERS = Embedding_Servers if test_embedding_servers else Completion_Servers
128 |     min_latency = initial_latency
129 |     fastest_server = None
130 | 
131 |     def test_server(server: Server):
132 |         def get_completion_or_embedding(
133 |             client,
134 |             message: List,
135 |             temperature: float = 0.0,
136 |             max_tokens: int = 256,
137 |             model_name: Optional[str] = None,
138 |         ) -> str:
139 |             def target(queue):
140 |                 try:
141 |                     if not test_embedding_servers:
142 |                         completion = client.chat.completions.create(
143 |                             model=model_name,
144 |                             messages=message,
145 |                             max_tokens=max_tokens,
146 |                             temperature=temperature,
147 |                             stop=["<|eot_id|>"],
148 |                         )
149 |                         queue.put(completion)
150 |                     else:
151 |                         embedding = client.embeddings.create(
152 |                             input=message[0]["content"], model=model_name
153 |                         )
154 |                         queue.put(embedding)
155 | 
156 |                 except Exception as e:
157 |                     queue.put(e)
158 | 
159 |             start_time = time.time()
160 |             queue = multiprocessing.Queue()
161 |             process = multiprocessing.Process(target=target, args=(queue,))
162 |             process.start()
163 | 
164 |             process.join(timeout=min_latency)  # 等待进程完成或超时
165 | 
166 |             if process.is_alive():
167 |                 print(
168 |                     f"Timeout: server {server.ip}:{server.port} took longer than {min_latency:.3f} seconds."
169 |                 )
170 |                 process.terminate()  # 终止进程
171 |                 process.join()  # 确保进程已终止
172 |                 return None, INF
173 |             else:
174 |                 result = queue.get()
175 |                 if isinstance(result, Exception):
176 |                     raise result
177 |                 latency = time.time() - start_time
178 |                 print(f"Connection Time: {latency:.3f} s")
179 |                 if not test_embedding_servers:
180 |                     return str(result.choices[0].message.content), latency
181 |                 else:
182 |                     return (list(result.data[0].embedding), latency)
183 | 
184 |         client = openai.OpenAI(
185 |             base_url=(f"http://{server.ip}:{server.port}/v1"),
186 |             api_key=("sk-1dwqsdv4r3wef3rvefg34ef1dwRv"),
187 |         )
188 | 
189 |         try:
190 |             response, latency = get_completion_or_embedding(
191 |                 client,
192 |                 BENCHMAK_MESSAGE,
193 |                 0.0,
194 |                 256,
195 |                 server.model_path,
196 |             )
197 |             print(
198 |                 f"Get response: {response}"
199 |                 if not test_embedding_servers
200 |                 else f"Get embedding: {response[:10]}"
201 |             )
202 |             if response is not None and len(response) > 0:
203 |                 print(
204 |                     f"""
205 | ============================================================
206 | Cluster: {server.ip}
207 | Port: {server.port}
208 | Model: {server.model_path}
209 | Size: {server.model_size}
210 | GPUs: {server.gpus}
211 | Latency: {latency:.3f} s
212 | ============================================================
213 | """
214 |                 )
215 |                 return True, latency
216 |             else:
217 |                 return False, INF
218 | 
219 |         except Exception as e:
220 |             print(
221 |                 f"Could not connect to server {server.ip}:{server.port} due to error: {e}"
222 |             )
223 |             return False, INF
224 | 
225 |     for server in SERVERS:
226 |         if server.model_size == model_size:
227 |             print(
228 |                 f"Testing Completion"
229 |                 if not test_embedding_servers
230 |                 else "Testing Embedding"
231 |             )
232 |             status, latency = test_server(server)
233 |             if status and (latency < min_latency):
234 |                 min_latency = latency
235 |                 fastest_server = server
236 | 
237 |     if fastest_server:
238 |         print(
239 |             f"Fastest server is {fastest_server.ip}:{fastest_server.port} with latency {min_latency:.3f} s"
240 |         )
241 |         return fastest_server, min_latency
242 |     else:
243 |         print("No servers responded in a timely manner.")
244 |         return None, INF
245 | 
246 | 
247 | def get_all_latency(test_embedding_servers: bool = False):
248 |     SERVERS = Embedding_Servers if test_embedding_servers else Completion_Servers
249 | 
250 |     def test_server(server: Server):
251 |         client = openai.OpenAI(
252 |             base_url=(f"http://{server.ip}:{server.port}/v1"),
253 |             api_key=("sk-1dwqsdv4r3wef3rvefg34ef1dwRv"),
254 |         )
255 | 
256 |         try:
257 |             start_time = time.time()
258 |             if not test_embedding_servers:
259 |                 completion = client.chat.completions.create(
260 |                     model=server.model_path,
261 |                     messages=BENCHMAK_MESSAGE,
262 |                     max_tokens=256,
263 |                     temperature=0.9,
264 |                     stop=["<|eot_id|>", "\nObservation", "Observation"],
265 |                 )
266 |                 response = str(completion.choices[0].message.content)
267 |             else:
268 |                 embedding = client.embeddings.create(
269 |                     input=BENCHMAK_MESSAGE[0]["content"], model=server.model_path
270 |                 )
271 |                 client.embeddings.create(input="how are you", model=server.model_path)
272 |                 response = str(embedding.data[0].embedding[:10])
273 |             duration = time.time() - start_time
274 |             print(
275 |                 f"""
276 | ============================================================
277 | TESTING SERVER
278 | Cluster: {server.ip}
279 | Port: {server.port}
280 | Model: {server.model_path}
281 | Size: {server.model_size}
282 | GPUs: {server.gpus}
283 | ============================================================
284 |             """
285 |             )
286 |             print(f"Connection Time: {duration:.3f}s for {server.ip}:{server.port}")
287 |             print(
288 |                 f"Get response: {response}"
289 |                 if not test_embedding_servers
290 |                 else f"GOT EMBEDDING: {response}"
291 |             )
292 |             return True
293 |         except Exception as e:
294 |             print(
295 |                 f"""
296 | ============================================================
297 | TESTING SERVER
298 | Cluster: {server.ip}
299 | Port: {server.port}
300 | Model: {server.model_path}
301 | Size: {server.model_size}
302 | GPUs: {server.gpus}
303 | ============================================================
304 |             """
305 |             )
306 |             print(
307 |                 f"Could not connect to server {server.ip}:{server.port} due to error: {e}"
308 |             )
309 |             return False
310 | 
311 |     for server in SERVERS:
312 |         test_server(server)
313 | 
314 | 
315 | def get_running_server_sizes(SERVERS=Completion_Servers + Embedding_Servers):
316 |     server_sizes = [server.model_size for server in SERVERS]
317 |     return server_sizes
318 | 
319 | 
320 | if __name__ == "__main__":
321 |     server, min_latency = get_fastest_server(
322 |         initial_latency=10, model_size="8", test_embedding_servers=False
323 |     )
324 |     print(server)
325 |     server, min_latency = get_fastest_server(
326 |         initial_latency=10, model_size="7", test_embedding_servers=True
327 |     )
328 |     print(server)
329 |     get_all_latency(test_embedding_servers=True)
330 |     get_all_latency(test_embedding_servers=False)
331 | 


--------------------------------------------------------------------------------
/model_server.py:
--------------------------------------------------------------------------------
  1 | """
  2 | model_server.py
  3 | """
  4 | 
  5 | import os
  6 | 
  7 | os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
  8 | os.environ["PYTHONUTF8"] = "1"
  9 | import time
 10 | from typing import Dict, List
 11 | import openai
 12 | import json
 13 | from client_configs import (
 14 |     get_fastest_server,
 15 |     get_running_server_sizes,
 16 |     MODEL_NAME_70B,
 17 |     MODEL_NAME_8B,
 18 |     EMBEDDING_7B,
 19 |     EMBEDDING_2B,
 20 |     BENCHMAK_MESSAGE,
 21 | )
 22 | 
 23 | LATENCY_GROWING_RATE = 20
 24 | MAX_RETRY = 20
 25 | INF = 200
 26 | 
 27 | 
 28 | class ModelServer:
 29 |     def __init__(self, config_path: str = None) -> None:
 30 |         running_server_sizes = get_running_server_sizes()
 31 |         (
 32 |             self.completion_client_70b,
 33 |             self.completion_client_8b,
 34 |             self.embedding_client_7b,
 35 |             self.embedding_client_2b,
 36 |         ) = (None, None, None, None)
 37 |         self.latency_70b, self.latency_8b, self.latency_7b, self.latency_2b = (
 38 |             INF,
 39 |             INF,
 40 |             INF,
 41 |             INF,
 42 |         )
 43 |         self.config_path = config_path
 44 |         # Turn the running flag in config path when the server failed to get response
 45 |         if "70" in running_server_sizes:
 46 |             self._manage_model_server(latency_bound=3, model_size="70")
 47 |         if "8" in running_server_sizes:
 48 |             self._manage_model_server(latency_bound=3, model_size="8")
 49 |         if "7" in running_server_sizes:
 50 |             self._manage_model_server(
 51 |                 latency_bound=3, model_size="7", get_embedding=True
 52 |             )
 53 |         if "2" in running_server_sizes:
 54 |             self._manage_model_server(
 55 |                 latency_bound=3, model_size="2", get_embedding=True
 56 |             )
 57 | 
 58 |     def turn_off_running_flag(self) -> None:
 59 |         with open(self.config_path, "r", encoding="utf-8") as rf:
 60 |             info_dict = json.load(rf)
 61 |             info_dict["is_running"] = False
 62 |         with open(self.config_path, "w", encoding="utf-8") as wf:
 63 |             json.dump(info_dict, wf, indent=4)
 64 | 
 65 |     def _manage_model_server(
 66 |         self, latency_bound, model_size: str, get_embedding: bool = False
 67 |     ) -> None:
 68 |         build_latency = latency_bound
 69 |         build_count = 0
 70 |         status = False
 71 |         while not status:
 72 |             server, latency_bound = get_fastest_server(
 73 |                 initial_latency=build_latency,
 74 |                 model_size=model_size,
 75 |                 test_embedding_servers=get_embedding,
 76 |             )
 77 |             # latency_bound+=10
 78 |             if server is not None:
 79 |                 client = openai.OpenAI(
 80 |                     base_url=(f"http://{server.ip}:{server.port}/v1"),
 81 |                     api_key=("sk-1dwqsdv4r3wef3rvefg34ef1dwRv"),
 82 |                 )
 83 |                 if model_size == "70" and not get_embedding:
 84 |                     self.completion_client_70b, self.latency_70b = client, latency_bound
 85 |                 elif model_size == "8" and not get_embedding:
 86 |                     self.completion_client_8b, self.latency_8b = client, latency_bound
 87 |                 elif model_size == "7" and get_embedding:
 88 |                     self.embedding_client_7b, self.latency_7b = client, latency_bound
 89 |                 elif model_size == "2" and get_embedding:
 90 |                     self.embedding_client_2b, self.latency_2b = client, latency_bound
 91 |                 else:
 92 |                     raise NotImplementedError
 93 |                 print(
 94 |                     f"Model server {model_size}B built with latency_bound {latency_bound}."
 95 |                 )
 96 |                 status = True
 97 |             else:
 98 |                 build_latency *= LATENCY_GROWING_RATE
 99 |                 build_count += 1
100 |                 print(
101 |                     f"Attempt {build_count} to build model server {model_size}B failed."
102 |                 )
103 |                 if build_count > MAX_RETRY:
104 |                     assert self.config_path is not None, "Config path is required."
105 |                     self.turn_off_running_flag()
106 |                     raise RuntimeError(
107 |                         f"Could not build model server after {MAX_RETRY} attempts."
108 |                     )
109 | 
110 |     def get_completion_or_embedding(
111 |         self,
112 |         model_size: str,
113 |         message,
114 |         temperature: float = 0.0,
115 |         max_tokens: int = 256,
116 |         get_embedding: bool = False,
117 |     ) -> str:
118 |         # print(f"Message: {message}")
119 |         assert model_size in ["70", "8", "7", "2"]
120 |         if not get_embedding:
121 |             model_name = MODEL_NAME_70B if model_size == "70" else MODEL_NAME_8B
122 |         else:
123 |             model_name = EMBEDDING_7B if model_size == "7" else EMBEDDING_2B
124 | 
125 |         for attempt in range(MAX_RETRY):
126 |             try:
127 |                 assert (
128 |                     (self.completion_client_70b is not None and model_size == "70")
129 |                     or (self.completion_client_8b is not None and model_size == "8")
130 |                     or (self.embedding_client_7b is not None and model_size == "7")
131 |                     or (self.embedding_client_2b is not None and model_size == "2")
132 |                 ), "Model server not initialized."
133 | 
134 |                 if not get_embedding:
135 |                     client = (
136 |                         self.completion_client_70b
137 |                         if model_size == "70"
138 |                         else self.completion_client_8b
139 |                     )
140 |                     latency_bound = (
141 |                         self.latency_70b if model_size == "70" else self.latency_8b
142 |                     )
143 |                 else:
144 |                     client = (
145 |                         self.embedding_client_7b
146 |                         if model_size == "7"
147 |                         else self.embedding_client_2b
148 |                     )
149 |                     latency_bound = (
150 |                         self.latency_7b if model_size == "7" else self.latency_2b
151 |                     )
152 |                 # print(
153 |                 #     f"Using client {client.base_url} with latency bound {latency_bound}."
154 |                 # )
155 |                 start_time = time.time()
156 |                 if not get_embedding:
157 |                     assert type(message) == list, "Message should be a list."
158 |                     response = client.chat.completions.create(
159 |                         model=model_name,
160 |                         messages=message,
161 |                         max_tokens=max_tokens,
162 |                         temperature=temperature,
163 |                         stop=["<|eot_id|>"],
164 |                     )
165 |                 else:
166 |                     assert type(message) == str, "Message should be a string."
167 |                     response = client.embeddings.create(
168 |                         model=model_name,
169 |                         input=message,
170 |                     )
171 |                 elapsed_time = time.time() - start_time
172 |                 # elapsed_time = 50
173 |                 # print(f"Connection Time: {elapsed_time:.3f} s")
174 | 
175 |                 if elapsed_time >= LATENCY_GROWING_RATE * latency_bound:
176 |                     print(
177 |                         f"Rebuilding model seed due to response delay ({elapsed_time:.3f}) longer than {LATENCY_GROWING_RATE} * latency bound ({latency_bound:.3f})."
178 |                     )
179 |                     self._manage_model_server(
180 |                         latency_bound=LATENCY_GROWING_RATE * latency_bound,
181 |                         model_size=model_size,
182 |                         get_embedding=get_embedding,
183 |                     )
184 | 
185 |                 return (
186 |                     str(response.choices[0].message.content)
187 |                     if not get_embedding
188 |                     else response.data[0].embedding
189 |                 )
190 | 
191 |             except Exception as e:
192 |                 print(f"Attempt {attempt + 1} to get response failed with error: {e}")
193 |                 print(f"Rebuilding model server {model_size}B.")
194 |                 self._manage_model_server(
195 |                     latency_bound=INF,
196 |                     model_size=model_size,
197 |                     get_embedding=get_embedding,
198 |                 )
199 | 
200 |         error_message = (
201 |             f"All clients failed to produce a completion after {MAX_RETRY} attempts."
202 |         )
203 |         print(error_message)
204 |         print(message)
205 |         assert self.config_path is not None, "Config path is required."
206 |         self.turn_off_running_flag()
207 |         raise RuntimeError(error_message)
208 | 
209 | 
210 | if __name__ == "__main__":
211 |     #! Test the model server
212 |     server = ModelServer()
213 |     message = BENCHMAK_MESSAGE
214 |     # message = []
215 |     for i in range(10):
216 |         print(f"Completion {i}:")
217 |         complition = server.get_completion_or_embedding("8", message)
218 |         print(complition)
219 | 
220 |     embedding = None
221 |     for i in range(10):
222 |         print(f"Embedding {i}:")
223 |         embedding = server.get_completion_or_embedding(
224 |             "7",
225 |             message="As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
226 |             get_embedding=True,
227 |         )
228 |         print(embedding[:10])
229 | 


--------------------------------------------------------------------------------
/serve_llm_pipeline.py:
--------------------------------------------------------------------------------
  1 | """
  2 | serve_llm_pipeline.py
  3 | """
  4 | 
  5 | import subprocess
  6 | import time
  7 | from concurrent.futures import ThreadPoolExecutor, as_completed
  8 | import re
  9 | import socket
 10 | from client_configs import Server, Completion_Servers, Embedding_Servers
 11 | from IPython import embed
 12 | 
 13 | #! 如果 GPU ulitization 较低的话，开不了这么长的 context length
 14 | MAX_CONTEXT_LENGTH = 65536 * 2
 15 | CHUNKED_PREFILL_SIZE = int(MAX_CONTEXT_LENGTH / 8)
 16 | 
 17 | 
 18 | def get_eno1_inet_address():
 19 |     result = subprocess.run(["ifconfig"], capture_output=True, text=True)
 20 |     output = result.stdout
 21 |     match = re.search(r"eno1:.*?(inet\s+(\d+\.\d+\.\d+\.\d+))", output, re.DOTALL)
 22 |     if match:
 23 |         return match.group(2)
 24 |     return None
 25 | 
 26 | 
 27 | def is_gpu_free(gpu_ids):
 28 |     gpu_ids_string = ",".join(map(str, gpu_ids))
 29 |     command = f"nvidia-smi --query-gpu=memory.used --format=csv,nounits,noheader -i {gpu_ids_string}"
 30 |     output = subprocess.check_output(command, shell=True).decode("utf-8").strip()
 31 |     used_memory = [int(x) for x in output.split("\n")]
 32 |     return all(
 33 |         used < 10000 for used in used_memory
 34 |     )  # GPUs with usage below 10000 MB are considered free
 35 | 
 36 | 
 37 | def get_gpu_memory_info(gpu_ids):
 38 |     gpu_ids_string = ",".join(map(str, gpu_ids))
 39 |     # 获取总内存
 40 |     command_total = f"nvidia-smi --query-gpu=memory.total --format=csv,nounits,noheader -i {gpu_ids_string}"
 41 |     output_total = (
 42 |         subprocess.check_output(command_total, shell=True).decode("utf-8").strip()
 43 |     )
 44 |     total_memory = [int(x) for x in output_total.split("\n")]
 45 |     # 获取剩余内存
 46 |     command_free = f"nvidia-smi --query-gpu=memory.free --format=csv,nounits,noheader -i {gpu_ids_string}"
 47 |     output_free = (
 48 |         subprocess.check_output(command_free, shell=True).decode("utf-8").strip()
 49 |     )
 50 |     free_memory = [int(x) for x in output_free.split("\n")]
 51 |     return total_memory, free_memory
 52 | 
 53 | 
 54 | def get_free_memory_ratio(gpu_ids):
 55 |     total_memory, free_memory = get_gpu_memory_info(gpu_ids)
 56 |     free_memory_ratio = [free / total for free, total in zip(free_memory, total_memory)]
 57 |     return free_memory_ratio
 58 | 
 59 | 
 60 | def get_comond_infos(server: Server):
 61 |     assert len(server.gpus) > 0, "No GPUs assigned to the server."
 62 |     assert (
 63 |         server.port % int(server.model_size) == 0
 64 |     ), "Port should be divisible by model size."
 65 |     assert (
 66 |         server.model_size in server.model_path
 67 |     ), "Model size should be in the model name."
 68 |     group_gpu_string = ",".join(map(str, server.gpus))
 69 |     tensor_parallel_size = len(server.gpus)
 70 |     command = f"""
 71 |     CUDA_VISIBLE_DEVICES={group_gpu_string} python -m sglang.launch_server --enable-p2p-check --model-path {server.model_path} \
 72 |     --dtype auto --tensor-parallel-size {tensor_parallel_size} \
 73 |     --context-length {MAX_CONTEXT_LENGTH if server.model_size != "7" else 32768} --chunked-prefill-size {CHUNKED_PREFILL_SIZE if server.model_size != "7" else int(32768 / 8)} \
 74 |     --port {server.port} --host 0.0.0.0 --api-key sk-1dwqsdv4r3wef3rvefg34ef1dwRv """
 75 |     #! host 0.0.0.0 可以用于广播
 76 |     # if server.model_size == "8" or server.model_size == "7":
 77 |     #     command += " --enable-torch-compile "
 78 |     # if server.model_size == "7":
 79 |     #     command += " --is-embedding "
 80 |     #! 8b 模型需要开启 torch compile，70b 还没优化
 81 |     return (group_gpu_string, command, server.port, server.model_size)
 82 | 
 83 | 
 84 | def main():
 85 |     eno1_ip_address = get_eno1_inet_address()
 86 |     ServerID = int(eno1_ip_address[-1])
 87 |     assert ServerID in [3, 4], "Model should be served on server 3 or 4."
 88 |     assert str(ServerID) in socket.gethostname(), "ServerID should be in the hostname."
 89 | 
 90 |     print(
 91 |         f"""
 92 | ============================================================
 93 | Cluster: {socket.gethostname()}
 94 | IP: {get_eno1_inet_address()}
 95 | ============================================================
 96 | """
 97 |     )
 98 | 
 99 |     command_infos = [
100 |         get_comond_infos(server)
101 |         for server in (Completion_Servers + Embedding_Servers)
102 |         if (server.ip == eno1_ip_address)
103 |     ]
104 | 
105 |     def run_with_gpu_check(command_info):
106 |         group_id, command, port, model_size = command_info
107 | 
108 |         while not is_gpu_free(group_id):
109 |             print(f"Waiting for GPU(s) {group_id} to be free...")
110 |             time.sleep(10)
111 | 
112 |         print(
113 |             f"Serving {model_size}b model on server {ServerID} port {port} with GPUs {group_id}"
114 |         )
115 |         free_gpu_ration = min(get_free_memory_ratio(group_id))
116 |         gpu_ultization = None
117 |         if free_gpu_ration >= 0.95:
118 |             gpu_ultization = 0.80
119 |         elif free_gpu_ration >= 0.85:
120 |             gpu_ultization = 0.70
121 |         else:
122 |             raise ValueError("GPU memory is not enough.")
123 |         command = command + f" --mem-fraction-static {gpu_ultization}  "
124 |         print(command)
125 |         subprocess.run(command, shell=True)
126 | 
127 |     with ThreadPoolExecutor(max_workers=len(command_infos)) as executor:
128 |         futures = [
129 |             executor.submit(run_with_gpu_check, cmd_info) for cmd_info in command_infos
130 |         ]
131 |         for future in as_completed(futures):
132 |             future.result()
133 | 
134 | 
135 | if __name__ == "__main__":
136 |     main()
137 | 


--------------------------------------------------------------------------------