├── LICENSE ├── R1-mem-good1.md ├── README.md ├── requirements.txt ├── vllm-gui-server-r1-loggood.py └── vllm-gui-server-r1-mem-good4.py /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | Requirements from open sourcers 6 | The following uses of this software are prohibited: 7 | - Direct or indirect commercial services (such as API charges, selling model weights, public cloud). 8 | - Integration into commercial products. 9 | - Private cloud deployment services for overall sales. 10 | 11 | 12 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 13 | 14 | 1. Definitions. 15 | 16 | "License" shall mean the terms and conditions for use, reproduction, 17 | and distribution as defined by Sections 1 through 9 of this document. 18 | 19 | "Licensor" shall mean the copyright owner or entity authorized by 20 | the copyright owner that is granting the License. 21 | 22 | "Legal Entity" shall mean the union of the acting entity and all 23 | other entities that control, are controlled by, or are under common 24 | control with that entity. For the purposes of this definition, 25 | "control" means (i) the power, direct or indirect, to cause the 26 | direction or management of such entity, whether by contract or 27 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 28 | outstanding shares, or (iii) beneficial ownership of such entity. 29 | 30 | "You" (or "Your") shall mean an individual or Legal Entity 31 | exercising permissions granted by this License. 32 | 33 | "Source" form shall mean the preferred form for making modifications, 34 | including but not limited to software source code, documentation 35 | source, and configuration files. 36 | 37 | "Object" form shall mean any form resulting from mechanical 38 | transformation or translation of a Source form, including but 39 | not limited to compiled object code, generated documentation, 40 | and conversions to other media types. 41 | 42 | "Work" shall mean the work of authorship, whether in Source or 43 | Object form, made available under the License, as indicated by a 44 | copyright notice that is included in or attached to the work 45 | (an example is provided in the Appendix below). 46 | 47 | "Derivative Works" shall mean any work, whether in Source or Object 48 | form, that is based on (or derived from) the Work and for which the 49 | editorial revisions, annotations, elaborations, or other modifications 50 | represent, as a whole, an original work of authorship. For the purposes 51 | of this License, Derivative Works shall not include works that remain 52 | separable from, or merely link (or bind by name) to the interfaces of, 53 | the Work and Derivative Works thereof. 54 | 55 | "Contribution" shall mean any work of authorship, including 56 | the original version of the Work and any modifications or additions 57 | to that Work or Derivative Works thereof, that is intentionally 58 | submitted to Licensor for inclusion in the Work by the copyright owner 59 | or by an individual or Legal Entity authorized to submit on behalf of 60 | the copyright owner. For the purposes of this definition, "submitted" 61 | means any form of electronic, verbal, or written communication sent 62 | to the Licensor or its representatives, including but not limited to 63 | communication on electronic mailing lists, source code control systems, 64 | and issue tracking systems that are managed by, or on behalf of, the 65 | Licensor for the purpose of discussing and improving the Work, but 66 | excluding communication that is conspicuously marked or otherwise 67 | designated in writing by the copyright owner as "Not a Contribution." 68 | 69 | "Contributor" shall mean Licensor and any individual or Legal Entity 70 | on behalf of whom a Contribution has been received by Licensor and 71 | subsequently incorporated within the Work. 72 | 73 | 2. Grant of Copyright License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | copyright license to reproduce, prepare Derivative Works of, 77 | publicly display, publicly perform, sublicense, and distribute the 78 | Work and such Derivative Works in Source or Object form. 79 | 80 | 3. Grant of Patent License. Subject to the terms and conditions of 81 | this License, each Contributor hereby grants to You a perpetual, 82 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 83 | (except as stated in this section) patent license to make, have made, 84 | use, offer to sell, sell, import, and otherwise transfer the Work, 85 | where such license applies only to those patent claims licensable 86 | by such Contributor that are necessarily infringed by their 87 | Contribution(s) alone or by combination of their Contribution(s) 88 | with the Work to which such Contribution(s) was submitted. If You 89 | institute patent litigation against any entity (including a 90 | cross-claim or counterclaim in a lawsuit) alleging that the Work 91 | or a Contribution incorporated within the Work constitutes direct 92 | or contributory patent infringement, then any patent licenses 93 | granted to You under this License for that Work shall terminate 94 | as of the date such litigation is filed. 95 | 96 | 4. Redistribution. You may reproduce and distribute copies of the 97 | Work or Derivative Works thereof in any medium, with or without 98 | modifications, and in Source or Object form, provided that You 99 | meet the following conditions: 100 | 101 | (a) You must give any other recipients of the Work or 102 | Derivative Works a copy of this License; and 103 | 104 | (b) You must cause any modified files to carry prominent notices 105 | stating that You changed the files; and 106 | 107 | (c) You must retain, in the Source form of any Derivative Works 108 | that You distribute, all copyright, patent, trademark, and 109 | attribution notices from the Source form of the Work, 110 | excluding those notices that do not pertain to any part of 111 | the Derivative Works; and 112 | 113 | (d) If the Work includes a "NOTICE" text file as part of its 114 | distribution, then any Derivative Works that You distribute must 115 | include a readable copy of the attribution notices contained 116 | within such NOTICE file, excluding those notices that do not 117 | pertain to any part of the Derivative Works, in at least one 118 | of the following places: within a NOTICE text file distributed 119 | as part of the Derivative Works; within the Source form or 120 | documentation, if provided along with the Derivative Works; or, 121 | within a display generated by the Derivative Works, if and 122 | wherever such third-party notices normally appear. The contents 123 | of the NOTICE file are for informational purposes only and 124 | do not modify the License. You may add Your own attribution 125 | notices within Derivative Works that You distribute, alongside 126 | or as an addendum to the NOTICE text from the Work, provided 127 | that such additional attribution notices cannot be construed 128 | as modifying the License. 129 | 130 | You may add Your own copyright statement to Your modifications and 131 | may provide additional or different license terms and conditions 132 | for use, reproduction, or distribution of Your modifications, or 133 | for any such Derivative Works as a whole, provided Your use, 134 | reproduction, and distribution of the Work otherwise complies with 135 | the conditions stated in this License. 136 | 137 | 5. Submission of Contributions. Unless You explicitly state otherwise, 138 | any Contribution intentionally submitted for inclusion in the Work 139 | by You to the Licensor shall be under the terms and conditions of 140 | this License, without any additional terms or conditions. 141 | Notwithstanding the above, nothing herein shall supersede or modify 142 | the terms of any separate license agreement you may have executed 143 | with Licensor regarding such Contributions. 144 | 145 | 6. Trademarks. This License does not grant permission to use the trade 146 | names, trademarks, service marks, or product names of the Licensor, 147 | except as required for reasonable and customary use in describing the 148 | origin of the Work and reproducing the content of the NOTICE file. 149 | 150 | 7. Disclaimer of Warranty. Unless required by applicable law or 151 | agreed to in writing, Licensor provides the Work (and each 152 | Contributor provides its Contributions) on an "AS IS" BASIS, 153 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 154 | implied, including, without limitation, any warranties or conditions 155 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 156 | PARTICULAR PURPOSE. You are solely responsible for determining the 157 | appropriateness of using or redistributing the Work and assume any 158 | risks associated with Your exercise of permissions under this License. 159 | 160 | 8. Limitation of Liability. In no event and under no legal theory, 161 | whether in tort (including negligence), contract, or otherwise, 162 | unless required by applicable law (such as deliberate and grossly 163 | negligent acts) or agreed to in writing, shall any Contributor be 164 | liable to You for damages, including any direct, indirect, special, 165 | incidental, or consequential damages of any character arising as a 166 | result of this License or out of the use or inability to use the 167 | Work (including but not limited to damages for loss of goodwill, 168 | work stoppage, computer failure or malfunction, or any and all 169 | other commercial damages or losses), even if such Contributor 170 | has been advised of the possibility of such damages. 171 | 172 | 9. Accepting Warranty or Additional Liability. While redistributing 173 | the Work or Derivative Works thereof, You may choose to offer, 174 | and charge a fee for, acceptance of support, warranty, indemnity, 175 | or other liability obligations and/or rights consistent with this 176 | License. However, in accepting such obligations, You may act only 177 | on Your own behalf and on Your sole responsibility, not on behalf 178 | of any other Contributor, and only if You agree to indemnify, 179 | defend, and hold each Contributor harmless for any liability 180 | incurred by, or claims asserted against, such Contributor by reason 181 | of your accepting any such warranty or additional liability. 182 | 183 | END OF TERMS AND CONDITIONS 184 | 185 | APPENDIX: How to apply the Apache License to your work. 186 | 187 | To apply the Apache License to your work, attach the following 188 | boilerplate notice, with the fields enclosed by brackets "[]" 189 | replaced with your own identifying information. (Don't include 190 | the brackets!) The text should be enclosed in the appropriate 191 | comment syntax for the file format. We also recommend that a 192 | file or class name and description of purpose be included on the 193 | same "printed page" as the copyright notice for easier 194 | identification within third-party archives. 195 | 196 | Copyright [yyyy] [name of copyright owner] 197 | 198 | Licensed under the Apache License, Version 2.0 (the "License"); 199 | you may not use this file except in compliance with the License. 200 | You may obtain a copy of the License at 201 | 202 | http://www.apache.org/licenses/LICENSE-2.0 203 | 204 | Unless required by applicable law or agreed to in writing, software 205 | distributed under the License is distributed on an "AS IS" BASIS, 206 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 207 | See the License for the specific language governing permissions and 208 | limitations under the License. 209 | -------------------------------------------------------------------------------- /R1-mem-good1.md: -------------------------------------------------------------------------------- 1 | # 使用协议-(非商业用途)支持学习与探索更多用途 2 | 本软件禁止以下用途,如果一定要使用,请先获取授权: 3 | - 直接或间接的商业服务(如API收费、售卖模型权重、公有云)。 4 | - 集成到商业产品中。 5 | - 用于整体售卖的私有云部署服务。 6 | 7 | # 作者和联系方式 8 | 9 | 作者:老谭 10 | 邮箱:10267672@qq.com 11 | B站:https://space.bilibili.com/328484347 12 | 13 | # 程序名称 14 | vllm-gui-server-r1-loggood.py (这个版本时只用显存的模式,适合有多块大容量显存N卡的用户) 15 | vllm-gui-server-r1-mem-good4.py (这个版本带mem的是支持内存显存混合模式) 16 | 17 | # 本项目VLLM+VRAM+DRAM 开发心得 18 | 19 | 自从deepseek问世后,个人部署、私有部署deepseek满血版或者蒸馏版的需求呈爆炸性增长,虽然大家部署后也不知到用来干嘛(我是做行业应用的我是知道的),但很热情的拥抱AI,太好了。 20 | 但是大家的显卡也不是,大多是消费级显卡,显存非常有限,如何在有限的显存里玩出花样就成了一个课题。 21 | 22 | VLLM是一个不错的平台,比Ollama强,速度快,但原生的Vllm不支持混合内存模型部署。 23 | 24 | 除了追加显卡,另外一个办法就是显存内存混合使用,也有人叫这是统一内存(Unified Memory Management),但实际上老黄对统一内存的硬件要求很高,一般的计算机达不到,想达到只能买新计算机最少DDR5内存以上。 25 | 26 | 我们这个项目是基于DDR3内存开发,首先保证能跑得动,再讨论跑得快的问题。 27 | 28 | 在此之前的内存显存混合模式跑大模型的软件有一些,各有特点。本软件的特点是: 29 | 1、先把大模型完全加载到内存,再由内存加载到显存,这样做就避免了先加载到显存可能出现的报错。 30 | 2、内存加载完毕后,运行时采用动态优化。 31 | 3、图形化界面,省去了繁杂的命令行操作。 32 | 4、支持跨平台 Windows、Ubuntu,都支持。 33 | 34 | # VLLM 服务器管理器 (内存优化版) 35 | 36 | 基于vLLM的高性能服务器管理系统(图形界面),专为大型语言模型优化,提供智能内存管理、多GPU支持与显存优化功能。特别适合在有限资源环境下部署大型模型。 37 | 38 | ![VLLM 服务器管理器](https://example.com/path/to/screenshot.png) 39 | 40 | ## 核心特性 41 | 42 | - 🚀 智能内存管理和CPU卸载功能 43 | - 💾 支持模型内存交换,突破显存限制 44 | - 🖥️ 多GPU张量并行计算(支持1-4卡配置) 45 | - 📊 实时GPU与系统内存监控 46 | - ⚙️ 智能参数推荐系统 47 | - 🔄 支持多种精度格式模型 48 | - 🛠️ 兼容不同版本VLLM命令行参数 49 | 50 | ## 内存优化亮点 51 | 52 | - **内存交换技术**: 允许加载超出GPU显存的大型模型 53 | - **智能内存预分配**: 减少内存碎片,优化大模型加载 54 | - **系统资源实时监控**: 动态调整参数避免OOM错误 55 | - **CPU卸载机制**: 使用系统内存作为模型权重缓存 56 | 57 | ## 界面指南 58 | 59 | ### 基本配置区域 60 | - **模型路径**: 选择本地模型文件夹 61 | - **IP地址/端口**: 设置服务器监听地址 62 | - **GPU数量**: 配置用于推理的GPU数量 63 | - **显存比例**: 控制每个GPU的内存使用率(0.0-1.0) 64 | - **最大Token数**: 设置批处理中的最大token数量 65 | - **最大序列长度**: 支持的最大上下文窗口大小 66 | 67 | ### KV缓存配置 68 | - **缓存精度**: 选择KV缓存的数值类型(float16/float32) 69 | - **块大小**: 定义每个缓存块的token数量 70 | - **最大块数**: 限制每个GPU分配的最大块数 71 | - **动态缩放**: 在不同批次之间启用缩放优化 72 | 73 | ### 内存优化设置 74 | - **CPU卸载大小**: 设置卸载到CPU内存的模型数据大小(GB) 75 | - **内存交换空间**: 配置磁盘交换空间大小(GB) 76 | - **强制即时执行**: 避免CUDA图捕获导致的内存不足 77 | - **内存缓冲区预分配**: 预先分配内存减少碎片 78 | 79 | ## 内存和显存计算指南 80 | 81 | ### 模型大小估算 82 | 83 | | 模型参数量 | FP16大小 | INT8大小 | 最小GPU要求 | 最佳GPU配置 | 84 | |----------|---------|---------|-----------|------------| 85 | | 7B | ~14GB | ~7GB | 16GB | 24GB单卡 | 86 | | 13B | ~26GB | ~13GB | 24GB×2 | 32GB×1 | 87 | | 32B | ~64GB | ~32GB | 40GB×2 | 80GB×1 | 88 | | 70B | ~140GB | ~70GB | 80GB×2 | 80GB×4 | 89 | 90 | ### 显存使用明细 91 | 92 | 对于一个32B模型(FP16),显存分配大致如下: 93 | 94 | ``` 95 | 模型权重: 64GB 96 | KV缓存(2048上下文): ~2GB 97 | 优化器状态: 不适用于推理 98 | 梯度: 不适用于推理 99 | 激活值: ~1GB 100 | CUDA内核: ~0.5GB 101 | -------------------------- 102 | 总计: ~67.5GB 103 | ``` 104 | 105 | ### 内存交换和CPU卸载计算 106 | 107 | 当使用内存交换功能时,您可以按照以下公式计算需要的资源: 108 | 109 | ``` 110 | 必要GPU显存 = 模型大小 × (1 - CPU卸载比例) × (1 - 显存比率/100) 111 | 必要系统内存 = 模型大小 × CPU卸载比例 + 缓冲区(~2GB) 112 | 推荐交换空间 = 模型大小 × 0.2 (大约20%预留空间) 113 | ``` 114 | 115 | 例如,加载70B模型(FP16)到RTX 4090(24GB): 116 | ``` 117 | CPU卸载: ~100GB 118 | GPU显存: ~21GB (模型处理部分) 119 | 系统内存: ~120GB 120 | 交换空间: ~28GB 121 | ``` 122 | 123 | ## 快速使用指南 124 | 125 | ```bash 126 | # 创建虚拟环境 127 | python -m venv vvvip 128 | vvvip\Scripts\activate 129 | 130 | #激活虚拟环境 131 | source vvvip/bin/activate 132 | 133 | # 安装依赖 134 | pip install -r requirements.txt 135 | 136 | sudo apt-get install python3-tk 137 | 138 | # 启动程序 139 | python vllm-gui-server-r1-mem-good1.py 140 | ``` 141 | 142 | ## 推荐配置方案 143 | 144 | ### 消费级显卡 (RTX 4090) 145 | - 最大模型: 13B (完整FP16) 146 | - 显存比例: 0.85 147 | - CPU卸载: 对于更大模型必须启用 148 | - 推荐设置: 使用界面"推荐设置"功能 149 | 150 | ### 专业级显卡 (A100-80GB) 151 | - 最大模型: 70B (单卡FP16) 152 | - 显存比例: 0.9 153 | - 内存交换: 可选,用于超长上下文 154 | - KV缓存: float16 优先 155 | 156 | ### 多卡配置 (RTX 4090 × 2) 157 | - 最大模型: 35B (张量并行) 158 | - GPU数量: 2 159 | - 显存比例: 0.8 160 | - KV缓存块大小: 16 161 | 162 | ## 高级使用技巧 163 | 164 | 1. **大模型加载** 165 | - 启用"强制即时执行"避免CUDA图捕获阶段的内存不足 166 | - 使用较低显存比例(0.75-0.85)预留系统空间 167 | 168 | 2. **内存优化** 169 | - 对大型模型启用"内存缓冲区预分配"减少碎片 170 | - 系统内存至少为模型大小的2倍 171 | 172 | 3. **性能平衡** 173 | - 增大"块大小"可减少缓存管理开销 174 | - 降低"最大序列长度"可减少每个请求的内存占用 175 | 176 | ## 常见问题解决 177 | 178 | | 问题 | 解决方案 | 179 | |------|---------| 180 | | CUDA OOM错误 | 1. 降低显存比例 2. 启用CPU卸载 3. 使用"推荐设置" | 181 | | 模型加载失败 | 检查模型路径是否包含完整权重文件 | 182 | | 服务器启动失败 | 尝试使用"备用启动方法"启动服务器 | 183 | | KV缓存溢出 | 减小"最大token数"或增加"最大块数" | 184 | | 系统内存不足 | 启用磁盘交换空间或减少CPU卸载比例 | 185 | 186 | ## 技术支持 187 | 188 | 如有问题请提交Issue或联系: 189 | - 邮箱:10267672@qq.com 190 | - [官方文档](https://api-docs.deepseek.com/) 191 | 192 | --- 193 | 194 | *注意: 此版本为内存优化专版,专注于突破显存限制加载超大模型,界面上的参数设置直接影响模型加载和推理性能,请根据系统配置谨慎设置。* 195 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # deepseek-on-vllm-VRAMandDRAM 2 | 一套基于Vllm的显存内存混合模式大模型部署工具(图形界面),VRAMandDRAM模式虽然慢一点,但是解决了超大模型在普通家用计算机上的部署问题。 3 | 4 | # 使用协议-(非商业用途)支持学习与探索更多用途 5 | 本软件禁止以下用途,如果一定要使用,请先获取授权: 6 | - 直接或间接的商业服务(如API收费、售卖模型权重、公有云)。 7 | - 集成到商业产品中。 8 | - 用于整体售卖的私有云部署服务。 9 | 10 | # 作者和联系方式 11 | 12 | 作者:老谭 13 | 邮箱:10267672@qq.com 14 | B站:https://space.bilibili.com/328484347 15 | 16 | # 程序名称 17 | vllm-gui-server-r1-loggood.py (这个版本时只用显存的模式,适合有多块大容量显存N卡的用户) 18 | vllm-gui-server-r1-mem-good4.py (这个版本带mem的是支持内存显存混合模式) 19 | 20 | # 本项目VLLM+VRAM+DRAM 开发心得 21 | 22 | 自从deepseek问世后,个人部署、私有部署deepseek满血版或者蒸馏版的需求呈爆炸性增长,虽然大家部署后也不知到用来干嘛(我是做行业应用的我是知道的),但很热情的拥抱AI,太好了。 23 | 但是大家的显卡也不是,大多是消费级显卡,显存非常有限,如何在有限的显存里玩出花样就成了一个课题。 24 | 25 | VLLM是一个不错的平台,比Ollama强,速度快,但原生的Vllm不支持混合内存模型部署。 26 | 27 | 除了追加显卡,另外一个办法就是显存内存混合使用,也有人叫这是统一内存(Unified Memory Management),但实际上老黄对统一内存的硬件要求很高,一般的计算机达不到,想达到只能买新计算机最少DDR5内存以上。 28 | 29 | 我们这个项目是基于DDR3内存开发,首先保证能跑得动,再讨论跑得快的问题。 30 | 31 | 在此之前的内存显存混合模式跑大模型的软件有一些,各有特点。本软件的特点是: 32 | 1、先把大模型完全加载到内存,再由内存加载到显存,这样做就避免了先加载到显存可能出现的报错。 33 | 2、内存加载完毕后,运行时采用动态优化。 34 | 3、图形化界面,省去了繁杂的命令行操作。 35 | 4、支持跨平台 Windows、Ubuntu,都支持。 36 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # 基础依赖 2 | torch>=2.2.0 3 | vllm>=0.3.3 4 | transformers>=4.38.1 5 | accelerate>=0.27.0 6 | tiktoken>=0.6.0 7 | 8 | # GUI依赖 9 | # tkinter 10 | 11 | # 系统依赖 12 | # 在安装前请先运行: 13 | # sudo apt-get update 14 | # sudo apt-get install python3-tk 15 | 16 | numpy>=1.26.4 17 | sentencepiece>=0.1.99 18 | tqdm>=4.66.1 19 | fsspec>=2024.2.0 20 | typing-extensions>=4.9.0 21 | psutil>=5.9.8 22 | 23 | requests>=2.31.0 24 | 25 | #sudo apt-get update 26 | #sudo apt-get install python3-dev build-essential 27 | #或者 sudo apt install python3.10-dev 28 | 29 | # 系统监控 30 | GPUtil>=1.4.0 31 | 32 | # pip install bitsandbytes==0.41.1 -------------------------------------------------------------------------------- /vllm-gui-server-r1-loggood.py: -------------------------------------------------------------------------------- 1 | """ 2 | VLLM GUI服务器 - 版本 R1 3 | 最近更新: 2025-03-06 4 | 5 | 更新内容: 6 | 1. 修复了内存交换参数设置,确保与VLLM最新文档一致 7 | 2. 修正了--swap-space参数,使用正确的GiB单位 8 | 3. 更新了CPU卸载参数,使用--cpu-offload-gb替代旧版参数 9 | 4. 改进了参数检测逻辑,支持不同版本的VLLM 10 | 5. 增强了备用启动方法,提供更多参数组合选项 11 | 6. 修复了model_size变量未定义的问题 12 | 7. 添加了--enforce-eager参数,避免CUDA图捕获阶段的内存不足 13 | 8. 优化了update_config函数,确保配置参数正确应用 14 | 9. 改进了推荐设置功能,根据模型大小和硬件条件提供更合理的配置 15 | 16 | 注意: 此版本支持VLLM的内存交换功能,允许加载超出GPU显存大小的模型 17 | 18 | 预期效果 19 | 这些优化应该能够显著提高您的LLM推理速度,特别是: 20 | 多通道加载器优化应该能减轻DDR3内存的带宽限制 21 | KV缓存优化应该能提高缓存命中率,减少重复计算 22 | GPU利用率优化应该能让GPU发挥更大作用 23 | 启动参数优化应该能更好地配置vLLM服务器 24 | 如果您在使用过程中发现任何问题,或者需要进一步的优化,请随时告诉我 25 | """ 26 | 27 | import tkinter as tk 28 | from tkinter import ttk, filedialog, messagebox 29 | import socket 30 | import json 31 | import threading 32 | from vllm import AsyncLLMEngine, SamplingParams 33 | import subprocess 34 | import GPUtil 35 | import time 36 | import os 37 | import torch 38 | import psutil 39 | import mmap 40 | import sys 41 | from datetime import datetime 42 | import re 43 | import pynvml 44 | 45 | class VLLMServerGUI: 46 | def __init__(self, master): 47 | self.master = master 48 | master.title("VLLM-DRAM-VRAM Server Manager") 49 | 50 | # 配置参数存储 51 | self.config = { 52 | 'model_path': '', 53 | 'ip': self.get_local_ip(), 54 | 'port': 8000, 55 | 'gpu_count': 1, 56 | 'mem_ratio': 95, # 提高显存使用率 57 | 'max_tokens': 4096, # 增加最大token数 58 | 'kv_dtype': 'float16', 59 | 'block_size': 16, 60 | 'max_blocks': '', 61 | 'calculate_scales': True, 62 | 'max_model_len': 4096, # 减小max_model_len以节省内存 63 | # 内存交换相关配置 64 | 'enable_memory_offload': True, # 默认启用内存交换 65 | 'memory_offload_ratio': 70, # 增加内存交换比例 66 | 'memory_channels': 4, 67 | 'reserved_memory': 20 68 | } 69 | 70 | # 服务器进程 71 | self.server_process = None 72 | 73 | # API地址 74 | self.api_address = None 75 | 76 | # 主界面布局 77 | self.create_widgets() 78 | 79 | # 加载配置 80 | self.load_config() 81 | 82 | # 专业监控标志 83 | self.monitoring = True 84 | # 启动GPU监控线程 85 | threading.Thread(target=self.update_gpu_stats, daemon=True).start() 86 | 87 | self.api_server_started = False 88 | self.model_loaded = False 89 | self.model_path = "" 90 | self.performance_optimized = False 91 | self.memory_channel_info_displayed = False # 新增标志,用于跟踪内存交换通道信息是否已显示 92 | self.cache_hit_info_displayed = False # 新增标志,用于跟踪缓存命中率信息是否已显示 93 | self.kv_cache_info_displayed = False # 新增标志,用于跟踪KV缓存命中率信息是否已显示 94 | 95 | def get_local_ip(self): 96 | s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) 97 | try: 98 | s.connect(('10.255.255.255', 1)) 99 | IP = s.getsockname()[0] 100 | except Exception: 101 | IP = '127.0.0.1' 102 | finally: 103 | s.close() 104 | return IP 105 | 106 | def create_widgets(self): 107 | # 基本配置区域 108 | self.config_frame = ttk.LabelFrame(self.master, text="基本配置") 109 | self.config_frame.pack(padx=10, pady=5, fill='x') 110 | 111 | # 模型路径 112 | ttk.Label(self.config_frame, text="模型路径:").grid(row=0, column=0) 113 | self.model_path_entry = ttk.Entry(self.config_frame, width=50) 114 | self.model_path_entry.grid(row=0, column=1) 115 | self.model_path_entry.insert(0, self.config['model_path']) 116 | ttk.Button(self.config_frame, text="浏览", command=self.select_model_path).grid(row=0, column=2) 117 | 118 | # 添加保存配置按钮和推荐设置按钮 119 | save_config_button = ttk.Button(self.config_frame, text="保存配置", command=self.save_config_with_message) 120 | save_config_button.grid(row=0, column=3, padx=5) 121 | recommend_button = ttk.Button(self.config_frame, text="推荐设置", command=self.recommend_settings) 122 | recommend_button.grid(row=0, column=4, padx=5) 123 | 124 | # IP地址 125 | ttk.Label(self.config_frame, text="IP地址:").grid(row=1, column=0) 126 | self.ip_entry = ttk.Entry(self.config_frame) 127 | self.ip_entry.grid(row=1, column=1, sticky='w') 128 | self.ip_entry.insert(0, self.config['ip']) 129 | 130 | # 端口 131 | ttk.Label(self.config_frame, text="端口:").grid(row=2, column=0) 132 | self.port_entry = ttk.Entry(self.config_frame) 133 | self.port_entry.grid(row=2, column=1, sticky='w') 134 | self.port_entry.insert(0, str(self.config['port'])) 135 | 136 | # GPU数量 137 | ttk.Label(self.config_frame, text="GPU数量:").grid(row=3, column=0) 138 | self.gpu_count_var = tk.StringVar(value=str(self.config['gpu_count'])) 139 | gpu_count_combo = ttk.Combobox(self.config_frame, textvariable=self.gpu_count_var, 140 | values=["1", "2", "3", "4"], width=5) 141 | gpu_count_combo.grid(row=3, column=1, sticky='w') 142 | 143 | # 显存比例 144 | ttk.Label(self.config_frame, text="显存比例(%):").grid(row=4, column=0) 145 | self.mem_ratio_entry = ttk.Entry(self.config_frame) 146 | self.mem_ratio_entry.grid(row=4, column=1, sticky='w') 147 | self.mem_ratio_entry.insert(0, str(self.config['mem_ratio'])) 148 | 149 | # 最大Token数 150 | ttk.Label(self.config_frame, text="最大Token数:").grid(row=5, column=0) 151 | self.max_tokens_var = tk.StringVar(value=str(self.config['max_tokens'])) 152 | ttk.Entry(self.config_frame, textvariable=self.max_tokens_var, width=8).grid(row=5, column=1) 153 | ttk.Label(self.config_frame, text="(回复token数应不小于整体序列长度)", foreground="gray").grid(row=6, column=0, columnspan=2, sticky='w') 154 | 155 | # 最大序列长度 156 | ttk.Label(self.config_frame, text="最大序列长度:").grid(row=5, column=2) 157 | self.max_model_len_var = tk.StringVar(value=str(self.config['max_model_len'])) 158 | max_model_len_combo = ttk.Combobox(self.config_frame, textvariable=self.max_model_len_var, 159 | values=["2048", "4096", "8192", "16384", "32768", "65536"], width=8) 160 | max_model_len_combo.grid(row=5, column=3) 161 | ttk.Label(self.config_frame, text="(请根据硬件条件选择合适参数)", foreground="gray").grid(row=6, column=2, columnspan=2, sticky='w') 162 | 163 | # KV缓存配置 164 | cache_frame = ttk.LabelFrame(self.config_frame, text="KV缓存配置") 165 | cache_frame.grid(row=7, column=0, columnspan=3, sticky="ew", pady=5) 166 | 167 | # 缓存精度 168 | ttk.Label(cache_frame, text="缓存精度:").grid(row=0, column=0) 169 | self.kv_dtype_var = tk.StringVar(value=self.config['kv_dtype']) 170 | ttk.Combobox(cache_frame, textvariable=self.kv_dtype_var, 171 | values=["float16", "float32"], width=8).grid(row=0, column=1) 172 | 173 | # 缓存块大小 174 | ttk.Label(cache_frame, text="块大小(tokens):").grid(row=0, column=2) 175 | self.block_size_var = tk.StringVar(value=str(self.config['block_size'])) 176 | ttk.Entry(cache_frame, textvariable=self.block_size_var, width=8).grid(row=0, column=3) 177 | 178 | # 最大缓存块数 179 | ttk.Label(cache_frame, text="最大块数:").grid(row=1, column=0) 180 | self.max_blocks_var = tk.StringVar(value=str(self.config['max_blocks'])) 181 | ttk.Entry(cache_frame, textvariable=self.max_blocks_var, width=8).grid(row=1, column=1) 182 | ttk.Label(cache_frame, text="(留空为自动)").grid(row=1, column=2) 183 | 184 | # 动态缩放选项 185 | self.calculate_scales_var = tk.BooleanVar(value=self.config['calculate_scales']) 186 | ttk.Checkbutton(cache_frame, text="启用动态缩放", 187 | variable=self.calculate_scales_var).grid(row=1, column=3) 188 | 189 | # 添加高级性能设置区域 190 | self.create_advanced_settings() 191 | 192 | # 监控面板 193 | monitor_frame = ttk.LabelFrame(self.master, text="GPU监控") 194 | monitor_frame.pack(padx=10, pady=5, fill='both', expand=True) 195 | 196 | # GPU状态显示 197 | columns = ('GPU', '显存使用率', 'GPU使用率', '温度', '功耗', 'KV缓存命中率') 198 | self.gpu_tree = ttk.Treeview(monitor_frame, columns=columns, show='headings') 199 | for col in columns: 200 | self.gpu_tree.heading(col, text=col) 201 | self.gpu_tree.column(col, width=100) 202 | self.gpu_tree.pack(fill='both', expand=True) 203 | 204 | # 状态显示区域 205 | self.status_text = tk.Text(monitor_frame, height=10) 206 | self.status_text.pack(fill='both') 207 | 208 | # 服务器控制按钮 209 | button_frame = ttk.Frame(self.config_frame) 210 | button_frame.grid(row=8, column=0, columnspan=3, pady=5) 211 | ttk.Button(button_frame, text="启动服务器", command=self.start_server).grid(row=0, column=0, padx=5) 212 | ttk.Button(button_frame, text="停止服务器", command=self.stop_server).grid(row=0, column=1, padx=5) 213 | 214 | # API地址显示 215 | self.api_label = ttk.Label(self.config_frame, text="API地址:") 216 | self.api_label.grid(row=9, column=0, columnspan=3) 217 | 218 | # 添加内存交换配置框架 219 | offload_frame = ttk.LabelFrame(self.config_frame, text="内存交换配置") 220 | offload_frame.grid(row=10, column=0, columnspan=3, sticky="ew", pady=5) 221 | 222 | # 启用内存交换选项 223 | self.enable_offload_var = tk.BooleanVar(value=self.config['enable_memory_offload']) 224 | ttk.Checkbutton(offload_frame, text="启用内存交换", 225 | variable=self.enable_offload_var).grid(row=0, column=0) 226 | 227 | # 内存通道数量 228 | ttk.Label(offload_frame, text="内存通道数:").grid(row=0, column=1) 229 | self.memory_channels_var = tk.StringVar(value=str(self.config['memory_channels'])) 230 | ttk.Combobox(offload_frame, textvariable=self.memory_channels_var, 231 | values=["2", "4", "8", "16"], width=5).grid(row=0, column=2) 232 | 233 | # 内存交换比例 234 | ttk.Label(offload_frame, text="内存交换比例(%):").grid(row=1, column=0) 235 | self.memory_offload_ratio_var = tk.StringVar(value=str(self.config['memory_offload_ratio'])) 236 | ttk.Entry(offload_frame, textvariable=self.memory_offload_ratio_var, width=5).grid(row=1, column=1) 237 | 238 | # 预留系统内存比例 239 | ttk.Label(offload_frame, text="系统内存预留(%):").grid(row=1, column=2) 240 | self.reserved_memory_var = tk.StringVar(value=str(self.config['reserved_memory'])) 241 | ttk.Entry(offload_frame, textvariable=self.reserved_memory_var, width=5).grid(row=1, column=3) 242 | 243 | # 添加高级说明 244 | ttk.Label(offload_frame, text="(启用后可加载超出显存的大模型,但会降低推理速度)", 245 | foreground="gray").grid(row=2, column=0, columnspan=4, sticky='w') 246 | 247 | # 添加"检查兼容性"按钮 248 | self.check_compatibility_button = ttk.Button( 249 | self.config_frame, 250 | text="检查兼容性", 251 | command=self.check_model_compatibility 252 | ) 253 | self.check_compatibility_button.grid(row=1, column=3, padx=5, pady=5, sticky="w") 254 | 255 | # 添加性能监控面板 256 | self.add_performance_monitoring() 257 | 258 | def select_model_path(self): 259 | path = filedialog.askdirectory() 260 | if path: 261 | self.config['model_path'] = path 262 | self.model_path_entry.delete(0, tk.END) # 清除当前内容 263 | self.model_path_entry.insert(0, path) # 插入新路径 264 | 265 | def start_server(self): 266 | """启动VLLM服务器""" 267 | if not self.config['model_path']: 268 | messagebox.showerror("错误", "请先选择模型路径") 269 | return 270 | 271 | if hasattr(self, 'server_process') and self.server_process and self.server_process.poll() is None: 272 | messagebox.showinfo("提示", "服务器已经在运行") 273 | return 274 | 275 | # 检查模型兼容性 276 | if not self.check_model_compatibility(): 277 | if not messagebox.askokcancel("警告", "模型兼容性检查发现潜在问题,是否继续启动服务器?"): 278 | return 279 | 280 | # 清理GPU内存 281 | self.clean_gpu_memory() 282 | 283 | # 设置环境变量以避免内存碎片问题 284 | env = os.environ.copy() 285 | 286 | # 应用高级设置中的CUDA内存分块大小 287 | cuda_split_size = self.config.get('advanced_cuda_split_size', 128) # 默认128MB 288 | env['PYTORCH_CUDA_ALLOC_CONF'] = f'expandable_segments:True,max_split_size_mb:{cuda_split_size}' 289 | self.status_text.insert(tk.END, f"CUDA内存分块大小: {cuda_split_size}MB\n") 290 | 291 | env['CUDA_VISIBLE_DEVICES'] = ','.join([str(i) for i in range(self.config['gpu_count'])]) 292 | env['OMP_NUM_THREADS'] = '4' # 限制OpenMP线程数 293 | env['MKL_NUM_THREADS'] = '4' # 限制MKL线程数 294 | 295 | # 添加性能优化环境变量 296 | env['CUDA_DEVICE_MAX_CONNECTIONS'] = '1' # 优化CUDA连接 297 | env['NCCL_P2P_DISABLE'] = '1' # 对于单GPU,禁用P2P可能提高性能 298 | env['CUDA_AUTO_BOOST'] = '1' # 启用GPU自动提升频率 299 | env['VLLM_USE_ASYNC_CUDA_MALLOC'] = '1' # 使用异步CUDA内存分配 300 | # 获取系统内存大小 301 | system_memory = psutil.virtual_memory().total / (1024 * 1024 * 1024) # GB 302 | # 根据硬件情况选择是否启用内存高效线性层 303 | if system_memory > 16: # 只有在系统内存足够时才启用 304 | env['VLLM_ENABLE_MEMORY_EFFICIENT_LINEAR'] = '1' # 启用内存高效线性层 305 | 306 | # 记录启动信息 307 | self.status_text.insert(tk.END, "\n===== 启动服务器 =====\n") 308 | self.status_text.insert(tk.END, f"时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n") 309 | self.status_text.insert(tk.END, f"模型路径: {self.config['model_path']}\n") 310 | self.status_text.insert(tk.END, f"GPU数量: {self.config['gpu_count']}\n") 311 | self.status_text.insert(tk.END, f"显存比例: {self.config['mem_ratio']}%\n") 312 | 313 | # 检查GPU监控线程 314 | if not self.monitoring: 315 | self.monitoring = True 316 | threading.Thread(target=self.update_gpu_stats, daemon=True).start() 317 | 318 | # 保存配置 319 | self.save_config() 320 | 321 | # 预先分配内存空间,防止运行时内存不足 322 | self.preallocate_memory_buffer() 323 | 324 | # 初始化KV缓存监控 325 | self.kv_cache_hits = 0 326 | self.kv_cache_misses = 0 327 | 328 | # 检查是否需要内存交换 329 | if self.config['enable_memory_offload']: 330 | try: 331 | self.status_text.insert(tk.END, "正在设置内存交换...\n") 332 | 333 | # 计算模型大小 334 | model_size = self.estimate_model_size() 335 | 336 | # 获取可用显存 337 | available_vram = self.get_available_vram(use_ratio=self.config['mem_ratio'] / 100) 338 | 339 | self.status_text.insert(tk.END, f"模型大小: {model_size:.2f}GB, 可用显存: {available_vram:.2f}GB\n") 340 | 341 | # 计算需要卸载的内存大小 342 | offload_ratio = self.config['memory_offload_ratio'] / 100 343 | initial_offload_size = model_size * offload_ratio 344 | 345 | self.status_text.insert(tk.END, f"将卸载 {initial_offload_size:.2f}GB 到系统内存 (比例: {self.config['memory_offload_ratio']}%)\n") 346 | 347 | # 设置内存映射文件 348 | self.setup_memory_offload(model_size, offload_ratio) 349 | 350 | # 检查VLLM支持的参数 351 | self.status_text.insert(tk.END, "检查VLLM支持的参数...\n") 352 | 353 | # 计算可用系统内存(考虑预留比例) 354 | available_memory = self.get_available_system_memory() 355 | reserved_ratio = self.config['reserved_memory'] / 100 356 | safe_memory = available_memory * (1 - reserved_ratio) 357 | 358 | # 获取实际分配的内存大小 359 | actual_offload_size = 0 360 | if hasattr(self, 'mm') and self.mm: 361 | try: 362 | # 获取内存映射文件大小 363 | map_file = os.path.join(os.getcwd(), "model_offload", "model_offload.bin") 364 | if os.path.exists(map_file): 365 | actual_offload_size = os.path.getsize(map_file) / (1024 * 1024 * 1024) 366 | self.status_text.insert(tk.END, f"实际分配的内存映射大小: {actual_offload_size:.2f}GB\n") 367 | except Exception as e: 368 | self.status_text.insert(tk.END, f"获取内存映射大小失败: {str(e)}\n") 369 | 370 | # 动态调整所需的内存大小 371 | min_required_size = min(18, model_size * 0.8) # 至少需要模型大小的80% 372 | 373 | if actual_offload_size < min_required_size: 374 | self.status_text.insert(tk.END, f"警告: 实际分配的内存映射大小不足{min_required_size:.1f}GB,可能无法加载模型\n") 375 | if not messagebox.askokcancel("警告", 376 | f"实际分配的内存映射大小仅为{actual_offload_size:.2f}GB,建议至少{min_required_size:.1f}GB。\n是否继续?"): 377 | return False 378 | 379 | # 计算合理的交换空间大小 - 根据模型大小动态调整 380 | # 对于小模型(<10GB),使用较小的交换空间 381 | if model_size < 10: 382 | swap_size = max(2.0, model_size * 0.1) 383 | else: 384 | # 对于大模型,使用更大的交换空间 385 | swap_size = max(4.0, model_size * 0.15) 386 | 387 | # 确保不超过安全内存的20% 388 | swap_size = min(swap_size, safe_memory * 0.2) 389 | 390 | # 计算合理的CPU卸载大小 - 根据模型大小和可用显存动态调整 391 | available_vram = self.get_available_vram(use_ratio=self.config['mem_ratio'] / 100) 392 | 393 | # 如果模型大小超过可用显存,计算需要卸载的部分 394 | if model_size > available_vram: 395 | # 需要卸载的大小 = 模型大小 - 可用显存 + 额外缓冲区(1GB) 396 | min_offload_size = model_size - available_vram + 1.0 397 | # 确保至少卸载模型的60% 398 | offload_size = max(min_offload_size, model_size * 0.6) 399 | else: 400 | # 如果模型可以完全放入显存,仍然卸载一部分以提高稳定性 401 | offload_size = model_size * 0.3 402 | 403 | # 确保不超过安全内存的70% 404 | offload_size = min(offload_size, safe_memory * 0.7) 405 | 406 | # 计算总内存使用 407 | total_mem_usage = swap_size + offload_size 408 | mem_usage_ratio = total_mem_usage / safe_memory * 100 409 | 410 | self.status_text.insert(tk.END, f"可用系统内存: {available_memory:.2f}GB, 安全内存: {safe_memory:.2f}GB\n") 411 | self.status_text.insert(tk.END, f"计算交换空间: {swap_size:.2f}GB, CPU卸载: {offload_size:.2f}GB\n") 412 | self.status_text.insert(tk.END, f"总内存使用: {total_mem_usage:.2f}GB (安全内存的{mem_usage_ratio:.1f}%)\n") 413 | 414 | # 确保max_num_batched_tokens大于等于max_num_seqs 415 | max_tokens = max(self.config['max_tokens'], 256) # 确保至少为256 416 | 417 | # 构建命令 418 | cmd = [ 419 | 'vllm', 'serve', 420 | self.config['model_path'], 421 | '--host', self.config['ip'], 422 | '--port', str(self.config['port']), 423 | '--tensor-parallel-size', str(self.config['gpu_count']), 424 | '--gpu-memory-utilization', str(self.config['mem_ratio'] / 100), 425 | '--max-num-batched-tokens', str(max_tokens), 426 | '--block-size', str(self.config['block_size']), 427 | '--max-model-len', str(self.config['max_model_len']), 428 | '--dtype', 'half' # 强制使用half精度 429 | ] 430 | 431 | # 添加最大块数(如果指定) 432 | if self.config['max_blocks']: 433 | cmd.extend(['--num-gpu-blocks', self.config['max_blocks']]) 434 | 435 | # 添加交换空间参数 436 | swap_param = f"{swap_size:.2f}" # 移除GiB单位,只使用数字 437 | cmd.extend(['--swap-space', swap_param]) 438 | self.status_text.insert(tk.END, f"添加交换空间参数: --swap-space {swap_param} (GB)\n") 439 | 440 | # 添加CPU卸载参数 441 | offload_param = f"{offload_size:.2f}" # 移除GB单位,只使用数字 442 | cmd.extend(['--cpu-offload-gb', offload_param]) 443 | self.status_text.insert(tk.END, f"添加CPU卸载参数: --cpu-offload-gb {offload_param} (GB)\n") 444 | 445 | # 添加强制使用eager模式,避免CUDA图捕获阶段的内存不足 446 | cmd.append('--enforce-eager') 447 | self.status_text.insert(tk.END, "添加强制eager模式参数: --enforce-eager\n") 448 | 449 | self.status_text.insert(tk.END, f"已启用内存交换,可用CPU内存: {safe_memory:.2f}GB\n") 450 | 451 | # 记录完整命令 452 | cmd_str = ' '.join(cmd) 453 | self.status_text.insert(tk.END, f"完整命令: {cmd_str}\n") 454 | self.status_text.see(tk.END) 455 | 456 | except Exception as e: 457 | self.status_text.insert(tk.END, f"设置内存交换时出错: {str(e)}\n") 458 | import traceback 459 | self.status_text.insert(tk.END, traceback.format_exc()) 460 | if not messagebox.askokcancel("错误", 461 | f"设置内存交换时出错: {str(e)}\n是否继续启动服务器(不使用内存交换)?"): 462 | return 463 | 464 | # 如果内存交换设置失败,使用基本命令 465 | max_tokens = max(self.config['max_tokens'], 256) # 确保至少为256 466 | cmd = [ 467 | 'vllm', 'serve', 468 | self.config['model_path'], 469 | '--host', self.config['ip'], 470 | '--port', str(self.config['port']), 471 | '--tensor-parallel-size', str(self.config['gpu_count']), 472 | '--gpu-memory-utilization', str(self.config['mem_ratio'] / 100), 473 | '--max-num-batched-tokens', str(max_tokens), 474 | '--block-size', str(self.config['block_size']), 475 | '--max-model-len', str(self.config['max_model_len']), 476 | '--dtype', 'half', # 强制使用half精度 477 | '--enforce-eager' # 添加强制使用eager模式,避免CUDA图捕获阶段的内存不足 478 | ] 479 | else: 480 | # 如果不需要内存交换,使用基本命令 481 | max_tokens = max(self.config['max_tokens'], 256) # 确保至少为256 482 | cmd = [ 483 | 'vllm', 'serve', 484 | self.config['model_path'], 485 | '--host', self.config['ip'], 486 | '--port', str(self.config['port']), 487 | '--tensor-parallel-size', str(self.config['gpu_count']), 488 | '--gpu-memory-utilization', str(self.config['mem_ratio'] / 100), 489 | '--max-num-batched-tokens', str(max_tokens), 490 | '--block-size', str(self.config['block_size']), 491 | '--max-model-len', str(self.config['max_model_len']), 492 | '--dtype', 'half', # 强制使用half精度 493 | '--enforce-eager' # 添加强制使用eager模式,避免CUDA图捕获阶段的内存不足 494 | ] 495 | 496 | # 添加性能优化参数 497 | performance_args = [ 498 | '--max-num-seqs', '32', # 增加最大序列数 499 | '--disable-log-stats', # 禁用统计日志,减少开销 500 | '--kv-cache-dtype', 'auto', # 使用自动选择KV缓存精度 501 | '--trust-remote-code' # 信任远程代码,支持更多模型 502 | ] 503 | 504 | # 应用高级设置中的批处理大小 505 | batch_size = self.config.get('advanced_batch_size', 16) # 默认16 506 | performance_args.extend(['--max-num-batched-tokens', str(max(batch_size * 256, max_tokens))]) 507 | self.status_text.insert(tk.END, f"批处理大小: {batch_size}\n") 508 | 509 | # 添加内存带宽优化参数 510 | if int(self.block_size_var.get()) < 32: 511 | # 如果块大小小于32,建议增加到32以提高内存带宽利用率 512 | self.status_text.insert(tk.END, f"注意: 当前块大小({self.block_size_var.get()})较小,可能影响内存带宽利用率\n") 513 | self.status_text.insert(tk.END, "建议使用更大的块大小(32-64)以提高内存带宽利用率\n") 514 | 515 | # 检查是否支持Flash Attention 516 | if self.check_flash_attention_support(): 517 | performance_args.append('--enable-chunked-prefill') 518 | self.status_text.insert(tk.END, "启用分块预填充优化\n") 519 | 520 | # 添加性能参数到命令 521 | cmd.extend(performance_args) 522 | 523 | # 异步启动服务器 524 | try: 525 | self.status_text.insert(tk.END, "正在启动服务器进程...\n") 526 | 527 | self.server_process = subprocess.Popen( 528 | cmd, 529 | stdout=subprocess.PIPE, 530 | stderr=subprocess.STDOUT, 531 | env=env # 使用修改后的环境变量 532 | ) 533 | 534 | # 等待一小段时间,检查进程是否立即退出 535 | time.sleep(1) 536 | if self.server_process.poll() is not None: 537 | # 进程已退出,获取输出 538 | output, _ = self.server_process.communicate() 539 | error_msg = f"启动服务器失败: {output.decode()}" 540 | self.status_text.insert(tk.END, f"{error_msg}\n") 541 | 542 | # 尝试使用备用方法 543 | return self.fallback_start_server(error_msg) 544 | 545 | # 启动监控线程 546 | threading.Thread(target=self.monitor_server_output).start() 547 | 548 | # 更新API地址 549 | # 说明:GET /v1返回404是正常现象,请使用支持POST的具体API endpoint进行请求 550 | api_base = f"http://{self.config['ip']}:{self.config['port']}/v1" 551 | self.api_label.config(text=f"API地址: {api_base}") 552 | self.status_text.insert(tk.END, f"\n服务器启动中...\nAPI地址: {api_base}\n") 553 | self.status_text.see(tk.END) 554 | 555 | return True 556 | 557 | except Exception as e: 558 | error_msg = f"启动服务器失败: {str(e)}" 559 | self.status_text.insert(tk.END, f"{error_msg}\n") 560 | import traceback 561 | self.status_text.insert(tk.END, traceback.format_exc()) 562 | 563 | # 尝试使用备用方法 564 | return self.fallback_start_server(error_msg) 565 | 566 | def stop_server(self): 567 | try: 568 | # 先停止所有监控线程 569 | self.monitoring = False 570 | # 等待一小段时间让线程有机会退出 571 | time.sleep(0.5) 572 | 573 | if hasattr(self, 'server_process') and self.server_process and self.server_process.poll() is None: 574 | self.server_process.terminate() 575 | try: 576 | self.server_process.wait(timeout=5) 577 | self.status_text.insert(tk.END, "\n服务器已停止.\n") 578 | except subprocess.TimeoutExpired: 579 | self.status_text.insert(tk.END, "\n停止服务器超时,但服务器可能已停止.\n") 580 | else: 581 | self.status_text.insert(tk.END, "\n服务器未在运行.\n") 582 | 583 | # 清理内存映射资源 584 | self.cleanup_memory_offload() 585 | 586 | except Exception as e: 587 | messagebox.showerror("错误", f"停止服务器失败: {str(e)}") 588 | finally: 589 | # 确保监控标志被设置为False 590 | self.monitoring = False 591 | # 禁用自动调优 592 | if hasattr(self, 'auto_tune_var'): 593 | self.auto_tune_var.set(False) 594 | self.api_label.config(text="API地址: 服务器未启动") 595 | 596 | def cleanup_memory_offload(self): 597 | """清理内存映射资源""" 598 | try: 599 | # 清理内存缓冲区 600 | self.cleanup_memory_buffer() 601 | 602 | # 清理多通道加载器 603 | if hasattr(self, 'multi_channel_loader') and self.multi_channel_loader is not None: 604 | try: 605 | # 调用加载器的close方法 606 | if hasattr(self.multi_channel_loader, 'close'): 607 | self.multi_channel_loader.close() 608 | self.multi_channel_loader = None 609 | self.status_text.insert(tk.END, "多通道加载器已关闭\n") 610 | except Exception as e: 611 | self.status_text.insert(tk.END, f"关闭多通道加载器时出错: {str(e)}\n") 612 | elif hasattr(self, 'channel_loaders'): 613 | # 兼容旧版本的代码 614 | for loader in self.channel_loaders: 615 | if hasattr(loader, 'mm') and loader.mm: 616 | loader.mm.close() 617 | if hasattr(loader, 'mm_file') and loader.mm_file: 618 | loader.mm_file.close() 619 | self.channel_loaders = [] 620 | 621 | # 清理内存映射 622 | if hasattr(self, 'mm') and self.mm: 623 | self.mm.close() 624 | self.mm = None 625 | 626 | if hasattr(self, 'mm_file') and self.mm_file: 627 | self.mm_file.close() 628 | self.mm_file = None 629 | 630 | self.status_text.insert(tk.END, "内存映射资源已释放\n") 631 | except Exception as e: 632 | self.status_text.insert(tk.END, f"释放内存映射资源时出错: {str(e)}\n") 633 | 634 | def monitor_server_output(self): 635 | """监控服务器输出并检测错误""" 636 | error_patterns = [ 637 | # 内存不足错误 638 | (r"CUDA out of memory", "GPU内存不足"), 639 | (r"OutOfMemoryError", "内存不足"), 640 | (r"OOM", "内存不足"), 641 | # 模型加载错误 642 | (r"Error loading model", "模型加载错误"), 643 | (r"Failed to load", "模型加载失败"), 644 | # 参数错误 645 | (r"ValueError", "参数错误"), 646 | (r"TypeError", "类型错误"), 647 | # 权限错误 648 | (r"PermissionError", "权限错误"), 649 | # 网络错误 650 | (r"ConnectionError", "连接错误"), 651 | (r"Address already in use", "端口已被占用"), 652 | # 通用错误 653 | (r"Error:", "发生错误"), 654 | (r"Exception:", "发生异常"), 655 | (r"Traceback", "程序崩溃") 656 | ] 657 | 658 | # Token生成模式 659 | token_pattern = r"Processed (\d+) tokens" 660 | 661 | # 记录启动时间 662 | start_time = time.time() 663 | error_detected = False 664 | error_message = "" 665 | server_started = False 666 | show_process_indicator = False 667 | last_indicator_time = time.time() 668 | api_info_displayed = False 669 | 670 | # 显示基础信息 671 | self.status_text.insert(tk.END, "开始启动服务器...\n") 672 | 673 | while True: 674 | if not hasattr(self, 'server_process') or self.server_process is None: 675 | self.status_text.insert(tk.END, "服务器进程不存在\n") 676 | break 677 | 678 | if self.server_process.poll() is not None: 679 | self.status_text.insert(tk.END, f"服务器进程已退出,退出码: {self.server_process.poll()}\n") 680 | break 681 | 682 | # 如果API信息已显示,不再显示任何后续日志 683 | if api_info_displayed: 684 | # 只静默监控服务器进程,但不显示任何输出 685 | time.sleep(0.5) 686 | continue 687 | 688 | # 每2秒动态显示一个进度指示器(仅在API信息显示前) 689 | current_time = time.time() 690 | if current_time - last_indicator_time > 2 and not server_started and not api_info_displayed: 691 | self.status_text.insert(tk.END, "=====\n") 692 | last_indicator_time = current_time 693 | 694 | try: 695 | output = self.server_process.stdout.readline() 696 | if not output: 697 | time.sleep(0.1) 698 | continue 699 | 700 | output_text = output.decode(errors='replace') 701 | 702 | # 检查是否包含API服务器信息 703 | if ("API server" in output_text or "Uvicorn running on http://" in output_text) and not api_info_displayed: 704 | # 显示API信息 705 | self.status_text.insert(tk.END, output_text) 706 | self.status_text.insert(tk.END, "服务器已成功启动\n") 707 | self.status_text.see(tk.END) 708 | 709 | # 标记服务器已启动且API信息已显示 710 | server_started = True 711 | api_info_displayed = True 712 | 713 | # 服务器成功启动后,静默执行自动性能优化 714 | if not hasattr(self, 'performance_optimized') or not self.performance_optimized: 715 | threading.Thread(target=self.auto_optimize_performance, daemon=True).start() 716 | self.performance_optimized = True 717 | 718 | # 成功显示API信息后,不再显示任何后续日志 719 | continue 720 | 721 | # 如果API信息已显示,不再处理任何输出 722 | if api_info_displayed: 723 | continue 724 | 725 | # 仅显示最关键信息,不显示详细的中间过程 726 | critical_patterns = [ 727 | "API server", "http://", "Model loaded", "model loaded successfully" 728 | ] 729 | 730 | is_critical = any(pattern in output_text.lower() for pattern in critical_patterns) 731 | is_error = any(re.search(pattern, output_text, re.IGNORECASE) for pattern, _ in error_patterns) 732 | 733 | # 只显示关键信息和错误信息 734 | if is_critical or is_error: 735 | self.status_text.insert(tk.END, output_text) 736 | self.status_text.see(tk.END) 737 | 738 | # 检查是否有token生成信息 739 | token_match = re.search(token_pattern, output_text) 740 | if token_match: 741 | tokens = int(token_match.group(1)) 742 | self.update_token_count(tokens) 743 | 744 | # 检查是否包含错误信息 745 | for pattern, error_type in error_patterns: 746 | if re.search(pattern, output_text, re.IGNORECASE): 747 | error_detected = True 748 | error_message = f"{error_type}: {output_text.strip()}" 749 | self.status_text.insert(tk.END, f"检测到错误: {error_type}\n") 750 | break 751 | 752 | # 如果检测到错误,等待一段时间收集更多日志,然后尝试恢复 753 | if error_detected: 754 | # 继续读取一些输出以获取更多错误信息 755 | for _ in range(10): # 读取最多10行额外输出 756 | try: 757 | more_output = self.server_process.stdout.readline() 758 | if more_output: 759 | more_text = more_output.decode(errors='replace') 760 | self.status_text.insert(tk.END, more_text) 761 | error_message += "\n" + more_text.strip() 762 | except: 763 | break 764 | time.sleep(0.1) 765 | 766 | # 如果是内存不足错误,尝试使用备用启动方法 767 | if "内存不足" in error_message: 768 | self.status_text.insert(tk.END, "检测到内存不足错误,尝试使用备用启动方法...\n") 769 | # 停止当前进程 770 | try: 771 | self.server_process.terminate() 772 | self.server_process.wait(timeout=5) 773 | except: 774 | pass 775 | # 尝试使用备用方法启动 776 | self.fallback_start_server(error_message) 777 | return 778 | # 如果是端口被占用,尝试使用不同端口 779 | elif "端口已被占用" in error_message: 780 | self.status_text.insert(tk.END, "检测到端口被占用,尝试使用不同端口...\n") 781 | # 停止当前进程 782 | try: 783 | self.server_process.terminate() 784 | self.server_process.wait(timeout=5) 785 | except: 786 | pass 787 | # 尝试使用不同端口 788 | self.config['port'] += 1 789 | self.status_text.insert(tk.END, f"尝试使用新端口: {self.config['port']}\n") 790 | self.start_server() 791 | return 792 | else: 793 | # 其他错误,显示错误信息并询问用户是否尝试备用方法 794 | if messagebox.askokcancel("错误", f"服务器启动时发生错误:\n{error_message}\n\n是否尝试使用备用方法启动?"): 795 | # 停止当前进程 796 | try: 797 | self.server_process.terminate() 798 | self.server_process.wait(timeout=5) 799 | except: 800 | pass 801 | # 尝试使用备用方法启动 802 | self.fallback_start_server(error_message) 803 | return 804 | 805 | except Exception as e: 806 | if not api_info_displayed: 807 | self.status_text.insert(tk.END, f"监控服务器输出时出错: {str(e)}\n") 808 | time.sleep(1) 809 | 810 | def update_gpu_stats(self): 811 | while self.monitoring: 812 | try: 813 | gpus = GPUtil.getGPUs() 814 | self.gpu_tree.delete(*self.gpu_tree.get_children()) 815 | for gpu in gpus: 816 | # 使用nvidia-smi获取功耗信息 817 | try: 818 | power_info = subprocess.run( 819 | ['nvidia-smi', f'--id={gpu.id}', '--query-gpu=power.draw', '--format=csv,noheader,nounits'], 820 | capture_output=True, 821 | text=True 822 | ) 823 | power_draw = power_info.stdout.strip() 824 | except: 825 | power_draw = "N/A" 826 | 827 | self.gpu_tree.insert('', 'end', values=( 828 | gpu.id, 829 | f"{gpu.memoryUsed}MB/{gpu.memoryTotal}MB", 830 | f"{gpu.load*100:.1f}%", 831 | f"{gpu.temperature}°C", 832 | f"{power_draw}W" if power_draw and power_draw != "N/A" else "N/A", 833 | "0.0%" # KV缓存命中率暂时不支持 834 | )) 835 | time.sleep(2) 836 | except Exception as e: 837 | self.status_text.insert(tk.END, f"GPU监控错误: {e}\n") 838 | self.status_text.see(tk.END) 839 | time.sleep(5) 840 | 841 | def get_gpu_stats(self): 842 | """获取GPU统计信息,返回字典列表""" 843 | try: 844 | # 使用pynvml库代替执行nvidia-smi命令 845 | pynvml.nvmlInit() 846 | 847 | gpu_count = pynvml.nvmlDeviceGetCount() 848 | gpu_stats = [] 849 | 850 | for i in range(gpu_count): 851 | handle = pynvml.nvmlDeviceGetHandleByIndex(i) 852 | 853 | # 获取GPU利用率 854 | utilization = pynvml.nvmlDeviceGetUtilizationRates(handle) 855 | gpu_util = f"{utilization.gpu} %" 856 | mem_util = f"{utilization.memory} %" 857 | 858 | # 获取温度 859 | temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU) 860 | 861 | # 获取功耗 862 | try: 863 | power = pynvml.nvmlDeviceGetPowerUsage(handle) / 1000.0 864 | power_draw = f"{power:.1f} W" 865 | except: 866 | power_draw = "N/A" 867 | 868 | gpu_stat = { 869 | 'utilization.gpu': gpu_util, 870 | 'utilization.memory': mem_util, 871 | 'temperature.gpu': f"{temp} C", 872 | 'power.draw': power_draw 873 | } 874 | gpu_stats.append(gpu_stat) 875 | 876 | pynvml.nvmlShutdown() 877 | return gpu_stats 878 | except ImportError: 879 | # 如果pynvml未安装,返回一个模拟的状态信息并记录警告 880 | self.status_text.insert(tk.END, "警告: pynvml未安装,无法获取GPU信息。请执行 pip install nvidia-ml-py3 安装。\n") 881 | # 返回一个包含默认值的字典,避免程序崩溃 882 | return [{'utilization.gpu': '0 %', 'utilization.memory': '0 %', 'temperature.gpu': '0 C', 'power.draw': 'N/A'}] 883 | except Exception as e: 884 | # 记录错误但返回一个空结果集而不是抛出异常 885 | self.status_text.insert(tk.END, f"获取GPU统计信息错误: {str(e)}\n") 886 | return [] 887 | 888 | def load_config(self): 889 | try: 890 | with open('server_config.json', 'r') as f: 891 | loaded_config = json.load(f) 892 | self.config.update(loaded_config) 893 | 894 | # 更新界面上的值 895 | self.model_path_entry.delete(0, tk.END) 896 | self.model_path_entry.insert(0, self.config['model_path']) 897 | 898 | self.ip_entry.delete(0, tk.END) 899 | self.ip_entry.insert(0, self.config['ip']) 900 | 901 | self.port_entry.delete(0, tk.END) 902 | self.port_entry.insert(0, str(self.config['port'])) 903 | 904 | self.gpu_count_var.set(str(self.config['gpu_count'])) 905 | 906 | self.mem_ratio_entry.delete(0, tk.END) 907 | self.mem_ratio_entry.insert(0, str(self.config['mem_ratio'])) 908 | 909 | self.max_tokens_var.set(str(self.config['max_tokens'])) 910 | 911 | self.max_model_len_var.set(str(self.config['max_model_len'])) # 加载max_model_len 912 | 913 | # 加载内存交换配置 914 | if 'enable_memory_offload' in self.config: 915 | self.enable_offload_var.set(self.config['enable_memory_offload']) 916 | if 'memory_channels' in self.config: 917 | self.memory_channels_var.set(str(self.config['memory_channels'])) 918 | if 'memory_offload_ratio' in self.config: 919 | self.memory_offload_ratio_var.set(str(self.config['memory_offload_ratio'])) 920 | if 'reserved_memory' in self.config: 921 | self.reserved_memory_var.set(str(self.config['reserved_memory'])) 922 | 923 | except FileNotFoundError: 924 | pass 925 | 926 | def save_config(self): 927 | with open('server_config.json', 'w') as f: 928 | json.dump(self.config, f, indent=4) 929 | 930 | def save_config_with_message(self): 931 | # 先调用update_config确保配置已更新 932 | if self.update_config(): 933 | # 保存配置 934 | self.save_config() 935 | messagebox.showinfo("成功", "配置已保存到server_config.json") 936 | 937 | def select_calibrated_model(self): 938 | path = filedialog.askdirectory(title="选择校准模型目录") 939 | if path: 940 | self.calibrated_model_var.set(path) 941 | self.config['calibrated_model'] = path 942 | 943 | def check_fp8_support(self): 944 | try: 945 | if not torch.cuda.is_available(): 946 | return False 947 | capability = torch.cuda.get_device_capability() 948 | # 需要Ampere或更新架构(计算能力 >= 8.0) 949 | return capability[0] >= 8 950 | except Exception as e: 951 | print(f"检查FP8支持失败: {e}") 952 | return False 953 | 954 | def run_calibration(self): 955 | if not self.check_fp8_support(): 956 | messagebox.showerror("错误", "当前GPU不支持FP8量化") 957 | return 958 | 959 | if not self.config['model_path']: 960 | messagebox.showerror("错误", "请先选择模型路径") 961 | return 962 | 963 | # 生成校准脚本 964 | calibration_script = f""" 965 | from datasets import load_dataset 966 | from transformers import AutoModelForCausalLM, AutoTokenizer 967 | from llmcompressor.transformers import oneshot 968 | 969 | # 加载模型 970 | model = AutoModelForCausalLM.from_pretrained("{self.config['model_path']}", 971 | device_map="auto", 972 | torch_dtype="auto") 973 | tokenizer = AutoTokenizer.from_pretrained("{self.config['model_path']}") 974 | 975 | # 配置校准参数 976 | NUM_CALIBRATION_SAMPLES = 512 977 | MAX_SEQUENCE_LENGTH = 2048 978 | 979 | # 加载数据集 980 | ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft") 981 | ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES)) 982 | 983 | def process_and_tokenize(example): 984 | text = tokenizer.apply_chat_template(example["messages"], tokenize=False) 985 | return tokenizer( 986 | text, 987 | padding=False, 988 | max_length=MAX_SEQUENCE_LENGTH, 989 | truncation=True, 990 | add_special_tokens=False, 991 | ) 992 | 993 | ds = ds.map(process_and_tokenize, remove_columns=ds.column_names) 994 | 995 | # 量化配置 996 | recipe = ''' 997 | quant_stage: 998 | quant_modifiers: 999 | QuantizationModifier: 1000 | kv_cache_scheme: 1001 | num_bits: 8 1002 | type: float 1003 | strategy: tensor 1004 | dynamic: false 1005 | symmetric: true 1006 | ''' 1007 | 1008 | # 应用量化 1009 | oneshot( 1010 | model=model, 1011 | dataset=ds, 1012 | recipe=recipe, 1013 | max_seq_length=MAX_SEQUENCE_LENGTH, 1014 | num_calibration_samples=NUM_CALIBRATION_SAMPLES, 1015 | ) 1016 | 1017 | # 保存量化模型 1018 | SAVE_DIR = "{os.path.basename(self.config['model_path'])}-FP8-KV" 1019 | model.save_pretrained(SAVE_DIR, save_compressed=True) 1020 | tokenizer.save_pretrained(SAVE_DIR) 1021 | """ 1022 | 1023 | # 保存并运行校准脚本 1024 | with open("run_calibration.py", "w") as f: 1025 | f.write(calibration_script) 1026 | 1027 | # 检测操作系统,使用适当的方式启动进程 1028 | try: 1029 | if sys.platform == 'win32': 1030 | # Windows系统 1031 | subprocess.Popen(["python", "run_calibration.py"], 1032 | cwd=os.getcwd(), 1033 | creationflags=subprocess.CREATE_NEW_CONSOLE) 1034 | else: 1035 | # Linux/Mac系统 1036 | subprocess.Popen(["python", "run_calibration.py"], 1037 | cwd=os.getcwd()) 1038 | 1039 | messagebox.showinfo("校准", "校准进程已启动,请等待完成...") 1040 | except Exception as e: 1041 | self.status_text.insert(tk.END, f"启动校准进程失败: {str(e)}\n") 1042 | messagebox.showerror("错误", f"启动校准进程失败: {str(e)}") 1043 | 1044 | def get_available_system_memory(self): 1045 | """获取可用系统内存(GB)""" 1046 | mem = psutil.virtual_memory() 1047 | # 返回可用内存(GB) 1048 | return mem.available / (1024 * 1024 * 1024) 1049 | 1050 | def get_available_vram(self, use_ratio=None): 1051 | """获取可用显存(GB)""" 1052 | try: 1053 | gpus = GPUtil.getGPUs() 1054 | if not gpus: 1055 | return 0 1056 | 1057 | # 如果使用多GPU,计算总显存 1058 | if self.config['gpu_count'] > 1: 1059 | total_vram = sum([gpu.memoryTotal for gpu in gpus[:self.config['gpu_count']]]) 1060 | else: 1061 | total_vram = gpus[0].memoryTotal 1062 | 1063 | # 转换为GB并应用显存比例 1064 | ratio = use_ratio if use_ratio is not None else (self.config['mem_ratio'] / 100) 1065 | return total_vram * ratio / 1024 1066 | except Exception as e: 1067 | self.status_text.insert(tk.END, f"获取显存信息错误: {e}\n") 1068 | return 0 1069 | 1070 | def estimate_model_size(self): 1071 | """估算模型大小(GB)""" 1072 | try: 1073 | # 简单估算:检查模型目录中的.bin文件大小总和 1074 | model_path = self.config['model_path'] 1075 | total_size = 0 1076 | 1077 | # 检查是否有model.safetensors文件 1078 | safetensors_path = os.path.join(model_path, "model.safetensors") 1079 | if os.path.exists(safetensors_path): 1080 | total_size = os.path.getsize(safetensors_path) 1081 | self.status_text.insert(tk.END, f"找到model.safetensors文件,大小: {total_size/(1024*1024*1024):.2f}GB\n") 1082 | # 转换为GB 1083 | return total_size / (1024 * 1024 * 1024) 1084 | 1085 | # 检查是否有pytorch_model.bin文件 1086 | pytorch_model_path = os.path.join(model_path, "pytorch_model.bin") 1087 | if os.path.exists(pytorch_model_path): 1088 | total_size = os.path.getsize(pytorch_model_path) 1089 | self.status_text.insert(tk.END, f"找到pytorch_model.bin文件,大小: {total_size/(1024*1024*1024):.2f}GB\n") 1090 | # 转换为GB 1091 | return total_size / (1024 * 1024 * 1024) 1092 | 1093 | # 如果是分片模型,计算所有分片的大小 1094 | for root, dirs, files in os.walk(model_path): 1095 | for file in files: 1096 | if file.endswith('.bin') or file.endswith('.safetensors'): 1097 | file_path = os.path.join(root, file) 1098 | file_size = os.path.getsize(file_path) 1099 | total_size += file_size 1100 | self.status_text.insert(tk.END, f"找到模型文件: {file}, 大小: {file_size/(1024*1024*1024):.2f}GB\n") 1101 | 1102 | # 如果没有找到任何模型文件,使用默认值 1103 | if total_size == 0: 1104 | self.status_text.insert(tk.END, "未找到模型文件,使用默认值29.5GB\n") 1105 | return 29.5 # 默认值为29.5GB 1106 | 1107 | # 转换为GB 1108 | model_size_gb = total_size / (1024 * 1024 * 1024) 1109 | self.status_text.insert(tk.END, f"估算模型总大小: {model_size_gb:.2f}GB\n") 1110 | return model_size_gb 1111 | except Exception as e: 1112 | self.status_text.insert(tk.END, f"估算模型大小错误: {e}\n") 1113 | # 返回默认值 1114 | return 29.5 # 默认值为29.5GB 1115 | 1116 | def setup_memory_offload(self, model_size, offload_ratio): 1117 | """设置内存交换功能""" 1118 | if not self.config['enable_memory_offload']: 1119 | return False 1120 | 1121 | try: 1122 | # 计算需要卸载到内存的部分 1123 | offload_size = model_size * offload_ratio 1124 | 1125 | self.status_text.insert(tk.END, f"将卸载 {offload_size:.2f}GB 到系统内存 (比例: {offload_ratio*100:.0f}%)\n") 1126 | 1127 | # 创建内存映射文件目录 1128 | offload_dir = os.path.join(os.getcwd(), "model_offload") 1129 | os.makedirs(offload_dir, exist_ok=True) 1130 | 1131 | # 创建内存映射文件 1132 | map_file = os.path.join(offload_dir, "model_offload.bin") 1133 | 1134 | # 转换为字节 1135 | offload_size_bytes = int(offload_size * 1024 * 1024 * 1024) 1136 | 1137 | # 检查是否有足够的磁盘空间 1138 | disk_usage = psutil.disk_usage(os.getcwd()) 1139 | if disk_usage.free < offload_size_bytes: 1140 | self.status_text.insert(tk.END, f"警告: 磁盘空间不足,需要 {offload_size:.2f}GB,但只有 {disk_usage.free/(1024*1024*1024):.2f}GB 可用\n") 1141 | return False 1142 | 1143 | # 获取系统内存信息 1144 | mem = psutil.virtual_memory() 1145 | available_memory = mem.available / (1024 * 1024 * 1024) # 可用内存(GB) 1146 | 1147 | # 确保至少有2GB的系统内存预留 1148 | safe_memory = available_memory - 2.0 1149 | 1150 | # 检查是否有足够的内存 1151 | if safe_memory < offload_size: 1152 | # 调整大小到可用安全内存的90% 1153 | adjusted_size = safe_memory * 0.9 1154 | self.status_text.insert(tk.END, f"警告: 可用内存不足,需要 {offload_size:.2f}GB,但安全可用内存只有 {safe_memory:.2f}GB\n") 1155 | self.status_text.insert(tk.END, f"自动调整卸载大小到 {adjusted_size:.2f}GB (安全内存的90%)\n") 1156 | offload_size = adjusted_size 1157 | offload_size_bytes = int(offload_size * 1024 * 1024 * 1024) 1158 | 1159 | # 创建内存映射文件 1160 | self.status_text.insert(tk.END, f"正在创建内存映射文件,大小: {offload_size:.2f}GB...\n") 1161 | 1162 | # 记录内存使用情况 1163 | mem_before = psutil.virtual_memory() 1164 | self.status_text.insert(tk.END, f"创建前系统内存: 已用 {mem_before.percent}% ({mem_before.used/1024/1024/1024:.2f}GB/{mem_before.total/1024/1024/1024:.2f}GB)\n") 1165 | 1166 | # 使用fallocate预分配文件空间(如果可用) 1167 | try: 1168 | import subprocess 1169 | self.status_text.insert(tk.END, f"尝试使用fallocate快速分配 {offload_size:.2f}GB 空间...\n") 1170 | result = subprocess.run(['fallocate', '-l', f"{offload_size_bytes}", map_file], 1171 | check=True, capture_output=True) 1172 | self.status_text.insert(tk.END, "使用fallocate成功预分配空间\n") 1173 | 1174 | # 验证文件大小 1175 | actual_size = os.path.getsize(map_file) 1176 | self.status_text.insert(tk.END, f"验证文件大小: {actual_size/(1024*1024*1024):.2f}GB\n") 1177 | 1178 | if actual_size < offload_size_bytes * 0.99: # 允许1%的误差 1179 | self.status_text.insert(tk.END, f"警告: 文件大小不足,将使用传统方法分配\n") 1180 | os.remove(map_file) # 删除不完整的文件 1181 | raise Exception("文件大小不足") 1182 | 1183 | except Exception as e: 1184 | self.status_text.insert(tk.END, f"fallocate失败: {str(e)},将使用传统方法分配空间\n") 1185 | 1186 | # 传统方法: 分块写入 1187 | with open(map_file, "wb") as f: 1188 | # 写入全零数据以分配空间 1189 | chunk_size = 1024 * 1024 * 128 # 减小到128MB块,降低内存压力 1190 | remaining = offload_size_bytes 1191 | 1192 | try: 1193 | while remaining > 0: 1194 | # 每写入512MB检查一次内存状态,更频繁地检查 1195 | if (offload_size_bytes - remaining) % (512*1024*1024) < chunk_size: 1196 | mem_check = psutil.virtual_memory() 1197 | # 如果可用内存低于1.5GB,停止写入 1198 | if mem_check.available < 1.5 * 1024 * 1024 * 1024: 1199 | self.status_text.insert(tk.END, f"警告: 可用内存低于1.5GB,停止分配更多内存\n") 1200 | break 1201 | 1202 | write_size = min(chunk_size, remaining) 1203 | f.write(b'\0' * write_size) 1204 | remaining -= write_size 1205 | # 更新进度 1206 | progress = (offload_size_bytes - remaining) / offload_size_bytes * 100 1207 | self.status_text.delete("end-2l", "end-1l") # 删除上一行进度 1208 | self.status_text.insert(tk.END, f"创建内存映射文件: {progress:.1f}% ({(offload_size_bytes-remaining)/(1024*1024*1024):.2f}GB/{offload_size:.2f}GB)\n") 1209 | self.status_text.see(tk.END) 1210 | 1211 | # 添加小延迟,让系统有时间释放内存 1212 | time.sleep(0.01) 1213 | 1214 | except MemoryError: 1215 | self.status_text.insert(tk.END, f"内存不足,无法完成映射文件创建\n") 1216 | # 记录已分配的大小 1217 | actual_size = offload_size_bytes - remaining 1218 | self.status_text.insert(tk.END, f"已分配 {actual_size/(1024*1024*1024):.2f}GB\n") 1219 | # 截断文件到已写入的大小 1220 | f.flush() 1221 | f.truncate(actual_size) 1222 | 1223 | # 记录内存使用情况 1224 | mem_after = psutil.virtual_memory() 1225 | self.status_text.insert(tk.END, f"创建后系统内存: 已用 {mem_after.percent}% ({mem_after.used/1024/1024/1024:.2f}GB/{mem_after.total/1024/1024/1024:.2f}GB)\n") 1226 | 1227 | # 验证最终文件大小 1228 | final_size = os.path.getsize(map_file) 1229 | self.status_text.insert(tk.END, f"内存映射文件最终大小: {final_size/(1024*1024*1024):.2f}GB\n") 1230 | 1231 | # 不再强制要求18GB,而是根据模型大小动态调整 1232 | min_required_size = min(18, model_size * 0.8) # 至少需要模型大小的80% 1233 | 1234 | if final_size < min_required_size * 1024 * 1024 * 1024: 1235 | self.status_text.insert(tk.END, f"警告: 内存映射文件大小不足{min_required_size:.1f}GB,可能无法加载模型\n") 1236 | if not messagebox.askokcancel("警告", 1237 | f"内存映射文件大小仅为{final_size/(1024*1024*1024):.2f}GB,建议至少{min_required_size:.1f}GB。\n是否继续?"): 1238 | return False 1239 | 1240 | self.status_text.insert(tk.END, "内存映射文件创建完成\n") 1241 | 1242 | # 创建内存映射 1243 | self.mm_file = open(map_file, "r+b") 1244 | self.mm = mmap.mmap(self.mm_file.fileno(), 0) 1245 | 1246 | # 使用用户设置的内存通道数,不再自动增加 1247 | channels = self.config['memory_channels'] 1248 | self.status_text.insert(tk.END, f"使用用户设置的内存通道数: {channels}\n") 1249 | 1250 | self.setup_multi_channel_loader() 1251 | 1252 | # 创建配置文件 1253 | offload_config = { 1254 | 'enabled': True, 1255 | 'offload_dir': offload_dir, 1256 | 'offload_ratio': offload_ratio, 1257 | 'channels': channels, 1258 | 'reserved_memory': self.config['reserved_memory'] / 100, 1259 | 'actual_size_gb': final_size/(1024*1024*1024) 1260 | } 1261 | 1262 | offload_config_path = os.path.join(offload_dir, "offload_config.json") 1263 | with open(offload_config_path, 'w') as f: 1264 | json.dump(offload_config, f, indent=4) 1265 | 1266 | self.status_text.insert(tk.END, f"内存交换配置已保存到 {offload_config_path}\n") 1267 | 1268 | return True 1269 | except Exception as e: 1270 | self.status_text.insert(tk.END, f"设置内存交换错误: {str(e)}\n") 1271 | import traceback 1272 | self.status_text.insert(tk.END, traceback.format_exc()) 1273 | return False 1274 | 1275 | def setup_multi_channel_loader(self): 1276 | """设置多通道加载器""" 1277 | class MultiChannelLoader: 1278 | def __init__(self, memory_map, num_channels=4, cache_size=32): # 添加cache_size参数 1279 | self.memory_map = memory_map 1280 | self.num_channels = num_channels 1281 | self.channel_locks = [threading.Lock() for _ in range(num_channels)] 1282 | self.channel_positions = [0] * num_channels 1283 | self.channel_usage = [0] * num_channels # 记录每个通道的使用次数 1284 | self.channel_last_access = [time.time()] * num_channels # 记录每个通道的最后访问时间 1285 | self.cache = {} # 简单的内存缓存 1286 | self.cache_hits = 0 1287 | self.cache_misses = 0 1288 | self.max_cache_size = cache_size # 使用传入的缓存大小 1289 | self.prefetch_queue = [] # 预取队列 1290 | self.prefetch_lock = threading.Lock() 1291 | self.prefetch_thread_running = True 1292 | # 启动预取线程 1293 | threading.Thread(target=self._prefetch_worker, daemon=True).start() 1294 | 1295 | def read_chunk(self, offset, size, channel_id=None): 1296 | # 检查缓存 1297 | cache_key = (offset, size) 1298 | if cache_key in self.cache: 1299 | self.cache_hits += 1 1300 | # 更新缓存访问时间 1301 | self.cache[cache_key]['last_access'] = time.time() 1302 | return self.cache[cache_key]['data'] 1303 | 1304 | self.cache_misses += 1 1305 | 1306 | # 如果未指定通道,选择最佳通道 1307 | if channel_id is None: 1308 | channel_id = self._get_best_channel(offset) 1309 | 1310 | with self.channel_locks[channel_id]: 1311 | # 记录访问时间 1312 | self.channel_last_access[channel_id] = time.time() 1313 | 1314 | # 如果当前位置接近请求的偏移量,可以减少寻址时间 1315 | if abs(self.channel_positions[channel_id] - offset) < 1024*1024: # 如果在1MB范围内 1316 | # 已经接近目标位置,直接读取 1317 | pass 1318 | else: 1319 | # 需要重新定位 1320 | self.memory_map.seek(offset) 1321 | 1322 | data = self.memory_map.read(size) 1323 | self.channel_positions[channel_id] = offset + size 1324 | self.channel_usage[channel_id] += 1 1325 | 1326 | # 更新缓存 1327 | if len(self.cache) >= self.max_cache_size: 1328 | # 删除最旧的缓存项 1329 | oldest_key = min(self.cache.keys(), key=lambda k: self.cache[k]['last_access']) 1330 | del self.cache[oldest_key] 1331 | 1332 | self.cache[cache_key] = { 1333 | 'data': data, 1334 | 'last_access': time.time() 1335 | } 1336 | 1337 | # 预测性预取 - 预取下一个可能的块 1338 | next_offset = offset + size 1339 | self.prefetch(next_offset, size) 1340 | 1341 | return data 1342 | 1343 | def _get_best_channel(self, target_offset): 1344 | # 优先选择位置接近的通道,其次考虑使用频率 1345 | best_channel = 0 1346 | best_score = float('inf') 1347 | 1348 | for i in range(self.num_channels): 1349 | # 计算位置接近度分数 1350 | position_score = abs(self.channel_positions[i] - target_offset) / (1024*1024) # MB为单位 1351 | 1352 | # 计算使用频率分数 1353 | usage_score = self.channel_usage[i] * 0.1 1354 | 1355 | # 计算时间分数(越久未使用越好) 1356 | time_score = -10 * (time.time() - self.channel_last_access[i]) 1357 | 1358 | # 综合评分(越低越好) 1359 | total_score = position_score + usage_score + time_score 1360 | 1361 | if total_score < best_score: 1362 | best_score = total_score 1363 | best_channel = i 1364 | 1365 | return best_channel 1366 | 1367 | def _get_least_busy_channel(self): 1368 | # 选择使用次数最少的通道 1369 | return self.channel_usage.index(min(self.channel_usage)) 1370 | 1371 | def get_stats(self): 1372 | return { 1373 | 'positions': self.channel_positions, 1374 | 'usage': self.channel_usage, 1375 | 'cache_hits': self.cache_hits, 1376 | 'cache_misses': self.cache_misses, 1377 | 'hit_ratio': self.cache_hits / (self.cache_hits + self.cache_misses + 0.001) * 100, 1378 | 'prefetch_queue_size': len(self.prefetch_queue) 1379 | } 1380 | 1381 | def prefetch(self, offset, size): 1382 | """预取数据到缓存""" 1383 | # 检查是否已经在缓存中 1384 | cache_key = (offset, size) 1385 | if cache_key in self.cache: 1386 | return 1387 | 1388 | # 检查是否已经在预取队列中 1389 | with self.prefetch_lock: 1390 | for item in self.prefetch_queue: 1391 | if item[0] == offset and item[1] == size: 1392 | return 1393 | 1394 | # 添加到预取队列,最多保留10个预取请求 1395 | self.prefetch_queue.append((offset, size)) 1396 | if len(self.prefetch_queue) > 10: 1397 | self.prefetch_queue.pop(0) 1398 | 1399 | def _prefetch_worker(self): 1400 | """预取线程""" 1401 | while self.prefetch_thread_running: 1402 | try: 1403 | # 检查预取队列 1404 | with self.prefetch_lock: 1405 | if self.prefetch_queue: 1406 | offset, size = self.prefetch_queue.pop(0) 1407 | else: 1408 | offset, size = None, None 1409 | 1410 | # 如果有预取请求,执行预取 1411 | if offset is not None and size is not None: 1412 | # 检查是否已经在缓存中 1413 | cache_key = (offset, size) 1414 | if cache_key not in self.cache: 1415 | # 选择最佳通道 1416 | channel_id = self._get_best_channel(offset) 1417 | # 执行预取 1418 | self.read_chunk(offset, size, channel_id) 1419 | except Exception as e: 1420 | print(f"预取错误: {e}") 1421 | 1422 | # 短暂休眠,避免占用过多CPU 1423 | time.sleep(0.01) 1424 | 1425 | def close(self): 1426 | """关闭加载器""" 1427 | self.prefetch_thread_running = False 1428 | self.cache.clear() 1429 | 1430 | # 创建多通道加载器 1431 | num_channels = max(4, int(self.config['memory_channels'])) # 确保至少有4个通道 1432 | 1433 | # 应用高级设置中的缓存大小 1434 | cache_size = self.config.get('advanced_cache_size', 32) # 默认32 1435 | self.status_text.insert(tk.END, f"内存缓存大小: {cache_size}\n") 1436 | 1437 | self.multi_channel_loader = MultiChannelLoader( 1438 | self.mm, 1439 | num_channels=num_channels, 1440 | cache_size=cache_size # 传入缓存大小 1441 | ) 1442 | 1443 | self.status_text.insert(tk.END, f"已创建 {num_channels} 个内存通道加载器,带缓存和预取功能\n") 1444 | 1445 | # 启动内存监控线程 1446 | self.memory_monitor_thread_running = True 1447 | threading.Thread(target=self.memory_monitor_thread, daemon=True).start() 1448 | 1449 | def update_system_memory_stats(self): 1450 | """更新系统内存统计信息""" 1451 | try: 1452 | # 检查监控标志,如果已关闭则直接返回 1453 | if not self.monitoring: 1454 | return False 1455 | 1456 | # 获取系统内存信息 1457 | mem = psutil.virtual_memory() 1458 | 1459 | # 更新到界面 1460 | self.status_text.insert(tk.END, f"系统内存: 已用 {mem.percent}% ({mem.used/1024/1024/1024:.2f}GB/{mem.total/1024/1024/1024:.2f}GB)\n") 1461 | 1462 | # 如果启用了内存交换,监控交换性能 1463 | if hasattr(self, 'multi_channel_loader') and self.multi_channel_loader is not None: 1464 | try: 1465 | stats = self.multi_channel_loader.get_stats() 1466 | 1467 | # 只有在第一次时显示内存交换通道信息 1468 | if not self.memory_channel_info_displayed: 1469 | channel_stats = [f"通道{i}: {pos/1024/1024:.2f}MB" for i, pos in enumerate(stats['positions'])] 1470 | usage_stats = [f"通道{i}: {usage}次" for i, usage in enumerate(stats['usage'])] 1471 | 1472 | self.status_text.insert(tk.END, f"内存交换通道状态: {', '.join(channel_stats)}\n") 1473 | self.status_text.insert(tk.END, f"内存交换通道使用: {', '.join(usage_stats)}\n") 1474 | 1475 | # 设置标志,表示已显示过内存交换通道信息 1476 | self.memory_channel_info_displayed = True 1477 | 1478 | # 显示缓存命中率(也只显示一次) 1479 | if not self.cache_hit_info_displayed and 'cache_hits' in stats and 'cache_misses' in stats: 1480 | total_requests = stats['cache_hits'] + stats['cache_misses'] 1481 | if total_requests > 0: 1482 | hit_ratio = stats['cache_hits'] / total_requests * 100 1483 | self.status_text.insert(tk.END, f"内存缓存命中率: {hit_ratio:.2f}% (命中: {stats['cache_hits']}, 未命中: {stats['cache_misses']})\n") 1484 | self.cache_hit_info_displayed = True 1485 | except Exception as e: 1486 | # 捕获获取统计信息时的错误,但不中断监控 1487 | self.status_text.insert(tk.END, f"获取内存交换统计信息错误: {str(e)}\n") 1488 | 1489 | # 更新GPU KV缓存命中率(如果有,也只显示一次) 1490 | if not self.kv_cache_info_displayed and hasattr(self, 'kv_cache_hits') and hasattr(self, 'kv_cache_misses'): 1491 | total_kv_requests = self.kv_cache_hits + self.kv_cache_misses 1492 | if total_kv_requests > 0: 1493 | kv_hit_ratio = self.kv_cache_hits / total_kv_requests * 100 1494 | self.status_text.insert(tk.END, f"KV缓存命中率: {kv_hit_ratio:.2f}% (命中: {self.kv_cache_hits}, 未命中: {self.kv_cache_misses})\n") 1495 | self.kv_cache_info_displayed = True 1496 | 1497 | self.status_text.see(tk.END) 1498 | return True 1499 | except Exception as e: 1500 | self.status_text.insert(tk.END, f"内存监控错误: {e}\n") 1501 | return False 1502 | 1503 | def memory_monitor_thread(self): 1504 | """内存监控线程""" 1505 | try: 1506 | # 设置本地变量,避免频繁访问self属性 1507 | monitoring = True 1508 | 1509 | while monitoring and self.monitoring: 1510 | try: 1511 | if hasattr(self, 'server_process') and self.server_process is not None and self.server_process.poll() is None: 1512 | # 检查是否所有信息都已经显示过一次 1513 | all_info_displayed = (self.memory_channel_info_displayed and 1514 | self.cache_hit_info_displayed and 1515 | self.kv_cache_info_displayed) 1516 | 1517 | # 如果所有信息都已显示过,则降低更新频率,且不输出系统内存使用信息 1518 | if all_info_displayed: 1519 | # 只静默更新状态,不显示到界面 1520 | pass 1521 | else: 1522 | # 仍有未显示的信息,正常更新并显示 1523 | self.update_system_memory_stats() 1524 | 1525 | # 增加更新间隔 1526 | time.sleep(15) # 每15秒更新一次 1527 | 1528 | # 检查监控标志是否已更改 1529 | monitoring = self.monitoring 1530 | except Exception as e: 1531 | # 出错时不显示错误信息,静默处理 1532 | time.sleep(5) # 出错时等待5秒再继续 1533 | except Exception as e: 1534 | # 捕获线程启动时的异常,静默处理 1535 | pass 1536 | 1537 | def check_vllm_supported_args(self): 1538 | """检查VLLM支持的命令行参数""" 1539 | supported_args = { 1540 | 'swap_space': '--swap-space', 1541 | 'cpu_offload': '--cpu-offload-gb', 1542 | 'max_cpu_memory': '--max-cpu-memory' 1543 | } 1544 | 1545 | try: 1546 | # 尝试运行vllm help命令,增加超时时间 1547 | help_output = subprocess.run( 1548 | ['vllm', 'serve', '--help'], 1549 | capture_output=True, 1550 | text=True, 1551 | timeout=15 # 增加超时时间到15秒 1552 | ) 1553 | 1554 | # 检查输出中是否包含特定参数 1555 | output = help_output.stdout + help_output.stderr 1556 | self.status_text.insert(tk.END, f"检查VLLM支持的参数...\n") 1557 | 1558 | # 检查每个参数 1559 | if '--swap-space' not in output: 1560 | if '--swap' in output: 1561 | supported_args['swap_space'] = '--swap' 1562 | self.status_text.insert(tk.END, "未找到--swap-space参数,将使用--swap\n") 1563 | else: 1564 | supported_args['swap_space'] = None 1565 | self.status_text.insert(tk.END, "未找到交换空间相关参数\n") 1566 | 1567 | # 检查CPU卸载参数 1568 | if '--cpu-offload-gb' not in output: 1569 | if '--cpu-offload' in output: 1570 | supported_args['cpu_offload'] = '--cpu-offload' 1571 | self.status_text.insert(tk.END, "未找到--cpu-offload-gb参数,将使用--cpu-offload\n") 1572 | elif '--offload-params' in output: 1573 | supported_args['cpu_offload'] = '--offload-params' 1574 | self.status_text.insert(tk.END, "未找到--cpu-offload-gb参数,将使用--offload-params\n") 1575 | else: 1576 | supported_args['cpu_offload'] = None 1577 | self.status_text.insert(tk.END, "未找到CPU卸载相关参数\n") 1578 | 1579 | if '--max-cpu-memory' not in output: 1580 | supported_args['max_cpu_memory'] = None 1581 | self.status_text.insert(tk.END, "未找到--max-cpu-memory参数\n") 1582 | 1583 | return supported_args 1584 | 1585 | except subprocess.TimeoutExpired: 1586 | self.status_text.insert(tk.END, "检查VLLM参数超时,使用默认参数\n") 1587 | # 使用最常见的参数组合 1588 | return { 1589 | 'swap_space': '--swap-space', 1590 | 'cpu_offload': '--cpu-offload', 1591 | 'max_cpu_memory': None 1592 | } 1593 | except Exception as e: 1594 | self.status_text.insert(tk.END, f"检查VLLM参数失败: {str(e)}\n") 1595 | # 返回默认值 1596 | return supported_args 1597 | 1598 | def fallback_start_server(self, error_msg): 1599 | """备用启动方法,尝试使用不同的参数启动服务器""" 1600 | if not messagebox.askokcancel("错误", 1601 | f"{error_msg}\n\n是否尝试使用备用方法启动服务器?"): 1602 | return False 1603 | 1604 | self.status_text.insert(tk.END, "\n尝试使用备用方法启动服务器...\n") 1605 | 1606 | # 清理GPU内存 1607 | self.clean_gpu_memory() 1608 | 1609 | # 设置环境变量以避免内存碎片问题 1610 | env = os.environ.copy() 1611 | env['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True,max_split_size_mb:128' 1612 | env['CUDA_VISIBLE_DEVICES'] = ','.join([str(i) for i in range(self.config['gpu_count'])]) 1613 | env['OMP_NUM_THREADS'] = '4' # 限制OpenMP线程数 1614 | env['MKL_NUM_THREADS'] = '4' # 限制MKL线程数 1615 | 1616 | # 添加VLLM特定的环境变量,优化内存使用 1617 | env['VLLM_USE_ASYNC_CUDA_MALLOC'] = '1' # 使用异步CUDA内存分配 1618 | env['VLLM_CPU_OFFLOAD_PIPELINE'] = '1' # 启用CPU卸载流水线 1619 | env['VLLM_ENABLE_STAGED_INIT'] = '1' # 启用分阶段初始化 1620 | 1621 | self.status_text.insert(tk.END, "已设置优化环境变量\n") 1622 | 1623 | # 临时降低模型参数 1624 | original_max_model_len = self.config['max_model_len'] 1625 | original_max_tokens = self.config['max_tokens'] 1626 | 1627 | # 降低序列长度以减少内存使用 1628 | self.config['max_model_len'] = min(self.config['max_model_len'], 2048) # 调整到2048 1629 | self.config['max_tokens'] = min(self.config['max_tokens'], 2048) # 调整到2048,确保大于max_num_seqs 1630 | 1631 | self.status_text.insert(tk.END, f"临时降低序列长度: {self.config['max_model_len']}, 最大token数: {self.config['max_tokens']}\n") 1632 | 1633 | # 获取模型大小 1634 | model_size = self.estimate_model_size() 1635 | 1636 | # 尝试不同的启动选项 1637 | options = [ 1638 | { 1639 | "desc": "使用最小内存配置", 1640 | "cmd": [ 1641 | 'vllm', 'serve', 1642 | self.config['model_path'], 1643 | '--host', self.config['ip'], 1644 | '--port', str(self.config['port']), 1645 | '--tensor-parallel-size', str(self.config['gpu_count']), 1646 | '--gpu-memory-utilization', '0.7', # 降低显存使用率 1647 | '--max-num-batched-tokens', str(self.config['max_tokens']), 1648 | '--block-size', str(self.config['block_size']), 1649 | '--max-model-len', str(self.config['max_model_len']), 1650 | '--dtype', 'half', 1651 | '--enforce-eager' # 添加强制使用eager模式 1652 | ] 1653 | }, 1654 | { 1655 | "desc": "使用量化配置", 1656 | "cmd": [ 1657 | 'vllm', 'serve', 1658 | self.config['model_path'], 1659 | '--host', self.config['ip'], 1660 | '--port', str(self.config['port']), 1661 | '--tensor-parallel-size', str(self.config['gpu_count']), 1662 | '--gpu-memory-utilization', '0.8', 1663 | '--max-num-batched-tokens', str(self.config['max_tokens']), 1664 | '--block-size', str(self.config['block_size']), 1665 | '--max-model-len', str(self.config['max_model_len']), 1666 | '--dtype', 'half', 1667 | '--quantization', 'awq', # 尝试使用AWQ量化 1668 | '--enforce-eager' # 添加强制使用eager模式 1669 | ] 1670 | }, 1671 | { 1672 | "desc": "使用最小内存交换配置", 1673 | "cmd": [ 1674 | 'vllm', 'serve', 1675 | self.config['model_path'], 1676 | '--host', self.config['ip'], 1677 | '--port', str(self.config['port']), 1678 | '--tensor-parallel-size', str(self.config['gpu_count']), 1679 | '--gpu-memory-utilization', '0.6', # 进一步降低显存使用率 1680 | '--max-num-batched-tokens', str(self.config['max_tokens']), 1681 | '--block-size', str(self.config['block_size']), 1682 | '--max-model-len', str(self.config['max_model_len']), 1683 | '--dtype', 'half', 1684 | '--swap-space', '2', # 移除GiB单位,只使用数字 1685 | '--cpu-offload-gb', '10', 1686 | '--enforce-eager' # 添加强制使用eager模式 1687 | ] 1688 | } 1689 | ] 1690 | 1691 | # 针对大型模型(>10GB)添加特殊选项 1692 | if model_size > 10: 1693 | # 添加分阶段加载选项 1694 | staged_option = { 1695 | "desc": "使用分阶段加载(适合大模型)", 1696 | "cmd": [ 1697 | 'vllm', 'serve', 1698 | self.config['model_path'], 1699 | '--host', self.config['ip'], 1700 | '--port', str(self.config['port']), 1701 | '--tensor-parallel-size', str(self.config['gpu_count']), 1702 | '--gpu-memory-utilization', '0.5', # 显著降低显存使用率 1703 | '--max-num-batched-tokens', str(min(self.config['max_tokens'], 1024)), # 降低批处理大小 1704 | '--block-size', str(min(self.config['block_size'], 8)), # 降低块大小 1705 | '--max-model-len', str(min(self.config['max_model_len'], 1024)), # 降低最大长度 1706 | '--dtype', 'half', 1707 | '--swap-space', '4', 1708 | '--cpu-offload-gb', str(max(10, int(model_size * 0.7))), # 至少卸载70%的模型 1709 | '--enforce-eager' # 添加强制使用eager模式 1710 | ] 1711 | } 1712 | options.insert(0, staged_option) # 将此选项放在首位 1713 | 1714 | # 添加8位量化选项 1715 | int8_option = { 1716 | "desc": "使用8位量化(适合大模型)", 1717 | "cmd": [ 1718 | 'vllm', 'serve', 1719 | self.config['model_path'], 1720 | '--host', self.config['ip'], 1721 | '--port', str(self.config['port']), 1722 | '--tensor-parallel-size', str(self.config['gpu_count']), 1723 | '--gpu-memory-utilization', '0.7', 1724 | '--max-num-batched-tokens', str(self.config['max_tokens']), 1725 | '--block-size', str(self.config['block_size']), 1726 | '--max-model-len', str(self.config['max_model_len']), 1727 | '--dtype', 'half', 1728 | '--quantization', 'int8', # 使用int8量化 1729 | '--enforce-eager' # 添加强制使用eager模式 1730 | ] 1731 | } 1732 | options.insert(1, int8_option) 1733 | 1734 | # 尝试每个选项 1735 | for i, option in enumerate(options): 1736 | self.status_text.insert(tk.END, f"\n尝试选项 {i+1}: {option['desc']}\n") 1737 | cmd_str = ' '.join(option['cmd']) 1738 | self.status_text.insert(tk.END, f"命令: {cmd_str}\n") 1739 | 1740 | try: 1741 | # 启动服务器 1742 | self.server_process = subprocess.Popen( 1743 | option['cmd'], 1744 | stdout=subprocess.PIPE, 1745 | stderr=subprocess.STDOUT, 1746 | env=env 1747 | ) 1748 | 1749 | # 等待一小段时间,检查进程是否立即退出 1750 | time.sleep(5) # 增加等待时间 1751 | if self.server_process.poll() is None: 1752 | # 进程仍在运行,启动成功 1753 | self.status_text.insert(tk.END, "服务器启动成功!\n") 1754 | 1755 | # 启动监控线程 1756 | threading.Thread(target=self.monitor_server_output).start() 1757 | 1758 | # 更新API地址 1759 | api_base = f"http://{self.config['ip']}:{self.config['port']}/v1" 1760 | self.api_label.config(text=f"API地址: {api_base}") 1761 | 1762 | return True 1763 | else: 1764 | # 进程已退出,获取输出 1765 | output, _ = self.server_process.communicate() 1766 | error_output = output.decode() 1767 | self.status_text.insert(tk.END, f"启动失败: {error_output}\n") 1768 | 1769 | # 分析错误原因 1770 | if "CUDA out of memory" in error_output: 1771 | self.status_text.insert(tk.END, "检测到错误: GPU内存不足\n") 1772 | elif "RuntimeError" in error_output: 1773 | self.status_text.insert(tk.END, "检测到错误: 程序崩溃\n") 1774 | 1775 | # 在选项之间添加额外的清理步骤 1776 | self.clean_gpu_memory() 1777 | time.sleep(2) # 等待GPU内存释放 1778 | 1779 | except Exception as e: 1780 | self.status_text.insert(tk.END, f"尝试选项 {i+1} 失败: {str(e)}\n") 1781 | 1782 | # 所有选项都失败,提供建议 1783 | self.status_text.insert(tk.END, "所有备用选项都失败,建议:\n") 1784 | self.status_text.insert(tk.END, "1. 关闭其他内存密集型应用程序\n") 1785 | self.status_text.insert(tk.END, "2. 重启系统以清理内存碎片\n") 1786 | self.status_text.insert(tk.END, "3. 尝试使用量化版本的模型\n") 1787 | self.status_text.insert(tk.END, "4. 尝试使用更小的模型,如7B或更小的版本\n") 1788 | 1789 | # 恢复原始设置 1790 | self.config['max_model_len'] = original_max_model_len 1791 | self.config['max_tokens'] = original_max_tokens 1792 | 1793 | return False 1794 | 1795 | def clean_gpu_memory(self): 1796 | """清理GPU内存""" 1797 | try: 1798 | self.status_text.insert(tk.END, "正在清理GPU内存...\n") 1799 | 1800 | # 尝试释放PyTorch缓存 1801 | if torch.cuda.is_available(): 1802 | torch.cuda.empty_cache() 1803 | self.status_text.insert(tk.END, "已清理PyTorch缓存\n") 1804 | 1805 | # 获取当前GPU内存使用情况 1806 | gpu = GPUtil.getGPUs()[0] 1807 | free_mem = gpu.memoryFree 1808 | total_mem = gpu.memoryTotal 1809 | used_mem = total_mem - free_mem 1810 | 1811 | self.status_text.insert(tk.END, f"当前GPU内存: 已用 {used_mem}MB / 总计 {total_mem}MB\n") 1812 | 1813 | # 如果内存使用率过高,建议用户重启系统 1814 | if used_mem / total_mem > 0.5: # 如果使用超过50% 1815 | self.status_text.insert(tk.END, "警告: GPU内存使用率较高,可能影响模型加载\n") 1816 | self.status_text.insert(tk.END, "建议关闭其他使用GPU的应用程序或重启系统\n") 1817 | 1818 | # 尝试运行系统命令释放内存 1819 | os.system("sync") # 同步文件系统缓存 1820 | 1821 | # 尝试释放系统缓存 1822 | try: 1823 | with open("/proc/sys/vm/drop_caches", "w") as f: 1824 | f.write("1") 1825 | self.status_text.insert(tk.END, "已释放系统缓存\n") 1826 | except: 1827 | pass # 可能没有权限,忽略错误 1828 | 1829 | self.status_text.insert(tk.END, "GPU内存清理完成\n") 1830 | 1831 | except Exception as e: 1832 | self.status_text.insert(tk.END, f"清理GPU内存时出错: {str(e)}\n") 1833 | 1834 | def preallocate_memory_buffer(self): 1835 | """预先分配内存缓冲区,防止运行时内存不足""" 1836 | try: 1837 | self.status_text.insert(tk.END, "正在预分配内存缓冲区...\n") 1838 | 1839 | # 获取模型大小 1840 | model_size = self.estimate_model_size() 1841 | 1842 | # 计算需要预分配的内存大小 - 根据模型大小动态调整 1843 | if model_size < 10: 1844 | # 小模型使用较小的缓冲区 1845 | buffer_size_gb = model_size * 0.2 1846 | buffer_size_gb = max(buffer_size_gb, 4.0) # 至少4GB 1847 | else: 1848 | # 大模型使用较大的缓冲区,但比例更小 1849 | buffer_size_gb = model_size * 0.15 1850 | buffer_size_gb = max(buffer_size_gb, 6.0) # 至少6GB 1851 | 1852 | # 检查可用内存 1853 | mem = psutil.virtual_memory() 1854 | available_gb = mem.available / (1024 * 1024 * 1024) 1855 | 1856 | # 确保缓冲区不超过可用内存的50% 1857 | max_buffer_size = available_gb * 0.5 1858 | if buffer_size_gb > max_buffer_size: 1859 | self.status_text.insert(tk.END, f"警告: 计算的缓冲区大小({buffer_size_gb:.2f}GB)超过可用内存的50%,调整大小\n") 1860 | buffer_size_gb = max_buffer_size 1861 | 1862 | # 保留至少5GB系统运行空间 1863 | if available_gb < buffer_size_gb + 5: 1864 | self.status_text.insert(tk.END, f"警告: 可用内存({available_gb:.2f}GB)不足,减小缓冲区大小\n") 1865 | buffer_size_gb = max(2.0, available_gb - 5) # 至少2GB,保留5GB系统运行空间 1866 | 1867 | self.status_text.insert(tk.END, f"预分配内存缓冲区大小: {buffer_size_gb:.2f}GB\n") 1868 | 1869 | # 创建内存缓冲区目录 1870 | buffer_dir = os.path.join(os.getcwd(), "memory_buffer") 1871 | os.makedirs(buffer_dir, exist_ok=True) 1872 | 1873 | # 创建内存缓冲区文件 1874 | buffer_file = os.path.join(buffer_dir, "memory_buffer.bin") 1875 | 1876 | # 如果文件已存在,检查大小是否足够 1877 | if os.path.exists(buffer_file): 1878 | current_size = os.path.getsize(buffer_file) / (1024 * 1024 * 1024) 1879 | if current_size >= buffer_size_gb: 1880 | self.status_text.insert(tk.END, f"使用现有内存缓冲区: {current_size:.2f}GB\n") 1881 | return 1882 | else: 1883 | self.status_text.insert(tk.END, f"现有内存缓冲区大小不足({current_size:.2f}GB),重新创建\n") 1884 | os.remove(buffer_file) 1885 | 1886 | # 创建新的内存缓冲区文件 1887 | self.status_text.insert(tk.END, f"创建内存缓冲区文件: {buffer_file}\n") 1888 | 1889 | # 计算缓冲区大小(字节) 1890 | buffer_size_bytes = int(buffer_size_gb * 1024 * 1024 * 1024) 1891 | 1892 | # 创建内存缓冲区文件 1893 | with open(buffer_file, "wb") as f: 1894 | # 分块写入,避免一次性分配过多内存 1895 | chunk_size = 1024 * 1024 * 64 # 减小到64MB块,降低内存压力 1896 | remaining = buffer_size_bytes 1897 | 1898 | # 记录内存使用情况 1899 | mem_before = psutil.virtual_memory() 1900 | self.status_text.insert(tk.END, f"创建前系统内存: 已用 {mem_before.percent}% ({mem_before.used/1024/1024/1024:.2f}GB/{mem_before.total/1024/1024/1024:.2f}GB)\n") 1901 | 1902 | try: 1903 | while remaining > 0: 1904 | # 每写入256MB检查一次内存状态,更频繁地检查 1905 | if (buffer_size_bytes - remaining) % (256*1024*1024) < chunk_size: 1906 | mem_check = psutil.virtual_memory() 1907 | # 如果可用内存低于2.5GB,停止写入 1908 | if mem_check.available < 2.5 * 1024 * 1024 * 1024: 1909 | self.status_text.insert(tk.END, f"警告: 可用内存低于2.5GB,停止分配更多内存\n") 1910 | break 1911 | 1912 | write_size = min(chunk_size, remaining) 1913 | f.write(b'\0' * write_size) 1914 | remaining -= write_size 1915 | # 更新进度 1916 | progress = (buffer_size_bytes - remaining) / buffer_size_bytes * 100 1917 | self.status_text.delete("end-2l", "end-1l") # 删除上一行进度 1918 | self.status_text.insert(tk.END, f"创建内存缓冲区: {progress:.1f}% ({(buffer_size_bytes-remaining)/(1024*1024*1024):.2f}GB/{buffer_size_gb:.2f}GB)\n") 1919 | self.status_text.see(tk.END) 1920 | except MemoryError: 1921 | self.status_text.insert(tk.END, f"内存不足,无法完成缓冲区创建\n") 1922 | # 记录已分配的大小 1923 | actual_size = buffer_size_bytes - remaining 1924 | self.status_text.insert(tk.END, f"已分配 {actual_size/(1024*1024*1024):.2f}GB\n") 1925 | # 截断文件到已写入的大小 1926 | f.flush() 1927 | f.truncate(actual_size) 1928 | 1929 | # 记录内存使用情况 1930 | mem_after = psutil.virtual_memory() 1931 | self.status_text.insert(tk.END, f"创建后系统内存: 已用 {mem_after.percent}% ({mem_after.used/1024/1024/1024:.2f}GB/{mem_after.total/1024/1024/1024:.2f}GB)\n") 1932 | 1933 | # 验证最终文件大小 1934 | final_size = os.path.getsize(buffer_file) 1935 | self.status_text.insert(tk.END, f"内存缓冲区最终大小: {final_size/(1024*1024*1024):.2f}GB\n") 1936 | 1937 | # 打开文件并映射到内存 1938 | self.buffer_file = open(buffer_file, "r+b") 1939 | self.buffer_mm = mmap.mmap(self.buffer_file.fileno(), 0) 1940 | 1941 | self.status_text.insert(tk.END, f"内存缓冲区创建完成: {final_size/(1024*1024*1024):.2f}GB\n") 1942 | except Exception as e: 1943 | self.status_text.insert(tk.END, f"创建内存缓冲区时出错: {str(e)}\n") 1944 | import traceback 1945 | self.status_text.insert(tk.END, traceback.format_exc()) 1946 | 1947 | def cleanup_memory_buffer(self): 1948 | """清理内存缓冲区""" 1949 | try: 1950 | if hasattr(self, 'buffer_mm') and self.buffer_mm: 1951 | self.buffer_mm.close() 1952 | self.buffer_mm = None 1953 | 1954 | if hasattr(self, 'buffer_file') and self.buffer_file: 1955 | self.buffer_file.close() 1956 | self.buffer_file = None 1957 | 1958 | self.status_text.insert(tk.END, "内存缓冲区已释放\n") 1959 | except Exception as e: 1960 | self.status_text.insert(tk.END, f"释放内存缓冲区时出错: {str(e)}\n") 1961 | 1962 | def recommend_settings(self): 1963 | """根据模型大小和硬件条件推荐设置""" 1964 | try: 1965 | # 检查是否选择了模型 1966 | if not self.config['model_path']: 1967 | messagebox.showerror("错误", "请先选择模型路径") 1968 | return 1969 | 1970 | # 估算模型大小 1971 | model_size = self.estimate_model_size() 1972 | 1973 | # 获取GPU信息 1974 | gpus = GPUtil.getGPUs() 1975 | if not gpus: 1976 | messagebox.showerror("错误", "未检测到GPU") 1977 | return 1978 | 1979 | # 获取第一个GPU的显存大小(GB) 1980 | gpu_memory = gpus[0].memoryTotal / 1024 1981 | 1982 | # 获取系统内存大小(GB) 1983 | system_memory = psutil.virtual_memory().total / (1024 * 1024 * 1024) 1984 | 1985 | # 根据模型大小和硬件条件推荐设置 1986 | self.status_text.insert(tk.END, "\n===== 推荐设置 =====\n") 1987 | self.status_text.insert(tk.END, f"模型大小: {model_size:.2f}GB\n") 1988 | self.status_text.insert(tk.END, f"GPU显存: {gpu_memory:.2f}GB\n") 1989 | self.status_text.insert(tk.END, f"系统内存: {system_memory:.2f}GB\n") 1990 | 1991 | # 推荐显存比例 1992 | if model_size > gpu_memory * 0.9: 1993 | # 模型接近或超过显存大小,需要内存交换 1994 | mem_ratio = 85 # 降低到85%,给系统留出更多余量 1995 | self.status_text.insert(tk.END, f"推荐显存比例: {mem_ratio}% (模型较大,降低比例避免OOM)\n") 1996 | 1997 | # 启用内存交换 1998 | self.enable_offload_var.set(True) 1999 | 2000 | # 计算合理的内存交换比例 2001 | if model_size > gpu_memory * 1.5: 2002 | # 模型远大于显存,需要大量交换 2003 | offload_ratio = 70 # 降低到70%,避免系统内存压力过大 2004 | else: 2005 | # 模型略大于显存,适度交换 2006 | offload_ratio = 60 2007 | 2008 | self.memory_offload_ratio_var.set(str(offload_ratio)) 2009 | self.status_text.insert(tk.END, f"推荐内存交换比例: {offload_ratio}%\n") 2010 | 2011 | # 推荐内存通道数 - 根据系统内存大小调整 2012 | if system_memory > 64: # 只有大内存系统才推荐更多通道 2013 | channels = 8 2014 | else: 2015 | channels = 4 # 对于32GB内存系统,使用4个通道 2016 | 2017 | self.memory_channels_var.set(str(channels)) 2018 | self.status_text.insert(tk.END, f"推荐内存通道数: {channels}\n") 2019 | 2020 | # 推荐预留内存比例 2021 | reserved_memory = 20 2022 | self.reserved_memory_var.set(str(reserved_memory)) 2023 | self.status_text.insert(tk.END, f"推荐系统内存预留: {reserved_memory}%\n") 2024 | 2025 | # 推荐较小的序列长度 2026 | if model_size > 20: 2027 | max_model_len = 2048 2028 | else: 2029 | max_model_len = 4096 2030 | 2031 | self.max_model_len_var.set(str(max_model_len)) 2032 | self.status_text.insert(tk.END, f"推荐最大序列长度: {max_model_len}\n") 2033 | 2034 | # 推荐适中的块大小以提高内存带宽利用率 2035 | block_size = 32 # 对于普通硬件,32是较好的平衡点 2036 | self.block_size_var.set(str(block_size)) 2037 | self.status_text.insert(tk.END, f"推荐块大小: {block_size} (提高内存带宽利用率)\n") 2038 | 2039 | # 推荐使用--enforce-eager参数 2040 | self.status_text.insert(tk.END, "推荐使用强制eager模式,避免CUDA图捕获阶段的内存不足\n") 2041 | 2042 | else: 2043 | # 模型可以完全放入显存 2044 | mem_ratio = 90 2045 | self.status_text.insert(tk.END, f"推荐显存比例: {mem_ratio}% (模型可完全放入显存)\n") 2046 | 2047 | # 不需要内存交换 2048 | self.enable_offload_var.set(False) 2049 | self.status_text.insert(tk.END, "不需要启用内存交换\n") 2050 | 2051 | # 推荐较大的序列长度 2052 | max_model_len = 8192 2053 | self.max_model_len_var.set(str(max_model_len)) 2054 | self.status_text.insert(tk.END, f"推荐最大序列长度: {max_model_len}\n") 2055 | 2056 | # 推荐适中的块大小以提高内存带宽利用率 2057 | block_size = 32 # 对于普通硬件,32是较好的平衡点 2058 | self.block_size_var.set(str(block_size)) 2059 | self.status_text.insert(tk.END, f"推荐块大小: {block_size} (提高内存带宽利用率)\n") 2060 | 2061 | # 更新界面上的值 2062 | self.mem_ratio_entry.delete(0, tk.END) 2063 | self.mem_ratio_entry.insert(0, str(mem_ratio)) 2064 | 2065 | # 更新配置 2066 | self.update_config() 2067 | 2068 | self.status_text.insert(tk.END, "推荐设置已应用到界面\n") 2069 | self.status_text.see(tk.END) 2070 | 2071 | except Exception as e: 2072 | messagebox.showerror("错误", f"推荐设置失败: {str(e)}") 2073 | 2074 | def update_config(self): 2075 | """更新配置参数""" 2076 | try: 2077 | # 获取界面上的值 2078 | model_path = self.model_path_entry.get() 2079 | ip = self.ip_entry.get() 2080 | port = int(self.port_entry.get()) 2081 | gpu_count = int(self.gpu_count_var.get()) 2082 | mem_ratio = int(self.mem_ratio_entry.get()) 2083 | max_tokens = int(self.max_tokens_var.get()) 2084 | max_model_len = int(self.max_model_len_var.get()) 2085 | block_size = int(self.block_size_var.get()) 2086 | 2087 | # 获取内存交换配置 2088 | enable_memory_offload = self.enable_offload_var.get() 2089 | memory_channels = int(self.memory_channels_var.get()) 2090 | memory_offload_ratio = int(self.memory_offload_ratio_var.get()) 2091 | reserved_memory = int(self.reserved_memory_var.get()) 2092 | 2093 | # 验证参数 2094 | if port < 1 or port > 65535: 2095 | messagebox.showerror("错误", "端口号必须在1-65535之间") 2096 | return False 2097 | 2098 | if gpu_count < 1: 2099 | messagebox.showerror("错误", "GPU数量必须大于0") 2100 | return False 2101 | 2102 | if mem_ratio < 10 or mem_ratio > 100: 2103 | messagebox.showerror("错误", "显存比例必须在10-100之间") 2104 | return False 2105 | 2106 | if max_tokens < 256: 2107 | messagebox.showerror("错误", "最大Token数不能小于256") 2108 | return False 2109 | 2110 | if max_model_len < 512: 2111 | messagebox.showerror("错误", "最大模型长度不能小于512") 2112 | return False 2113 | 2114 | if block_size < 1: 2115 | messagebox.showerror("错误", "块大小必须大于0") 2116 | return False 2117 | 2118 | # 验证内存交换配置 2119 | if enable_memory_offload: 2120 | if memory_channels < 1: 2121 | messagebox.showerror("错误", "内存通道数必须大于0") 2122 | return False 2123 | 2124 | if memory_offload_ratio < 10 or memory_offload_ratio > 100: 2125 | messagebox.showerror("错误", "内存交换比例必须在10-100之间") 2126 | return False 2127 | 2128 | if reserved_memory < 0 or reserved_memory > 50: 2129 | messagebox.showerror("错误", "预留内存比例必须在0-50之间") 2130 | return False 2131 | 2132 | # 更新配置 2133 | self.config['model_path'] = model_path 2134 | self.config['ip'] = ip 2135 | self.config['port'] = port 2136 | self.config['gpu_count'] = gpu_count 2137 | self.config['mem_ratio'] = mem_ratio 2138 | self.config['max_tokens'] = max_tokens 2139 | self.config['max_model_len'] = max_model_len 2140 | self.config['block_size'] = block_size 2141 | 2142 | # 更新内存交换配置 2143 | self.config['enable_memory_offload'] = enable_memory_offload 2144 | self.config['memory_channels'] = memory_channels 2145 | self.config['memory_offload_ratio'] = memory_offload_ratio 2146 | self.config['reserved_memory'] = reserved_memory 2147 | 2148 | # 保存配置到文件 2149 | self.save_config() 2150 | 2151 | # 在状态栏显示配置信息 2152 | self.status_text.insert(tk.END, "\n===== 配置已更新 =====\n") 2153 | self.status_text.insert(tk.END, f"模型路径: {model_path}\n") 2154 | self.status_text.insert(tk.END, f"IP地址: {ip}, 端口: {port}\n") 2155 | self.status_text.insert(tk.END, f"GPU数量: {gpu_count}, 显存比例: {mem_ratio}%\n") 2156 | self.status_text.insert(tk.END, f"最大Token数: {max_tokens}, 最大模型长度: {max_model_len}, 块大小: {block_size}\n") 2157 | 2158 | if enable_memory_offload: 2159 | self.status_text.insert(tk.END, f"已启用内存交换: 通道数={memory_channels}, 交换比例={memory_offload_ratio}%, 预留内存={reserved_memory}%\n") 2160 | else: 2161 | self.status_text.insert(tk.END, "未启用内存交换\n") 2162 | 2163 | self.status_text.see(tk.END) 2164 | 2165 | return True 2166 | 2167 | except Exception as e: 2168 | messagebox.showerror("错误", f"更新配置失败: {str(e)}") 2169 | return False 2170 | 2171 | def validate_config(self): 2172 | """验证配置参数""" 2173 | if self.config['max_tokens'] < self.config['max_model_len']: 2174 | if not messagebox.askokcancel("警告", 2175 | "最大回复token数小于整体序列长度,这可能会影响模型性能。\n建议将max_tokens设置为不小于max_model_len。\n是否继续?"): 2176 | return False 2177 | return True 2178 | 2179 | def check_model_compatibility(self): 2180 | """检查模型与VLLM的兼容性""" 2181 | if not self.config['model_path']: 2182 | self.status_text.insert(tk.END, "错误: 未选择模型路径\n") 2183 | return False 2184 | 2185 | self.status_text.insert(tk.END, "正在检查模型兼容性...\n") 2186 | 2187 | # 检查硬件配置 2188 | self.check_hardware_configuration() 2189 | 2190 | # 检查模型文件是否存在 2191 | model_path = self.config['model_path'] 2192 | if not os.path.exists(model_path): 2193 | self.status_text.insert(tk.END, f"错误: 模型路径不存在: {model_path}\n") 2194 | return False 2195 | 2196 | # 检查必要的模型文件 2197 | required_files = [] 2198 | safetensors_found = False 2199 | bin_files_found = False 2200 | 2201 | # 检查是否有.safetensors文件 2202 | for root, dirs, files in os.walk(model_path): 2203 | for file in files: 2204 | if file.endswith('.safetensors'): 2205 | safetensors_found = True 2206 | self.status_text.insert(tk.END, f"找到safetensors文件: {file}\n") 2207 | elif file.endswith('.bin'): 2208 | bin_files_found = True 2209 | self.status_text.insert(tk.END, f"找到bin文件: {file}\n") 2210 | 2211 | if not (safetensors_found or bin_files_found): 2212 | self.status_text.insert(tk.END, "错误: 未找到模型权重文件(.safetensors或.bin)\n") 2213 | return False 2214 | 2215 | # 检查config.json文件 2216 | config_path = os.path.join(model_path, "config.json") 2217 | if not os.path.exists(config_path): 2218 | self.status_text.insert(tk.END, "错误: 未找到config.json文件\n") 2219 | return False 2220 | 2221 | # 检查tokenizer文件 2222 | tokenizer_files = ["tokenizer.json", "tokenizer_config.json"] 2223 | tokenizer_found = False 2224 | for file in tokenizer_files: 2225 | if os.path.exists(os.path.join(model_path, file)): 2226 | tokenizer_found = True 2227 | break 2228 | 2229 | if not tokenizer_found: 2230 | self.status_text.insert(tk.END, "警告: 未找到标准tokenizer文件,VLLM可能无法正确加载\n") 2231 | 2232 | # 读取模型配置 2233 | try: 2234 | with open(config_path, 'r') as f: 2235 | config = json.load(f) 2236 | 2237 | # 检查模型类型 2238 | model_type = config.get('model_type', '') 2239 | self.status_text.insert(tk.END, f"模型类型: {model_type}\n") 2240 | 2241 | # 检查是否是支持的模型类型 2242 | supported_types = ["llama", "mistral", "falcon", "gpt_neox", "gpt2", "bloom", "qwen", "baichuan", "chatglm", "mpt"] 2243 | if model_type.lower() not in [t.lower() for t in supported_types]: 2244 | self.status_text.insert(tk.END, f"警告: 模型类型 '{model_type}' 可能不被VLLM完全支持\n") 2245 | 2246 | # 检查模型大小 2247 | hidden_size = config.get('hidden_size', 0) 2248 | num_layers = config.get('num_hidden_layers', 0) or config.get('num_layers', 0) 2249 | vocab_size = config.get('vocab_size', 0) 2250 | 2251 | if hidden_size and num_layers: 2252 | # 粗略估计模型参数量 2253 | params_billion = (hidden_size * hidden_size * 4 * num_layers + hidden_size * vocab_size) / 1e9 2254 | self.status_text.insert(tk.END, f"估计模型参数量: {params_billion:.2f}B\n") 2255 | 2256 | # 检查是否是大模型 2257 | if params_billion > 30: 2258 | self.status_text.insert(tk.END, "警告: 这是一个较大的模型,可能需要多GPU或内存交换\n") 2259 | 2260 | # 检查特殊注意力机制 2261 | attention_type = config.get('attention_type', '') 2262 | if attention_type and attention_type not in ['scaled_dot_product', 'eager']: 2263 | self.status_text.insert(tk.END, f"警告: 特殊注意力机制 '{attention_type}' 可能不被VLLM支持\n") 2264 | 2265 | # 检查激活函数 2266 | activation_function = config.get('hidden_act', '') 2267 | if activation_function and activation_function not in ['gelu', 'gelu_new', 'relu', 'silu', 'swish']: 2268 | self.status_text.insert(tk.END, f"警告: 激活函数 '{activation_function}' 可能不被VLLM完全支持\n") 2269 | 2270 | except Exception as e: 2271 | self.status_text.insert(tk.END, f"读取模型配置时出错: {str(e)}\n") 2272 | 2273 | # 检查VLLM版本 2274 | try: 2275 | vllm_version = subprocess.run(['vllm', '--version'], capture_output=True, text=True) 2276 | version_str = vllm_version.stdout.strip() or vllm_version.stderr.strip() 2277 | self.status_text.insert(tk.END, f"VLLM版本: {version_str}\n") 2278 | 2279 | # 检查CUDA版本 2280 | if torch.cuda.is_available(): 2281 | cuda_version = torch.version.cuda 2282 | self.status_text.insert(tk.END, f"CUDA版本: {cuda_version}\n") 2283 | 2284 | # 检查GPU计算能力 2285 | capability = torch.cuda.get_device_capability() 2286 | self.status_text.insert(tk.END, f"GPU计算能力: {capability[0]}.{capability[1]}\n") 2287 | 2288 | # 检查是否支持当前GPU 2289 | if capability[0] < 7: 2290 | self.status_text.insert(tk.END, "警告: VLLM最佳支持计算能力7.0+的GPU (V100及更新)\n") 2291 | except Exception as e: 2292 | self.status_text.insert(tk.END, f"检查VLLM版本时出错: {str(e)}\n") 2293 | 2294 | # 检查GPU内存 2295 | try: 2296 | gpus = GPUtil.getGPUs() 2297 | if gpus: 2298 | gpu = gpus[0] 2299 | gpu_memory = gpu.memoryTotal / 1024 # GB 2300 | self.status_text.insert(tk.END, f"GPU显存: {gpu_memory:.2f}GB\n") 2301 | 2302 | # 估算模型大小 2303 | model_size = self.estimate_model_size() 2304 | self.status_text.insert(tk.END, f"估计模型大小: {model_size:.2f}GB\n") 2305 | 2306 | # 检查是否需要内存交换 2307 | if model_size > gpu_memory * 0.8: 2308 | self.status_text.insert(tk.END, f"警告: 模型大小({model_size:.2f}GB)接近或超过GPU显存({gpu_memory:.2f}GB)\n") 2309 | self.status_text.insert(tk.END, "建议启用内存交换或使用多GPU\n") 2310 | 2311 | # 检查系统内存 2312 | system_memory = psutil.virtual_memory().total / (1024 * 1024 * 1024) # GB 2313 | self.status_text.insert(tk.END, f"系统内存: {system_memory:.2f}GB\n") 2314 | 2315 | if system_memory < model_size * 1.5: 2316 | self.status_text.insert(tk.END, "警告: 系统内存可能不足以进行有效的内存交换\n") 2317 | 2318 | # 检查磁盘空间(用于内存映射文件) 2319 | disk_usage = psutil.disk_usage('/') 2320 | free_disk = disk_usage.free / (1024 * 1024 * 1024) # GB 2321 | self.status_text.insert(tk.END, f"可用磁盘空间: {free_disk:.2f}GB\n") 2322 | 2323 | if free_disk < model_size * 2: 2324 | self.status_text.insert(tk.END, "警告: 磁盘空间可能不足以创建内存映射文件\n") 2325 | except Exception as e: 2326 | self.status_text.insert(tk.END, f"检查GPU内存时出错: {str(e)}\n") 2327 | 2328 | self.status_text.insert(tk.END, "模型兼容性检查完成\n") 2329 | return True 2330 | 2331 | def check_hardware_configuration(self): 2332 | """检测用户硬件配置并提供相应的优化建议""" 2333 | self.status_text.insert(tk.END, "\n===== 硬件配置检测 =====\n") 2334 | 2335 | # 检测CPU 2336 | try: 2337 | cpu_count = psutil.cpu_count(logical=False) # 物理核心数 2338 | cpu_logical = psutil.cpu_count(logical=True) # 逻辑核心数 2339 | self.status_text.insert(tk.END, f"CPU: {cpu_count}核心/{cpu_logical}线程\n") 2340 | except Exception: 2341 | pass 2342 | 2343 | # 检测内存 2344 | try: 2345 | mem = psutil.virtual_memory() 2346 | total_memory = mem.total / (1024 * 1024 * 1024) # GB 2347 | self.status_text.insert(tk.END, f"系统内存: {total_memory:.2f}GB\n") 2348 | except Exception: 2349 | pass 2350 | 2351 | # 检测GPU 2352 | try: 2353 | if torch.cuda.is_available(): 2354 | gpu_count = torch.cuda.device_count() 2355 | self.status_text.insert(tk.END, f"检测到 {gpu_count} 个GPU\n") 2356 | 2357 | for i in range(gpu_count): 2358 | gpu_name = torch.cuda.get_device_name(i) 2359 | gpu_mem = torch.cuda.get_device_properties(i).total_memory / (1024 * 1024 * 1024) # GB 2360 | self.status_text.insert(tk.END, f"GPU {i}: {gpu_name}, 显存: {gpu_mem:.2f}GB\n") 2361 | else: 2362 | self.status_text.insert(tk.END, "未检测到支持CUDA的GPU\n") 2363 | except Exception: 2364 | pass 2365 | 2366 | self.status_text.insert(tk.END, "硬件配置检测完成\n") 2367 | self.status_text.see(tk.END) 2368 | 2369 | def check_flash_attention_support(self): 2370 | """检查是否支持Flash Attention""" 2371 | try: 2372 | import torch 2373 | has_support = hasattr(torch.nn.functional, 'scaled_dot_product_attention') 2374 | return False # 暂时禁用Flash Attention功能,避免兼容性问题 2375 | except Exception: 2376 | return False 2377 | 2378 | def add_performance_monitoring(self): 2379 | """添加性能监控与自动调优功能""" 2380 | # 创建性能监控面板 2381 | self.perf_frame = ttk.LabelFrame(self.master, text="性能监控") 2382 | self.perf_frame.pack(padx=10, pady=5, fill='both') 2383 | 2384 | # 添加性能指标显示 2385 | self.perf_labels = {} 2386 | metrics = ["GPU利用率", "内存带宽", "KV缓存命中率", "推理速度(token/s)"] 2387 | 2388 | for i, metric in enumerate(metrics): 2389 | ttk.Label(self.perf_frame, text=f"{metric}:").grid(row=i, column=0, sticky='w') 2390 | self.perf_labels[metric] = ttk.Label(self.perf_frame, text="N/A") 2391 | self.perf_labels[metric].grid(row=i, column=1, sticky='w') 2392 | 2393 | # 添加自动调优开关 2394 | self.auto_tune_var = tk.BooleanVar(value=True) 2395 | ttk.Checkbutton(self.perf_frame, text="启用自动性能调优", variable=self.auto_tune_var).grid(row=len(metrics), column=0, columnspan=2, sticky='w') 2396 | 2397 | # 初始化性能统计变量 2398 | self.total_tokens_generated = 0 2399 | self.kv_cache_hits = 0 2400 | self.kv_cache_misses = 0 2401 | 2402 | # 启动性能监控线程 2403 | self.start_performance_monitor() 2404 | 2405 | def start_performance_monitor(self): 2406 | """启动性能监控线程""" 2407 | def monitor_loop(): 2408 | last_tokens = 0 2409 | last_time = time.time() 2410 | 2411 | while hasattr(self, 'monitoring') and self.monitoring: 2412 | try: 2413 | if hasattr(self, 'server_process') and self.server_process is not None and self.server_process.poll() is None: 2414 | # 获取GPU统计信息 2415 | gpu_stats = self.get_gpu_stats() 2416 | if gpu_stats and len(gpu_stats) > 0: 2417 | # 安全获取GPU利用率和内存利用率 2418 | gpu_util_str = gpu_stats[0].get('utilization.gpu', '0 %').replace('%', '').strip() 2419 | mem_util_str = gpu_stats[0].get('utilization.memory', '0 %').replace('%', '').strip() 2420 | 2421 | # 转换为浮点数,处理可能的转换错误 2422 | try: 2423 | gpu_util = float(gpu_util_str) 2424 | except ValueError: 2425 | gpu_util = 0 2426 | 2427 | try: 2428 | mem_util = float(mem_util_str) 2429 | except ValueError: 2430 | mem_util = 0 2431 | 2432 | # 更新性能指标标签 2433 | if 'GPU利用率' in self.perf_labels: 2434 | self.perf_labels['GPU利用率'].config(text=f"{gpu_util:.1f}%") 2435 | if '内存带宽' in self.perf_labels: 2436 | self.perf_labels['内存带宽'].config(text=f"{mem_util:.1f}%") 2437 | 2438 | # 计算并更新推理速度 2439 | now = time.time() 2440 | if now - last_time >= 5: # 每5秒更新一次 2441 | tokens_per_sec = (self.total_tokens_generated - last_tokens) / (now - last_time) 2442 | last_tokens = self.total_tokens_generated 2443 | last_time = now 2444 | 2445 | if '推理速度(token/s)' in self.perf_labels: 2446 | self.perf_labels['推理速度(token/s)'].config(text=f"{tokens_per_sec:.2f}") 2447 | 2448 | # 无日志的自动调优逻辑 - 只在服务运行且启用自动调优时执行 2449 | if hasattr(self, 'monitoring') and self.monitoring and hasattr(self, 'auto_tune_var') and self.auto_tune_var.get() and tokens_per_sec < 5.0: 2450 | # 如果GPU利用率高但内存带宽低,说明存在内存瓶颈 2451 | if gpu_util > 90 and mem_util < 30: 2452 | # 静默优化内存访问 2453 | self.optimize_memory_access() 2454 | # 如果GPU利用率低,说明存在计算瓶颈 2455 | elif gpu_util < 30: 2456 | # 静默优化GPU利用率 2457 | self.optimize_for_low_gpu_utilization() 2458 | 2459 | # 更新KV缓存命中率 2460 | if hasattr(self, 'monitoring') and self.monitoring and hasattr(self, 'kv_cache_hits') and hasattr(self, 'kv_cache_misses'): 2461 | total_kv_requests = self.kv_cache_hits + self.kv_cache_misses 2462 | if total_kv_requests > 0: 2463 | kv_hit_ratio = self.kv_cache_hits / total_kv_requests * 100 2464 | if 'KV缓存命中率' in self.perf_labels: 2465 | self.perf_labels['KV缓存命中率'].config(text=f"{kv_hit_ratio:.2f}%") 2466 | except Exception: 2467 | # 静默处理错误,不显示错误信息 2468 | pass 2469 | 2470 | # 检查监控标志 2471 | if not hasattr(self, 'monitoring') or not self.monitoring: 2472 | break 2473 | 2474 | time.sleep(1) 2475 | 2476 | # 确保monitoring属性已设置 2477 | if not hasattr(self, 'monitoring'): 2478 | self.monitoring = True 2479 | 2480 | # 启动监控线程 2481 | self.perf_monitor_thread = threading.Thread(target=monitor_loop, daemon=True) 2482 | self.perf_monitor_thread.start() 2483 | 2484 | def optimize_for_low_gpu_utilization(self): 2485 | """针对低GPU利用率进行优化""" 2486 | # 这个方法会在GPU利用率低于30%时被调用 2487 | 2488 | # 1. 尝试增加批处理大小 2489 | if hasattr(self, 'batch_size'): 2490 | old_batch_size = self.batch_size 2491 | self.batch_size = min(self.batch_size * 2, 32) # 最大批大小32 2492 | 2493 | # 2. 尝试预热GPU 2494 | try: 2495 | # 创建一个小的张量并执行一些操作来预热GPU 2496 | import torch 2497 | if torch.cuda.is_available(): 2498 | device = torch.device("cuda") 2499 | # 创建一个大张量并执行一些操作 2500 | x = torch.randn(1000, 1000, device=device) 2501 | for _ in range(10): 2502 | x = torch.matmul(x, x) 2503 | # 强制同步 2504 | torch.cuda.synchronize() 2505 | except Exception: 2506 | pass 2507 | 2508 | # 3. 检查并优化内存访问模式 2509 | if hasattr(self, 'multi_channel_loader'): 2510 | # 增加缓存大小 2511 | if hasattr(self.multi_channel_loader, 'max_cache_size'): 2512 | old_cache_size = self.multi_channel_loader.max_cache_size 2513 | self.multi_channel_loader.max_cache_size = min(old_cache_size * 2, 128) 2514 | 2515 | def optimize_memory_access(self): 2516 | """优化内存访问模式""" 2517 | # 1. 尝试优化多通道加载器 2518 | if hasattr(self, 'multi_channel_loader') and self.multi_channel_loader is not None: 2519 | try: 2520 | # 获取当前通道数和缓存大小 2521 | old_channels = self.multi_channel_loader.num_channels 2522 | old_cache_size = self.multi_channel_loader.max_cache_size 2523 | 2524 | # 根据系统内存情况,适当增加通道数和缓存大小 2525 | # 对于普通硬件,最大增加到8个通道 2526 | self.multi_channel_loader.num_channels = min(old_channels * 2, 8) 2527 | # 对于普通硬件,最大增加到64 2528 | self.multi_channel_loader.max_cache_size = min(old_cache_size * 2, 64) 2529 | except Exception: 2530 | pass 2531 | 2532 | # 2. 尝试优化CUDA内存分配策略 2533 | try: 2534 | # 设置环境变量以优化CUDA内存分配,但使用较小的分块大小 2535 | os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True,max_split_size_mb:128' 2536 | except Exception: 2537 | pass 2538 | 2539 | def update_token_count(self, new_tokens): 2540 | """更新生成的token计数""" 2541 | if not hasattr(self, 'total_tokens_generated'): 2542 | self.total_tokens_generated = 0 2543 | self.total_tokens_generated += new_tokens 2544 | 2545 | def auto_optimize_performance(self): 2546 | """自动性能优化""" 2547 | try: 2548 | # 等待一段时间,确保服务器已经稳定运行 2549 | time.sleep(10) 2550 | 2551 | if not self.monitoring or not hasattr(self, 'server_process') or self.server_process is None or self.server_process.poll() is not None: 2552 | return 2553 | 2554 | # 静默执行性能优化 2555 | self.warm_up_gpu() 2556 | self.optimize_memory_access() 2557 | 2558 | except Exception: 2559 | pass 2560 | 2561 | def warm_up_gpu(self): 2562 | """预热GPU,提高性能稳定性""" 2563 | try: 2564 | # 创建一个小的张量并执行一些操作来预热GPU 2565 | import torch 2566 | if torch.cuda.is_available(): 2567 | device = torch.device("cuda") 2568 | # 创建一个大张量并执行一些操作 2569 | x = torch.randn(2000, 2000, device=device) 2570 | for _ in range(20): 2571 | x = torch.matmul(x, x) 2572 | # 强制同步 2573 | torch.cuda.synchronize() 2574 | except Exception as e: 2575 | pass 2576 | 2577 | def create_advanced_settings(self): 2578 | """创建高级性能设置区域""" 2579 | # 创建高级设置框架 2580 | advanced_frame = ttk.LabelFrame(self.master, text="高级性能设置") 2581 | advanced_frame.pack(padx=10, pady=5, fill='x') 2582 | 2583 | # 添加说明 2584 | ttk.Label(advanced_frame, text="以下设置适用于高性能硬件,请根据您的实际硬件配置谨慎调整", 2585 | foreground="red").grid(row=0, column=0, columnspan=4, sticky='w') 2586 | 2587 | # 内存缓存大小 2588 | ttk.Label(advanced_frame, text="内存缓存大小:").grid(row=1, column=0) 2589 | self.cache_size_var = tk.StringVar(value="32") 2590 | cache_size_combo = ttk.Combobox(advanced_frame, textvariable=self.cache_size_var, 2591 | values=["16", "32", "64", "128", "256"], width=5) 2592 | cache_size_combo.grid(row=1, column=1) 2593 | ttk.Label(advanced_frame, text="(大内存系统可增大)").grid(row=1, column=2) 2594 | 2595 | # CUDA内存分配块大小 2596 | ttk.Label(advanced_frame, text="CUDA内存分块(MB):").grid(row=2, column=0) 2597 | self.cuda_split_size_var = tk.StringVar(value="128") 2598 | cuda_split_combo = ttk.Combobox(advanced_frame, textvariable=self.cuda_split_size_var, 2599 | values=["64", "128", "256", "512"], width=5) 2600 | cuda_split_combo.grid(row=2, column=1) 2601 | ttk.Label(advanced_frame, text="(大显存GPU可增大)").grid(row=2, column=2) 2602 | 2603 | # 批处理大小 2604 | ttk.Label(advanced_frame, text="批处理大小:").grid(row=3, column=0) 2605 | self.batch_size_var = tk.StringVar(value="16") 2606 | batch_size_combo = ttk.Combobox(advanced_frame, textvariable=self.batch_size_var, 2607 | values=["8", "16", "32", "64"], width=5) 2608 | batch_size_combo.grid(row=3, column=1) 2609 | ttk.Label(advanced_frame, text="(高性能GPU可增大)").grid(row=3, column=2) 2610 | 2611 | # 检测硬件按钮 2612 | detect_hardware_button = ttk.Button(advanced_frame, text="检测硬件配置", 2613 | command=self.check_hardware_configuration) 2614 | detect_hardware_button.grid(row=4, column=0, columnspan=2, pady=5) 2615 | 2616 | # 应用高级设置按钮 2617 | apply_advanced_button = ttk.Button(advanced_frame, text="应用高级设置", 2618 | command=self.apply_advanced_settings) 2619 | apply_advanced_button.grid(row=4, column=2, columnspan=2, pady=5) 2620 | 2621 | # 添加说明 2622 | ttk.Label(advanced_frame, text="注意: 高级设置将在下次启动服务器时生效", 2623 | foreground="blue").grid(row=5, column=0, columnspan=4, sticky='w') 2624 | 2625 | # 加载已保存的高级设置 2626 | self.load_advanced_settings() 2627 | 2628 | def load_advanced_settings(self): 2629 | """加载已保存的高级设置""" 2630 | try: 2631 | # 如果配置中有高级设置,则加载 2632 | if 'advanced_cache_size' in self.config: 2633 | self.cache_size_var.set(str(self.config['advanced_cache_size'])) 2634 | if 'advanced_cuda_split_size' in self.config: 2635 | self.cuda_split_size_var.set(str(self.config['advanced_cuda_split_size'])) 2636 | if 'advanced_batch_size' in self.config: 2637 | self.batch_size_var.set(str(self.config['advanced_batch_size'])) 2638 | except Exception as e: 2639 | self.status_text.insert(tk.END, f"加载高级设置失败: {str(e)}\n") 2640 | 2641 | def apply_advanced_settings(self): 2642 | """应用高级性能设置""" 2643 | try: 2644 | # 获取高级设置值 2645 | cache_size = int(self.cache_size_var.get()) 2646 | cuda_split_size = int(self.cuda_split_size_var.get()) 2647 | batch_size = int(self.batch_size_var.get()) 2648 | 2649 | # 保存到配置 2650 | self.config['advanced_cache_size'] = cache_size 2651 | self.config['advanced_cuda_split_size'] = cuda_split_size 2652 | self.config['advanced_batch_size'] = batch_size 2653 | 2654 | # 更新配置文件 2655 | self.save_config() 2656 | 2657 | # 显示确认信息 2658 | self.status_text.insert(tk.END, "\n===== 高级设置已应用 =====\n") 2659 | self.status_text.insert(tk.END, f"内存缓存大小: {cache_size}\n") 2660 | self.status_text.insert(tk.END, f"CUDA内存分块大小: {cuda_split_size}MB\n") 2661 | self.status_text.insert(tk.END, f"批处理大小: {batch_size}\n") 2662 | self.status_text.insert(tk.END, "这些设置将在下次启动服务器时生效\n") 2663 | self.status_text.see(tk.END) 2664 | 2665 | messagebox.showinfo("成功", "高级设置已应用,将在下次启动服务器时生效") 2666 | except Exception as e: 2667 | messagebox.showerror("错误", f"应用高级设置失败: {str(e)}") 2668 | 2669 | if __name__ == "__main__": 2670 | root = tk.Tk() 2671 | app = VLLMServerGUI(root) 2672 | root.mainloop() 2673 | --------------------------------------------------------------------------------