├── LICENSE
├── R1-mem-good1.md
├── README.md
├── requirements.txt
├── vllm-gui-server-r1-loggood.py
└── vllm-gui-server-r1-mem-good4.py


/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 | Requirements from open sourcers
  6 | The following uses of this software are prohibited:
  7 | - Direct or indirect commercial services (such as API charges, selling model weights, public cloud).
  8 | - Integration into commercial products.
  9 | - Private cloud deployment services for overall sales.
 10 | 
 11 | 
 12 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
 13 | 
 14 |    1. Definitions.
 15 | 
 16 |       "License" shall mean the terms and conditions for use, reproduction,
 17 |       and distribution as defined by Sections 1 through 9 of this document.
 18 | 
 19 |       "Licensor" shall mean the copyright owner or entity authorized by
 20 |       the copyright owner that is granting the License.
 21 | 
 22 |       "Legal Entity" shall mean the union of the acting entity and all
 23 |       other entities that control, are controlled by, or are under common
 24 |       control with that entity. For the purposes of this definition,
 25 |       "control" means (i) the power, direct or indirect, to cause the
 26 |       direction or management of such entity, whether by contract or
 27 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 28 |       outstanding shares, or (iii) beneficial ownership of such entity.
 29 | 
 30 |       "You" (or "Your") shall mean an individual or Legal Entity
 31 |       exercising permissions granted by this License.
 32 | 
 33 |       "Source" form shall mean the preferred form for making modifications,
 34 |       including but not limited to software source code, documentation
 35 |       source, and configuration files.
 36 | 
 37 |       "Object" form shall mean any form resulting from mechanical
 38 |       transformation or translation of a Source form, including but
 39 |       not limited to compiled object code, generated documentation,
 40 |       and conversions to other media types.
 41 | 
 42 |       "Work" shall mean the work of authorship, whether in Source or
 43 |       Object form, made available under the License, as indicated by a
 44 |       copyright notice that is included in or attached to the work
 45 |       (an example is provided in the Appendix below).
 46 | 
 47 |       "Derivative Works" shall mean any work, whether in Source or Object
 48 |       form, that is based on (or derived from) the Work and for which the
 49 |       editorial revisions, annotations, elaborations, or other modifications
 50 |       represent, as a whole, an original work of authorship. For the purposes
 51 |       of this License, Derivative Works shall not include works that remain
 52 |       separable from, or merely link (or bind by name) to the interfaces of,
 53 |       the Work and Derivative Works thereof.
 54 | 
 55 |       "Contribution" shall mean any work of authorship, including
 56 |       the original version of the Work and any modifications or additions
 57 |       to that Work or Derivative Works thereof, that is intentionally
 58 |       submitted to Licensor for inclusion in the Work by the copyright owner
 59 |       or by an individual or Legal Entity authorized to submit on behalf of
 60 |       the copyright owner. For the purposes of this definition, "submitted"
 61 |       means any form of electronic, verbal, or written communication sent
 62 |       to the Licensor or its representatives, including but not limited to
 63 |       communication on electronic mailing lists, source code control systems,
 64 |       and issue tracking systems that are managed by, or on behalf of, the
 65 |       Licensor for the purpose of discussing and improving the Work, but
 66 |       excluding communication that is conspicuously marked or otherwise
 67 |       designated in writing by the copyright owner as "Not a Contribution."
 68 | 
 69 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 70 |       on behalf of whom a Contribution has been received by Licensor and
 71 |       subsequently incorporated within the Work.
 72 | 
 73 |    2. Grant of Copyright License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       copyright license to reproduce, prepare Derivative Works of,
 77 |       publicly display, publicly perform, sublicense, and distribute the
 78 |       Work and such Derivative Works in Source or Object form.
 79 | 
 80 |    3. Grant of Patent License. Subject to the terms and conditions of
 81 |       this License, each Contributor hereby grants to You a perpetual,
 82 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 83 |       (except as stated in this section) patent license to make, have made,
 84 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 85 |       where such license applies only to those patent claims licensable
 86 |       by such Contributor that are necessarily infringed by their
 87 |       Contribution(s) alone or by combination of their Contribution(s)
 88 |       with the Work to which such Contribution(s) was submitted. If You
 89 |       institute patent litigation against any entity (including a
 90 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 91 |       or a Contribution incorporated within the Work constitutes direct
 92 |       or contributory patent infringement, then any patent licenses
 93 |       granted to You under this License for that Work shall terminate
 94 |       as of the date such litigation is filed.
 95 | 
 96 |    4. Redistribution. You may reproduce and distribute copies of the
 97 |       Work or Derivative Works thereof in any medium, with or without
 98 |       modifications, and in Source or Object form, provided that You
 99 |       meet the following conditions:
100 | 
101 |       (a) You must give any other recipients of the Work or
102 |           Derivative Works a copy of this License; and
103 | 
104 |       (b) You must cause any modified files to carry prominent notices
105 |           stating that You changed the files; and
106 | 
107 |       (c) You must retain, in the Source form of any Derivative Works
108 |           that You distribute, all copyright, patent, trademark, and
109 |           attribution notices from the Source form of the Work,
110 |           excluding those notices that do not pertain to any part of
111 |           the Derivative Works; and
112 | 
113 |       (d) If the Work includes a "NOTICE" text file as part of its
114 |           distribution, then any Derivative Works that You distribute must
115 |           include a readable copy of the attribution notices contained
116 |           within such NOTICE file, excluding those notices that do not
117 |           pertain to any part of the Derivative Works, in at least one
118 |           of the following places: within a NOTICE text file distributed
119 |           as part of the Derivative Works; within the Source form or
120 |           documentation, if provided along with the Derivative Works; or,
121 |           within a display generated by the Derivative Works, if and
122 |           wherever such third-party notices normally appear. The contents
123 |           of the NOTICE file are for informational purposes only and
124 |           do not modify the License. You may add Your own attribution
125 |           notices within Derivative Works that You distribute, alongside
126 |           or as an addendum to the NOTICE text from the Work, provided
127 |           that such additional attribution notices cannot be construed
128 |           as modifying the License.
129 | 
130 |       You may add Your own copyright statement to Your modifications and
131 |       may provide additional or different license terms and conditions
132 |       for use, reproduction, or distribution of Your modifications, or
133 |       for any such Derivative Works as a whole, provided Your use,
134 |       reproduction, and distribution of the Work otherwise complies with
135 |       the conditions stated in this License.
136 | 
137 |    5. Submission of Contributions. Unless You explicitly state otherwise,
138 |       any Contribution intentionally submitted for inclusion in the Work
139 |       by You to the Licensor shall be under the terms and conditions of
140 |       this License, without any additional terms or conditions.
141 |       Notwithstanding the above, nothing herein shall supersede or modify
142 |       the terms of any separate license agreement you may have executed
143 |       with Licensor regarding such Contributions.
144 | 
145 |    6. Trademarks. This License does not grant permission to use the trade
146 |       names, trademarks, service marks, or product names of the Licensor,
147 |       except as required for reasonable and customary use in describing the
148 |       origin of the Work and reproducing the content of the NOTICE file.
149 | 
150 |    7. Disclaimer of Warranty. Unless required by applicable law or
151 |       agreed to in writing, Licensor provides the Work (and each
152 |       Contributor provides its Contributions) on an "AS IS" BASIS,
153 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
154 |       implied, including, without limitation, any warranties or conditions
155 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
156 |       PARTICULAR PURPOSE. You are solely responsible for determining the
157 |       appropriateness of using or redistributing the Work and assume any
158 |       risks associated with Your exercise of permissions under this License.
159 | 
160 |    8. Limitation of Liability. In no event and under no legal theory,
161 |       whether in tort (including negligence), contract, or otherwise,
162 |       unless required by applicable law (such as deliberate and grossly
163 |       negligent acts) or agreed to in writing, shall any Contributor be
164 |       liable to You for damages, including any direct, indirect, special,
165 |       incidental, or consequential damages of any character arising as a
166 |       result of this License or out of the use or inability to use the
167 |       Work (including but not limited to damages for loss of goodwill,
168 |       work stoppage, computer failure or malfunction, or any and all
169 |       other commercial damages or losses), even if such Contributor
170 |       has been advised of the possibility of such damages.
171 | 
172 |    9. Accepting Warranty or Additional Liability. While redistributing
173 |       the Work or Derivative Works thereof, You may choose to offer,
174 |       and charge a fee for, acceptance of support, warranty, indemnity,
175 |       or other liability obligations and/or rights consistent with this
176 |       License. However, in accepting such obligations, You may act only
177 |       on Your own behalf and on Your sole responsibility, not on behalf
178 |       of any other Contributor, and only if You agree to indemnify,
179 |       defend, and hold each Contributor harmless for any liability
180 |       incurred by, or claims asserted against, such Contributor by reason
181 |       of your accepting any such warranty or additional liability.
182 | 
183 |    END OF TERMS AND CONDITIONS
184 | 
185 |    APPENDIX: How to apply the Apache License to your work.
186 | 
187 |       To apply the Apache License to your work, attach the following
188 |       boilerplate notice, with the fields enclosed by brackets "[]"
189 |       replaced with your own identifying information. (Don't include
190 |       the brackets!)  The text should be enclosed in the appropriate
191 |       comment syntax for the file format. We also recommend that a
192 |       file or class name and description of purpose be included on the
193 |       same "printed page" as the copyright notice for easier
194 |       identification within third-party archives.
195 | 
196 |    Copyright [yyyy] [name of copyright owner]
197 | 
198 |    Licensed under the Apache License, Version 2.0 (the "License");
199 |    you may not use this file except in compliance with the License.
200 |    You may obtain a copy of the License at
201 | 
202 |        http://www.apache.org/licenses/LICENSE-2.0
203 | 
204 |    Unless required by applicable law or agreed to in writing, software
205 |    distributed under the License is distributed on an "AS IS" BASIS,
206 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
207 |    See the License for the specific language governing permissions and
208 |    limitations under the License.
209 | 


--------------------------------------------------------------------------------
/R1-mem-good1.md:
--------------------------------------------------------------------------------
  1 | # 使用协议-（非商业用途）支持学习与探索更多用途
  2 | 本软件禁止以下用途，如果一定要使用，请先获取授权：
  3 | - 直接或间接的商业服务（如API收费、售卖模型权重、公有云）。
  4 | - 集成到商业产品中。
  5 | - 用于整体售卖的私有云部署服务。
  6 | 
  7 | # 作者和联系方式
  8 | 
  9 | 作者：老谭
 10 | 邮箱：10267672@qq.com
 11 | B站：https://space.bilibili.com/328484347
 12 | 
 13 | # 程序名称
 14 | vllm-gui-server-r1-loggood.py （这个版本时只用显存的模式，适合有多块大容量显存N卡的用户）
 15 | vllm-gui-server-r1-mem-good4.py （这个版本带mem的是支持内存显存混合模式）
 16 | 
 17 | # 本项目VLLM+VRAM+DRAM 开发心得
 18 | 
 19 | 自从deepseek问世后，个人部署、私有部署deepseek满血版或者蒸馏版的需求呈爆炸性增长，虽然大家部署后也不知到用来干嘛（我是做行业应用的我是知道的），但很热情的拥抱AI，太好了。
 20 | 但是大家的显卡也不是，大多是消费级显卡，显存非常有限，如何在有限的显存里玩出花样就成了一个课题。
 21 | 
 22 | VLLM是一个不错的平台，比Ollama强，速度快，但原生的Vllm不支持混合内存模型部署。
 23 | 
 24 | 除了追加显卡，另外一个办法就是显存内存混合使用，也有人叫这是统一内存（Unified Memory Management），但实际上老黄对统一内存的硬件要求很高，一般的计算机达不到，想达到只能买新计算机最少DDR5内存以上。
 25 | 
 26 | 我们这个项目是基于DDR3内存开发，首先保证能跑得动，再讨论跑得快的问题。
 27 | 
 28 | 在此之前的内存显存混合模式跑大模型的软件有一些，各有特点。本软件的特点是：
 29 | 1、先把大模型完全加载到内存，再由内存加载到显存，这样做就避免了先加载到显存可能出现的报错。
 30 | 2、内存加载完毕后，运行时采用动态优化。
 31 | 3、图形化界面，省去了繁杂的命令行操作。
 32 | 4、支持跨平台 Windows、Ubuntu，都支持。
 33 | 
 34 | # VLLM 服务器管理器 (内存优化版)
 35 | 
 36 | 基于vLLM的高性能服务器管理系统（图形界面），专为大型语言模型优化，提供智能内存管理、多GPU支持与显存优化功能。特别适合在有限资源环境下部署大型模型。
 37 | 
 38 | ![VLLM 服务器管理器](https://example.com/path/to/screenshot.png)
 39 | 
 40 | ## 核心特性
 41 | 
 42 | - 🚀 智能内存管理和CPU卸载功能
 43 | - 💾 支持模型内存交换，突破显存限制
 44 | - 🖥️ 多GPU张量并行计算（支持1-4卡配置）
 45 | - 📊 实时GPU与系统内存监控
 46 | - ⚙️ 智能参数推荐系统
 47 | - 🔄 支持多种精度格式模型
 48 | - 🛠️ 兼容不同版本VLLM命令行参数
 49 | 
 50 | ## 内存优化亮点
 51 | 
 52 | - **内存交换技术**: 允许加载超出GPU显存的大型模型
 53 | - **智能内存预分配**: 减少内存碎片，优化大模型加载
 54 | - **系统资源实时监控**: 动态调整参数避免OOM错误
 55 | - **CPU卸载机制**: 使用系统内存作为模型权重缓存
 56 | 
 57 | ## 界面指南
 58 | 
 59 | ### 基本配置区域
 60 | - **模型路径**: 选择本地模型文件夹
 61 | - **IP地址/端口**: 设置服务器监听地址
 62 | - **GPU数量**: 配置用于推理的GPU数量
 63 | - **显存比例**: 控制每个GPU的内存使用率(0.0-1.0)
 64 | - **最大Token数**: 设置批处理中的最大token数量
 65 | - **最大序列长度**: 支持的最大上下文窗口大小
 66 | 
 67 | ### KV缓存配置
 68 | - **缓存精度**: 选择KV缓存的数值类型(float16/float32)
 69 | - **块大小**: 定义每个缓存块的token数量
 70 | - **最大块数**: 限制每个GPU分配的最大块数
 71 | - **动态缩放**: 在不同批次之间启用缩放优化
 72 | 
 73 | ### 内存优化设置
 74 | - **CPU卸载大小**: 设置卸载到CPU内存的模型数据大小(GB)
 75 | - **内存交换空间**: 配置磁盘交换空间大小(GB)
 76 | - **强制即时执行**: 避免CUDA图捕获导致的内存不足
 77 | - **内存缓冲区预分配**: 预先分配内存减少碎片
 78 | 
 79 | ## 内存和显存计算指南
 80 | 
 81 | ### 模型大小估算
 82 | 
 83 | | 模型参数量 | FP16大小 | INT8大小 | 最小GPU要求 | 最佳GPU配置 |
 84 | |----------|---------|---------|-----------|------------|
 85 | | 7B       | ~14GB   | ~7GB    | 16GB      | 24GB单卡   |
 86 | | 13B      | ~26GB   | ~13GB   | 24GB×2    | 32GB×1     |
 87 | | 32B      | ~64GB   | ~32GB   | 40GB×2    | 80GB×1     |
 88 | | 70B      | ~140GB  | ~70GB   | 80GB×2    | 80GB×4     |
 89 | 
 90 | ### 显存使用明细
 91 | 
 92 | 对于一个32B模型(FP16)，显存分配大致如下：
 93 | 
 94 | ```
 95 | 模型权重: 64GB
 96 | KV缓存(2048上下文): ~2GB
 97 | 优化器状态: 不适用于推理
 98 | 梯度: 不适用于推理
 99 | 激活值: ~1GB
100 | CUDA内核: ~0.5GB
101 | --------------------------
102 | 总计: ~67.5GB
103 | ```
104 | 
105 | ### 内存交换和CPU卸载计算
106 | 
107 | 当使用内存交换功能时，您可以按照以下公式计算需要的资源:
108 | 
109 | ```
110 | 必要GPU显存 = 模型大小 × (1 - CPU卸载比例) × (1 - 显存比率/100)
111 | 必要系统内存 = 模型大小 × CPU卸载比例 + 缓冲区(~2GB)
112 | 推荐交换空间 = 模型大小 × 0.2 (大约20%预留空间)
113 | ```
114 | 
115 | 例如，加载70B模型(FP16)到RTX 4090(24GB):
116 | ```
117 | CPU卸载: ~100GB
118 | GPU显存: ~21GB (模型处理部分)
119 | 系统内存: ~120GB
120 | 交换空间: ~28GB
121 | ```
122 | 
123 | ## 快速使用指南
124 | 
125 | ```bash
126 | # 创建虚拟环境
127 | python -m venv vvvip
128 | vvvip\Scripts\activate
129 | 
130 | #激活虚拟环境
131 | source vvvip/bin/activate
132 | 
133 | # 安装依赖
134 | pip install -r requirements.txt
135 | 
136 | sudo apt-get install python3-tk
137 | 
138 | # 启动程序
139 | python vllm-gui-server-r1-mem-good1.py
140 | ```
141 | 
142 | ## 推荐配置方案
143 | 
144 | ### 消费级显卡 (RTX 4090)
145 | - 最大模型: 13B (完整FP16)
146 | - 显存比例: 0.85
147 | - CPU卸载: 对于更大模型必须启用
148 | - 推荐设置: 使用界面"推荐设置"功能
149 | 
150 | ### 专业级显卡 (A100-80GB)
151 | - 最大模型: 70B (单卡FP16)
152 | - 显存比例: 0.9
153 | - 内存交换: 可选，用于超长上下文
154 | - KV缓存: float16 优先
155 | 
156 | ### 多卡配置 (RTX 4090 × 2)
157 | - 最大模型: 35B (张量并行)
158 | - GPU数量: 2
159 | - 显存比例: 0.8
160 | - KV缓存块大小: 16
161 | 
162 | ## 高级使用技巧
163 | 
164 | 1. **大模型加载**
165 |    - 启用"强制即时执行"避免CUDA图捕获阶段的内存不足
166 |    - 使用较低显存比例(0.75-0.85)预留系统空间
167 | 
168 | 2. **内存优化**
169 |    - 对大型模型启用"内存缓冲区预分配"减少碎片
170 |    - 系统内存至少为模型大小的2倍
171 | 
172 | 3. **性能平衡**
173 |    - 增大"块大小"可减少缓存管理开销
174 |    - 降低"最大序列长度"可减少每个请求的内存占用
175 | 
176 | ## 常见问题解决
177 | 
178 | | 问题 | 解决方案 |
179 | |------|---------|
180 | | CUDA OOM错误 | 1. 降低显存比例 2. 启用CPU卸载 3. 使用"推荐设置" |
181 | | 模型加载失败 | 检查模型路径是否包含完整权重文件 |
182 | | 服务器启动失败 | 尝试使用"备用启动方法"启动服务器 |
183 | | KV缓存溢出 | 减小"最大token数"或增加"最大块数" |
184 | | 系统内存不足 | 启用磁盘交换空间或减少CPU卸载比例 |
185 | 
186 | ## 技术支持
187 | 
188 | 如有问题请提交Issue或联系：
189 | - 邮箱：10267672@qq.com
190 | - [官方文档](https://api-docs.deepseek.com/)
191 | 
192 | ---
193 | 
194 | *注意: 此版本为内存优化专版，专注于突破显存限制加载超大模型，界面上的参数设置直接影响模型加载和推理性能，请根据系统配置谨慎设置。*
195 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # deepseek-on-vllm-VRAMandDRAM
 2 | 一套基于Vllm的显存内存混合模式大模型部署工具（图形界面），VRAMandDRAM模式虽然慢一点，但是解决了超大模型在普通家用计算机上的部署问题。
 3 | 
 4 | # 使用协议-（非商业用途）支持学习与探索更多用途
 5 | 本软件禁止以下用途，如果一定要使用，请先获取授权：
 6 | - 直接或间接的商业服务（如API收费、售卖模型权重、公有云）。
 7 | - 集成到商业产品中。
 8 | - 用于整体售卖的私有云部署服务。
 9 | 
10 | # 作者和联系方式
11 | 
12 | 作者：老谭
13 | 邮箱：10267672@qq.com
14 | B站：https://space.bilibili.com/328484347
15 | 
16 | # 程序名称
17 | vllm-gui-server-r1-loggood.py （这个版本时只用显存的模式，适合有多块大容量显存N卡的用户）
18 | vllm-gui-server-r1-mem-good4.py （这个版本带mem的是支持内存显存混合模式）
19 | 
20 | # 本项目VLLM+VRAM+DRAM 开发心得
21 | 
22 | 自从deepseek问世后，个人部署、私有部署deepseek满血版或者蒸馏版的需求呈爆炸性增长，虽然大家部署后也不知到用来干嘛（我是做行业应用的我是知道的），但很热情的拥抱AI，太好了。
23 | 但是大家的显卡也不是，大多是消费级显卡，显存非常有限，如何在有限的显存里玩出花样就成了一个课题。
24 | 
25 | VLLM是一个不错的平台，比Ollama强，速度快，但原生的Vllm不支持混合内存模型部署。
26 | 
27 | 除了追加显卡，另外一个办法就是显存内存混合使用，也有人叫这是统一内存（Unified Memory Management），但实际上老黄对统一内存的硬件要求很高，一般的计算机达不到，想达到只能买新计算机最少DDR5内存以上。
28 | 
29 | 我们这个项目是基于DDR3内存开发，首先保证能跑得动，再讨论跑得快的问题。
30 | 
31 | 在此之前的内存显存混合模式跑大模型的软件有一些，各有特点。本软件的特点是：
32 | 1、先把大模型完全加载到内存，再由内存加载到显存，这样做就避免了先加载到显存可能出现的报错。
33 | 2、内存加载完毕后，运行时采用动态优化。
34 | 3、图形化界面，省去了繁杂的命令行操作。
35 | 4、支持跨平台 Windows、Ubuntu，都支持。
36 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # 基础依赖
 2 | torch>=2.2.0
 3 | vllm>=0.3.3
 4 | transformers>=4.38.1
 5 | accelerate>=0.27.0
 6 | tiktoken>=0.6.0
 7 | 
 8 | # GUI依赖
 9 | # tkinter
10 | 
11 | # 系统依赖
12 | # 在安装前请先运行：
13 | # sudo apt-get update
14 | # sudo apt-get install python3-tk
15 | 
16 | numpy>=1.26.4
17 | sentencepiece>=0.1.99
18 | tqdm>=4.66.1
19 | fsspec>=2024.2.0
20 | typing-extensions>=4.9.0
21 | psutil>=5.9.8 
22 | 
23 | requests>=2.31.0
24 | 
25 | #sudo apt-get update
26 | #sudo apt-get install python3-dev build-essential
27 | #或者 sudo apt install python3.10-dev
28 | 
29 | # 系统监控
30 | GPUtil>=1.4.0
31 | 
32 | # pip install bitsandbytes==0.41.1


--------------------------------------------------------------------------------
/vllm-gui-server-r1-loggood.py:
--------------------------------------------------------------------------------
   1 | """
   2 | VLLM GUI服务器 - 版本 R1
   3 | 最近更新: 2025-03-06
   4 | 
   5 | 更新内容:
   6 | 1. 修复了内存交换参数设置，确保与VLLM最新文档一致
   7 | 2. 修正了--swap-space参数，使用正确的GiB单位
   8 | 3. 更新了CPU卸载参数，使用--cpu-offload-gb替代旧版参数
   9 | 4. 改进了参数检测逻辑，支持不同版本的VLLM
  10 | 5. 增强了备用启动方法，提供更多参数组合选项
  11 | 6. 修复了model_size变量未定义的问题
  12 | 7. 添加了--enforce-eager参数，避免CUDA图捕获阶段的内存不足
  13 | 8. 优化了update_config函数，确保配置参数正确应用
  14 | 9. 改进了推荐设置功能，根据模型大小和硬件条件提供更合理的配置
  15 | 
  16 | 注意: 此版本支持VLLM的内存交换功能，允许加载超出GPU显存大小的模型
  17 | 
  18 | 预期效果
  19 | 这些优化应该能够显著提高您的LLM推理速度，特别是：
  20 | 多通道加载器优化应该能减轻DDR3内存的带宽限制
  21 | KV缓存优化应该能提高缓存命中率，减少重复计算
  22 | GPU利用率优化应该能让GPU发挥更大作用
  23 | 启动参数优化应该能更好地配置vLLM服务器
  24 | 如果您在使用过程中发现任何问题，或者需要进一步的优化，请随时告诉我
  25 | """
  26 | 
  27 | import tkinter as tk
  28 | from tkinter import ttk, filedialog, messagebox
  29 | import socket
  30 | import json
  31 | import threading
  32 | from vllm import AsyncLLMEngine, SamplingParams
  33 | import subprocess
  34 | import GPUtil
  35 | import time
  36 | import os
  37 | import torch
  38 | import psutil
  39 | import mmap
  40 | import sys
  41 | from datetime import datetime
  42 | import re
  43 | import pynvml
  44 | 
  45 | class VLLMServerGUI:
  46 |     def __init__(self, master):
  47 |         self.master = master
  48 |         master.title("VLLM-DRAM-VRAM Server Manager")
  49 |         
  50 |         # 配置参数存储
  51 |         self.config = {
  52 |             'model_path': '',
  53 |             'ip': self.get_local_ip(),
  54 |             'port': 8000,
  55 |             'gpu_count': 1,
  56 |             'mem_ratio': 95,  # 提高显存使用率
  57 |             'max_tokens': 4096,  # 增加最大token数
  58 |             'kv_dtype': 'float16',
  59 |             'block_size': 16,
  60 |             'max_blocks': '',
  61 |             'calculate_scales': True,
  62 |             'max_model_len': 4096,  # 减小max_model_len以节省内存
  63 |             # 内存交换相关配置
  64 |             'enable_memory_offload': True,  # 默认启用内存交换
  65 |             'memory_offload_ratio': 70,  # 增加内存交换比例
  66 |             'memory_channels': 4,
  67 |             'reserved_memory': 20
  68 |         }
  69 |         
  70 |         # 服务器进程
  71 |         self.server_process = None
  72 |         
  73 |         # API地址
  74 |         self.api_address = None
  75 |         
  76 |         # 主界面布局
  77 |         self.create_widgets()
  78 |         
  79 |         # 加载配置
  80 |         self.load_config()
  81 |         
  82 |         # 专业监控标志
  83 |         self.monitoring = True
  84 |         # 启动GPU监控线程
  85 |         threading.Thread(target=self.update_gpu_stats, daemon=True).start()
  86 |         
  87 |         self.api_server_started = False
  88 |         self.model_loaded = False
  89 |         self.model_path = ""
  90 |         self.performance_optimized = False
  91 |         self.memory_channel_info_displayed = False  # 新增标志，用于跟踪内存交换通道信息是否已显示
  92 |         self.cache_hit_info_displayed = False  # 新增标志，用于跟踪缓存命中率信息是否已显示
  93 |         self.kv_cache_info_displayed = False  # 新增标志，用于跟踪KV缓存命中率信息是否已显示
  94 |         
  95 |     def get_local_ip(self):
  96 |         s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
  97 |         try:
  98 |             s.connect(('10.255.255.255', 1))
  99 |             IP = s.getsockname()[0]
 100 |         except Exception:
 101 |             IP = '127.0.0.1'
 102 |         finally:
 103 |             s.close()
 104 |         return IP
 105 |     
 106 |     def create_widgets(self):
 107 |         # 基本配置区域
 108 |         self.config_frame = ttk.LabelFrame(self.master, text="基本配置")
 109 |         self.config_frame.pack(padx=10, pady=5, fill='x')
 110 |         
 111 |         # 模型路径
 112 |         ttk.Label(self.config_frame, text="模型路径:").grid(row=0, column=0)
 113 |         self.model_path_entry = ttk.Entry(self.config_frame, width=50)
 114 |         self.model_path_entry.grid(row=0, column=1)
 115 |         self.model_path_entry.insert(0, self.config['model_path'])
 116 |         ttk.Button(self.config_frame, text="浏览", command=self.select_model_path).grid(row=0, column=2)
 117 |         
 118 |         # 添加保存配置按钮和推荐设置按钮
 119 |         save_config_button = ttk.Button(self.config_frame, text="保存配置", command=self.save_config_with_message)
 120 |         save_config_button.grid(row=0, column=3, padx=5)
 121 |         recommend_button = ttk.Button(self.config_frame, text="推荐设置", command=self.recommend_settings)
 122 |         recommend_button.grid(row=0, column=4, padx=5)
 123 |         
 124 |         # IP地址
 125 |         ttk.Label(self.config_frame, text="IP地址:").grid(row=1, column=0)
 126 |         self.ip_entry = ttk.Entry(self.config_frame)
 127 |         self.ip_entry.grid(row=1, column=1, sticky='w')
 128 |         self.ip_entry.insert(0, self.config['ip'])
 129 |         
 130 |         # 端口
 131 |         ttk.Label(self.config_frame, text="端口:").grid(row=2, column=0)
 132 |         self.port_entry = ttk.Entry(self.config_frame)
 133 |         self.port_entry.grid(row=2, column=1, sticky='w')
 134 |         self.port_entry.insert(0, str(self.config['port']))
 135 |         
 136 |         # GPU数量
 137 |         ttk.Label(self.config_frame, text="GPU数量:").grid(row=3, column=0)
 138 |         self.gpu_count_var = tk.StringVar(value=str(self.config['gpu_count']))
 139 |         gpu_count_combo = ttk.Combobox(self.config_frame, textvariable=self.gpu_count_var,
 140 |                                      values=["1", "2", "3", "4"], width=5)
 141 |         gpu_count_combo.grid(row=3, column=1, sticky='w')
 142 |         
 143 |         # 显存比例
 144 |         ttk.Label(self.config_frame, text="显存比例(%):").grid(row=4, column=0)
 145 |         self.mem_ratio_entry = ttk.Entry(self.config_frame)
 146 |         self.mem_ratio_entry.grid(row=4, column=1, sticky='w')
 147 |         self.mem_ratio_entry.insert(0, str(self.config['mem_ratio']))
 148 |         
 149 |         # 最大Token数
 150 |         ttk.Label(self.config_frame, text="最大Token数:").grid(row=5, column=0)
 151 |         self.max_tokens_var = tk.StringVar(value=str(self.config['max_tokens']))
 152 |         ttk.Entry(self.config_frame, textvariable=self.max_tokens_var, width=8).grid(row=5, column=1)
 153 |         ttk.Label(self.config_frame, text="(回复token数应不小于整体序列长度)", foreground="gray").grid(row=6, column=0, columnspan=2, sticky='w')
 154 |         
 155 |         # 最大序列长度
 156 |         ttk.Label(self.config_frame, text="最大序列长度:").grid(row=5, column=2)
 157 |         self.max_model_len_var = tk.StringVar(value=str(self.config['max_model_len']))
 158 |         max_model_len_combo = ttk.Combobox(self.config_frame, textvariable=self.max_model_len_var,
 159 |                                          values=["2048", "4096", "8192", "16384", "32768", "65536"], width=8)
 160 |         max_model_len_combo.grid(row=5, column=3)
 161 |         ttk.Label(self.config_frame, text="(请根据硬件条件选择合适参数)", foreground="gray").grid(row=6, column=2, columnspan=2, sticky='w')
 162 |         
 163 |         # KV缓存配置
 164 |         cache_frame = ttk.LabelFrame(self.config_frame, text="KV缓存配置")
 165 |         cache_frame.grid(row=7, column=0, columnspan=3, sticky="ew", pady=5)
 166 |         
 167 |         # 缓存精度
 168 |         ttk.Label(cache_frame, text="缓存精度:").grid(row=0, column=0)
 169 |         self.kv_dtype_var = tk.StringVar(value=self.config['kv_dtype'])
 170 |         ttk.Combobox(cache_frame, textvariable=self.kv_dtype_var,
 171 |                     values=["float16", "float32"], width=8).grid(row=0, column=1)
 172 |         
 173 |         # 缓存块大小
 174 |         ttk.Label(cache_frame, text="块大小(tokens):").grid(row=0, column=2)
 175 |         self.block_size_var = tk.StringVar(value=str(self.config['block_size']))
 176 |         ttk.Entry(cache_frame, textvariable=self.block_size_var, width=8).grid(row=0, column=3)
 177 |         
 178 |         # 最大缓存块数
 179 |         ttk.Label(cache_frame, text="最大块数:").grid(row=1, column=0)
 180 |         self.max_blocks_var = tk.StringVar(value=str(self.config['max_blocks']))
 181 |         ttk.Entry(cache_frame, textvariable=self.max_blocks_var, width=8).grid(row=1, column=1)
 182 |         ttk.Label(cache_frame, text="(留空为自动)").grid(row=1, column=2)
 183 |         
 184 |         # 动态缩放选项
 185 |         self.calculate_scales_var = tk.BooleanVar(value=self.config['calculate_scales'])
 186 |         ttk.Checkbutton(cache_frame, text="启用动态缩放", 
 187 |                        variable=self.calculate_scales_var).grid(row=1, column=3)
 188 |         
 189 |         # 添加高级性能设置区域
 190 |         self.create_advanced_settings()
 191 |         
 192 |         # 监控面板
 193 |         monitor_frame = ttk.LabelFrame(self.master, text="GPU监控")
 194 |         monitor_frame.pack(padx=10, pady=5, fill='both', expand=True)
 195 |         
 196 |         # GPU状态显示
 197 |         columns = ('GPU', '显存使用率', 'GPU使用率', '温度', '功耗', 'KV缓存命中率')
 198 |         self.gpu_tree = ttk.Treeview(monitor_frame, columns=columns, show='headings')
 199 |         for col in columns:
 200 |             self.gpu_tree.heading(col, text=col)
 201 |             self.gpu_tree.column(col, width=100)
 202 |         self.gpu_tree.pack(fill='both', expand=True)
 203 |         
 204 |         # 状态显示区域
 205 |         self.status_text = tk.Text(monitor_frame, height=10)
 206 |         self.status_text.pack(fill='both')
 207 |         
 208 |         # 服务器控制按钮
 209 |         button_frame = ttk.Frame(self.config_frame)
 210 |         button_frame.grid(row=8, column=0, columnspan=3, pady=5)
 211 |         ttk.Button(button_frame, text="启动服务器", command=self.start_server).grid(row=0, column=0, padx=5)
 212 |         ttk.Button(button_frame, text="停止服务器", command=self.stop_server).grid(row=0, column=1, padx=5)
 213 |         
 214 |         # API地址显示
 215 |         self.api_label = ttk.Label(self.config_frame, text="API地址:")
 216 |         self.api_label.grid(row=9, column=0, columnspan=3)
 217 |         
 218 |         # 添加内存交换配置框架
 219 |         offload_frame = ttk.LabelFrame(self.config_frame, text="内存交换配置")
 220 |         offload_frame.grid(row=10, column=0, columnspan=3, sticky="ew", pady=5)
 221 |         
 222 |         # 启用内存交换选项
 223 |         self.enable_offload_var = tk.BooleanVar(value=self.config['enable_memory_offload'])
 224 |         ttk.Checkbutton(offload_frame, text="启用内存交换", 
 225 |                        variable=self.enable_offload_var).grid(row=0, column=0)
 226 |         
 227 |         # 内存通道数量
 228 |         ttk.Label(offload_frame, text="内存通道数:").grid(row=0, column=1)
 229 |         self.memory_channels_var = tk.StringVar(value=str(self.config['memory_channels']))
 230 |         ttk.Combobox(offload_frame, textvariable=self.memory_channels_var,
 231 |                     values=["2", "4", "8", "16"], width=5).grid(row=0, column=2)
 232 |         
 233 |         # 内存交换比例
 234 |         ttk.Label(offload_frame, text="内存交换比例(%):").grid(row=1, column=0)
 235 |         self.memory_offload_ratio_var = tk.StringVar(value=str(self.config['memory_offload_ratio']))
 236 |         ttk.Entry(offload_frame, textvariable=self.memory_offload_ratio_var, width=5).grid(row=1, column=1)
 237 |         
 238 |         # 预留系统内存比例
 239 |         ttk.Label(offload_frame, text="系统内存预留(%):").grid(row=1, column=2)
 240 |         self.reserved_memory_var = tk.StringVar(value=str(self.config['reserved_memory']))
 241 |         ttk.Entry(offload_frame, textvariable=self.reserved_memory_var, width=5).grid(row=1, column=3)
 242 |         
 243 |         # 添加高级说明
 244 |         ttk.Label(offload_frame, text="(启用后可加载超出显存的大模型，但会降低推理速度)", 
 245 |                  foreground="gray").grid(row=2, column=0, columnspan=4, sticky='w')
 246 |         
 247 |         # 添加"检查兼容性"按钮
 248 |         self.check_compatibility_button = ttk.Button(
 249 |             self.config_frame, 
 250 |             text="检查兼容性", 
 251 |             command=self.check_model_compatibility
 252 |         )
 253 |         self.check_compatibility_button.grid(row=1, column=3, padx=5, pady=5, sticky="w")
 254 |         
 255 |         # 添加性能监控面板
 256 |         self.add_performance_monitoring()
 257 |     
 258 |     def select_model_path(self):
 259 |         path = filedialog.askdirectory()
 260 |         if path:
 261 |             self.config['model_path'] = path
 262 |             self.model_path_entry.delete(0, tk.END)  # 清除当前内容
 263 |             self.model_path_entry.insert(0, path)    # 插入新路径
 264 |             
 265 |     def start_server(self):
 266 |         """启动VLLM服务器"""
 267 |         if not self.config['model_path']:
 268 |             messagebox.showerror("错误", "请先选择模型路径")
 269 |             return
 270 |         
 271 |         if hasattr(self, 'server_process') and self.server_process and self.server_process.poll() is None:
 272 |             messagebox.showinfo("提示", "服务器已经在运行")
 273 |             return
 274 |         
 275 |         # 检查模型兼容性
 276 |         if not self.check_model_compatibility():
 277 |             if not messagebox.askokcancel("警告", "模型兼容性检查发现潜在问题，是否继续启动服务器？"):
 278 |                 return
 279 |         
 280 |         # 清理GPU内存
 281 |         self.clean_gpu_memory()
 282 |         
 283 |         # 设置环境变量以避免内存碎片问题
 284 |         env = os.environ.copy()
 285 | 
 286 |         # 应用高级设置中的CUDA内存分块大小
 287 |         cuda_split_size = self.config.get('advanced_cuda_split_size', 128)  # 默认128MB
 288 |         env['PYTORCH_CUDA_ALLOC_CONF'] = f'expandable_segments:True,max_split_size_mb:{cuda_split_size}'
 289 |         self.status_text.insert(tk.END, f"CUDA内存分块大小: {cuda_split_size}MB\n")
 290 | 
 291 |         env['CUDA_VISIBLE_DEVICES'] = ','.join([str(i) for i in range(self.config['gpu_count'])])
 292 |         env['OMP_NUM_THREADS'] = '4'  # 限制OpenMP线程数
 293 |         env['MKL_NUM_THREADS'] = '4'  # 限制MKL线程数
 294 | 
 295 |         # 添加性能优化环境变量
 296 |         env['CUDA_DEVICE_MAX_CONNECTIONS'] = '1'  # 优化CUDA连接
 297 |         env['NCCL_P2P_DISABLE'] = '1'  # 对于单GPU，禁用P2P可能提高性能
 298 |         env['CUDA_AUTO_BOOST'] = '1'  # 启用GPU自动提升频率
 299 |         env['VLLM_USE_ASYNC_CUDA_MALLOC'] = '1'  # 使用异步CUDA内存分配
 300 |         # 获取系统内存大小
 301 |         system_memory = psutil.virtual_memory().total / (1024 * 1024 * 1024)  # GB
 302 |         # 根据硬件情况选择是否启用内存高效线性层
 303 |         if system_memory > 16:  # 只有在系统内存足够时才启用
 304 |             env['VLLM_ENABLE_MEMORY_EFFICIENT_LINEAR'] = '1'  # 启用内存高效线性层
 305 |         
 306 |         # 记录启动信息
 307 |         self.status_text.insert(tk.END, "\n===== 启动服务器 =====\n")
 308 |         self.status_text.insert(tk.END, f"时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
 309 |         self.status_text.insert(tk.END, f"模型路径: {self.config['model_path']}\n")
 310 |         self.status_text.insert(tk.END, f"GPU数量: {self.config['gpu_count']}\n")
 311 |         self.status_text.insert(tk.END, f"显存比例: {self.config['mem_ratio']}%\n")
 312 |         
 313 |         # 检查GPU监控线程
 314 |         if not self.monitoring:
 315 |             self.monitoring = True
 316 |             threading.Thread(target=self.update_gpu_stats, daemon=True).start()
 317 |         
 318 |         # 保存配置
 319 |         self.save_config()
 320 |         
 321 |         # 预先分配内存空间，防止运行时内存不足
 322 |         self.preallocate_memory_buffer()
 323 |         
 324 |         # 初始化KV缓存监控
 325 |         self.kv_cache_hits = 0
 326 |         self.kv_cache_misses = 0
 327 |         
 328 |         # 检查是否需要内存交换
 329 |         if self.config['enable_memory_offload']:
 330 |             try:
 331 |                 self.status_text.insert(tk.END, "正在设置内存交换...\n")
 332 |                 
 333 |                 # 计算模型大小
 334 |                 model_size = self.estimate_model_size()
 335 |                 
 336 |                 # 获取可用显存
 337 |                 available_vram = self.get_available_vram(use_ratio=self.config['mem_ratio'] / 100)
 338 |                 
 339 |                 self.status_text.insert(tk.END, f"模型大小: {model_size:.2f}GB, 可用显存: {available_vram:.2f}GB\n")
 340 |                 
 341 |                 # 计算需要卸载的内存大小
 342 |                 offload_ratio = self.config['memory_offload_ratio'] / 100
 343 |                 initial_offload_size = model_size * offload_ratio
 344 |                 
 345 |                 self.status_text.insert(tk.END, f"将卸载 {initial_offload_size:.2f}GB 到系统内存 (比例: {self.config['memory_offload_ratio']}%)\n")
 346 |                 
 347 |                 # 设置内存映射文件
 348 |                 self.setup_memory_offload(model_size, offload_ratio)
 349 |                 
 350 |                 # 检查VLLM支持的参数
 351 |                 self.status_text.insert(tk.END, "检查VLLM支持的参数...\n")
 352 |                 
 353 |                 # 计算可用系统内存（考虑预留比例）
 354 |                 available_memory = self.get_available_system_memory()
 355 |                 reserved_ratio = self.config['reserved_memory'] / 100
 356 |                 safe_memory = available_memory * (1 - reserved_ratio)
 357 |                     
 358 |                 # 获取实际分配的内存大小
 359 |                 actual_offload_size = 0
 360 |                 if hasattr(self, 'mm') and self.mm:
 361 |                     try:
 362 |                         # 获取内存映射文件大小
 363 |                         map_file = os.path.join(os.getcwd(), "model_offload", "model_offload.bin")
 364 |                         if os.path.exists(map_file):
 365 |                             actual_offload_size = os.path.getsize(map_file) / (1024 * 1024 * 1024)
 366 |                             self.status_text.insert(tk.END, f"实际分配的内存映射大小: {actual_offload_size:.2f}GB\n")
 367 |                     except Exception as e:
 368 |                         self.status_text.insert(tk.END, f"获取内存映射大小失败: {str(e)}\n")
 369 |                 
 370 |                 # 动态调整所需的内存大小
 371 |                 min_required_size = min(18, model_size * 0.8)  # 至少需要模型大小的80%
 372 |                 
 373 |                 if actual_offload_size < min_required_size:
 374 |                     self.status_text.insert(tk.END, f"警告: 实际分配的内存映射大小不足{min_required_size:.1f}GB，可能无法加载模型\n")
 375 |                     if not messagebox.askokcancel("警告", 
 376 |                         f"实际分配的内存映射大小仅为{actual_offload_size:.2f}GB，建议至少{min_required_size:.1f}GB。\n是否继续？"):
 377 |                         return False
 378 |                 
 379 |                 # 计算合理的交换空间大小 - 根据模型大小动态调整
 380 |                 # 对于小模型(<10GB)，使用较小的交换空间
 381 |                 if model_size < 10:
 382 |                     swap_size = max(2.0, model_size * 0.1)
 383 |                 else:
 384 |                     # 对于大模型，使用更大的交换空间
 385 |                     swap_size = max(4.0, model_size * 0.15)
 386 |                 
 387 |                 # 确保不超过安全内存的20%
 388 |                 swap_size = min(swap_size, safe_memory * 0.2)
 389 |                 
 390 |                 # 计算合理的CPU卸载大小 - 根据模型大小和可用显存动态调整
 391 |                 available_vram = self.get_available_vram(use_ratio=self.config['mem_ratio'] / 100)
 392 |                 
 393 |                 # 如果模型大小超过可用显存，计算需要卸载的部分
 394 |                 if model_size > available_vram:
 395 |                     # 需要卸载的大小 = 模型大小 - 可用显存 + 额外缓冲区(1GB)
 396 |                     min_offload_size = model_size - available_vram + 1.0
 397 |                     # 确保至少卸载模型的60%
 398 |                     offload_size = max(min_offload_size, model_size * 0.6)
 399 |                 else:
 400 |                     # 如果模型可以完全放入显存，仍然卸载一部分以提高稳定性
 401 |                     offload_size = model_size * 0.3
 402 |                 
 403 |                 # 确保不超过安全内存的70%
 404 |                 offload_size = min(offload_size, safe_memory * 0.7)
 405 |                 
 406 |                 # 计算总内存使用
 407 |                 total_mem_usage = swap_size + offload_size
 408 |                 mem_usage_ratio = total_mem_usage / safe_memory * 100
 409 |                     
 410 |                 self.status_text.insert(tk.END, f"可用系统内存: {available_memory:.2f}GB, 安全内存: {safe_memory:.2f}GB\n")
 411 |                 self.status_text.insert(tk.END, f"计算交换空间: {swap_size:.2f}GB, CPU卸载: {offload_size:.2f}GB\n")
 412 |                 self.status_text.insert(tk.END, f"总内存使用: {total_mem_usage:.2f}GB (安全内存的{mem_usage_ratio:.1f}%)\n")
 413 |                 
 414 |                 # 确保max_num_batched_tokens大于等于max_num_seqs
 415 |                 max_tokens = max(self.config['max_tokens'], 256)  # 确保至少为256
 416 |                 
 417 |                 # 构建命令
 418 |                 cmd = [
 419 |                     'vllm', 'serve',
 420 |                     self.config['model_path'],
 421 |                     '--host', self.config['ip'],
 422 |                     '--port', str(self.config['port']),
 423 |                     '--tensor-parallel-size', str(self.config['gpu_count']),
 424 |                     '--gpu-memory-utilization', str(self.config['mem_ratio'] / 100),
 425 |                     '--max-num-batched-tokens', str(max_tokens),
 426 |                     '--block-size', str(self.config['block_size']),
 427 |                     '--max-model-len', str(self.config['max_model_len']),
 428 |                     '--dtype', 'half'  # 强制使用half精度
 429 |                 ]
 430 |                 
 431 |                 # 添加最大块数（如果指定）
 432 |                 if self.config['max_blocks']:
 433 |                     cmd.extend(['--num-gpu-blocks', self.config['max_blocks']])
 434 |                 
 435 |                 # 添加交换空间参数
 436 |                 swap_param = f"{swap_size:.2f}"  # 移除GiB单位，只使用数字
 437 |                 cmd.extend(['--swap-space', swap_param])
 438 |                 self.status_text.insert(tk.END, f"添加交换空间参数: --swap-space {swap_param} (GB)\n")
 439 |                 
 440 |                 # 添加CPU卸载参数
 441 |                 offload_param = f"{offload_size:.2f}"  # 移除GB单位，只使用数字
 442 |                 cmd.extend(['--cpu-offload-gb', offload_param])
 443 |                 self.status_text.insert(tk.END, f"添加CPU卸载参数: --cpu-offload-gb {offload_param} (GB)\n")
 444 |                 
 445 |                 # 添加强制使用eager模式，避免CUDA图捕获阶段的内存不足
 446 |                 cmd.append('--enforce-eager')
 447 |                 self.status_text.insert(tk.END, "添加强制eager模式参数: --enforce-eager\n")
 448 |                     
 449 |                 self.status_text.insert(tk.END, f"已启用内存交换，可用CPU内存: {safe_memory:.2f}GB\n")
 450 |                     
 451 |                 # 记录完整命令
 452 |                 cmd_str = ' '.join(cmd)
 453 |                 self.status_text.insert(tk.END, f"完整命令: {cmd_str}\n")
 454 |                 self.status_text.see(tk.END)
 455 |                 
 456 |             except Exception as e:
 457 |                 self.status_text.insert(tk.END, f"设置内存交换时出错: {str(e)}\n")
 458 |                 import traceback
 459 |                 self.status_text.insert(tk.END, traceback.format_exc())
 460 |                 if not messagebox.askokcancel("错误", 
 461 |                     f"设置内存交换时出错: {str(e)}\n是否继续启动服务器（不使用内存交换）？"):
 462 |                     return
 463 |                 
 464 |                 # 如果内存交换设置失败，使用基本命令
 465 |                 max_tokens = max(self.config['max_tokens'], 256)  # 确保至少为256
 466 |                 cmd = [
 467 |                     'vllm', 'serve',
 468 |                     self.config['model_path'],
 469 |                     '--host', self.config['ip'],
 470 |                     '--port', str(self.config['port']),
 471 |                     '--tensor-parallel-size', str(self.config['gpu_count']),
 472 |                     '--gpu-memory-utilization', str(self.config['mem_ratio'] / 100),
 473 |                     '--max-num-batched-tokens', str(max_tokens),
 474 |                     '--block-size', str(self.config['block_size']),
 475 |                     '--max-model-len', str(self.config['max_model_len']),
 476 |                     '--dtype', 'half',  # 强制使用half精度
 477 |                     '--enforce-eager'  # 添加强制使用eager模式，避免CUDA图捕获阶段的内存不足
 478 |                 ]
 479 |         else:
 480 |             # 如果不需要内存交换，使用基本命令
 481 |             max_tokens = max(self.config['max_tokens'], 256)  # 确保至少为256
 482 |             cmd = [
 483 |                 'vllm', 'serve',
 484 |                 self.config['model_path'],
 485 |                 '--host', self.config['ip'],
 486 |                 '--port', str(self.config['port']),
 487 |                 '--tensor-parallel-size', str(self.config['gpu_count']),
 488 |                 '--gpu-memory-utilization', str(self.config['mem_ratio'] / 100),
 489 |                 '--max-num-batched-tokens', str(max_tokens),
 490 |                 '--block-size', str(self.config['block_size']),
 491 |                 '--max-model-len', str(self.config['max_model_len']),
 492 |                 '--dtype', 'half',  # 强制使用half精度
 493 |                 '--enforce-eager'  # 添加强制使用eager模式，避免CUDA图捕获阶段的内存不足
 494 |             ]
 495 |         
 496 |         # 添加性能优化参数
 497 |         performance_args = [
 498 |             '--max-num-seqs', '32',  # 增加最大序列数
 499 |             '--disable-log-stats',  # 禁用统计日志，减少开销
 500 |             '--kv-cache-dtype', 'auto',  # 使用自动选择KV缓存精度
 501 |             '--trust-remote-code'  # 信任远程代码，支持更多模型
 502 |         ]
 503 |         
 504 |         # 应用高级设置中的批处理大小
 505 |         batch_size = self.config.get('advanced_batch_size', 16)  # 默认16
 506 |         performance_args.extend(['--max-num-batched-tokens', str(max(batch_size * 256, max_tokens))])
 507 |         self.status_text.insert(tk.END, f"批处理大小: {batch_size}\n")
 508 | 
 509 |         # 添加内存带宽优化参数
 510 |         if int(self.block_size_var.get()) < 32:
 511 |             # 如果块大小小于32，建议增加到32以提高内存带宽利用率
 512 |             self.status_text.insert(tk.END, f"注意: 当前块大小({self.block_size_var.get()})较小，可能影响内存带宽利用率\n")
 513 |             self.status_text.insert(tk.END, "建议使用更大的块大小(32-64)以提高内存带宽利用率\n")
 514 | 
 515 |         # 检查是否支持Flash Attention
 516 |         if self.check_flash_attention_support():
 517 |             performance_args.append('--enable-chunked-prefill')
 518 |             self.status_text.insert(tk.END, "启用分块预填充优化\n")
 519 |         
 520 |         # 添加性能参数到命令
 521 |         cmd.extend(performance_args)
 522 |         
 523 |         # 异步启动服务器
 524 |         try:
 525 |             self.status_text.insert(tk.END, "正在启动服务器进程...\n")
 526 |             
 527 |             self.server_process = subprocess.Popen(
 528 |                 cmd,
 529 |                 stdout=subprocess.PIPE,
 530 |                 stderr=subprocess.STDOUT,
 531 |                 env=env  # 使用修改后的环境变量
 532 |             )
 533 |             
 534 |             # 等待一小段时间，检查进程是否立即退出
 535 |             time.sleep(1)
 536 |             if self.server_process.poll() is not None:
 537 |                 # 进程已退出，获取输出
 538 |                 output, _ = self.server_process.communicate()
 539 |                 error_msg = f"启动服务器失败: {output.decode()}"
 540 |                 self.status_text.insert(tk.END, f"{error_msg}\n")
 541 |                 
 542 |                 # 尝试使用备用方法
 543 |                 return self.fallback_start_server(error_msg)
 544 |             
 545 |             # 启动监控线程
 546 |             threading.Thread(target=self.monitor_server_output).start()
 547 | 
 548 |             # 更新API地址
 549 |             # 说明：GET /v1返回404是正常现象，请使用支持POST的具体API endpoint进行请求
 550 |             api_base = f"http://{self.config['ip']}:{self.config['port']}/v1"
 551 |             self.api_label.config(text=f"API地址: {api_base}")
 552 |             self.status_text.insert(tk.END, f"\n服务器启动中...\nAPI地址: {api_base}\n")
 553 |             self.status_text.see(tk.END)
 554 |             
 555 |             return True
 556 |             
 557 |         except Exception as e:
 558 |             error_msg = f"启动服务器失败: {str(e)}"
 559 |             self.status_text.insert(tk.END, f"{error_msg}\n")
 560 |             import traceback
 561 |             self.status_text.insert(tk.END, traceback.format_exc())
 562 |             
 563 |             # 尝试使用备用方法
 564 |             return self.fallback_start_server(error_msg)
 565 |     
 566 |     def stop_server(self):
 567 |         try:
 568 |             # 先停止所有监控线程
 569 |             self.monitoring = False
 570 |             # 等待一小段时间让线程有机会退出
 571 |             time.sleep(0.5)
 572 |             
 573 |             if hasattr(self, 'server_process') and self.server_process and self.server_process.poll() is None:
 574 |                 self.server_process.terminate()
 575 |                 try:
 576 |                     self.server_process.wait(timeout=5)
 577 |                     self.status_text.insert(tk.END, "\n服务器已停止.\n")
 578 |                 except subprocess.TimeoutExpired:
 579 |                     self.status_text.insert(tk.END, "\n停止服务器超时，但服务器可能已停止.\n")
 580 |             else:
 581 |                 self.status_text.insert(tk.END, "\n服务器未在运行.\n")
 582 |                 
 583 |             # 清理内存映射资源
 584 |             self.cleanup_memory_offload()
 585 |                 
 586 |         except Exception as e:
 587 |             messagebox.showerror("错误", f"停止服务器失败: {str(e)}")
 588 |         finally:
 589 |             # 确保监控标志被设置为False
 590 |             self.monitoring = False
 591 |             # 禁用自动调优
 592 |             if hasattr(self, 'auto_tune_var'):
 593 |                 self.auto_tune_var.set(False)
 594 |             self.api_label.config(text="API地址: 服务器未启动")
 595 |     
 596 |     def cleanup_memory_offload(self):
 597 |         """清理内存映射资源"""
 598 |         try:
 599 |             # 清理内存缓冲区
 600 |             self.cleanup_memory_buffer()
 601 |             
 602 |             # 清理多通道加载器
 603 |             if hasattr(self, 'multi_channel_loader') and self.multi_channel_loader is not None:
 604 |                 try:
 605 |                     # 调用加载器的close方法
 606 |                     if hasattr(self.multi_channel_loader, 'close'):
 607 |                         self.multi_channel_loader.close()
 608 |                     self.multi_channel_loader = None
 609 |                     self.status_text.insert(tk.END, "多通道加载器已关闭\n")
 610 |                 except Exception as e:
 611 |                     self.status_text.insert(tk.END, f"关闭多通道加载器时出错: {str(e)}\n")
 612 |             elif hasattr(self, 'channel_loaders'):
 613 |                 # 兼容旧版本的代码
 614 |                 for loader in self.channel_loaders:
 615 |                     if hasattr(loader, 'mm') and loader.mm:
 616 |                         loader.mm.close()
 617 |                     if hasattr(loader, 'mm_file') and loader.mm_file:
 618 |                         loader.mm_file.close()
 619 |                 self.channel_loaders = []
 620 |             
 621 |             # 清理内存映射
 622 |             if hasattr(self, 'mm') and self.mm:
 623 |                 self.mm.close()
 624 |                 self.mm = None
 625 |                 
 626 |             if hasattr(self, 'mm_file') and self.mm_file:
 627 |                 self.mm_file.close()
 628 |                 self.mm_file = None
 629 |             
 630 |             self.status_text.insert(tk.END, "内存映射资源已释放\n")
 631 |         except Exception as e:
 632 |             self.status_text.insert(tk.END, f"释放内存映射资源时出错: {str(e)}\n")
 633 |     
 634 |     def monitor_server_output(self):
 635 |         """监控服务器输出并检测错误"""
 636 |         error_patterns = [
 637 |             # 内存不足错误
 638 |             (r"CUDA out of memory", "GPU内存不足"),
 639 |             (r"OutOfMemoryError", "内存不足"),
 640 |             (r"OOM", "内存不足"),
 641 |             # 模型加载错误
 642 |             (r"Error loading model", "模型加载错误"),
 643 |             (r"Failed to load", "模型加载失败"),
 644 |             # 参数错误
 645 |             (r"ValueError", "参数错误"),
 646 |             (r"TypeError", "类型错误"),
 647 |             # 权限错误
 648 |             (r"PermissionError", "权限错误"),
 649 |             # 网络错误
 650 |             (r"ConnectionError", "连接错误"),
 651 |             (r"Address already in use", "端口已被占用"),
 652 |             # 通用错误
 653 |             (r"Error:", "发生错误"),
 654 |             (r"Exception:", "发生异常"),
 655 |             (r"Traceback", "程序崩溃")
 656 |         ]
 657 |         
 658 |         # Token生成模式
 659 |         token_pattern = r"Processed (\d+) tokens"
 660 |         
 661 |         # 记录启动时间
 662 |         start_time = time.time()
 663 |         error_detected = False
 664 |         error_message = ""
 665 |         server_started = False
 666 |         show_process_indicator = False
 667 |         last_indicator_time = time.time()
 668 |         api_info_displayed = False
 669 |         
 670 |         # 显示基础信息
 671 |         self.status_text.insert(tk.END, "开始启动服务器...\n")
 672 |         
 673 |         while True:
 674 |             if not hasattr(self, 'server_process') or self.server_process is None:
 675 |                 self.status_text.insert(tk.END, "服务器进程不存在\n")
 676 |                 break
 677 |                 
 678 |             if self.server_process.poll() is not None:
 679 |                 self.status_text.insert(tk.END, f"服务器进程已退出，退出码: {self.server_process.poll()}\n")
 680 |                 break
 681 |             
 682 |             # 如果API信息已显示，不再显示任何后续日志
 683 |             if api_info_displayed:
 684 |                 # 只静默监控服务器进程，但不显示任何输出
 685 |                 time.sleep(0.5)
 686 |                 continue
 687 |             
 688 |             # 每2秒动态显示一个进度指示器（仅在API信息显示前）
 689 |             current_time = time.time()
 690 |             if current_time - last_indicator_time > 2 and not server_started and not api_info_displayed:
 691 |                 self.status_text.insert(tk.END, "=====\n")
 692 |                 last_indicator_time = current_time
 693 |                 
 694 |             try:
 695 |                 output = self.server_process.stdout.readline()
 696 |                 if not output:
 697 |                     time.sleep(0.1)
 698 |                     continue
 699 |                     
 700 |                 output_text = output.decode(errors='replace')
 701 |                 
 702 |                 # 检查是否包含API服务器信息
 703 |                 if ("API server" in output_text or "Uvicorn running on http://" in output_text) and not api_info_displayed:
 704 |                     # 显示API信息
 705 |                     self.status_text.insert(tk.END, output_text)
 706 |                     self.status_text.insert(tk.END, "服务器已成功启动\n")
 707 |                     self.status_text.see(tk.END)
 708 |                     
 709 |                     # 标记服务器已启动且API信息已显示
 710 |                     server_started = True
 711 |                     api_info_displayed = True
 712 |                     
 713 |                     # 服务器成功启动后，静默执行自动性能优化
 714 |                     if not hasattr(self, 'performance_optimized') or not self.performance_optimized:
 715 |                         threading.Thread(target=self.auto_optimize_performance, daemon=True).start()
 716 |                         self.performance_optimized = True
 717 |                     
 718 |                     # 成功显示API信息后，不再显示任何后续日志
 719 |                     continue
 720 |                 
 721 |                 # 如果API信息已显示，不再处理任何输出
 722 |                 if api_info_displayed:
 723 |                     continue
 724 |                 
 725 |                 # 仅显示最关键信息，不显示详细的中间过程
 726 |                 critical_patterns = [
 727 |                     "API server", "http://", "Model loaded", "model loaded successfully"
 728 |                 ]
 729 |                 
 730 |                 is_critical = any(pattern in output_text.lower() for pattern in critical_patterns)
 731 |                 is_error = any(re.search(pattern, output_text, re.IGNORECASE) for pattern, _ in error_patterns)
 732 |                 
 733 |                 # 只显示关键信息和错误信息
 734 |                 if is_critical or is_error:
 735 |                     self.status_text.insert(tk.END, output_text)
 736 |                     self.status_text.see(tk.END)
 737 |                 
 738 |                 # 检查是否有token生成信息
 739 |                 token_match = re.search(token_pattern, output_text)
 740 |                 if token_match:
 741 |                     tokens = int(token_match.group(1))
 742 |                     self.update_token_count(tokens)
 743 |                 
 744 |                 # 检查是否包含错误信息
 745 |                 for pattern, error_type in error_patterns:
 746 |                     if re.search(pattern, output_text, re.IGNORECASE):
 747 |                         error_detected = True
 748 |                         error_message = f"{error_type}: {output_text.strip()}"
 749 |                         self.status_text.insert(tk.END, f"检测到错误: {error_type}\n")
 750 |                         break
 751 |                         
 752 |                 # 如果检测到错误，等待一段时间收集更多日志，然后尝试恢复
 753 |                 if error_detected:
 754 |                     # 继续读取一些输出以获取更多错误信息
 755 |                     for _ in range(10):  # 读取最多10行额外输出
 756 |                         try:
 757 |                             more_output = self.server_process.stdout.readline()
 758 |                             if more_output:
 759 |                                 more_text = more_output.decode(errors='replace')
 760 |                                 self.status_text.insert(tk.END, more_text)
 761 |                                 error_message += "\n" + more_text.strip()
 762 |                         except:
 763 |                             break
 764 |                         time.sleep(0.1)
 765 |                     
 766 |                     # 如果是内存不足错误，尝试使用备用启动方法
 767 |                     if "内存不足" in error_message:
 768 |                         self.status_text.insert(tk.END, "检测到内存不足错误，尝试使用备用启动方法...\n")
 769 |                         # 停止当前进程
 770 |                         try:
 771 |                             self.server_process.terminate()
 772 |                             self.server_process.wait(timeout=5)
 773 |                         except:
 774 |                             pass
 775 |                         # 尝试使用备用方法启动
 776 |                         self.fallback_start_server(error_message)
 777 |                         return
 778 |                     # 如果是端口被占用，尝试使用不同端口
 779 |                     elif "端口已被占用" in error_message:
 780 |                         self.status_text.insert(tk.END, "检测到端口被占用，尝试使用不同端口...\n")
 781 |                         # 停止当前进程
 782 |                         try:
 783 |                             self.server_process.terminate()
 784 |                             self.server_process.wait(timeout=5)
 785 |                         except:
 786 |                             pass
 787 |                         # 尝试使用不同端口
 788 |                         self.config['port'] += 1
 789 |                         self.status_text.insert(tk.END, f"尝试使用新端口: {self.config['port']}\n")
 790 |                         self.start_server()
 791 |                         return
 792 |                     else:
 793 |                         # 其他错误，显示错误信息并询问用户是否尝试备用方法
 794 |                         if messagebox.askokcancel("错误", f"服务器启动时发生错误:\n{error_message}\n\n是否尝试使用备用方法启动?"):
 795 |                             # 停止当前进程
 796 |                             try:
 797 |                                 self.server_process.terminate()
 798 |                                 self.server_process.wait(timeout=5)
 799 |                             except:
 800 |                                 pass
 801 |                             # 尝试使用备用方法启动
 802 |                             self.fallback_start_server(error_message)
 803 |                         return
 804 |                 
 805 |             except Exception as e:
 806 |                 if not api_info_displayed:
 807 |                     self.status_text.insert(tk.END, f"监控服务器输出时出错: {str(e)}\n")
 808 |                 time.sleep(1)
 809 |     
 810 |     def update_gpu_stats(self):
 811 |         while self.monitoring:
 812 |             try:
 813 |                 gpus = GPUtil.getGPUs()
 814 |                 self.gpu_tree.delete(*self.gpu_tree.get_children())
 815 |                 for gpu in gpus:
 816 |                     # 使用nvidia-smi获取功耗信息
 817 |                     try:
 818 |                         power_info = subprocess.run(
 819 |                             ['nvidia-smi', f'--id={gpu.id}', '--query-gpu=power.draw', '--format=csv,noheader,nounits'],
 820 |                             capture_output=True,
 821 |                             text=True
 822 |                         )
 823 |                         power_draw = power_info.stdout.strip()
 824 |                     except:
 825 |                         power_draw = "N/A"
 826 |                     
 827 |                     self.gpu_tree.insert('', 'end', values=(
 828 |                         gpu.id,
 829 |                         f"{gpu.memoryUsed}MB/{gpu.memoryTotal}MB",
 830 |                         f"{gpu.load*100:.1f}%",
 831 |                         f"{gpu.temperature}°C",
 832 |                         f"{power_draw}W" if power_draw and power_draw != "N/A" else "N/A",
 833 |                         "0.0%"  # KV缓存命中率暂时不支持
 834 |                     ))
 835 |                 time.sleep(2)
 836 |             except Exception as e:
 837 |                 self.status_text.insert(tk.END, f"GPU监控错误: {e}\n")
 838 |                 self.status_text.see(tk.END)
 839 |                 time.sleep(5)
 840 |     
 841 |     def get_gpu_stats(self):
 842 |         """获取GPU统计信息，返回字典列表"""
 843 |         try:
 844 |             # 使用pynvml库代替执行nvidia-smi命令
 845 |             pynvml.nvmlInit()
 846 |             
 847 |             gpu_count = pynvml.nvmlDeviceGetCount()
 848 |             gpu_stats = []
 849 |             
 850 |             for i in range(gpu_count):
 851 |                 handle = pynvml.nvmlDeviceGetHandleByIndex(i)
 852 |                 
 853 |                 # 获取GPU利用率
 854 |                 utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
 855 |                 gpu_util = f"{utilization.gpu} %"
 856 |                 mem_util = f"{utilization.memory} %"
 857 |                 
 858 |                 # 获取温度
 859 |                 temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
 860 |                 
 861 |                 # 获取功耗
 862 |                 try:
 863 |                     power = pynvml.nvmlDeviceGetPowerUsage(handle) / 1000.0
 864 |                     power_draw = f"{power:.1f} W"
 865 |                 except:
 866 |                     power_draw = "N/A"
 867 |                 
 868 |                 gpu_stat = {
 869 |                     'utilization.gpu': gpu_util,
 870 |                     'utilization.memory': mem_util,
 871 |                     'temperature.gpu': f"{temp} C",
 872 |                     'power.draw': power_draw
 873 |                 }
 874 |                 gpu_stats.append(gpu_stat)
 875 |             
 876 |             pynvml.nvmlShutdown()
 877 |             return gpu_stats
 878 |         except ImportError:
 879 |             # 如果pynvml未安装，返回一个模拟的状态信息并记录警告
 880 |             self.status_text.insert(tk.END, "警告: pynvml未安装，无法获取GPU信息。请执行 pip install nvidia-ml-py3 安装。\n")
 881 |             # 返回一个包含默认值的字典，避免程序崩溃
 882 |             return [{'utilization.gpu': '0 %', 'utilization.memory': '0 %', 'temperature.gpu': '0 C', 'power.draw': 'N/A'}]
 883 |         except Exception as e:
 884 |             # 记录错误但返回一个空结果集而不是抛出异常
 885 |             self.status_text.insert(tk.END, f"获取GPU统计信息错误: {str(e)}\n")
 886 |             return []
 887 |     
 888 |     def load_config(self):
 889 |         try:
 890 |             with open('server_config.json', 'r') as f:
 891 |                 loaded_config = json.load(f)
 892 |                 self.config.update(loaded_config)
 893 |                 
 894 |                 # 更新界面上的值
 895 |                 self.model_path_entry.delete(0, tk.END)
 896 |                 self.model_path_entry.insert(0, self.config['model_path'])
 897 |                 
 898 |                 self.ip_entry.delete(0, tk.END)
 899 |                 self.ip_entry.insert(0, self.config['ip'])
 900 |                 
 901 |                 self.port_entry.delete(0, tk.END)
 902 |                 self.port_entry.insert(0, str(self.config['port']))
 903 |                 
 904 |                 self.gpu_count_var.set(str(self.config['gpu_count']))
 905 |                 
 906 |                 self.mem_ratio_entry.delete(0, tk.END)
 907 |                 self.mem_ratio_entry.insert(0, str(self.config['mem_ratio']))
 908 |                 
 909 |                 self.max_tokens_var.set(str(self.config['max_tokens']))
 910 |                 
 911 |                 self.max_model_len_var.set(str(self.config['max_model_len']))  # 加载max_model_len
 912 |                 
 913 |                 # 加载内存交换配置
 914 |                 if 'enable_memory_offload' in self.config:
 915 |                     self.enable_offload_var.set(self.config['enable_memory_offload'])
 916 |                 if 'memory_channels' in self.config:
 917 |                     self.memory_channels_var.set(str(self.config['memory_channels']))
 918 |                 if 'memory_offload_ratio' in self.config:
 919 |                     self.memory_offload_ratio_var.set(str(self.config['memory_offload_ratio']))
 920 |                 if 'reserved_memory' in self.config:
 921 |                     self.reserved_memory_var.set(str(self.config['reserved_memory']))
 922 |                 
 923 |         except FileNotFoundError:
 924 |             pass
 925 | 
 926 |     def save_config(self):
 927 |         with open('server_config.json', 'w') as f:
 928 |             json.dump(self.config, f, indent=4)
 929 | 
 930 |     def save_config_with_message(self):
 931 |         # 先调用update_config确保配置已更新
 932 |         if self.update_config():
 933 |             # 保存配置
 934 |             self.save_config()
 935 |             messagebox.showinfo("成功", "配置已保存到server_config.json")
 936 | 
 937 |     def select_calibrated_model(self):
 938 |         path = filedialog.askdirectory(title="选择校准模型目录")
 939 |         if path:
 940 |             self.calibrated_model_var.set(path)
 941 |             self.config['calibrated_model'] = path
 942 |             
 943 |     def check_fp8_support(self):
 944 |         try:
 945 |             if not torch.cuda.is_available():
 946 |                 return False
 947 |             capability = torch.cuda.get_device_capability()
 948 |             # 需要Ampere或更新架构（计算能力 >= 8.0）
 949 |             return capability[0] >= 8
 950 |         except Exception as e:
 951 |             print(f"检查FP8支持失败: {e}")
 952 |             return False
 953 |             
 954 |     def run_calibration(self):
 955 |         if not self.check_fp8_support():
 956 |             messagebox.showerror("错误", "当前GPU不支持FP8量化")
 957 |             return
 958 |             
 959 |         if not self.config['model_path']:
 960 |             messagebox.showerror("错误", "请先选择模型路径")
 961 |             return
 962 |             
 963 |         # 生成校准脚本
 964 |         calibration_script = f"""
 965 | from datasets import load_dataset
 966 | from transformers import AutoModelForCausalLM, AutoTokenizer
 967 | from llmcompressor.transformers import oneshot
 968 | 
 969 | # 加载模型
 970 | model = AutoModelForCausalLM.from_pretrained("{self.config['model_path']}", 
 971 |                                             device_map="auto", 
 972 |                                             torch_dtype="auto")
 973 | tokenizer = AutoTokenizer.from_pretrained("{self.config['model_path']}")
 974 | 
 975 | # 配置校准参数
 976 | NUM_CALIBRATION_SAMPLES = 512
 977 | MAX_SEQUENCE_LENGTH = 2048
 978 | 
 979 | # 加载数据集
 980 | ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
 981 | ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
 982 | 
 983 | def process_and_tokenize(example):
 984 |     text = tokenizer.apply_chat_template(example["messages"], tokenize=False)
 985 |     return tokenizer(
 986 |         text,
 987 |         padding=False,
 988 |         max_length=MAX_SEQUENCE_LENGTH,
 989 |         truncation=True,
 990 |         add_special_tokens=False,
 991 |     )
 992 | 
 993 | ds = ds.map(process_and_tokenize, remove_columns=ds.column_names)
 994 | 
 995 | # 量化配置
 996 | recipe = '''
 997 | quant_stage:
 998 |     quant_modifiers:
 999 |         QuantizationModifier:
1000 |             kv_cache_scheme:
1001 |                 num_bits: 8
1002 |                 type: float
1003 |                 strategy: tensor
1004 |                 dynamic: false
1005 |                 symmetric: true
1006 | '''
1007 | 
1008 | # 应用量化
1009 | oneshot(
1010 |     model=model,
1011 |     dataset=ds,
1012 |     recipe=recipe,
1013 |     max_seq_length=MAX_SEQUENCE_LENGTH,
1014 |     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
1015 | )
1016 | 
1017 | # 保存量化模型
1018 | SAVE_DIR = "{os.path.basename(self.config['model_path'])}-FP8-KV"
1019 | model.save_pretrained(SAVE_DIR, save_compressed=True)
1020 | tokenizer.save_pretrained(SAVE_DIR)
1021 | """
1022 |         
1023 |         # 保存并运行校准脚本
1024 |         with open("run_calibration.py", "w") as f:
1025 |             f.write(calibration_script)
1026 |             
1027 |         # 检测操作系统，使用适当的方式启动进程
1028 |         try:
1029 |             if sys.platform == 'win32':
1030 |                 # Windows系统
1031 |                 subprocess.Popen(["python", "run_calibration.py"],
1032 |                                 cwd=os.getcwd(),
1033 |                                 creationflags=subprocess.CREATE_NEW_CONSOLE)
1034 |             else:
1035 |                 # Linux/Mac系统
1036 |                 subprocess.Popen(["python", "run_calibration.py"],
1037 |                                 cwd=os.getcwd())
1038 |                         
1039 |             messagebox.showinfo("校准", "校准进程已启动，请等待完成...")
1040 |         except Exception as e:
1041 |             self.status_text.insert(tk.END, f"启动校准进程失败: {str(e)}\n")
1042 |             messagebox.showerror("错误", f"启动校准进程失败: {str(e)}")
1043 |     
1044 |     def get_available_system_memory(self):
1045 |         """获取可用系统内存（GB）"""
1046 |         mem = psutil.virtual_memory()
1047 |         # 返回可用内存（GB）
1048 |         return mem.available / (1024 * 1024 * 1024)
1049 |     
1050 |     def get_available_vram(self, use_ratio=None):
1051 |         """获取可用显存（GB）"""
1052 |         try:
1053 |             gpus = GPUtil.getGPUs()
1054 |             if not gpus:
1055 |                 return 0
1056 |             
1057 |             # 如果使用多GPU，计算总显存
1058 |             if self.config['gpu_count'] > 1:
1059 |                 total_vram = sum([gpu.memoryTotal for gpu in gpus[:self.config['gpu_count']]])
1060 |             else:
1061 |                 total_vram = gpus[0].memoryTotal
1062 |                 
1063 |             # 转换为GB并应用显存比例
1064 |             ratio = use_ratio if use_ratio is not None else (self.config['mem_ratio'] / 100)
1065 |             return total_vram * ratio / 1024
1066 |         except Exception as e:
1067 |             self.status_text.insert(tk.END, f"获取显存信息错误: {e}\n")
1068 |             return 0
1069 |     
1070 |     def estimate_model_size(self):
1071 |         """估算模型大小（GB）"""
1072 |         try:
1073 |             # 简单估算：检查模型目录中的.bin文件大小总和
1074 |             model_path = self.config['model_path']
1075 |             total_size = 0
1076 |             
1077 |             # 检查是否有model.safetensors文件
1078 |             safetensors_path = os.path.join(model_path, "model.safetensors")
1079 |             if os.path.exists(safetensors_path):
1080 |                 total_size = os.path.getsize(safetensors_path)
1081 |                 self.status_text.insert(tk.END, f"找到model.safetensors文件，大小: {total_size/(1024*1024*1024):.2f}GB\n")
1082 |                 # 转换为GB
1083 |                 return total_size / (1024 * 1024 * 1024)
1084 |             
1085 |             # 检查是否有pytorch_model.bin文件
1086 |             pytorch_model_path = os.path.join(model_path, "pytorch_model.bin")
1087 |             if os.path.exists(pytorch_model_path):
1088 |                 total_size = os.path.getsize(pytorch_model_path)
1089 |                 self.status_text.insert(tk.END, f"找到pytorch_model.bin文件，大小: {total_size/(1024*1024*1024):.2f}GB\n")
1090 |                 # 转换为GB
1091 |                 return total_size / (1024 * 1024 * 1024)
1092 |             
1093 |             # 如果是分片模型，计算所有分片的大小
1094 |             for root, dirs, files in os.walk(model_path):
1095 |                 for file in files:
1096 |                     if file.endswith('.bin') or file.endswith('.safetensors'):
1097 |                         file_path = os.path.join(root, file)
1098 |                         file_size = os.path.getsize(file_path)
1099 |                         total_size += file_size
1100 |                         self.status_text.insert(tk.END, f"找到模型文件: {file}, 大小: {file_size/(1024*1024*1024):.2f}GB\n")
1101 |             
1102 |             # 如果没有找到任何模型文件，使用默认值
1103 |             if total_size == 0:
1104 |                 self.status_text.insert(tk.END, "未找到模型文件，使用默认值29.5GB\n")
1105 |                 return 29.5  # 默认值为29.5GB
1106 |             
1107 |             # 转换为GB
1108 |             model_size_gb = total_size / (1024 * 1024 * 1024)
1109 |             self.status_text.insert(tk.END, f"估算模型总大小: {model_size_gb:.2f}GB\n")
1110 |             return model_size_gb
1111 |         except Exception as e:
1112 |             self.status_text.insert(tk.END, f"估算模型大小错误: {e}\n")
1113 |             # 返回默认值
1114 |             return 29.5  # 默认值为29.5GB
1115 | 
1116 |     def setup_memory_offload(self, model_size, offload_ratio):
1117 |         """设置内存交换功能"""
1118 |         if not self.config['enable_memory_offload']:
1119 |             return False
1120 |             
1121 |         try:
1122 |             # 计算需要卸载到内存的部分
1123 |             offload_size = model_size * offload_ratio
1124 |             
1125 |             self.status_text.insert(tk.END, f"将卸载 {offload_size:.2f}GB 到系统内存 (比例: {offload_ratio*100:.0f}%)\n")
1126 |             
1127 |             # 创建内存映射文件目录
1128 |             offload_dir = os.path.join(os.getcwd(), "model_offload")
1129 |             os.makedirs(offload_dir, exist_ok=True)
1130 |             
1131 |             # 创建内存映射文件
1132 |             map_file = os.path.join(offload_dir, "model_offload.bin")
1133 |             
1134 |             # 转换为字节
1135 |             offload_size_bytes = int(offload_size * 1024 * 1024 * 1024)
1136 |             
1137 |             # 检查是否有足够的磁盘空间
1138 |             disk_usage = psutil.disk_usage(os.getcwd())
1139 |             if disk_usage.free < offload_size_bytes:
1140 |                 self.status_text.insert(tk.END, f"警告: 磁盘空间不足，需要 {offload_size:.2f}GB，但只有 {disk_usage.free/(1024*1024*1024):.2f}GB 可用\n")
1141 |                 return False
1142 |                 
1143 |             # 获取系统内存信息
1144 |             mem = psutil.virtual_memory()
1145 |             available_memory = mem.available / (1024 * 1024 * 1024)  # 可用内存(GB)
1146 |             
1147 |             # 确保至少有2GB的系统内存预留
1148 |             safe_memory = available_memory - 2.0
1149 |             
1150 |             # 检查是否有足够的内存
1151 |             if safe_memory < offload_size:
1152 |                 # 调整大小到可用安全内存的90%
1153 |                 adjusted_size = safe_memory * 0.9
1154 |                 self.status_text.insert(tk.END, f"警告: 可用内存不足，需要 {offload_size:.2f}GB，但安全可用内存只有 {safe_memory:.2f}GB\n")
1155 |                 self.status_text.insert(tk.END, f"自动调整卸载大小到 {adjusted_size:.2f}GB (安全内存的90%)\n")
1156 |                 offload_size = adjusted_size
1157 |                 offload_size_bytes = int(offload_size * 1024 * 1024 * 1024)
1158 |             
1159 |             # 创建内存映射文件
1160 |             self.status_text.insert(tk.END, f"正在创建内存映射文件，大小: {offload_size:.2f}GB...\n")
1161 |             
1162 |             # 记录内存使用情况
1163 |             mem_before = psutil.virtual_memory()
1164 |             self.status_text.insert(tk.END, f"创建前系统内存: 已用 {mem_before.percent}% ({mem_before.used/1024/1024/1024:.2f}GB/{mem_before.total/1024/1024/1024:.2f}GB)\n")
1165 |             
1166 |             # 使用fallocate预分配文件空间(如果可用)
1167 |             try:
1168 |                 import subprocess
1169 |                 self.status_text.insert(tk.END, f"尝试使用fallocate快速分配 {offload_size:.2f}GB 空间...\n")
1170 |                 result = subprocess.run(['fallocate', '-l', f"{offload_size_bytes}", map_file], 
1171 |                                       check=True, capture_output=True)
1172 |                 self.status_text.insert(tk.END, "使用fallocate成功预分配空间\n")
1173 |                 
1174 |                 # 验证文件大小
1175 |                 actual_size = os.path.getsize(map_file)
1176 |                 self.status_text.insert(tk.END, f"验证文件大小: {actual_size/(1024*1024*1024):.2f}GB\n")
1177 |                 
1178 |                 if actual_size < offload_size_bytes * 0.99:  # 允许1%的误差
1179 |                     self.status_text.insert(tk.END, f"警告: 文件大小不足，将使用传统方法分配\n")
1180 |                     os.remove(map_file)  # 删除不完整的文件
1181 |                     raise Exception("文件大小不足")
1182 |                     
1183 |             except Exception as e:
1184 |                 self.status_text.insert(tk.END, f"fallocate失败: {str(e)}，将使用传统方法分配空间\n")
1185 |                 
1186 |                 # 传统方法: 分块写入
1187 |                 with open(map_file, "wb") as f:
1188 |                     # 写入全零数据以分配空间
1189 |                     chunk_size = 1024 * 1024 * 128  # 减小到128MB块，降低内存压力
1190 |                     remaining = offload_size_bytes
1191 |                     
1192 |                     try:
1193 |                         while remaining > 0:
1194 |                             # 每写入512MB检查一次内存状态，更频繁地检查
1195 |                             if (offload_size_bytes - remaining) % (512*1024*1024) < chunk_size:
1196 |                                 mem_check = psutil.virtual_memory()
1197 |                                 # 如果可用内存低于1.5GB，停止写入
1198 |                                 if mem_check.available < 1.5 * 1024 * 1024 * 1024:
1199 |                                     self.status_text.insert(tk.END, f"警告: 可用内存低于1.5GB，停止分配更多内存\n")
1200 |                                     break
1201 |                             
1202 |                             write_size = min(chunk_size, remaining)
1203 |                             f.write(b'\0' * write_size)
1204 |                             remaining -= write_size
1205 |                             # 更新进度
1206 |                             progress = (offload_size_bytes - remaining) / offload_size_bytes * 100
1207 |                             self.status_text.delete("end-2l", "end-1l")  # 删除上一行进度
1208 |                             self.status_text.insert(tk.END, f"创建内存映射文件: {progress:.1f}% ({(offload_size_bytes-remaining)/(1024*1024*1024):.2f}GB/{offload_size:.2f}GB)\n")
1209 |                             self.status_text.see(tk.END)
1210 |                             
1211 |                             # 添加小延迟，让系统有时间释放内存
1212 |                             time.sleep(0.01)
1213 |                             
1214 |                     except MemoryError:
1215 |                         self.status_text.insert(tk.END, f"内存不足，无法完成映射文件创建\n")
1216 |                         # 记录已分配的大小
1217 |                         actual_size = offload_size_bytes - remaining
1218 |                         self.status_text.insert(tk.END, f"已分配 {actual_size/(1024*1024*1024):.2f}GB\n")
1219 |                         # 截断文件到已写入的大小
1220 |                         f.flush()
1221 |                         f.truncate(actual_size)
1222 |             
1223 |             # 记录内存使用情况
1224 |             mem_after = psutil.virtual_memory()
1225 |             self.status_text.insert(tk.END, f"创建后系统内存: 已用 {mem_after.percent}% ({mem_after.used/1024/1024/1024:.2f}GB/{mem_after.total/1024/1024/1024:.2f}GB)\n")
1226 |             
1227 |             # 验证最终文件大小
1228 |             final_size = os.path.getsize(map_file)
1229 |             self.status_text.insert(tk.END, f"内存映射文件最终大小: {final_size/(1024*1024*1024):.2f}GB\n")
1230 |             
1231 |             # 不再强制要求18GB，而是根据模型大小动态调整
1232 |             min_required_size = min(18, model_size * 0.8)  # 至少需要模型大小的80%
1233 |             
1234 |             if final_size < min_required_size * 1024 * 1024 * 1024:
1235 |                 self.status_text.insert(tk.END, f"警告: 内存映射文件大小不足{min_required_size:.1f}GB，可能无法加载模型\n")
1236 |                 if not messagebox.askokcancel("警告", 
1237 |                     f"内存映射文件大小仅为{final_size/(1024*1024*1024):.2f}GB，建议至少{min_required_size:.1f}GB。\n是否继续？"):
1238 |                     return False
1239 |                 
1240 |             self.status_text.insert(tk.END, "内存映射文件创建完成\n")
1241 |             
1242 |             # 创建内存映射
1243 |             self.mm_file = open(map_file, "r+b")
1244 |             self.mm = mmap.mmap(self.mm_file.fileno(), 0)
1245 |             
1246 |             # 使用用户设置的内存通道数，不再自动增加
1247 |             channels = self.config['memory_channels']
1248 |             self.status_text.insert(tk.END, f"使用用户设置的内存通道数: {channels}\n")
1249 |             
1250 |             self.setup_multi_channel_loader()
1251 |             
1252 |             # 创建配置文件
1253 |             offload_config = {
1254 |                 'enabled': True,
1255 |                 'offload_dir': offload_dir,
1256 |                 'offload_ratio': offload_ratio,
1257 |                 'channels': channels,
1258 |                 'reserved_memory': self.config['reserved_memory'] / 100,
1259 |                 'actual_size_gb': final_size/(1024*1024*1024)
1260 |             }
1261 |             
1262 |             offload_config_path = os.path.join(offload_dir, "offload_config.json")
1263 |             with open(offload_config_path, 'w') as f:
1264 |                 json.dump(offload_config, f, indent=4)
1265 |             
1266 |             self.status_text.insert(tk.END, f"内存交换配置已保存到 {offload_config_path}\n")
1267 |             
1268 |             return True
1269 |         except Exception as e:
1270 |             self.status_text.insert(tk.END, f"设置内存交换错误: {str(e)}\n")
1271 |             import traceback
1272 |             self.status_text.insert(tk.END, traceback.format_exc())
1273 |             return False
1274 |     
1275 |     def setup_multi_channel_loader(self):
1276 |         """设置多通道加载器"""
1277 |         class MultiChannelLoader:
1278 |             def __init__(self, memory_map, num_channels=4, cache_size=32):  # 添加cache_size参数
1279 |                 self.memory_map = memory_map
1280 |                 self.num_channels = num_channels
1281 |                 self.channel_locks = [threading.Lock() for _ in range(num_channels)]
1282 |                 self.channel_positions = [0] * num_channels
1283 |                 self.channel_usage = [0] * num_channels  # 记录每个通道的使用次数
1284 |                 self.channel_last_access = [time.time()] * num_channels  # 记录每个通道的最后访问时间
1285 |                 self.cache = {}  # 简单的内存缓存
1286 |                 self.cache_hits = 0
1287 |                 self.cache_misses = 0
1288 |                 self.max_cache_size = cache_size  # 使用传入的缓存大小
1289 |                 self.prefetch_queue = []  # 预取队列
1290 |                 self.prefetch_lock = threading.Lock()
1291 |                 self.prefetch_thread_running = True
1292 |                 # 启动预取线程
1293 |                 threading.Thread(target=self._prefetch_worker, daemon=True).start()
1294 |                 
1295 |             def read_chunk(self, offset, size, channel_id=None):
1296 |                 # 检查缓存
1297 |                 cache_key = (offset, size)
1298 |                 if cache_key in self.cache:
1299 |                     self.cache_hits += 1
1300 |                     # 更新缓存访问时间
1301 |                     self.cache[cache_key]['last_access'] = time.time()
1302 |                     return self.cache[cache_key]['data']
1303 |                 
1304 |                 self.cache_misses += 1
1305 |                 
1306 |                 # 如果未指定通道，选择最佳通道
1307 |                 if channel_id is None:
1308 |                     channel_id = self._get_best_channel(offset)
1309 |                     
1310 |                 with self.channel_locks[channel_id]:
1311 |                     # 记录访问时间
1312 |                     self.channel_last_access[channel_id] = time.time()
1313 |                     
1314 |                     # 如果当前位置接近请求的偏移量，可以减少寻址时间
1315 |                     if abs(self.channel_positions[channel_id] - offset) < 1024*1024:  # 如果在1MB范围内
1316 |                         # 已经接近目标位置，直接读取
1317 |                         pass
1318 |                     else:
1319 |                         # 需要重新定位
1320 |                         self.memory_map.seek(offset)
1321 |                     
1322 |                     data = self.memory_map.read(size)
1323 |                     self.channel_positions[channel_id] = offset + size
1324 |                     self.channel_usage[channel_id] += 1
1325 |                     
1326 |                     # 更新缓存
1327 |                     if len(self.cache) >= self.max_cache_size:
1328 |                         # 删除最旧的缓存项
1329 |                         oldest_key = min(self.cache.keys(), key=lambda k: self.cache[k]['last_access'])
1330 |                         del self.cache[oldest_key]
1331 |                     
1332 |                     self.cache[cache_key] = {
1333 |                         'data': data,
1334 |                         'last_access': time.time()
1335 |                     }
1336 |                     
1337 |                     # 预测性预取 - 预取下一个可能的块
1338 |                     next_offset = offset + size
1339 |                     self.prefetch(next_offset, size)
1340 |                     
1341 |                     return data
1342 |                     
1343 |             def _get_best_channel(self, target_offset):
1344 |                 # 优先选择位置接近的通道，其次考虑使用频率
1345 |                 best_channel = 0
1346 |                 best_score = float('inf')
1347 |                 
1348 |                 for i in range(self.num_channels):
1349 |                     # 计算位置接近度分数
1350 |                     position_score = abs(self.channel_positions[i] - target_offset) / (1024*1024)  # MB为单位
1351 |                     
1352 |                     # 计算使用频率分数
1353 |                     usage_score = self.channel_usage[i] * 0.1
1354 |                     
1355 |                     # 计算时间分数（越久未使用越好）
1356 |                     time_score = -10 * (time.time() - self.channel_last_access[i])
1357 |                     
1358 |                     # 综合评分（越低越好）
1359 |                     total_score = position_score + usage_score + time_score
1360 |                     
1361 |                     if total_score < best_score:
1362 |                         best_score = total_score
1363 |                         best_channel = i
1364 |                 
1365 |                 return best_channel
1366 |                     
1367 |             def _get_least_busy_channel(self):
1368 |                 # 选择使用次数最少的通道
1369 |                 return self.channel_usage.index(min(self.channel_usage))
1370 |                 
1371 |             def get_stats(self):
1372 |                 return {
1373 |                     'positions': self.channel_positions,
1374 |                     'usage': self.channel_usage,
1375 |                     'cache_hits': self.cache_hits,
1376 |                     'cache_misses': self.cache_misses,
1377 |                     'hit_ratio': self.cache_hits / (self.cache_hits + self.cache_misses + 0.001) * 100,
1378 |                     'prefetch_queue_size': len(self.prefetch_queue)
1379 |                 }
1380 |             
1381 |             def prefetch(self, offset, size):
1382 |                 """预取数据到缓存"""
1383 |                 # 检查是否已经在缓存中
1384 |                 cache_key = (offset, size)
1385 |                 if cache_key in self.cache:
1386 |                     return
1387 |                 
1388 |                 # 检查是否已经在预取队列中
1389 |                 with self.prefetch_lock:
1390 |                     for item in self.prefetch_queue:
1391 |                         if item[0] == offset and item[1] == size:
1392 |                             return
1393 |                     
1394 |                     # 添加到预取队列，最多保留10个预取请求
1395 |                     self.prefetch_queue.append((offset, size))
1396 |                     if len(self.prefetch_queue) > 10:
1397 |                         self.prefetch_queue.pop(0)
1398 |             
1399 |             def _prefetch_worker(self):
1400 |                 """预取线程"""
1401 |                 while self.prefetch_thread_running:
1402 |                     try:
1403 |                         # 检查预取队列
1404 |                         with self.prefetch_lock:
1405 |                             if self.prefetch_queue:
1406 |                                 offset, size = self.prefetch_queue.pop(0)
1407 |                             else:
1408 |                                 offset, size = None, None
1409 |                         
1410 |                         # 如果有预取请求，执行预取
1411 |                         if offset is not None and size is not None:
1412 |                             # 检查是否已经在缓存中
1413 |                             cache_key = (offset, size)
1414 |                             if cache_key not in self.cache:
1415 |                                 # 选择最佳通道
1416 |                                 channel_id = self._get_best_channel(offset)
1417 |                                 # 执行预取
1418 |                                 self.read_chunk(offset, size, channel_id)
1419 |                     except Exception as e:
1420 |                         print(f"预取错误: {e}")
1421 |                     
1422 |                     # 短暂休眠，避免占用过多CPU
1423 |                     time.sleep(0.01)
1424 |                 
1425 |             def close(self):
1426 |                 """关闭加载器"""
1427 |                 self.prefetch_thread_running = False
1428 |                 self.cache.clear()
1429 |         
1430 |         # 创建多通道加载器
1431 |         num_channels = max(4, int(self.config['memory_channels']))  # 确保至少有4个通道
1432 |         
1433 |         # 应用高级设置中的缓存大小
1434 |         cache_size = self.config.get('advanced_cache_size', 32)  # 默认32
1435 |         self.status_text.insert(tk.END, f"内存缓存大小: {cache_size}\n")
1436 |         
1437 |         self.multi_channel_loader = MultiChannelLoader(
1438 |             self.mm, 
1439 |             num_channels=num_channels,
1440 |             cache_size=cache_size  # 传入缓存大小
1441 |         )
1442 |         
1443 |         self.status_text.insert(tk.END, f"已创建 {num_channels} 个内存通道加载器，带缓存和预取功能\n")
1444 |         
1445 |         # 启动内存监控线程
1446 |         self.memory_monitor_thread_running = True
1447 |         threading.Thread(target=self.memory_monitor_thread, daemon=True).start()
1448 | 
1449 |     def update_system_memory_stats(self):
1450 |         """更新系统内存统计信息"""
1451 |         try:
1452 |             # 检查监控标志，如果已关闭则直接返回
1453 |             if not self.monitoring:
1454 |                 return False
1455 |                 
1456 |             # 获取系统内存信息
1457 |             mem = psutil.virtual_memory()
1458 |             
1459 |             # 更新到界面
1460 |             self.status_text.insert(tk.END, f"系统内存: 已用 {mem.percent}% ({mem.used/1024/1024/1024:.2f}GB/{mem.total/1024/1024/1024:.2f}GB)\n")
1461 |             
1462 |             # 如果启用了内存交换，监控交换性能
1463 |             if hasattr(self, 'multi_channel_loader') and self.multi_channel_loader is not None:
1464 |                 try:
1465 |                     stats = self.multi_channel_loader.get_stats()
1466 |                     
1467 |                     # 只有在第一次时显示内存交换通道信息
1468 |                     if not self.memory_channel_info_displayed:
1469 |                         channel_stats = [f"通道{i}: {pos/1024/1024:.2f}MB" for i, pos in enumerate(stats['positions'])]
1470 |                         usage_stats = [f"通道{i}: {usage}次" for i, usage in enumerate(stats['usage'])]
1471 |                         
1472 |                         self.status_text.insert(tk.END, f"内存交换通道状态: {', '.join(channel_stats)}\n")
1473 |                         self.status_text.insert(tk.END, f"内存交换通道使用: {', '.join(usage_stats)}\n")
1474 |                         
1475 |                         # 设置标志，表示已显示过内存交换通道信息
1476 |                         self.memory_channel_info_displayed = True
1477 |                     
1478 |                     # 显示缓存命中率（也只显示一次）
1479 |                     if not self.cache_hit_info_displayed and 'cache_hits' in stats and 'cache_misses' in stats:
1480 |                         total_requests = stats['cache_hits'] + stats['cache_misses']
1481 |                         if total_requests > 0:
1482 |                             hit_ratio = stats['cache_hits'] / total_requests * 100
1483 |                             self.status_text.insert(tk.END, f"内存缓存命中率: {hit_ratio:.2f}% (命中: {stats['cache_hits']}, 未命中: {stats['cache_misses']})\n")
1484 |                             self.cache_hit_info_displayed = True
1485 |                 except Exception as e:
1486 |                     # 捕获获取统计信息时的错误，但不中断监控
1487 |                     self.status_text.insert(tk.END, f"获取内存交换统计信息错误: {str(e)}\n")
1488 |             
1489 |             # 更新GPU KV缓存命中率（如果有，也只显示一次）
1490 |             if not self.kv_cache_info_displayed and hasattr(self, 'kv_cache_hits') and hasattr(self, 'kv_cache_misses'):
1491 |                 total_kv_requests = self.kv_cache_hits + self.kv_cache_misses
1492 |                 if total_kv_requests > 0:
1493 |                     kv_hit_ratio = self.kv_cache_hits / total_kv_requests * 100
1494 |                     self.status_text.insert(tk.END, f"KV缓存命中率: {kv_hit_ratio:.2f}% (命中: {self.kv_cache_hits}, 未命中: {self.kv_cache_misses})\n")
1495 |                     self.kv_cache_info_displayed = True
1496 |             
1497 |             self.status_text.see(tk.END)
1498 |             return True
1499 |         except Exception as e:
1500 |             self.status_text.insert(tk.END, f"内存监控错误: {e}\n")
1501 |             return False
1502 | 
1503 |     def memory_monitor_thread(self):
1504 |         """内存监控线程"""
1505 |         try:
1506 |             # 设置本地变量，避免频繁访问self属性
1507 |             monitoring = True
1508 |             
1509 |             while monitoring and self.monitoring:
1510 |                 try:
1511 |                     if hasattr(self, 'server_process') and self.server_process is not None and self.server_process.poll() is None:
1512 |                         # 检查是否所有信息都已经显示过一次
1513 |                         all_info_displayed = (self.memory_channel_info_displayed and 
1514 |                                              self.cache_hit_info_displayed and 
1515 |                                              self.kv_cache_info_displayed)
1516 |                             
1517 |                         # 如果所有信息都已显示过，则降低更新频率，且不输出系统内存使用信息
1518 |                         if all_info_displayed:
1519 |                             # 只静默更新状态，不显示到界面
1520 |                             pass
1521 |                         else:
1522 |                             # 仍有未显示的信息，正常更新并显示
1523 |                             self.update_system_memory_stats()
1524 |                             
1525 |                     # 增加更新间隔
1526 |                     time.sleep(15)  # 每15秒更新一次
1527 |                     
1528 |                     # 检查监控标志是否已更改
1529 |                     monitoring = self.monitoring
1530 |                 except Exception as e:
1531 |                     # 出错时不显示错误信息，静默处理
1532 |                     time.sleep(5)  # 出错时等待5秒再继续
1533 |         except Exception as e:
1534 |             # 捕获线程启动时的异常，静默处理
1535 |             pass
1536 | 
1537 |     def check_vllm_supported_args(self):
1538 |         """检查VLLM支持的命令行参数"""
1539 |         supported_args = {
1540 |             'swap_space': '--swap-space',
1541 |             'cpu_offload': '--cpu-offload-gb',
1542 |             'max_cpu_memory': '--max-cpu-memory'
1543 |         }
1544 |         
1545 |         try:
1546 |             # 尝试运行vllm help命令，增加超时时间
1547 |             help_output = subprocess.run(
1548 |                 ['vllm', 'serve', '--help'],
1549 |                 capture_output=True,
1550 |                 text=True,
1551 |                 timeout=15  # 增加超时时间到15秒
1552 |             )
1553 |             
1554 |             # 检查输出中是否包含特定参数
1555 |             output = help_output.stdout + help_output.stderr
1556 |             self.status_text.insert(tk.END, f"检查VLLM支持的参数...\n")
1557 |             
1558 |             # 检查每个参数
1559 |             if '--swap-space' not in output:
1560 |                 if '--swap' in output:
1561 |                     supported_args['swap_space'] = '--swap'
1562 |                     self.status_text.insert(tk.END, "未找到--swap-space参数，将使用--swap\n")
1563 |                 else:
1564 |                     supported_args['swap_space'] = None
1565 |                     self.status_text.insert(tk.END, "未找到交换空间相关参数\n")
1566 |                 
1567 |             # 检查CPU卸载参数
1568 |             if '--cpu-offload-gb' not in output:
1569 |                 if '--cpu-offload' in output:
1570 |                     supported_args['cpu_offload'] = '--cpu-offload'
1571 |                     self.status_text.insert(tk.END, "未找到--cpu-offload-gb参数，将使用--cpu-offload\n")
1572 |                 elif '--offload-params' in output:
1573 |                     supported_args['cpu_offload'] = '--offload-params'
1574 |                     self.status_text.insert(tk.END, "未找到--cpu-offload-gb参数，将使用--offload-params\n")
1575 |                 else:
1576 |                     supported_args['cpu_offload'] = None
1577 |                     self.status_text.insert(tk.END, "未找到CPU卸载相关参数\n")
1578 |                     
1579 |             if '--max-cpu-memory' not in output:
1580 |                 supported_args['max_cpu_memory'] = None
1581 |                 self.status_text.insert(tk.END, "未找到--max-cpu-memory参数\n")
1582 |                 
1583 |             return supported_args
1584 |             
1585 |         except subprocess.TimeoutExpired:
1586 |             self.status_text.insert(tk.END, "检查VLLM参数超时，使用默认参数\n")
1587 |             # 使用最常见的参数组合
1588 |             return {
1589 |                 'swap_space': '--swap-space',
1590 |                 'cpu_offload': '--cpu-offload',
1591 |                 'max_cpu_memory': None
1592 |             }
1593 |         except Exception as e:
1594 |             self.status_text.insert(tk.END, f"检查VLLM参数失败: {str(e)}\n")
1595 |             # 返回默认值
1596 |             return supported_args
1597 | 
1598 |     def fallback_start_server(self, error_msg):
1599 |         """备用启动方法，尝试使用不同的参数启动服务器"""
1600 |         if not messagebox.askokcancel("错误", 
1601 |             f"{error_msg}\n\n是否尝试使用备用方法启动服务器？"):
1602 |             return False
1603 |             
1604 |         self.status_text.insert(tk.END, "\n尝试使用备用方法启动服务器...\n")
1605 |         
1606 |         # 清理GPU内存
1607 |         self.clean_gpu_memory()
1608 |         
1609 |         # 设置环境变量以避免内存碎片问题
1610 |         env = os.environ.copy()
1611 |         env['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True,max_split_size_mb:128'
1612 |         env['CUDA_VISIBLE_DEVICES'] = ','.join([str(i) for i in range(self.config['gpu_count'])])
1613 |         env['OMP_NUM_THREADS'] = '4'  # 限制OpenMP线程数
1614 |         env['MKL_NUM_THREADS'] = '4'  # 限制MKL线程数
1615 |         
1616 |         # 添加VLLM特定的环境变量，优化内存使用
1617 |         env['VLLM_USE_ASYNC_CUDA_MALLOC'] = '1'  # 使用异步CUDA内存分配
1618 |         env['VLLM_CPU_OFFLOAD_PIPELINE'] = '1'  # 启用CPU卸载流水线
1619 |         env['VLLM_ENABLE_STAGED_INIT'] = '1'  # 启用分阶段初始化
1620 |         
1621 |         self.status_text.insert(tk.END, "已设置优化环境变量\n")
1622 |         
1623 |         # 临时降低模型参数
1624 |         original_max_model_len = self.config['max_model_len']
1625 |         original_max_tokens = self.config['max_tokens']
1626 |         
1627 |         # 降低序列长度以减少内存使用
1628 |         self.config['max_model_len'] = min(self.config['max_model_len'], 2048)  # 调整到2048
1629 |         self.config['max_tokens'] = min(self.config['max_tokens'], 2048)  # 调整到2048，确保大于max_num_seqs
1630 |         
1631 |         self.status_text.insert(tk.END, f"临时降低序列长度: {self.config['max_model_len']}, 最大token数: {self.config['max_tokens']}\n")
1632 |         
1633 |         # 获取模型大小
1634 |         model_size = self.estimate_model_size()
1635 |         
1636 |         # 尝试不同的启动选项
1637 |         options = [
1638 |             {
1639 |                 "desc": "使用最小内存配置",
1640 |                 "cmd": [
1641 |                     'vllm', 'serve',
1642 |                     self.config['model_path'],
1643 |                     '--host', self.config['ip'],
1644 |                     '--port', str(self.config['port']),
1645 |                     '--tensor-parallel-size', str(self.config['gpu_count']),
1646 |                     '--gpu-memory-utilization', '0.7',  # 降低显存使用率
1647 |                     '--max-num-batched-tokens', str(self.config['max_tokens']),
1648 |                     '--block-size', str(self.config['block_size']),
1649 |                     '--max-model-len', str(self.config['max_model_len']),
1650 |                     '--dtype', 'half',
1651 |                     '--enforce-eager'  # 添加强制使用eager模式
1652 |                 ]
1653 |             },
1654 |             {
1655 |                 "desc": "使用量化配置",
1656 |                 "cmd": [
1657 |                     'vllm', 'serve',
1658 |                     self.config['model_path'],
1659 |                     '--host', self.config['ip'],
1660 |                     '--port', str(self.config['port']),
1661 |                     '--tensor-parallel-size', str(self.config['gpu_count']),
1662 |                     '--gpu-memory-utilization', '0.8',
1663 |                     '--max-num-batched-tokens', str(self.config['max_tokens']),
1664 |                     '--block-size', str(self.config['block_size']),
1665 |                     '--max-model-len', str(self.config['max_model_len']),
1666 |                     '--dtype', 'half',
1667 |                     '--quantization', 'awq',  # 尝试使用AWQ量化
1668 |                     '--enforce-eager'  # 添加强制使用eager模式
1669 |                 ]
1670 |             },
1671 |             {
1672 |                 "desc": "使用最小内存交换配置",
1673 |                 "cmd": [
1674 |                     'vllm', 'serve',
1675 |                     self.config['model_path'],
1676 |                     '--host', self.config['ip'],
1677 |                     '--port', str(self.config['port']),
1678 |                     '--tensor-parallel-size', str(self.config['gpu_count']),
1679 |                     '--gpu-memory-utilization', '0.6',  # 进一步降低显存使用率
1680 |                     '--max-num-batched-tokens', str(self.config['max_tokens']),
1681 |                     '--block-size', str(self.config['block_size']),
1682 |                     '--max-model-len', str(self.config['max_model_len']),
1683 |                     '--dtype', 'half',
1684 |                     '--swap-space', '2',  # 移除GiB单位，只使用数字
1685 |                     '--cpu-offload-gb', '10',
1686 |                     '--enforce-eager'  # 添加强制使用eager模式
1687 |                 ]
1688 |             }
1689 |         ]
1690 |         
1691 |         # 针对大型模型(>10GB)添加特殊选项
1692 |         if model_size > 10:
1693 |             # 添加分阶段加载选项
1694 |             staged_option = {
1695 |                 "desc": "使用分阶段加载（适合大模型）",
1696 |                 "cmd": [
1697 |                     'vllm', 'serve',
1698 |                     self.config['model_path'],
1699 |                     '--host', self.config['ip'],
1700 |                     '--port', str(self.config['port']),
1701 |                     '--tensor-parallel-size', str(self.config['gpu_count']),
1702 |                     '--gpu-memory-utilization', '0.5',  # 显著降低显存使用率
1703 |                     '--max-num-batched-tokens', str(min(self.config['max_tokens'], 1024)),  # 降低批处理大小
1704 |                     '--block-size', str(min(self.config['block_size'], 8)),  # 降低块大小
1705 |                     '--max-model-len', str(min(self.config['max_model_len'], 1024)),  # 降低最大长度
1706 |                     '--dtype', 'half',
1707 |                     '--swap-space', '4',
1708 |                     '--cpu-offload-gb', str(max(10, int(model_size * 0.7))),  # 至少卸载70%的模型
1709 |                     '--enforce-eager'  # 添加强制使用eager模式
1710 |                 ]
1711 |             }
1712 |             options.insert(0, staged_option)  # 将此选项放在首位
1713 |             
1714 |             # 添加8位量化选项
1715 |             int8_option = {
1716 |                 "desc": "使用8位量化（适合大模型）",
1717 |                 "cmd": [
1718 |                     'vllm', 'serve',
1719 |                     self.config['model_path'],
1720 |                     '--host', self.config['ip'],
1721 |                     '--port', str(self.config['port']),
1722 |                     '--tensor-parallel-size', str(self.config['gpu_count']),
1723 |                     '--gpu-memory-utilization', '0.7',
1724 |                     '--max-num-batched-tokens', str(self.config['max_tokens']),
1725 |                     '--block-size', str(self.config['block_size']),
1726 |                     '--max-model-len', str(self.config['max_model_len']),
1727 |                     '--dtype', 'half',
1728 |                     '--quantization', 'int8',  # 使用int8量化
1729 |                     '--enforce-eager'  # 添加强制使用eager模式
1730 |                 ]
1731 |             }
1732 |             options.insert(1, int8_option)
1733 |         
1734 |         # 尝试每个选项
1735 |         for i, option in enumerate(options):
1736 |             self.status_text.insert(tk.END, f"\n尝试选项 {i+1}: {option['desc']}\n")
1737 |             cmd_str = ' '.join(option['cmd'])
1738 |             self.status_text.insert(tk.END, f"命令: {cmd_str}\n")
1739 |                 
1740 |             try:
1741 |                 # 启动服务器
1742 |                 self.server_process = subprocess.Popen(
1743 |                     option['cmd'],
1744 |                     stdout=subprocess.PIPE,
1745 |                     stderr=subprocess.STDOUT,
1746 |                     env=env
1747 |                 )
1748 |                     
1749 |                 # 等待一小段时间，检查进程是否立即退出
1750 |                 time.sleep(5)  # 增加等待时间
1751 |                 if self.server_process.poll() is None:
1752 |                     # 进程仍在运行，启动成功
1753 |                     self.status_text.insert(tk.END, "服务器启动成功！\n")
1754 |                     
1755 |                     # 启动监控线程
1756 |                     threading.Thread(target=self.monitor_server_output).start()
1757 |                     
1758 |                     # 更新API地址
1759 |                     api_base = f"http://{self.config['ip']}:{self.config['port']}/v1"
1760 |                     self.api_label.config(text=f"API地址: {api_base}")
1761 |                     
1762 |                     return True
1763 |                 else:
1764 |                     # 进程已退出，获取输出
1765 |                     output, _ = self.server_process.communicate()
1766 |                     error_output = output.decode()
1767 |                     self.status_text.insert(tk.END, f"启动失败: {error_output}\n")
1768 |                     
1769 |                     # 分析错误原因
1770 |                     if "CUDA out of memory" in error_output:
1771 |                         self.status_text.insert(tk.END, "检测到错误: GPU内存不足\n")
1772 |                     elif "RuntimeError" in error_output:
1773 |                         self.status_text.insert(tk.END, "检测到错误: 程序崩溃\n")
1774 |                     
1775 |                     # 在选项之间添加额外的清理步骤
1776 |                     self.clean_gpu_memory()
1777 |                     time.sleep(2)  # 等待GPU内存释放
1778 |                     
1779 |             except Exception as e:
1780 |                 self.status_text.insert(tk.END, f"尝试选项 {i+1} 失败: {str(e)}\n")
1781 |         
1782 |         # 所有选项都失败，提供建议
1783 |         self.status_text.insert(tk.END, "所有备用选项都失败，建议：\n")
1784 |         self.status_text.insert(tk.END, "1. 关闭其他内存密集型应用程序\n")
1785 |         self.status_text.insert(tk.END, "2. 重启系统以清理内存碎片\n")
1786 |         self.status_text.insert(tk.END, "3. 尝试使用量化版本的模型\n")
1787 |         self.status_text.insert(tk.END, "4. 尝试使用更小的模型，如7B或更小的版本\n")
1788 |         
1789 |         # 恢复原始设置
1790 |         self.config['max_model_len'] = original_max_model_len
1791 |         self.config['max_tokens'] = original_max_tokens
1792 |         
1793 |         return False
1794 | 
1795 |     def clean_gpu_memory(self):
1796 |         """清理GPU内存"""
1797 |         try:
1798 |             self.status_text.insert(tk.END, "正在清理GPU内存...\n")
1799 |             
1800 |             # 尝试释放PyTorch缓存
1801 |             if torch.cuda.is_available():
1802 |                 torch.cuda.empty_cache()
1803 |                 self.status_text.insert(tk.END, "已清理PyTorch缓存\n")
1804 |                 
1805 |                 # 获取当前GPU内存使用情况
1806 |                 gpu = GPUtil.getGPUs()[0]
1807 |                 free_mem = gpu.memoryFree
1808 |                 total_mem = gpu.memoryTotal
1809 |                 used_mem = total_mem - free_mem
1810 |                 
1811 |                 self.status_text.insert(tk.END, f"当前GPU内存: 已用 {used_mem}MB / 总计 {total_mem}MB\n")
1812 |                 
1813 |                 # 如果内存使用率过高，建议用户重启系统
1814 |                 if used_mem / total_mem > 0.5:  # 如果使用超过50%
1815 |                     self.status_text.insert(tk.END, "警告: GPU内存使用率较高，可能影响模型加载\n")
1816 |                     self.status_text.insert(tk.END, "建议关闭其他使用GPU的应用程序或重启系统\n")
1817 |                     
1818 |             # 尝试运行系统命令释放内存
1819 |             os.system("sync")  # 同步文件系统缓存
1820 |             
1821 |             # 尝试释放系统缓存
1822 |             try:
1823 |                 with open("/proc/sys/vm/drop_caches", "w") as f:
1824 |                     f.write("1")
1825 |                 self.status_text.insert(tk.END, "已释放系统缓存\n")
1826 |             except:
1827 |                 pass  # 可能没有权限，忽略错误
1828 |                 
1829 |             self.status_text.insert(tk.END, "GPU内存清理完成\n")
1830 |             
1831 |         except Exception as e:
1832 |             self.status_text.insert(tk.END, f"清理GPU内存时出错: {str(e)}\n")
1833 | 
1834 |     def preallocate_memory_buffer(self):
1835 |         """预先分配内存缓冲区，防止运行时内存不足"""
1836 |         try:
1837 |             self.status_text.insert(tk.END, "正在预分配内存缓冲区...\n")
1838 |                 
1839 |             # 获取模型大小
1840 |             model_size = self.estimate_model_size()
1841 |             
1842 |             # 计算需要预分配的内存大小 - 根据模型大小动态调整
1843 |             if model_size < 10:
1844 |                 # 小模型使用较小的缓冲区
1845 |                 buffer_size_gb = model_size * 0.2
1846 |                 buffer_size_gb = max(buffer_size_gb, 4.0)  # 至少4GB
1847 |             else:
1848 |                 # 大模型使用较大的缓冲区，但比例更小
1849 |                 buffer_size_gb = model_size * 0.15
1850 |                 buffer_size_gb = max(buffer_size_gb, 6.0)  # 至少6GB
1851 |             
1852 |             # 检查可用内存
1853 |             mem = psutil.virtual_memory()
1854 |             available_gb = mem.available / (1024 * 1024 * 1024)
1855 |             
1856 |             # 确保缓冲区不超过可用内存的50%
1857 |             max_buffer_size = available_gb * 0.5
1858 |             if buffer_size_gb > max_buffer_size:
1859 |                 self.status_text.insert(tk.END, f"警告: 计算的缓冲区大小({buffer_size_gb:.2f}GB)超过可用内存的50%，调整大小\n")
1860 |                 buffer_size_gb = max_buffer_size
1861 |             
1862 |             # 保留至少5GB系统运行空间
1863 |             if available_gb < buffer_size_gb + 5:
1864 |                 self.status_text.insert(tk.END, f"警告: 可用内存({available_gb:.2f}GB)不足，减小缓冲区大小\n")
1865 |                 buffer_size_gb = max(2.0, available_gb - 5)  # 至少2GB，保留5GB系统运行空间
1866 |                 
1867 |             self.status_text.insert(tk.END, f"预分配内存缓冲区大小: {buffer_size_gb:.2f}GB\n")
1868 |             
1869 |             # 创建内存缓冲区目录
1870 |             buffer_dir = os.path.join(os.getcwd(), "memory_buffer")
1871 |             os.makedirs(buffer_dir, exist_ok=True)
1872 |             
1873 |             # 创建内存缓冲区文件
1874 |             buffer_file = os.path.join(buffer_dir, "memory_buffer.bin")
1875 |             
1876 |             # 如果文件已存在，检查大小是否足够
1877 |             if os.path.exists(buffer_file):
1878 |                 current_size = os.path.getsize(buffer_file) / (1024 * 1024 * 1024)
1879 |                 if current_size >= buffer_size_gb:
1880 |                     self.status_text.insert(tk.END, f"使用现有内存缓冲区: {current_size:.2f}GB\n")
1881 |                     return
1882 |                 else:
1883 |                     self.status_text.insert(tk.END, f"现有内存缓冲区大小不足({current_size:.2f}GB)，重新创建\n")
1884 |                     os.remove(buffer_file)
1885 |             
1886 |             # 创建新的内存缓冲区文件
1887 |             self.status_text.insert(tk.END, f"创建内存缓冲区文件: {buffer_file}\n")
1888 |             
1889 |             # 计算缓冲区大小（字节）
1890 |             buffer_size_bytes = int(buffer_size_gb * 1024 * 1024 * 1024)
1891 |             
1892 |             # 创建内存缓冲区文件
1893 |             with open(buffer_file, "wb") as f:
1894 |                 # 分块写入，避免一次性分配过多内存
1895 |                 chunk_size = 1024 * 1024 * 64  # 减小到64MB块，降低内存压力
1896 |                 remaining = buffer_size_bytes
1897 |                 
1898 |                 # 记录内存使用情况
1899 |                 mem_before = psutil.virtual_memory()
1900 |                 self.status_text.insert(tk.END, f"创建前系统内存: 已用 {mem_before.percent}% ({mem_before.used/1024/1024/1024:.2f}GB/{mem_before.total/1024/1024/1024:.2f}GB)\n")
1901 |                 
1902 |                 try:
1903 |                     while remaining > 0:
1904 |                         # 每写入256MB检查一次内存状态，更频繁地检查
1905 |                         if (buffer_size_bytes - remaining) % (256*1024*1024) < chunk_size:
1906 |                             mem_check = psutil.virtual_memory()
1907 |                             # 如果可用内存低于2.5GB，停止写入
1908 |                             if mem_check.available < 2.5 * 1024 * 1024 * 1024:
1909 |                                 self.status_text.insert(tk.END, f"警告: 可用内存低于2.5GB，停止分配更多内存\n")
1910 |                                 break
1911 |                         
1912 |                         write_size = min(chunk_size, remaining)
1913 |                         f.write(b'\0' * write_size)
1914 |                         remaining -= write_size
1915 |                         # 更新进度
1916 |                         progress = (buffer_size_bytes - remaining) / buffer_size_bytes * 100
1917 |                         self.status_text.delete("end-2l", "end-1l")  # 删除上一行进度
1918 |                         self.status_text.insert(tk.END, f"创建内存缓冲区: {progress:.1f}% ({(buffer_size_bytes-remaining)/(1024*1024*1024):.2f}GB/{buffer_size_gb:.2f}GB)\n")
1919 |                         self.status_text.see(tk.END)
1920 |                 except MemoryError:
1921 |                     self.status_text.insert(tk.END, f"内存不足，无法完成缓冲区创建\n")
1922 |                     # 记录已分配的大小
1923 |                     actual_size = buffer_size_bytes - remaining
1924 |                     self.status_text.insert(tk.END, f"已分配 {actual_size/(1024*1024*1024):.2f}GB\n")
1925 |                     # 截断文件到已写入的大小
1926 |                     f.flush()
1927 |                     f.truncate(actual_size)
1928 |                 
1929 |                 # 记录内存使用情况
1930 |                 mem_after = psutil.virtual_memory()
1931 |                 self.status_text.insert(tk.END, f"创建后系统内存: 已用 {mem_after.percent}% ({mem_after.used/1024/1024/1024:.2f}GB/{mem_after.total/1024/1024/1024:.2f}GB)\n")
1932 |         
1933 |             # 验证最终文件大小
1934 |             final_size = os.path.getsize(buffer_file)
1935 |             self.status_text.insert(tk.END, f"内存缓冲区最终大小: {final_size/(1024*1024*1024):.2f}GB\n")
1936 |             
1937 |             # 打开文件并映射到内存
1938 |             self.buffer_file = open(buffer_file, "r+b")
1939 |             self.buffer_mm = mmap.mmap(self.buffer_file.fileno(), 0)
1940 |             
1941 |             self.status_text.insert(tk.END, f"内存缓冲区创建完成: {final_size/(1024*1024*1024):.2f}GB\n")
1942 |         except Exception as e:
1943 |             self.status_text.insert(tk.END, f"创建内存缓冲区时出错: {str(e)}\n")
1944 |             import traceback
1945 |             self.status_text.insert(tk.END, traceback.format_exc())
1946 |     
1947 |     def cleanup_memory_buffer(self):
1948 |         """清理内存缓冲区"""
1949 |         try:
1950 |             if hasattr(self, 'buffer_mm') and self.buffer_mm:
1951 |                 self.buffer_mm.close()
1952 |                 self.buffer_mm = None
1953 |             
1954 |             if hasattr(self, 'buffer_file') and self.buffer_file:
1955 |                 self.buffer_file.close()
1956 |                 self.buffer_file = None
1957 |             
1958 |             self.status_text.insert(tk.END, "内存缓冲区已释放\n")
1959 |         except Exception as e:
1960 |             self.status_text.insert(tk.END, f"释放内存缓冲区时出错: {str(e)}\n")
1961 | 
1962 |     def recommend_settings(self):
1963 |         """根据模型大小和硬件条件推荐设置"""
1964 |         try:
1965 |             # 检查是否选择了模型
1966 |             if not self.config['model_path']:
1967 |                 messagebox.showerror("错误", "请先选择模型路径")
1968 |                 return
1969 |                 
1970 |             # 估算模型大小
1971 |             model_size = self.estimate_model_size()
1972 |             
1973 |             # 获取GPU信息
1974 |             gpus = GPUtil.getGPUs()
1975 |             if not gpus:
1976 |                 messagebox.showerror("错误", "未检测到GPU")
1977 |                 return
1978 |                 
1979 |             # 获取第一个GPU的显存大小(GB)
1980 |             gpu_memory = gpus[0].memoryTotal / 1024
1981 |             
1982 |             # 获取系统内存大小(GB)
1983 |             system_memory = psutil.virtual_memory().total / (1024 * 1024 * 1024)
1984 |             
1985 |             # 根据模型大小和硬件条件推荐设置
1986 |             self.status_text.insert(tk.END, "\n===== 推荐设置 =====\n")
1987 |             self.status_text.insert(tk.END, f"模型大小: {model_size:.2f}GB\n")
1988 |             self.status_text.insert(tk.END, f"GPU显存: {gpu_memory:.2f}GB\n")
1989 |             self.status_text.insert(tk.END, f"系统内存: {system_memory:.2f}GB\n")
1990 |             
1991 |             # 推荐显存比例
1992 |             if model_size > gpu_memory * 0.9:
1993 |                 # 模型接近或超过显存大小，需要内存交换
1994 |                 mem_ratio = 85  # 降低到85%，给系统留出更多余量
1995 |                 self.status_text.insert(tk.END, f"推荐显存比例: {mem_ratio}% (模型较大，降低比例避免OOM)\n")
1996 |                 
1997 |                 # 启用内存交换
1998 |                 self.enable_offload_var.set(True)
1999 |                 
2000 |                 # 计算合理的内存交换比例
2001 |                 if model_size > gpu_memory * 1.5:
2002 |                     # 模型远大于显存，需要大量交换
2003 |                     offload_ratio = 70  # 降低到70%，避免系统内存压力过大
2004 |                 else:
2005 |                     # 模型略大于显存，适度交换
2006 |                     offload_ratio = 60
2007 |                     
2008 |                 self.memory_offload_ratio_var.set(str(offload_ratio))
2009 |                 self.status_text.insert(tk.END, f"推荐内存交换比例: {offload_ratio}%\n")
2010 |                 
2011 |                 # 推荐内存通道数 - 根据系统内存大小调整
2012 |                 if system_memory > 64:  # 只有大内存系统才推荐更多通道
2013 |                     channels = 8
2014 |                 else:
2015 |                     channels = 4  # 对于32GB内存系统，使用4个通道
2016 |                     
2017 |                 self.memory_channels_var.set(str(channels))
2018 |                 self.status_text.insert(tk.END, f"推荐内存通道数: {channels}\n")
2019 |                 
2020 |                 # 推荐预留内存比例
2021 |                 reserved_memory = 20
2022 |                 self.reserved_memory_var.set(str(reserved_memory))
2023 |                 self.status_text.insert(tk.END, f"推荐系统内存预留: {reserved_memory}%\n")
2024 |                 
2025 |                 # 推荐较小的序列长度
2026 |                 if model_size > 20:
2027 |                     max_model_len = 2048
2028 |                 else:
2029 |                     max_model_len = 4096
2030 |                     
2031 |                 self.max_model_len_var.set(str(max_model_len))
2032 |                 self.status_text.insert(tk.END, f"推荐最大序列长度: {max_model_len}\n")
2033 |                 
2034 |                 # 推荐适中的块大小以提高内存带宽利用率
2035 |                 block_size = 32  # 对于普通硬件，32是较好的平衡点
2036 |                 self.block_size_var.set(str(block_size))
2037 |                 self.status_text.insert(tk.END, f"推荐块大小: {block_size} (提高内存带宽利用率)\n")
2038 |                 
2039 |                 # 推荐使用--enforce-eager参数
2040 |                 self.status_text.insert(tk.END, "推荐使用强制eager模式，避免CUDA图捕获阶段的内存不足\n")
2041 |                 
2042 |             else:
2043 |                 # 模型可以完全放入显存
2044 |                 mem_ratio = 90
2045 |                 self.status_text.insert(tk.END, f"推荐显存比例: {mem_ratio}% (模型可完全放入显存)\n")
2046 |                 
2047 |                 # 不需要内存交换
2048 |                 self.enable_offload_var.set(False)
2049 |                 self.status_text.insert(tk.END, "不需要启用内存交换\n")
2050 |                 
2051 |                 # 推荐较大的序列长度
2052 |                 max_model_len = 8192
2053 |                 self.max_model_len_var.set(str(max_model_len))
2054 |                 self.status_text.insert(tk.END, f"推荐最大序列长度: {max_model_len}\n")
2055 |                 
2056 |                 # 推荐适中的块大小以提高内存带宽利用率
2057 |                 block_size = 32  # 对于普通硬件，32是较好的平衡点
2058 |                 self.block_size_var.set(str(block_size))
2059 |                 self.status_text.insert(tk.END, f"推荐块大小: {block_size} (提高内存带宽利用率)\n")
2060 |             
2061 |             # 更新界面上的值
2062 |             self.mem_ratio_entry.delete(0, tk.END)
2063 |             self.mem_ratio_entry.insert(0, str(mem_ratio))
2064 |             
2065 |             # 更新配置
2066 |             self.update_config()
2067 |             
2068 |             self.status_text.insert(tk.END, "推荐设置已应用到界面\n")
2069 |             self.status_text.see(tk.END)
2070 |             
2071 |         except Exception as e:
2072 |             messagebox.showerror("错误", f"推荐设置失败: {str(e)}")
2073 | 
2074 |     def update_config(self):
2075 |         """更新配置参数"""
2076 |         try:
2077 |             # 获取界面上的值
2078 |             model_path = self.model_path_entry.get()
2079 |             ip = self.ip_entry.get()
2080 |             port = int(self.port_entry.get())
2081 |             gpu_count = int(self.gpu_count_var.get())
2082 |             mem_ratio = int(self.mem_ratio_entry.get())
2083 |             max_tokens = int(self.max_tokens_var.get())
2084 |             max_model_len = int(self.max_model_len_var.get())
2085 |             block_size = int(self.block_size_var.get())
2086 |             
2087 |             # 获取内存交换配置
2088 |             enable_memory_offload = self.enable_offload_var.get()
2089 |             memory_channels = int(self.memory_channels_var.get())
2090 |             memory_offload_ratio = int(self.memory_offload_ratio_var.get())
2091 |             reserved_memory = int(self.reserved_memory_var.get())
2092 |             
2093 |             # 验证参数
2094 |             if port < 1 or port > 65535:
2095 |                 messagebox.showerror("错误", "端口号必须在1-65535之间")
2096 |                 return False
2097 |                 
2098 |             if gpu_count < 1:
2099 |                 messagebox.showerror("错误", "GPU数量必须大于0")
2100 |                 return False
2101 |                 
2102 |             if mem_ratio < 10 or mem_ratio > 100:
2103 |                 messagebox.showerror("错误", "显存比例必须在10-100之间")
2104 |                 return False
2105 |                 
2106 |             if max_tokens < 256:
2107 |                 messagebox.showerror("错误", "最大Token数不能小于256")
2108 |                 return False
2109 |                 
2110 |             if max_model_len < 512:
2111 |                 messagebox.showerror("错误", "最大模型长度不能小于512")
2112 |                 return False
2113 |                 
2114 |             if block_size < 1:
2115 |                 messagebox.showerror("错误", "块大小必须大于0")
2116 |                 return False
2117 |                 
2118 |             # 验证内存交换配置
2119 |             if enable_memory_offload:
2120 |                 if memory_channels < 1:
2121 |                     messagebox.showerror("错误", "内存通道数必须大于0")
2122 |                     return False
2123 |                     
2124 |                 if memory_offload_ratio < 10 or memory_offload_ratio > 100:
2125 |                     messagebox.showerror("错误", "内存交换比例必须在10-100之间")
2126 |                     return False
2127 |                     
2128 |                 if reserved_memory < 0 or reserved_memory > 50:
2129 |                     messagebox.showerror("错误", "预留内存比例必须在0-50之间")
2130 |                     return False
2131 |             
2132 |             # 更新配置
2133 |             self.config['model_path'] = model_path
2134 |             self.config['ip'] = ip
2135 |             self.config['port'] = port
2136 |             self.config['gpu_count'] = gpu_count
2137 |             self.config['mem_ratio'] = mem_ratio
2138 |             self.config['max_tokens'] = max_tokens
2139 |             self.config['max_model_len'] = max_model_len
2140 |             self.config['block_size'] = block_size
2141 |             
2142 |             # 更新内存交换配置
2143 |             self.config['enable_memory_offload'] = enable_memory_offload
2144 |             self.config['memory_channels'] = memory_channels
2145 |             self.config['memory_offload_ratio'] = memory_offload_ratio
2146 |             self.config['reserved_memory'] = reserved_memory
2147 |             
2148 |             # 保存配置到文件
2149 |             self.save_config()
2150 |             
2151 |             # 在状态栏显示配置信息
2152 |             self.status_text.insert(tk.END, "\n===== 配置已更新 =====\n")
2153 |             self.status_text.insert(tk.END, f"模型路径: {model_path}\n")
2154 |             self.status_text.insert(tk.END, f"IP地址: {ip}, 端口: {port}\n")
2155 |             self.status_text.insert(tk.END, f"GPU数量: {gpu_count}, 显存比例: {mem_ratio}%\n")
2156 |             self.status_text.insert(tk.END, f"最大Token数: {max_tokens}, 最大模型长度: {max_model_len}, 块大小: {block_size}\n")
2157 |             
2158 |             if enable_memory_offload:
2159 |                 self.status_text.insert(tk.END, f"已启用内存交换: 通道数={memory_channels}, 交换比例={memory_offload_ratio}%, 预留内存={reserved_memory}%\n")
2160 |             else:
2161 |                 self.status_text.insert(tk.END, "未启用内存交换\n")
2162 |                 
2163 |             self.status_text.see(tk.END)
2164 |             
2165 |             return True
2166 |             
2167 |         except Exception as e:
2168 |             messagebox.showerror("错误", f"更新配置失败: {str(e)}")
2169 |             return False
2170 | 
2171 |     def validate_config(self):
2172 |         """验证配置参数"""
2173 |         if self.config['max_tokens'] < self.config['max_model_len']:
2174 |             if not messagebox.askokcancel("警告", 
2175 |                 "最大回复token数小于整体序列长度，这可能会影响模型性能。\n建议将max_tokens设置为不小于max_model_len。\n是否继续？"):
2176 |                 return False
2177 |         return True
2178 | 
2179 |     def check_model_compatibility(self):
2180 |         """检查模型与VLLM的兼容性"""
2181 |         if not self.config['model_path']:
2182 |             self.status_text.insert(tk.END, "错误: 未选择模型路径\n")
2183 |             return False
2184 |         
2185 |         self.status_text.insert(tk.END, "正在检查模型兼容性...\n")
2186 |         
2187 |         # 检查硬件配置
2188 |         self.check_hardware_configuration()
2189 |         
2190 |         # 检查模型文件是否存在
2191 |         model_path = self.config['model_path']
2192 |         if not os.path.exists(model_path):
2193 |             self.status_text.insert(tk.END, f"错误: 模型路径不存在: {model_path}\n")
2194 |             return False
2195 |         
2196 |         # 检查必要的模型文件
2197 |         required_files = []
2198 |         safetensors_found = False
2199 |         bin_files_found = False
2200 |         
2201 |         # 检查是否有.safetensors文件
2202 |         for root, dirs, files in os.walk(model_path):
2203 |             for file in files:
2204 |                 if file.endswith('.safetensors'):
2205 |                     safetensors_found = True
2206 |                     self.status_text.insert(tk.END, f"找到safetensors文件: {file}\n")
2207 |                 elif file.endswith('.bin'):
2208 |                     bin_files_found = True
2209 |                     self.status_text.insert(tk.END, f"找到bin文件: {file}\n")
2210 |         
2211 |         if not (safetensors_found or bin_files_found):
2212 |             self.status_text.insert(tk.END, "错误: 未找到模型权重文件(.safetensors或.bin)\n")
2213 |             return False
2214 |         
2215 |         # 检查config.json文件
2216 |         config_path = os.path.join(model_path, "config.json")
2217 |         if not os.path.exists(config_path):
2218 |             self.status_text.insert(tk.END, "错误: 未找到config.json文件\n")
2219 |             return False
2220 |         
2221 |         # 检查tokenizer文件
2222 |         tokenizer_files = ["tokenizer.json", "tokenizer_config.json"]
2223 |         tokenizer_found = False
2224 |         for file in tokenizer_files:
2225 |             if os.path.exists(os.path.join(model_path, file)):
2226 |                 tokenizer_found = True
2227 |                 break
2228 |         
2229 |         if not tokenizer_found:
2230 |             self.status_text.insert(tk.END, "警告: 未找到标准tokenizer文件，VLLM可能无法正确加载\n")
2231 |         
2232 |         # 读取模型配置
2233 |         try:
2234 |             with open(config_path, 'r') as f:
2235 |                 config = json.load(f)
2236 |             
2237 |             # 检查模型类型
2238 |             model_type = config.get('model_type', '')
2239 |             self.status_text.insert(tk.END, f"模型类型: {model_type}\n")
2240 |             
2241 |             # 检查是否是支持的模型类型
2242 |             supported_types = ["llama", "mistral", "falcon", "gpt_neox", "gpt2", "bloom", "qwen", "baichuan", "chatglm", "mpt"]
2243 |             if model_type.lower() not in [t.lower() for t in supported_types]:
2244 |                 self.status_text.insert(tk.END, f"警告: 模型类型 '{model_type}' 可能不被VLLM完全支持\n")
2245 |             
2246 |             # 检查模型大小
2247 |             hidden_size = config.get('hidden_size', 0)
2248 |             num_layers = config.get('num_hidden_layers', 0) or config.get('num_layers', 0)
2249 |             vocab_size = config.get('vocab_size', 0)
2250 |             
2251 |             if hidden_size and num_layers:
2252 |                 # 粗略估计模型参数量
2253 |                 params_billion = (hidden_size * hidden_size * 4 * num_layers + hidden_size * vocab_size) / 1e9
2254 |                 self.status_text.insert(tk.END, f"估计模型参数量: {params_billion:.2f}B\n")
2255 |                 
2256 |                 # 检查是否是大模型
2257 |                 if params_billion > 30:
2258 |                     self.status_text.insert(tk.END, "警告: 这是一个较大的模型，可能需要多GPU或内存交换\n")
2259 |             
2260 |             # 检查特殊注意力机制
2261 |             attention_type = config.get('attention_type', '')
2262 |             if attention_type and attention_type not in ['scaled_dot_product', 'eager']:
2263 |                 self.status_text.insert(tk.END, f"警告: 特殊注意力机制 '{attention_type}' 可能不被VLLM支持\n")
2264 |             
2265 |             # 检查激活函数
2266 |             activation_function = config.get('hidden_act', '')
2267 |             if activation_function and activation_function not in ['gelu', 'gelu_new', 'relu', 'silu', 'swish']:
2268 |                 self.status_text.insert(tk.END, f"警告: 激活函数 '{activation_function}' 可能不被VLLM完全支持\n")
2269 |         
2270 |         except Exception as e:
2271 |             self.status_text.insert(tk.END, f"读取模型配置时出错: {str(e)}\n")
2272 |         
2273 |         # 检查VLLM版本
2274 |         try:
2275 |             vllm_version = subprocess.run(['vllm', '--version'], capture_output=True, text=True)
2276 |             version_str = vllm_version.stdout.strip() or vllm_version.stderr.strip()
2277 |             self.status_text.insert(tk.END, f"VLLM版本: {version_str}\n")
2278 |             
2279 |             # 检查CUDA版本
2280 |             if torch.cuda.is_available():
2281 |                 cuda_version = torch.version.cuda
2282 |                 self.status_text.insert(tk.END, f"CUDA版本: {cuda_version}\n")
2283 |                 
2284 |                 # 检查GPU计算能力
2285 |                 capability = torch.cuda.get_device_capability()
2286 |                 self.status_text.insert(tk.END, f"GPU计算能力: {capability[0]}.{capability[1]}\n")
2287 |                 
2288 |                 # 检查是否支持当前GPU
2289 |                 if capability[0] < 7:
2290 |                     self.status_text.insert(tk.END, "警告: VLLM最佳支持计算能力7.0+的GPU (V100及更新)\n")
2291 |         except Exception as e:
2292 |             self.status_text.insert(tk.END, f"检查VLLM版本时出错: {str(e)}\n")
2293 |         
2294 |         # 检查GPU内存
2295 |         try:
2296 |             gpus = GPUtil.getGPUs()
2297 |             if gpus:
2298 |                 gpu = gpus[0]
2299 |                 gpu_memory = gpu.memoryTotal / 1024  # GB
2300 |                 self.status_text.insert(tk.END, f"GPU显存: {gpu_memory:.2f}GB\n")
2301 |                 
2302 |                 # 估算模型大小
2303 |                 model_size = self.estimate_model_size()
2304 |                 self.status_text.insert(tk.END, f"估计模型大小: {model_size:.2f}GB\n")
2305 |                 
2306 |                 # 检查是否需要内存交换
2307 |                 if model_size > gpu_memory * 0.8:
2308 |                     self.status_text.insert(tk.END, f"警告: 模型大小({model_size:.2f}GB)接近或超过GPU显存({gpu_memory:.2f}GB)\n")
2309 |                     self.status_text.insert(tk.END, "建议启用内存交换或使用多GPU\n")
2310 |                     
2311 |                     # 检查系统内存
2312 |                     system_memory = psutil.virtual_memory().total / (1024 * 1024 * 1024)  # GB
2313 |                     self.status_text.insert(tk.END, f"系统内存: {system_memory:.2f}GB\n")
2314 |                     
2315 |                     if system_memory < model_size * 1.5:
2316 |                         self.status_text.insert(tk.END, "警告: 系统内存可能不足以进行有效的内存交换\n")
2317 |                     
2318 |                     # 检查磁盘空间（用于内存映射文件）
2319 |                     disk_usage = psutil.disk_usage('/')
2320 |                     free_disk = disk_usage.free / (1024 * 1024 * 1024)  # GB
2321 |                     self.status_text.insert(tk.END, f"可用磁盘空间: {free_disk:.2f}GB\n")
2322 |                     
2323 |                     if free_disk < model_size * 2:
2324 |                         self.status_text.insert(tk.END, "警告: 磁盘空间可能不足以创建内存映射文件\n")
2325 |         except Exception as e:
2326 |             self.status_text.insert(tk.END, f"检查GPU内存时出错: {str(e)}\n")
2327 |         
2328 |         self.status_text.insert(tk.END, "模型兼容性检查完成\n")
2329 |         return True
2330 | 
2331 |     def check_hardware_configuration(self):
2332 |         """检测用户硬件配置并提供相应的优化建议"""
2333 |         self.status_text.insert(tk.END, "\n===== 硬件配置检测 =====\n")
2334 |         
2335 |         # 检测CPU
2336 |         try:
2337 |             cpu_count = psutil.cpu_count(logical=False)  # 物理核心数
2338 |             cpu_logical = psutil.cpu_count(logical=True)  # 逻辑核心数
2339 |             self.status_text.insert(tk.END, f"CPU: {cpu_count}核心/{cpu_logical}线程\n")
2340 |         except Exception:
2341 |             pass
2342 |         
2343 |         # 检测内存
2344 |         try:
2345 |             mem = psutil.virtual_memory()
2346 |             total_memory = mem.total / (1024 * 1024 * 1024)  # GB
2347 |             self.status_text.insert(tk.END, f"系统内存: {total_memory:.2f}GB\n")
2348 |         except Exception:
2349 |             pass
2350 |         
2351 |         # 检测GPU
2352 |         try:
2353 |             if torch.cuda.is_available():
2354 |                 gpu_count = torch.cuda.device_count()
2355 |                 self.status_text.insert(tk.END, f"检测到 {gpu_count} 个GPU\n")
2356 |                 
2357 |                 for i in range(gpu_count):
2358 |                     gpu_name = torch.cuda.get_device_name(i)
2359 |                     gpu_mem = torch.cuda.get_device_properties(i).total_memory / (1024 * 1024 * 1024)  # GB
2360 |                     self.status_text.insert(tk.END, f"GPU {i}: {gpu_name}, 显存: {gpu_mem:.2f}GB\n")
2361 |             else:
2362 |                 self.status_text.insert(tk.END, "未检测到支持CUDA的GPU\n")
2363 |         except Exception:
2364 |             pass
2365 |         
2366 |         self.status_text.insert(tk.END, "硬件配置检测完成\n")
2367 |         self.status_text.see(tk.END)
2368 | 
2369 |     def check_flash_attention_support(self):
2370 |         """检查是否支持Flash Attention"""
2371 |         try:
2372 |             import torch
2373 |             has_support = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
2374 |             return False  # 暂时禁用Flash Attention功能，避免兼容性问题
2375 |         except Exception:
2376 |             return False
2377 | 
2378 |     def add_performance_monitoring(self):
2379 |         """添加性能监控与自动调优功能"""
2380 |         # 创建性能监控面板
2381 |         self.perf_frame = ttk.LabelFrame(self.master, text="性能监控")
2382 |         self.perf_frame.pack(padx=10, pady=5, fill='both')
2383 |         
2384 |         # 添加性能指标显示
2385 |         self.perf_labels = {}
2386 |         metrics = ["GPU利用率", "内存带宽", "KV缓存命中率", "推理速度(token/s)"]
2387 |         
2388 |         for i, metric in enumerate(metrics):
2389 |             ttk.Label(self.perf_frame, text=f"{metric}:").grid(row=i, column=0, sticky='w')
2390 |             self.perf_labels[metric] = ttk.Label(self.perf_frame, text="N/A")
2391 |             self.perf_labels[metric].grid(row=i, column=1, sticky='w')
2392 |         
2393 |         # 添加自动调优开关
2394 |         self.auto_tune_var = tk.BooleanVar(value=True)
2395 |         ttk.Checkbutton(self.perf_frame, text="启用自动性能调优", variable=self.auto_tune_var).grid(row=len(metrics), column=0, columnspan=2, sticky='w')
2396 |         
2397 |         # 初始化性能统计变量
2398 |         self.total_tokens_generated = 0
2399 |         self.kv_cache_hits = 0
2400 |         self.kv_cache_misses = 0
2401 |         
2402 |         # 启动性能监控线程
2403 |         self.start_performance_monitor()
2404 | 
2405 |     def start_performance_monitor(self):
2406 |         """启动性能监控线程"""
2407 |         def monitor_loop():
2408 |             last_tokens = 0
2409 |             last_time = time.time()
2410 |             
2411 |             while hasattr(self, 'monitoring') and self.monitoring:
2412 |                 try:
2413 |                     if hasattr(self, 'server_process') and self.server_process is not None and self.server_process.poll() is None:
2414 |                         # 获取GPU统计信息
2415 |                         gpu_stats = self.get_gpu_stats()
2416 |                         if gpu_stats and len(gpu_stats) > 0:
2417 |                             # 安全获取GPU利用率和内存利用率
2418 |                             gpu_util_str = gpu_stats[0].get('utilization.gpu', '0 %').replace('%', '').strip()
2419 |                             mem_util_str = gpu_stats[0].get('utilization.memory', '0 %').replace('%', '').strip()
2420 |                             
2421 |                             # 转换为浮点数，处理可能的转换错误
2422 |                             try:
2423 |                                 gpu_util = float(gpu_util_str)
2424 |                             except ValueError:
2425 |                                 gpu_util = 0
2426 |                                 
2427 |                             try:
2428 |                                 mem_util = float(mem_util_str)
2429 |                             except ValueError:
2430 |                                 mem_util = 0
2431 |                             
2432 |                             # 更新性能指标标签
2433 |                             if 'GPU利用率' in self.perf_labels:
2434 |                                 self.perf_labels['GPU利用率'].config(text=f"{gpu_util:.1f}%")
2435 |                             if '内存带宽' in self.perf_labels:
2436 |                                 self.perf_labels['内存带宽'].config(text=f"{mem_util:.1f}%")
2437 |                                 
2438 |                             # 计算并更新推理速度
2439 |                             now = time.time()
2440 |                             if now - last_time >= 5:  # 每5秒更新一次
2441 |                                 tokens_per_sec = (self.total_tokens_generated - last_tokens) / (now - last_time)
2442 |                                 last_tokens = self.total_tokens_generated
2443 |                                 last_time = now
2444 |                                 
2445 |                                 if '推理速度(token/s)' in self.perf_labels:
2446 |                                     self.perf_labels['推理速度(token/s)'].config(text=f"{tokens_per_sec:.2f}")
2447 |                                 
2448 |                                 # 无日志的自动调优逻辑 - 只在服务运行且启用自动调优时执行
2449 |                                 if hasattr(self, 'monitoring') and self.monitoring and hasattr(self, 'auto_tune_var') and self.auto_tune_var.get() and tokens_per_sec < 5.0:
2450 |                                     # 如果GPU利用率高但内存带宽低，说明存在内存瓶颈
2451 |                                     if gpu_util > 90 and mem_util < 30:
2452 |                                         # 静默优化内存访问
2453 |                                         self.optimize_memory_access()
2454 |                                     # 如果GPU利用率低，说明存在计算瓶颈
2455 |                                     elif gpu_util < 30:
2456 |                                         # 静默优化GPU利用率
2457 |                                         self.optimize_for_low_gpu_utilization()
2458 |                     
2459 |                         # 更新KV缓存命中率
2460 |                         if hasattr(self, 'monitoring') and self.monitoring and hasattr(self, 'kv_cache_hits') and hasattr(self, 'kv_cache_misses'):
2461 |                             total_kv_requests = self.kv_cache_hits + self.kv_cache_misses
2462 |                             if total_kv_requests > 0:
2463 |                                 kv_hit_ratio = self.kv_cache_hits / total_kv_requests * 100
2464 |                                 if 'KV缓存命中率' in self.perf_labels:
2465 |                                     self.perf_labels['KV缓存命中率'].config(text=f"{kv_hit_ratio:.2f}%")
2466 |                 except Exception:
2467 |                     # 静默处理错误，不显示错误信息
2468 |                     pass
2469 |                 
2470 |                 # 检查监控标志
2471 |                 if not hasattr(self, 'monitoring') or not self.monitoring:
2472 |                     break
2473 |                 
2474 |                 time.sleep(1)
2475 |         
2476 |         # 确保monitoring属性已设置
2477 |         if not hasattr(self, 'monitoring'):
2478 |             self.monitoring = True
2479 |             
2480 |         # 启动监控线程
2481 |         self.perf_monitor_thread = threading.Thread(target=monitor_loop, daemon=True)
2482 |         self.perf_monitor_thread.start()
2483 | 
2484 |     def optimize_for_low_gpu_utilization(self):
2485 |         """针对低GPU利用率进行优化"""
2486 |         # 这个方法会在GPU利用率低于30%时被调用
2487 |         
2488 |         # 1. 尝试增加批处理大小
2489 |         if hasattr(self, 'batch_size'):
2490 |             old_batch_size = self.batch_size
2491 |             self.batch_size = min(self.batch_size * 2, 32)  # 最大批大小32
2492 |         
2493 |         # 2. 尝试预热GPU
2494 |         try:
2495 |             # 创建一个小的张量并执行一些操作来预热GPU
2496 |             import torch
2497 |             if torch.cuda.is_available():
2498 |                 device = torch.device("cuda")
2499 |                 # 创建一个大张量并执行一些操作
2500 |                 x = torch.randn(1000, 1000, device=device)
2501 |                 for _ in range(10):
2502 |                     x = torch.matmul(x, x)
2503 |                 # 强制同步
2504 |                 torch.cuda.synchronize()
2505 |         except Exception:
2506 |             pass
2507 |         
2508 |         # 3. 检查并优化内存访问模式
2509 |         if hasattr(self, 'multi_channel_loader'):
2510 |             # 增加缓存大小
2511 |             if hasattr(self.multi_channel_loader, 'max_cache_size'):
2512 |                 old_cache_size = self.multi_channel_loader.max_cache_size
2513 |                 self.multi_channel_loader.max_cache_size = min(old_cache_size * 2, 128)
2514 | 
2515 |     def optimize_memory_access(self):
2516 |         """优化内存访问模式"""
2517 |         # 1. 尝试优化多通道加载器
2518 |         if hasattr(self, 'multi_channel_loader') and self.multi_channel_loader is not None:
2519 |             try:
2520 |                 # 获取当前通道数和缓存大小
2521 |                 old_channels = self.multi_channel_loader.num_channels
2522 |                 old_cache_size = self.multi_channel_loader.max_cache_size
2523 |                 
2524 |                 # 根据系统内存情况，适当增加通道数和缓存大小
2525 |                 # 对于普通硬件，最大增加到8个通道
2526 |                 self.multi_channel_loader.num_channels = min(old_channels * 2, 8)
2527 |                 # 对于普通硬件，最大增加到64
2528 |                 self.multi_channel_loader.max_cache_size = min(old_cache_size * 2, 64)
2529 |             except Exception:
2530 |                 pass
2531 |         
2532 |         # 2. 尝试优化CUDA内存分配策略
2533 |         try:
2534 |             # 设置环境变量以优化CUDA内存分配，但使用较小的分块大小
2535 |             os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True,max_split_size_mb:128'
2536 |         except Exception:
2537 |             pass
2538 | 
2539 |     def update_token_count(self, new_tokens):
2540 |         """更新生成的token计数"""
2541 |         if not hasattr(self, 'total_tokens_generated'):
2542 |             self.total_tokens_generated = 0
2543 |         self.total_tokens_generated += new_tokens
2544 | 
2545 |     def auto_optimize_performance(self):
2546 |         """自动性能优化"""
2547 |         try:
2548 |             # 等待一段时间，确保服务器已经稳定运行
2549 |             time.sleep(10)
2550 |             
2551 |             if not self.monitoring or not hasattr(self, 'server_process') or self.server_process is None or self.server_process.poll() is not None:
2552 |                 return
2553 |                 
2554 |             # 静默执行性能优化
2555 |             self.warm_up_gpu()
2556 |             self.optimize_memory_access()
2557 |             
2558 |         except Exception:
2559 |             pass
2560 | 
2561 |     def warm_up_gpu(self):
2562 |         """预热GPU，提高性能稳定性"""
2563 |         try:
2564 |             # 创建一个小的张量并执行一些操作来预热GPU
2565 |             import torch
2566 |             if torch.cuda.is_available():
2567 |                 device = torch.device("cuda")
2568 |                 # 创建一个大张量并执行一些操作
2569 |                 x = torch.randn(2000, 2000, device=device)
2570 |                 for _ in range(20):
2571 |                     x = torch.matmul(x, x)
2572 |                 # 强制同步
2573 |                 torch.cuda.synchronize()
2574 |         except Exception as e:
2575 |             pass
2576 | 
2577 |     def create_advanced_settings(self):
2578 |         """创建高级性能设置区域"""
2579 |         # 创建高级设置框架
2580 |         advanced_frame = ttk.LabelFrame(self.master, text="高级性能设置")
2581 |         advanced_frame.pack(padx=10, pady=5, fill='x')
2582 |         
2583 |         # 添加说明
2584 |         ttk.Label(advanced_frame, text="以下设置适用于高性能硬件，请根据您的实际硬件配置谨慎调整", 
2585 |                  foreground="red").grid(row=0, column=0, columnspan=4, sticky='w')
2586 |         
2587 |         # 内存缓存大小
2588 |         ttk.Label(advanced_frame, text="内存缓存大小:").grid(row=1, column=0)
2589 |         self.cache_size_var = tk.StringVar(value="32")
2590 |         cache_size_combo = ttk.Combobox(advanced_frame, textvariable=self.cache_size_var,
2591 |                                       values=["16", "32", "64", "128", "256"], width=5)
2592 |         cache_size_combo.grid(row=1, column=1)
2593 |         ttk.Label(advanced_frame, text="(大内存系统可增大)").grid(row=1, column=2)
2594 |         
2595 |         # CUDA内存分配块大小
2596 |         ttk.Label(advanced_frame, text="CUDA内存分块(MB):").grid(row=2, column=0)
2597 |         self.cuda_split_size_var = tk.StringVar(value="128")
2598 |         cuda_split_combo = ttk.Combobox(advanced_frame, textvariable=self.cuda_split_size_var,
2599 |                                       values=["64", "128", "256", "512"], width=5)
2600 |         cuda_split_combo.grid(row=2, column=1)
2601 |         ttk.Label(advanced_frame, text="(大显存GPU可增大)").grid(row=2, column=2)
2602 |         
2603 |         # 批处理大小
2604 |         ttk.Label(advanced_frame, text="批处理大小:").grid(row=3, column=0)
2605 |         self.batch_size_var = tk.StringVar(value="16")
2606 |         batch_size_combo = ttk.Combobox(advanced_frame, textvariable=self.batch_size_var,
2607 |                                       values=["8", "16", "32", "64"], width=5)
2608 |         batch_size_combo.grid(row=3, column=1)
2609 |         ttk.Label(advanced_frame, text="(高性能GPU可增大)").grid(row=3, column=2)
2610 |         
2611 |         # 检测硬件按钮
2612 |         detect_hardware_button = ttk.Button(advanced_frame, text="检测硬件配置", 
2613 |                                           command=self.check_hardware_configuration)
2614 |         detect_hardware_button.grid(row=4, column=0, columnspan=2, pady=5)
2615 |         
2616 |         # 应用高级设置按钮
2617 |         apply_advanced_button = ttk.Button(advanced_frame, text="应用高级设置", 
2618 |                                          command=self.apply_advanced_settings)
2619 |         apply_advanced_button.grid(row=4, column=2, columnspan=2, pady=5)
2620 |         
2621 |         # 添加说明
2622 |         ttk.Label(advanced_frame, text="注意: 高级设置将在下次启动服务器时生效", 
2623 |                  foreground="blue").grid(row=5, column=0, columnspan=4, sticky='w')
2624 |         
2625 |         # 加载已保存的高级设置
2626 |         self.load_advanced_settings()
2627 | 
2628 |     def load_advanced_settings(self):
2629 |         """加载已保存的高级设置"""
2630 |         try:
2631 |             # 如果配置中有高级设置，则加载
2632 |             if 'advanced_cache_size' in self.config:
2633 |                 self.cache_size_var.set(str(self.config['advanced_cache_size']))
2634 |             if 'advanced_cuda_split_size' in self.config:
2635 |                 self.cuda_split_size_var.set(str(self.config['advanced_cuda_split_size']))
2636 |             if 'advanced_batch_size' in self.config:
2637 |                 self.batch_size_var.set(str(self.config['advanced_batch_size']))
2638 |         except Exception as e:
2639 |             self.status_text.insert(tk.END, f"加载高级设置失败: {str(e)}\n")
2640 | 
2641 |     def apply_advanced_settings(self):
2642 |         """应用高级性能设置"""
2643 |         try:
2644 |             # 获取高级设置值
2645 |             cache_size = int(self.cache_size_var.get())
2646 |             cuda_split_size = int(self.cuda_split_size_var.get())
2647 |             batch_size = int(self.batch_size_var.get())
2648 |             
2649 |             # 保存到配置
2650 |             self.config['advanced_cache_size'] = cache_size
2651 |             self.config['advanced_cuda_split_size'] = cuda_split_size
2652 |             self.config['advanced_batch_size'] = batch_size
2653 |             
2654 |             # 更新配置文件
2655 |             self.save_config()
2656 |             
2657 |             # 显示确认信息
2658 |             self.status_text.insert(tk.END, "\n===== 高级设置已应用 =====\n")
2659 |             self.status_text.insert(tk.END, f"内存缓存大小: {cache_size}\n")
2660 |             self.status_text.insert(tk.END, f"CUDA内存分块大小: {cuda_split_size}MB\n")
2661 |             self.status_text.insert(tk.END, f"批处理大小: {batch_size}\n")
2662 |             self.status_text.insert(tk.END, "这些设置将在下次启动服务器时生效\n")
2663 |             self.status_text.see(tk.END)
2664 |             
2665 |             messagebox.showinfo("成功", "高级设置已应用，将在下次启动服务器时生效")
2666 |         except Exception as e:
2667 |             messagebox.showerror("错误", f"应用高级设置失败: {str(e)}")
2668 | 
2669 | if __name__ == "__main__":
2670 |     root = tk.Tk()
2671 |     app = VLLMServerGUI(root)
2672 |     root.mainloop()
2673 | 


--------------------------------------------------------------------------------