├── requirements.txt ├── __init__.py ├── .gitignore ├── LICENSE ├── README.md ├── workflows └── sample.json └── upscale_cudaspeed.py /requirements.txt: -------------------------------------------------------------------------------- 1 | spandrel>=0.3.0 2 | torch>=2.0.0 -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | from .upscale_cudaspeed import NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS 2 | 3 | print("✅ ComfyUI-Upscale-CUDAspeed 已加载") 4 | print("🔧 推荐使用节点: '🚀 Upscale Image CUDAspeed'") 5 | 6 | __all__ = ['NODE_CLASS_MAPPINGS', 'NODE_DISPLAY_NAME_MAPPINGS'] -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.so 6 | .Python 7 | build/ 8 | develop-eggs/ 9 | dist/ 10 | downloads/ 11 | eggs/ 12 | .eggs/ 13 | lib/ 14 | lib64/ 15 | parts/ 16 | sdist/ 17 | var/ 18 | wheels/ 19 | pip-wheel-metadata/ 20 | share/python-wheels/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | MANIFEST 25 | 26 | # PyInstaller 27 | *.manifest 28 | *.spec 29 | 30 | # Installer logs 31 | pip-log.txt 32 | pip-delete-this-directory.txt 33 | 34 | # Unit test / coverage reports 35 | htmlcov/ 36 | .tox/ 37 | .nox/ 38 | .coverage 39 | .coverage.* 40 | .cache 41 | nosetests.xml 42 | coverage.xml 43 | *.cover 44 | *.py,cover 45 | .hypothesis/ 46 | .pytest_cache/ 47 | 48 | # Environments 49 | .env 50 | .venv 51 | env/ 52 | venv/ 53 | ENV/ 54 | env.bak/ 55 | venv.bak/ 56 | 57 | # IDE 58 | .vscode/ 59 | .idea/ 60 | *.swp 61 | *.swo 62 | 63 | # OS 64 | .DS_Store 65 | Thumbs.db 66 | 67 | # ComfyUI specific 68 | models/ 69 | output/ 70 | temp/ 71 | input/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 ComfyUI-Upscale-CUDAspeed 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ComfyUI-Upscale-CUDAspeed 2 | 3 | 一个高性能的ComfyUI图像放大插件，通过CUDA加速和模型编译优化提供极致的放大速度。 4 |

5 | 6 | ## 特性 7 | 8 | - 🚀 **高性能CUDA加速**：利用PyTorch编译技术优化模型推理速度 9 | - 🔧 **智能模型编译**：自动编译模型并缓存，避免重复编译 10 | - 🎯 **尺寸感知缓存**：为不同输入尺寸分别缓存编译结果，避免尺寸变化导致的重新编译 11 | - 💾 **智能内存管理**：动态调整瓦片大小，优化显存使用 12 | - 🛡️ **数值稳定性**：增强的后处理确保编译模型输出质量 13 | - ⚡ **异步处理**：使用多CUDA流并行处理，最大化GPU利用率 14 | 15 | ## 安装 16 | 17 | ### 前提条件 18 | 19 | - ComfyUI 已安装并运行 20 | - NVIDIA GPU 支持 CUDA 21 | - PyTorch 2.0+ (支持 `torch.compile`) 22 | 23 | ### 安装步骤 24 | 25 | 1. 将本仓库克隆到ComfyUI的 `custom_nodes` 目录： 26 | 27 | ```bash 28 | cd ComfyUI/custom_nodes 29 | git clone https://github.com/piscesbody/ComfyUI-Upscale-CUDAspeed.git 30 | ``` 31 | 32 | 2. 安装依赖： 33 | 34 | ```bash 35 | pip install -r requirements.txt 36 | ``` 37 | 38 | 3. 重启ComfyUI 39 | 40 | ## 使用说明 41 | 42 | ### 节点介绍 43 | 44 | #### 🚀 Upscale Image CUDAspeed 45 | 46 | 主要放大节点，提供高性能的图像放大功能。 47 | 48 | **输入参数：** 49 | 50 | - `upscale_model`: 放大模型（通过UpscaleModelLoader加载）推荐模型：RealESRGAN_x2plus.pth 51 | - `image`: 输入图像 52 | - `use_autocast`: 自动混合精度（启用/禁用） 53 | - `precision`: 精度模式（自动/fp16/fp32/bf16）推荐：fp16 54 | - `tile_size`: 瓦片大小（0表示自动计算）推荐：视频最长边如1280x720，则输入1280 55 | - `overlap`: 瓦片重叠大小（0表示自动计算）推荐：8（视频最小值即可） 56 | - `enable_compile`: 模型编译（启用/禁用）如果批量处理同尺寸，建议打开。第一次编译有点长，但是加速效果明显。 57 | - `optimization_level`: 优化级别（平衡/速度/内存）推荐：speed 58 | - `batch_size` (可选): 批处理大小推荐：1 59 | 60 | #### UpscaleModelLoader 61 | 62 | 放大模型加载器，用于加载各种放大模型。 63 | 64 | **输入参数：** 65 | 66 | - `model_name`: 模型文件名（从 `upscale_models` 目录选择） 67 | 68 | ### 工作流示例 69 | 70 | 1. 使用 `UpscaleModelLoader` 节点加载放大模型 71 | 2. 连接图像到 `🚀 Upscale Image CUDAspeed` 节点 72 | 3. 调整参数以获得最佳性能和质量 73 | 4. 执行工作流 74 | 75 | ### 性能优化建议 76 | 77 | #### 优化级别选择 78 | 79 | - **平衡模式**：适合大多数场景，在速度和内存间取得平衡 80 | - **速度模式**：追求最高速度，适合大显存显卡 81 | - **内存模式**：节省显存，适合小显存显卡 82 | 83 | #### 模型编译 84 | 85 | 启用 `enable_compile` 可以显著提升推理速度，但首次运行需要编译时间。编译后的模型会自动缓存，后续运行无需重新编译。 86 | 87 | #### 自动混合精度 88 | 89 | 启用 `use_autocast` 可以利用Tensor Core加速计算，在支持的GPU上提供更好的性能。 90 | 91 | ## 支持的模型 92 | 93 | 本插件支持所有与Spandrel兼容的放大模型，包括： 94 | 95 | - ESRGAN 96 | - Real-ESRGAN 97 | - Real-CUGAN 98 | - SwinIR 99 | - HAT 100 | - 以及其他单图像放大模型 101 | 102 | ## 技术细节 103 | 104 | ### 模型编译优化 105 | 106 | 插件使用PyTorch的 `torch.compile` 功能对模型进行即时编译优化： 107 | 108 | - **尺寸感知缓存**：为不同输入尺寸分别缓存编译结果 109 | - **运行时缓存**：在内存中缓存编译模型，避免重复编译 110 | - **持久化记录**：记录模型编译状态，重启后仍有效 111 | 112 | ### 内存管理 113 | 114 | - **动态瓦片调整**：根据可用显存自动调整瓦片大小 115 | - **智能内存评估**：基于实际张量计算显存需求 116 | - **流式处理**：使用多CUDA流并行处理数据 117 | 118 | ### 数值稳定性 119 | 120 | 针对编译模型可能出现的数值范围问题，提供了增强的后处理： 121 | 122 | - 异常值检测和裁剪 123 | - 分位数归一化 124 | - 统计驱动的范围调整 125 | 126 | ## 故障排除 127 | 128 | ### 常见问题 129 | 130 | **Q: 编译失败或出现错误** 131 | A: 尝试禁用 `enable_compile` 使用普通模式，或检查PyTorch版本是否支持编译。 132 | 133 | **Q: 显存不足** 134 | A: 尝试使用内存优化模式，减小 `tile_size`，或启用 `use_autocast`。 135 | 136 | **Q: 输出图像发白或颜色异常** 137 | A: 这是编译模型的常见问题，插件已内置增强后处理。如果问题持续，尝试禁用模型编译。 138 | 139 | **Q: 性能没有提升** 140 | A: 确保使用支持的GPU和PyTorch 2.0+版本，首次运行需要编译时间。 141 | 142 | ### 日志调试 143 | 144 | 插件会输出详细的调试信息，包括： 145 | 146 | - 模型编译状态 147 | - 内存使用情况 148 | - 处理时间统计 149 | - 设备跟踪信息 150 | 151 | 查看ComfyUI控制台输出可以了解详细的运行状态。 152 | 153 | ## 许可证 154 | 155 | 本项目基于 MIT 许可证开源 - 查看 [LICENSE](LICENSE) 文件了解详情。 156 | 157 | ## 贡献 158 | 159 | 欢迎提交Issue和Pull Request来改进这个项目。 160 | -------------------------------------------------------------------------------- /workflows/sample.json: -------------------------------------------------------------------------------- 1 | {"id":"771c167f-3b82-420e-b867-457e7a1843ee","revision":0,"last_node_id":5,"last_link_id":3,"nodes":[{"id":2,"type":"UpscaleModelLoader","pos":[871.7640380859375,-4999.3056640625],"size":[270,58],"flags":{},"order":0,"mode":0,"inputs":[{"localized_name":"模型名称","name":"model_name","type":"COMBO","widget":{"name":"model_name"},"link":null}],"outputs":[{"localized_name":"放大模型","name":"UPSCALE_MODEL","type":"UPSCALE_MODEL","links":[1]}],"properties":{"cnr_id":"comfy-core","ver":"0.3.65","Node name for S&R":"UpscaleModelLoader","ue_properties":{"widget_ue_connectable":{},"input_ue_unconnectable":{},"version":"7.2.2"}},"widgets_values":["RealESRGAN_x2plus.pth"]},{"id":3,"type":"VHS_VideoCombine","pos":[1682.199462890625,-4868.9638671875],"size":[490.8441467285156,334],"flags":{},"order":3,"mode":0,"inputs":[{"localized_name":"images","name":"images","type":"IMAGE","link":2},{"localized_name":"audio","name":"audio","shape":7,"type":"AUDIO","link":null},{"localized_name":"meta_batch","name":"meta_batch","shape":7,"type":"VHS_BatchManager","link":null},{"localized_name":"vae","name":"vae","shape":7,"type":"VAE","link":null},{"localized_name":"frame_rate","name":"frame_rate","type":"FLOAT","widget":{"name":"frame_rate"},"link":null},{"localized_name":"loop_count","name":"loop_count","type":"INT","widget":{"name":"loop_count"},"link":null},{"localized_name":"filename_prefix","name":"filename_prefix","type":"STRING","widget":{"name":"filename_prefix"},"link":null},{"localized_name":"format","name":"format","type":"COMBO","widget":{"name":"format"},"link":null},{"localized_name":"pingpong","name":"pingpong","type":"BOOLEAN","widget":{"name":"pingpong"},"link":null},{"localized_name":"save_output","name":"save_output","type":"BOOLEAN","widget":{"name":"save_output"},"link":null},{"name":"pix_fmt","type":["yuv420p","yuv420p10le"],"widget":{"name":"pix_fmt"},"link":null},{"name":"crf","type":"INT","widget":{"name":"crf"},"link":null},{"name":"save_metadata","type":"BOOLEAN","widget":{"name":"save_metadata"},"link":null},{"name":"trim_to_audio","type":"BOOLEAN","widget":{"name":"trim_to_audio"},"link":null}],"outputs":[{"localized_name":"Filenames","name":"Filenames","type":"VHS_FILENAMES","links":null}],"properties":{"cnr_id":"comfyui-videohelpersuite","ver":"08e8df15db24da292d4b7f943c460dc2ab442b24","Node name for S&R":"VHS_VideoCombine","ue_properties":{"widget_ue_connectable":{},"input_ue_unconnectable":{},"version":"7.2.2"}},"widgets_values":{"frame_rate":24,"loop_count":0,"filename_prefix":"AnimateDiff","format":"video/h264-mp4","pix_fmt":"yuv420p","crf":19,"save_metadata":true,"trim_to_audio":false,"pingpong":false,"save_output":false,"videopreview":{"hidden":false,"paused":false,"params":{"filename":"AnimateDiff_00004.mp4","subfolder":"","type":"temp","format":"video/h264-mp4","frame_rate":24,"workflow":"AnimateDiff_00004.png","fullpath":"C:\\Users\\ADMINI~1\\AppData\\Local\\Temp\\latentsync_81337309\\AnimateDiff_00004.mp4"}}}},{"id":4,"type":"VHS_LoadVideo","pos":[871.8960571289062,-4805.31884765625],"size":[253.279296875,738.4962768554688],"flags":{},"order":1,"mode":0,"inputs":[{"localized_name":"meta_batch","name":"meta_batch","shape":7,"type":"VHS_BatchManager","link":null},{"localized_name":"vae","name":"vae","shape":7,"type":"VAE","link":null},{"localized_name":"video","name":"video","type":"COMBO","widget":{"name":"video"},"link":null},{"localized_name":"force_rate","name":"force_rate","type":"FLOAT","widget":{"name":"force_rate"},"link":null},{"localized_name":"custom_width","name":"custom_width","type":"INT","widget":{"name":"custom_width"},"link":null},{"localized_name":"custom_height","name":"custom_height","type":"INT","widget":{"name":"custom_height"},"link":null},{"localized_name":"frame_load_cap","name":"frame_load_cap","type":"INT","widget":{"name":"frame_load_cap"},"link":null},{"localized_name":"skip_first_frames","name":"skip_first_frames","type":"INT","widget":{"name":"skip_first_frames"},"link":null},{"localized_name":"select_every_nth","name":"select_every_nth","type":"INT","widget":{"name":"select_every_nth"},"link":null},{"localized_name":"format","name":"format","shape":7,"type":"COMBO","widget":{"name":"format"},"link":null}],"outputs":[{"localized_name":"图像","name":"IMAGE","type":"IMAGE","links":[3]},{"localized_name":"frame_count","name":"frame_count","type":"INT","links":null},{"localized_name":"audio","name":"audio","type":"AUDIO","links":null},{"localized_name":"video_info","name":"video_info","type":"VHS_VIDEOINFO","links":null}],"properties":{"cnr_id":"comfyui-videohelpersuite","ver":"08e8df15db24da292d4b7f943c460dc2ab442b24","Node name for S&R":"VHS_LoadVideo","ue_properties":{"widget_ue_connectable":{},"input_ue_unconnectable":{},"version":"7.2.2"}},"widgets_values":{"video":"WanVideoWrapper_Pusa22_00033.mp4","force_rate":0,"custom_width":0,"custom_height":0,"frame_load_cap":0,"skip_first_frames":0,"select_every_nth":1,"format":"AnimateDiff","choose video to upload":"image","videopreview":{"hidden":false,"paused":false,"params":{"filename":"WanVideoWrapper_Pusa22_00033.mp4","type":"input","format":"video/mp4","force_rate":0,"custom_width":0,"custom_height":512,"frame_load_cap":0,"skip_first_frames":0,"select_every_nth":1}}}},{"id":1,"type":"ImageUpscaleWithModelCUDAspeedFixed","pos":[1234.9473876953125,-4857.76611328125],"size":[381.3081970214844,222],"flags":{},"order":2,"mode":0,"inputs":[{"localized_name":"upscale_model","name":"upscale_model","type":"UPSCALE_MODEL","link":1},{"localized_name":"image","name":"image","type":"IMAGE","link":3},{"localized_name":"use_autocast","name":"use_autocast","type":"COMBO","widget":{"name":"use_autocast"},"link":null},{"localized_name":"precision","name":"precision","type":"COMBO","widget":{"name":"precision"},"link":null},{"localized_name":"tile_size","name":"tile_size","type":"INT","widget":{"name":"tile_size"},"link":null},{"localized_name":"overlap","name":"overlap","type":"INT","widget":{"name":"overlap"},"link":null},{"localized_name":"enable_compile","name":"enable_compile","type":"COMBO","widget":{"name":"enable_compile"},"link":null},{"localized_name":"optimization_level","name":"optimization_level","type":"COMBO","widget":{"name":"optimization_level"},"link":null},{"localized_name":"batch_size","name":"batch_size","shape":7,"type":"INT","widget":{"name":"batch_size"},"link":null}],"outputs":[{"localized_name":"图像","name":"IMAGE","type":"IMAGE","links":[2]}],"properties":{"aux_id":"piscesbody/ComfyUI-Upscale-CUDAspeed","ver":"3b36bc441f07905d7b4f258feb789b52ef3be57e","Node name for S&R":"ImageUpscaleWithModelCUDAspeedFixed","ue_properties":{"widget_ue_connectable":{},"input_ue_unconnectable":{},"version":"7.2.2"}},"widgets_values":["enable","fp16",1280,8,"enable","speed",1]}],"links":[[1,2,0,1,0,"UPSCALE_MODEL"],[2,1,0,3,0,"IMAGE"],[3,4,0,1,1,"IMAGE"]],"groups":[],"config":{},"extra":{"ue_links":[],"links_added_by_ue":[],"ds":{"scale":1.3109994191500645,"offset":[-876.3402639284977,5042.450908931591]}},"version":0.4} -------------------------------------------------------------------------------- /upscale_cudaspeed.py: -------------------------------------------------------------------------------- 1 | """ 2 | ComfyUI Upscale CUDAspeed 3 | 优化编译后长时间处理和尺寸变化重新编译的问题 4 | """ 5 | 6 | import logging 7 | import torch 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | from spandrel import ModelLoader, ImageModelDescriptor 11 | from comfy import model_management 12 | import comfy.utils 13 | import folder_paths 14 | from typing import Optional, Tuple, List 15 | import math 16 | import time 17 | import gc 18 | import os 19 | import pickle 20 | import hashlib 21 | 22 | try: 23 | from tqdm import tqdm 24 | tqdm_available = True 25 | except ImportError: 26 | tqdm_available = False 27 | print("tqdm not available, using basic progress indicators") 28 | 29 | try: 30 | from spandrel_extra_arches import EXTRA_REGISTRY 31 | from spandrel import MAIN_REGISTRY 32 | MAIN_REGISTRY.add(*EXTRA_REGISTRY) 33 | logging.info("成功导入spandrel_extra_arches：支持非商业放大模型。") 34 | except: 35 | pass 36 | 37 | class UpscaleModelLoader: 38 | @classmethod 39 | def INPUT_TYPES(s): 40 | return {"required": { "model_name": (folder_paths.get_filename_list("upscale_models"), ), 41 | }} 42 | RETURN_TYPES = ("UPSCALE_MODEL",) 43 | FUNCTION = "load_model" 44 | CATEGORY = "loaders" 45 | 46 | def load_model(self, model_name): 47 | model_path = folder_paths.get_full_path_or_raise("upscale_models", model_name) 48 | sd = comfy.utils.load_torch_file(model_path, safe_load=True) 49 | if "module.layers.0.residual_group.blocks.0.norm1.weight" in sd: 50 | sd = comfy.utils.state_dict_prefix_replace(sd, {"module.":""}) 51 | out = ModelLoader().load_from_state_dict(sd).eval() 52 | 53 | if not isinstance(out, ImageModelDescriptor): 54 | raise Exception("放大模型必须是单图像模型。") 55 | 56 | return (out, ) 57 | 58 | 59 | class ImageUpscaleWithModelCUDAspeedFixed: 60 | """高性能放大节点 61 | 优化编译后长时间处理和尺寸变化重新编译的问题 62 | """ 63 | 64 | @classmethod 65 | def INPUT_TYPES(s): 66 | return { 67 | "required": { 68 | "upscale_model": ("UPSCALE_MODEL",), 69 | "image": ("IMAGE",), 70 | "use_autocast": (["enable", "disable"], {"default": "enable"}), 71 | "precision": (["auto", "fp16", "fp32", "bf16"], {"default": "auto"}), 72 | "tile_size": ("INT", {"default": 0, "min": 0, "max": 2048, "step": 64}), 73 | "overlap": ("INT", {"default": 0, "min": 0, "max": 128, "step": 8}), 74 | "enable_compile": (["enable", "disable"], {"default": "enable"}), 75 | "optimization_level": (["balanced", "speed", "memory"], {"default": "balanced"}), 76 | }, 77 | "optional": { 78 | "batch_size": ("INT", {"default": 1, "min": 1, "max": 16, "step": 1}), 79 | } 80 | } 81 | 82 | RETURN_TYPES = ("IMAGE",) 83 | FUNCTION = "upscale" 84 | CATEGORY = "image/upscaling" 85 | 86 | # 编译后的模型缓存（类变量，在实例间共享） 87 | _compiled_models = {} 88 | 89 | # 编译模型存储目录（用于记录编译状态） 90 | _compiled_models_dir = None 91 | 92 | # 运行时编译缓存（类变量，在实例间共享） 93 | _runtime_compiled_models = {} 94 | 95 | # 尺寸编译缓存 - 关键修复：为不同尺寸缓存编译结果 96 | _size_compiled_models = {} 97 | 98 | def __init__(self): 99 | """初始化模型存储目录""" 100 | # 设置编译模型存储目录 101 | current_dir = os.path.dirname(os.path.abspath(__file__)) 102 | self._compiled_models_dir = os.path.join(current_dir, "compiled_models") 103 | os.makedirs(self._compiled_models_dir, exist_ok=True) 104 | print(f"📁 编译模型存储目录: {self._compiled_models_dir}") 105 | 106 | # 初始化运行时缓存（如果是第一次实例化） 107 | if not hasattr(ImageUpscaleWithModelCUDAspeedFixed, '_runtime_compiled_models'): 108 | ImageUpscaleWithModelCUDAspeedFixed._runtime_compiled_models = {} 109 | 110 | # 初始化尺寸编译缓存 111 | if not hasattr(ImageUpscaleWithModelCUDAspeedFixed, '_size_compiled_models'): 112 | ImageUpscaleWithModelCUDAspeedFixed._size_compiled_models = {} 113 | 114 | # 加载已编译模型记录 115 | self._load_compiled_models_info() 116 | 117 | # 调试：显示运行时缓存状态 118 | print(f"🔍 运行时缓存状态: {len(ImageUpscaleWithModelCUDAspeedFixed._runtime_compiled_models)} 个编译模型") 119 | print(f"🔍 尺寸缓存状态: {len(ImageUpscaleWithModelCUDAspeedFixed._size_compiled_models)} 个尺寸缓存") 120 | 121 | def _get_model_hash(self, model_state_dict): 122 | """生成模型状态字典的哈希值""" 123 | # 创建一个简化的模型状态用于哈希计算 124 | simplified_state = {} 125 | for key, value in model_state_dict.items(): 126 | # 只取部分关键参数计算哈希，避免计算量过大 127 | if 'weight' in key or 'bias' in key: 128 | # 取前100个元素计算哈希 129 | flat_value = value.flatten() 130 | sample_size = min(100, len(flat_value)) 131 | simplified_state[key] = flat_value[:sample_size].cpu().numpy().tobytes() 132 | 133 | # 计算哈希 134 | hash_obj = hashlib.md5() 135 | for key in sorted(simplified_state.keys()): 136 | hash_obj.update(simplified_state[key]) 137 | 138 | return hash_obj.hexdigest() 139 | 140 | def _get_compiled_model_path(self, model_hash): 141 | """获取编译模型信息文件路径""" 142 | return os.path.join(self._compiled_models_dir, f"compiled_{model_hash}.pkl") 143 | 144 | def _load_compiled_models_info(self): 145 | """加载已编译模型信息（仅记录，不加载编译函数）""" 146 | print("🔍 检查已编译的模型信息...") 147 | loaded_count = 0 148 | 149 | for filename in os.listdir(self._compiled_models_dir): 150 | if filename.startswith("compiled_") and filename.endswith(".pkl"): 151 | file_path = os.path.join(self._compiled_models_dir, filename) 152 | try: 153 | # 只检查文件是否存在，不实际加载编译函数 154 | if os.path.getsize(file_path) > 0: 155 | # 只记录模型哈希，不加载编译函数 156 | model_hash = filename.replace("compiled_", "").replace(".pkl", "") 157 | self._compiled_models[model_hash] = True # 标记为已编译 158 | loaded_count += 1 159 | print(f" ✅ 发现编译模型记录: {filename}") 160 | 161 | except Exception as e: 162 | print(f" ❌ 检查编译模型失败 {filename}: {e}") 163 | 164 | print(f"📊 发现 {loaded_count} 个编译模型记录") 165 | 166 | def _save_compiled_model_info(self, model_hash): 167 | """保存编译模型信息到文件（不保存实际的编译函数）""" 168 | try: 169 | # 不保存编译函数本身，只保存编译记录 170 | compiled_data = { 171 | 'model_hash': model_hash, 172 | 'save_time': time.time(), 173 | 'compile_info': '模型已编译，编译函数无法序列化保存' 174 | } 175 | 176 | file_path = self._get_compiled_model_path(model_hash) 177 | with open(file_path, 'wb') as f: 178 | pickle.dump(compiled_data, f) 179 | 180 | print(f"💾 编译模型信息已保存: {os.path.basename(file_path)}") 181 | return True 182 | 183 | except Exception as e: 184 | print(f"❌ 保存编译模型信息失败: {e}") 185 | return False 186 | 187 | def upscale(self, upscale_model, image, use_autocast="enable", precision="auto", 188 | tile_size=0, overlap=0, enable_compile="enable", optimization_level="balanced", 189 | batch_size=1): 190 | 191 | print(f"🚀 开始图像放大处理") 192 | print(f"📊 输入图像尺寸: {image.shape}") 193 | 194 | # 获取模型信息 195 | model_name = self._get_model_name(upscale_model) 196 | print(f"🔧 使用放大模型: {model_name}, 模型缩放比例: {upscale_model.scale}") 197 | print(f"⚙️ 使用参数 - 自动混合精度: {use_autocast}, 精度: {precision}") 198 | print(f"🔧 优化级别: {optimization_level}, 模型编译: {enable_compile}") 199 | 200 | # 详细性能监控 201 | total_start_time = time.time() 202 | phase_start_time = total_start_time 203 | 204 | # 确定精度和优化设置 205 | dtype, autocast_enabled = self._determine_precision(precision, use_autocast) 206 | phase_end_time = time.time() 207 | print(f"⏱️ 精度设置完成 - 耗时: {phase_end_time - phase_start_time:.3f}秒") 208 | phase_start_time = phase_end_time 209 | 210 | # 智能参数计算 211 | tile_size, overlap = self._calculate_optimal_tile_size( 212 | image.shape, upscale_model.scale, tile_size, overlap, optimization_level 213 | ) 214 | phase_end_time = time.time() 215 | print(f"⏱️ 参数计算完成 - 耗时: {phase_end_time - phase_start_time:.3f}秒") 216 | phase_start_time = phase_end_time 217 | 218 | print(f"📐 优化参数 - 瓦片大小: {tile_size}, 重叠: {overlap}") 219 | 220 | # 执行放大处理 221 | result = self._upscale_fixed( 222 | upscale_model, image, dtype, autocast_enabled, 223 | tile_size, overlap, enable_compile, batch_size 224 | ) 225 | 226 | # 性能统计 227 | total_end_time = time.time() 228 | processing_time = total_end_time - total_start_time 229 | print(f"✅ 图像放大处理完成 - 总耗时: {processing_time:.2f}秒") 230 | print(f"📊 输出图像尺寸: {result[0].shape}") 231 | 232 | return result 233 | 234 | def _get_model_name(self, upscale_model): 235 | """获取模型名称信息""" 236 | model_name = getattr(upscale_model, 'name', None) 237 | if model_name is None: 238 | model_name = getattr(upscale_model, '__class__', type(upscale_model)).__name__ 239 | if hasattr(upscale_model, 'model'): 240 | underlying_model = getattr(upscale_model.model, '__class__', None) 241 | if underlying_model: 242 | model_name = f"{model_name}({underlying_model.__name__})" 243 | else: 244 | model_name = type(upscale_model).__name__ 245 | return model_name 246 | 247 | def _determine_precision(self, precision, use_autocast): 248 | """确定精度设置""" 249 | if precision == "auto": 250 | if model_management.should_use_fp16(): 251 | precision = "fp16" 252 | else: 253 | precision = "fp32" 254 | 255 | dtype = torch.float32 256 | autocast_enabled = False 257 | 258 | if use_autocast == "enable": 259 | if precision == "fp16": 260 | dtype = torch.float16 261 | autocast_enabled = True 262 | elif precision == "bf16": 263 | dtype = torch.bfloat16 264 | autocast_enabled = True 265 | 266 | return dtype, autocast_enabled 267 | 268 | def _calculate_optimal_tile_size(self, image_shape, scale_factor, tile_size, overlap, optimization_level): 269 | """智能计算最优瓦片大小和重叠""" 270 | _, _, height, width = image_shape if len(image_shape) == 4 else (1, *image_shape[1:]) 271 | 272 | # 如果用户指定了参数，使用用户指定的值 273 | if tile_size > 0 and overlap > 0: 274 | return tile_size, overlap 275 | 276 | # 根据优化级别计算默认值 277 | if optimization_level == "speed": 278 | base_tile = 512 # 优化：减小默认瓦片大小，避免过大瓦片导致性能下降 279 | base_overlap = 16 280 | elif optimization_level == "memory": 281 | base_tile = 256 # 小瓦片，节省内存 282 | base_overlap = 24 283 | else: # balanced 284 | base_tile = 384 285 | base_overlap = 32 286 | 287 | # 根据图像尺寸智能调整瓦片大小 288 | max_dim = max(height, width) 289 | 290 | # 优化：更智能的瓦片大小计算 291 | if max_dim <= 512: 292 | tile_size = min(512, base_tile) 293 | elif max_dim <= 1024: 294 | tile_size = min(512, base_tile) # 对于1080p以下图像，使用512瓦片 295 | elif max_dim <= 1920: 296 | tile_size = min(640, base_tile) # 对于2K图像，使用640瓦片 297 | else: 298 | tile_size = base_tile 299 | 300 | # 优化：根据实际图像尺寸进一步调整 301 | # 如果图像尺寸小于瓦片大小，直接使用图像尺寸 302 | if height < tile_size and width < tile_size: 303 | tile_size = max(height, width) 304 | 305 | # 根据缩放比例调整重叠 306 | overlap = max(8, base_overlap // max(1, int(scale_factor))) 307 | 308 | print(f"🔧 智能瓦片计算 - 图像尺寸: {width}x{height}, 计算瓦片: {tile_size}x{tile_size}, 重叠: {overlap}") 309 | 310 | return tile_size, overlap 311 | 312 | def _upscale_fixed(self, upscale_model, image, dtype, autocast_enabled, 313 | tile_size, overlap, enable_compile, batch_size): 314 | """修复性能问题的单GPU放大实现""" 315 | device = model_management.get_torch_device() 316 | print(f"💻 使用设备: {device}") 317 | print(f"🔍 设备跟踪 - _upscale_fixed入口: 输入图像设备={image.device}") 318 | 319 | # 先将原始模型移到设备 320 | upscale_model.to(device) 321 | 322 | # 准备编译模型 - 关键修复：使用尺寸感知的编译缓存 323 | use_compiled_model = False 324 | compiled_forward = None 325 | 326 | # 生成尺寸键用于缓存 327 | size_key = f"{image.shape[2]}x{image.shape[3]}" # 高度x宽度 328 | print(f"📐 当前输入尺寸: {size_key}") 329 | 330 | if enable_compile == "enable" and hasattr(torch, 'compile'): 331 | # 获取模型哈希作为唯一标识 332 | model_hash = None 333 | try: 334 | if hasattr(upscale_model, 'model') and hasattr(upscale_model.model, 'state_dict'): 335 | model_state_dict = upscale_model.model.state_dict() 336 | model_hash = self._get_model_hash(model_state_dict) 337 | print(f"🔑 模型哈希: {model_hash}") 338 | except Exception as e: 339 | print(f"⚠️ 获取模型哈希失败: {e}") 340 | model_hash = None 341 | 342 | # 检查是否有编译记录 343 | has_compile_record = model_hash and model_hash in self._compiled_models 344 | 345 | # 使用尺寸感知的缓存键 346 | model_key = f"{model_hash}_{size_key}" if model_hash else f"{id(upscale_model)}_{size_key}" 347 | 348 | # 调试：显示缓存查找状态 349 | print(f"🔍 缓存查找 - 模型键: {model_key}") 350 | print(f"🔍 运行时缓存中存在: {model_key in ImageUpscaleWithModelCUDAspeedFixed._runtime_compiled_models}") 351 | print(f"🔍 尺寸缓存中存在: {model_key in ImageUpscaleWithModelCUDAspeedFixed._size_compiled_models}") 352 | print(f"🔍 编译记录 - 模型哈希: {model_hash}, 记录存在: {has_compile_record}") 353 | 354 | # 关键修复：优先检查尺寸缓存 355 | if model_key in ImageUpscaleWithModelCUDAspeedFixed._size_compiled_models: 356 | # 使用尺寸缓存的编译模型 357 | compiled_forward = ImageUpscaleWithModelCUDAspeedFixed._size_compiled_models[model_key] 358 | use_compiled_model = True 359 | print(f"✅ 使用已编译模型 (尺寸缓存: {size_key})") 360 | elif model_key in ImageUpscaleWithModelCUDAspeedFixed._runtime_compiled_models: 361 | # 使用运行时缓存的编译模型 362 | compiled_forward = ImageUpscaleWithModelCUDAspeedFixed._runtime_compiled_models[model_key] 363 | use_compiled_model = True 364 | print(f"✅ 使用已编译模型 (运行时缓存: {size_key})") 365 | else: 366 | # 需要重新编译 367 | if has_compile_record: 368 | print(f"🔧 重新编译模型 (已有记录，但尺寸 {size_key} 未缓存)...") 369 | else: 370 | print(f"🔧 编译模型以优化性能 (尺寸: {size_key})...") 371 | 372 | try: 373 | # 尝试编译模型的forward方法 374 | if hasattr(upscale_model, 'model') and hasattr(upscale_model.model, 'forward'): 375 | # 使用最安全的编译配置，完全避免CUDA图问题 376 | import os 377 | os.environ["TORCHINDUCTOR_CUDAGRAPHS"] = "0" 378 | torch._inductor.config.triton.cudagraphs = False 379 | torch._inductor.config.triton.cudagraph_trees = False 380 | 381 | # 简化的编译过程 - 移除复杂的进度条 382 | print("🔄 开始模型编译... (这可能需要几秒钟)") 383 | compile_start_time = time.time() 384 | 385 | # 使用最简单的编译模式 386 | compiled_forward = torch.compile( 387 | upscale_model.model.forward, 388 | mode="default", 389 | fullgraph=False, 390 | dynamic=False # 固定尺寸编译，性能更好 391 | ) 392 | 393 | compile_end_time = time.time() 394 | compile_time = compile_end_time - compile_start_time 395 | 396 | print(f"✅ 编译完成 - 耗时: {compile_time:.2f}秒") 397 | 398 | # 关键修复：同时保存到运行时缓存和尺寸缓存 399 | ImageUpscaleWithModelCUDAspeedFixed._runtime_compiled_models[model_key] = compiled_forward 400 | ImageUpscaleWithModelCUDAspeedFixed._size_compiled_models[model_key] = compiled_forward 401 | 402 | # 保存编译记录（不保存编译函数本身） 403 | if model_hash and not has_compile_record: 404 | self._compiled_models[model_hash] = True 405 | self._save_compiled_model_info(model_hash) 406 | print("✅ 模型编译成功并已记录") 407 | else: 408 | print("✅ 模型编译成功") 409 | 410 | use_compiled_model = True 411 | 412 | else: 413 | print("⚠️ 模型结构不支持编译，使用普通模式") 414 | use_compiled_model = False 415 | except Exception as e: 416 | print(f"⚠️ 模型编译失败，使用普通模式: {e}") 417 | use_compiled_model = False 418 | 419 | # 启用Tensor Core优化 420 | torch.backends.cudnn.allow_tf32 = True 421 | torch.backends.cuda.matmul.allow_tf32 = True 422 | 423 | # 创建优化的CUDA流 424 | compute_stream = torch.cuda.Stream(device) 425 | data_stream = torch.cuda.Stream(device) 426 | 427 | # 异步数据预处理：在编译模型的同时准备输入数据 428 | print("🔄 开始异步数据预处理...") 429 | data_prep_start = time.time() 430 | 431 | # 准备输入图像（异步） 432 | with torch.cuda.stream(data_stream): 433 | in_img = image.movedim(-1, -3).to(device, non_blocking=True) 434 | 435 | data_prep_end = time.time() 436 | print(f"⏱️ 数据预处理完成 - 耗时: {data_prep_end - data_prep_start:.2f}秒") 437 | 438 | # 内存管理 439 | print("🔄 开始内存优化...") 440 | memory_start = time.time() 441 | self._optimize_memory_usage(upscale_model, in_img, tile_size, device) 442 | memory_end = time.time() 443 | print(f"⏱️ 内存优化完成 - 耗时: {memory_end - memory_start:.2f}秒") 444 | 445 | # 等待数据预处理完成 446 | print("🔄 等待数据预处理完成...") 447 | data_stream.synchronize() 448 | 449 | # 执行放大处理 450 | try: 451 | result = self._process_tiles_fixed( 452 | upscale_model, compiled_forward, use_compiled_model, in_img, 453 | autocast_enabled, dtype, tile_size, overlap, compute_stream, 454 | data_stream, batch_size, device 455 | ) 456 | 457 | # 智能显存管理：根据显存情况决定输出设备 458 | result = self._smart_memory_management(result, upscale_model, device) 459 | 460 | finally: 461 | # 清理内存 462 | upscale_model.to("cpu") 463 | if hasattr(torch.cuda, 'empty_cache'): 464 | torch.cuda.empty_cache() 465 | 466 | return result 467 | 468 | def _optimize_memory_usage(self, upscale_model, image, tile_size, device): 469 | """优化内存使用""" 470 | # 计算内存需求 471 | memory_required = model_management.module_size(upscale_model.model) 472 | memory_required += (tile_size * tile_size * 3) * image.element_size() * 384.0 473 | memory_required += image.nelement() * image.element_size() 474 | 475 | # 释放内存 476 | model_management.free_memory(memory_required, device) 477 | 478 | # 预分配GPU内存池（如果可用） 479 | if hasattr(torch.cuda, 'memory_allocated'): 480 | current_allocated = torch.cuda.memory_allocated(device) 481 | print(f"💾 GPU内存使用: {current_allocated / 1024**3:.2f} GB") 482 | 483 | def _process_tiles_fixed(self, upscale_model, compiled_forward, use_compiled_model, in_img, 484 | autocast_enabled, dtype, tile_size, overlap, compute_stream, 485 | data_stream, batch_size, device): 486 | """修复的瓦片处理 - 简化流程，移除不必要的预热""" 487 | print(f"🔍 设备跟踪 - _process_tiles_fixed入口: 输入图像设备={in_img.device}") 488 | oom = True 489 | current_tile_size = tile_size 490 | max_retries = 3 491 | retry_count = 0 492 | 493 | while oom and retry_count < max_retries: 494 | try: 495 | # 计算处理步骤 496 | steps = in_img.shape[0] * comfy.utils.get_tiled_scale_steps( 497 | in_img.shape[3], in_img.shape[2], 498 | tile_x=current_tile_size, tile_y=current_tile_size, 499 | overlap=overlap 500 | ) 501 | print(f"📈 预计处理步骤数: {steps}, 当前瓦片大小: {current_tile_size}x{current_tile_size}") 502 | 503 | # 创建进度条 504 | pbar = self._create_progress_bar(steps) 505 | 506 | # 优化的放大函数 - 支持编译和普通模式 507 | def upscale_fn(x): 508 | with torch.cuda.stream(compute_stream): 509 | if use_compiled_model and compiled_forward is not None: 510 | # 使用编译后的forward函数 511 | if autocast_enabled: 512 | with torch.autocast(device_type="cuda", dtype=dtype): 513 | # 编译后的函数已经绑定了模型实例 514 | result = compiled_forward(x) 515 | else: 516 | result = compiled_forward(x) 517 | else: 518 | # 使用原始模型 519 | if autocast_enabled: 520 | with torch.autocast(device_type="cuda", dtype=dtype): 521 | result = upscale_model(x) 522 | else: 523 | result = upscale_model(x) 524 | 525 | # 确保输出数据类型正确 526 | if autocast_enabled and result.dtype != torch.float32: 527 | result = result.float() 528 | 529 | compute_stream.synchronize() 530 | return result 531 | 532 | # 使用优化的瓦片缩放 533 | print("🔄 开始tiled_scale处理...") 534 | print(f"🔍 设备跟踪 - tiled_scale调用前: 输入设备={in_img.device}") 535 | tiled_scale_start_time = time.time() 536 | 537 | # 执行实际的tiled_scale处理 538 | with torch.no_grad(): 539 | s = comfy.utils.tiled_scale( 540 | in_img, 541 | upscale_fn, 542 | tile_x=current_tile_size, 543 | tile_y=current_tile_size, 544 | overlap=overlap, 545 | upscale_amount=upscale_model.scale, 546 | output_device=device, # 关键优化：直接输出到GPU，避免不必要的CPU传输 547 | pbar=pbar 548 | ) 549 | 550 | tiled_scale_end_time = time.time() 551 | print(f"✅ tiled_scale处理完成 - 耗时: {tiled_scale_end_time - tiled_scale_start_time:.3f}秒") 552 | print(f"🔍 设备跟踪 - tiled_scale调用后: 输出设备={s.device}") 553 | 554 | oom = False 555 | 556 | # 关闭进度条 557 | if hasattr(pbar, 'close'): 558 | pbar.close() 559 | 560 | except model_management.OOM_EXCEPTION as e: 561 | retry_count += 1 562 | current_tile_size = max(128, current_tile_size // 2) 563 | print(f"⚠️ 内存不足，减小瓦片大小到 {current_tile_size}x{current_tile_size} (重试 {retry_count}/{max_retries})") 564 | 565 | if current_tile_size < 128: 566 | raise e 567 | 568 | if oom: 569 | raise model_management.OOM_EXCEPTION("无法在可用内存内处理图像") 570 | 571 | # 优化：由于tiled_scale已直接输出到GPU，直接使用GPU后处理 572 | print("🔍 检查输出设备状态...") 573 | print(f"📊 输出张量设备: {s.device}, 形状: {s.shape}") 574 | 575 | # 确保在GPU上进行后处理 576 | if s.device.type != 'cuda': 577 | print(f"🔄 将结果移动到GPU进行后处理 (当前设备: {s.device})") 578 | s = s.to(device, non_blocking=True) 579 | print(f"✅ 结果已移动到GPU: {s.device}") 580 | 581 | # 使用GPU后处理 582 | s = self._gpu_post_process(s, device) 583 | 584 | return (s,) 585 | 586 | def _create_progress_bar(self, steps): 587 | """创建进度条""" 588 | if tqdm_available: 589 | return tqdm(total=steps, desc="单GPU放大处理", unit="tile", leave=False) 590 | else: 591 | return comfy.utils.ProgressBar(steps) 592 | 593 | def _post_process_output(self, output_tensor): 594 | """修复编译模型输出发白问题的后处理""" 595 | print(f"🔧 开始增强后处理，输入设备: {output_tensor.device}") 596 | print(f"🔍 设备跟踪 - _post_process_output: 输入设备={output_tensor.device}") 597 | 598 | # 调整维度顺序 599 | s = output_tensor.movedim(-3, -1) 600 | print(f"🔍 设备跟踪 - movedim后: 设备={s.device}") 601 | 602 | # 处理非数值 603 | s = torch.nan_to_num(s, nan=0.0, posinf=1.0, neginf=0.0) 604 | print(f"🔍 设备跟踪 - nan_to_num后: 设备={s.device}") 605 | 606 | # 详细的数值统计分析 607 | s_min = torch.min(s) 608 | s_max = torch.max(s) 609 | s_mean = torch.mean(s) 610 | s_std = torch.std(s) 611 | 612 | print(f"📊 原始输出统计 - 最小值: {s_min:.4f}, 最大值: {s_max:.4f}, 平均值: {s_mean:.4f}, 标准差: {s_std:.4f}") 613 | 614 | # 检测编译模型特有的数值范围问题 615 | if s_max > 10.0 or s_min < -5.0: 616 | # 严重范围偏移 - 编译模型常见问题 617 | print("⚠️ 检测到严重数值范围偏移，进行深度归一化") 618 | 619 | # 方法1: 基于统计的归一化 620 | if s_std > 0.01: # 有合理的分布 621 | # 使用3-sigma规则裁剪异常值 622 | lower_bound = s_mean - 3 * s_std 623 | upper_bound = s_mean + 3 * s_std 624 | s = torch.clamp(s, min=lower_bound, max=upper_bound) 625 | 626 | # 重新计算统计量 627 | s_min = torch.min(s) 628 | s_max = torch.max(s) 629 | 630 | # 方法2: 分位数归一化（更鲁棒） 631 | try: 632 | # 使用分位数避免极端值影响 633 | q_low = torch.quantile(s, 0.01) 634 | q_high = torch.quantile(s, 0.99) 635 | s = torch.clamp(s, min=q_low, max=q_high) 636 | 637 | # 重新计算统计量 638 | s_min = torch.min(s) 639 | s_max = torch.max(s) 640 | except: 641 | pass # 分位数计算失败时使用原有方法 642 | 643 | # 最终归一化到[0,1] 644 | if s_max - s_min > 1e-6: 645 | s = (s - s_min) / (s_max - s_min) 646 | else: 647 | s = torch.zeros_like(s) # 全零情况 648 | 649 | elif s_max > 1.0 or s_min < 0.0: 650 | # 轻微范围偏移 651 | print("⚠️ 检测到轻微数值偏移，进行裁剪归一化") 652 | 653 | # 限制到合理范围 654 | s = torch.clamp(s, min=0.0, max=s_max) 655 | 656 | # 如果最大值仍然大于1，进行缩放 657 | if s_max > 1.0: 658 | s = s / s_max 659 | 660 | else: 661 | # 正常范围，直接限制 662 | s = torch.clamp(s, min=0.0, max=1.0) 663 | 664 | # 最终确保在[0,1]范围内 665 | s = torch.clamp(s, min=0.0, max=1.0) 666 | 667 | # 最终统计验证 668 | final_min = torch.min(s) 669 | final_max = torch.max(s) 670 | final_mean = torch.mean(s) 671 | 672 | print(f"✅ 处理后统计 - 最小值: {final_min:.4f}, 最大值: {final_max:.4f}, 平均值: {final_mean:.4f}") 673 | print(f"🔧 增强后处理完成，输出设备: {s.device}") 674 | 675 | return s 676 | 677 | def _accurate_memory_assessment(self, output_tensor, device): 678 | """优化的显存评估 - 基于实际张量，使用更宽松的阈值""" 679 | # 使用实际张量计算显存需求 680 | output_memory = output_tensor.nelement() * output_tensor.element_size() 681 | 682 | # 获取当前显存状态 683 | if hasattr(torch.cuda, 'get_device_properties'): 684 | total_memory = torch.cuda.get_device_properties(device).total_memory 685 | allocated = torch.cuda.memory_allocated(device) 686 | 687 | # 计算真正的可用显存：总显存 - 已分配显存 688 | actual_available_memory = total_memory - allocated 689 | 690 | # 优化：根据总显存大小动态调整安全余量 691 | if total_memory >= 20 * 1024**3: # 20GB以上大显存显卡 692 | safety_margin = 2 * 1024**3 # 2GB 693 | else: 694 | safety_margin = 4 * 1024**3 # 4GB 695 | 696 | available_memory = actual_available_memory - safety_margin 697 | 698 | print(f"💾 优化显存评估 - 输出张量形状: {output_tensor.shape}") 699 | print(f"💾 优化显存评估 - 元素数量: {output_tensor.nelement()}") 700 | print(f"💾 优化显存评估 - 元素大小: {output_tensor.element_size()} 字节") 701 | print(f"💾 优化显存评估 - 总显存: {total_memory/1024**3:.2f}GB") 702 | print(f"💾 优化显存评估 - 已分配: {allocated/1024**3:.2f}GB") 703 | print(f"💾 优化显存评估 - 实际可用: {actual_available_memory/1024**3:.2f}GB") 704 | print(f"💾 优化显存评估 - 安全余量后可用: {available_memory/1024**3:.2f}GB") 705 | print(f"💾 优化显存评估 - 输出需求: {output_memory/1024**3:.2f}GB") 706 | 707 | # 优化：使用更宽松的检查条件 708 | # 条件1：可用显存足够容纳输出张量 709 | # 条件2：输出张量不超过总显存的60% 710 | memory_condition = available_memory >= output_memory 711 | threshold_condition = output_memory <= total_memory * 0.6 712 | 713 | result = memory_condition and threshold_condition 714 | 715 | if result: 716 | print("✅ 显存评估通过，可以使用GPU处理") 717 | else: 718 | print("❌ 显存评估未通过，使用CPU处理") 719 | 720 | return result 721 | 722 | return False 723 | 724 | def _ensure_gpu_processing(self, tensor, device): 725 | """确保张量在GPU上处理""" 726 | if tensor.device.type != 'cuda': 727 | print(f"🔄 将张量从 {tensor.device} 移动到 GPU") 728 | return tensor.to(device, non_blocking=True) 729 | return tensor 730 | 731 | def _gpu_post_process(self, output_tensor, device): 732 | """GPU上的后处理""" 733 | print(f"🔧 开始GPU增强后处理，输入设备: {output_tensor.device}") 734 | 735 | # 确保输入在GPU上 736 | output_tensor = self._ensure_gpu_processing(output_tensor, device) 737 | 738 | # 调整维度顺序 739 | s = output_tensor.movedim(-3, -1) 740 | print(f"🔍 设备跟踪 - GPU movedim后: 设备={s.device}") 741 | 742 | # 处理非数值 743 | s = torch.nan_to_num(s, nan=0.0, posinf=1.0, neginf=0.0) 744 | print(f"🔍 设备跟踪 - GPU nan_to_num后: 设备={s.device}") 745 | 746 | # 详细的数值统计分析 747 | s_min = torch.min(s) 748 | s_max = torch.max(s) 749 | s_mean = torch.mean(s) 750 | s_std = torch.std(s) 751 | 752 | print(f"📊 GPU原始输出统计 - 最小值: {s_min:.4f}, 最大值: {s_max:.4f}, 平均值: {s_mean:.4f}, 标准差: {s_std:.4f}") 753 | 754 | # 检测编译模型特有的数值范围问题 755 | if s_max > 10.0 or s_min < -5.0: 756 | # 严重范围偏移 - 编译模型常见问题 757 | print("⚠️ GPU检测到严重数值范围偏移，进行深度归一化") 758 | 759 | # 方法1: 基于统计的归一化 760 | if s_std > 0.01: # 有合理的分布 761 | # 使用3-sigma规则裁剪异常值 762 | lower_bound = s_mean - 3 * s_std 763 | upper_bound = s_mean + 3 * s_std 764 | s = torch.clamp(s, min=lower_bound, max=upper_bound) 765 | 766 | # 重新计算统计量 767 | s_min = torch.min(s) 768 | s_max = torch.max(s) 769 | 770 | # 方法2: 分位数归一化（更鲁棒） 771 | try: 772 | # 使用分位数避免极端值影响 773 | q_low = torch.quantile(s, 0.01) 774 | q_high = torch.quantile(s, 0.99) 775 | s = torch.clamp(s, min=q_low, max=q_high) 776 | 777 | # 重新计算统计量 778 | s_min = torch.min(s) 779 | s_max = torch.max(s) 780 | except: 781 | pass # 分位数计算失败时使用原有方法 782 | 783 | # 最终归一化到[0,1] 784 | if s_max - s_min > 1e-6: 785 | s = (s - s_min) / (s_max - s_min) 786 | else: 787 | s = torch.zeros_like(s) # 全零情况 788 | 789 | elif s_max > 1.0 or s_min < 0.0: 790 | # 轻微范围偏移 791 | print("⚠️ GPU检测到轻微数值偏移，进行裁剪归一化") 792 | 793 | # 限制到合理范围 794 | s = torch.clamp(s, min=0.0, max=s_max) 795 | 796 | # 如果最大值仍然大于1，进行缩放 797 | if s_max > 1.0: 798 | s = s / s_max 799 | 800 | else: 801 | # 正常范围，直接限制 802 | s = torch.clamp(s, min=0.0, max=1.0) 803 | 804 | # 最终确保在[0,1]范围内 805 | s = torch.clamp(s, min=0.0, max=1.0) 806 | 807 | # 最终统计验证 808 | final_min = torch.min(s) 809 | final_max = torch.max(s) 810 | final_mean = torch.mean(s) 811 | 812 | print(f"✅ GPU处理后统计 - 最小值: {final_min:.4f}, 最大值: {final_max:.4f}, 平均值: {final_mean:.4f}") 813 | print(f"🔧 GPU增强后处理完成，输出设备: {s.device}") 814 | 815 | return s 816 | 817 | def _smart_memory_management(self, result, upscale_model, device): 818 | """智能显存管理：根据显存情况决定输出设备""" 819 | print("🔍 开始智能显存管理检查...") 820 | print(f"🔍 设备跟踪 - _smart_memory_management入口: 输入设备={result[0].device if result else 'None'}") 821 | 822 | if result is None or len(result) == 0: 823 | print("❓ 结果为空，跳过显存管理") 824 | return result 825 | 826 | output_tensor = result[0] 827 | print(f"📊 输出张量设备: {output_tensor.device}, 形状: {output_tensor.shape}") 828 | 829 | if output_tensor.device.type != 'cuda': 830 | print(f"📋 输出张量已在 {output_tensor.device}，跳过显存管理") 831 | return result 832 | 833 | try: 834 | # 计算输出张量的显存需求 835 | output_memory = output_tensor.nelement() * output_tensor.element_size() 836 | print(f"📊 输出张量显存需求: {output_memory/1024**3:.2f}GB") 837 | 838 | # 获取当前GPU显存状态 839 | if hasattr(torch.cuda, 'memory_reserved'): 840 | reserved = torch.cuda.memory_reserved(device) 841 | allocated = torch.cuda.memory_allocated(device) 842 | 843 | # 获取总显存和可用显存 844 | if hasattr(torch.cuda, 'get_device_properties'): 845 | total_memory = torch.cuda.get_device_properties(device).total_memory 846 | # 计算真正的可用显存：总显存 - 已分配显存 847 | actual_available_memory = total_memory - allocated 848 | 849 | # 安全余量：保留2GB的显存用于后续操作 850 | safety_margin = 2 * 1024**3 # 2GB 851 | available_memory = actual_available_memory - safety_margin 852 | 853 | print(f"💾 显存状态 - 总显存: {total_memory/1024**3:.2f}GB, 已分配: {allocated/1024**3:.2f}GB") 854 | print(f"💾 可用显存计算 - 实际可用: {actual_available_memory/1024**3:.2f}GB, 安全余量后: {available_memory/1024**3:.2f}GB") 855 | print(f"📊 输出张量需求: {output_memory/1024**3:.2f}GB") 856 | 857 | # 如果可用显存足够，直接保留在GPU上 858 | if available_memory >= output_memory: 859 | print("🚀 显存充足，结果保留在GPU直接导出") 860 | return result 861 | else: 862 | print("💾 显存不足，结果移动到CPU导出") 863 | # 异步移动到CPU，减少阻塞时间 864 | with torch.cuda.stream(torch.cuda.Stream(device)): 865 | cpu_tensor = output_tensor.cpu() 866 | print("✅ 结果已移动到CPU") 867 | return (cpu_tensor,) 868 | else: 869 | # 如果没有获取总显存功能，使用旧的逻辑 870 | free_memory = reserved - allocated 871 | safety_margin = reserved * 0.2 872 | available_memory = free_memory - safety_margin 873 | 874 | print(f"💾 显存状态 (旧方法) - 已分配: {allocated/1024**3:.2f}GB, 保留: {reserved/1024**3:.2f}GB, 可用: {available_memory/1024**3:.2f}GB") 875 | 876 | if available_memory >= output_memory: 877 | print("🚀 显存充足，结果保留在GPU直接导出") 878 | return result 879 | else: 880 | print("💾 显存不足，结果移动到CPU导出") 881 | with torch.cuda.stream(torch.cuda.Stream(device)): 882 | cpu_tensor = output_tensor.cpu() 883 | print("✅ 结果已移动到CPU") 884 | return (cpu_tensor,) 885 | else: 886 | # 如果没有显存查询功能，保守策略：移动到CPU 887 | print("💾 无法获取显存信息，结果移动到CPU导出") 888 | with torch.cuda.stream(torch.cuda.Stream(device)): 889 | cpu_tensor = output_tensor.cpu() 890 | return (cpu_tensor,) 891 | 892 | except Exception as e: 893 | print(f"⚠️ 显存管理异常，使用保守策略: {e}") 894 | # 异常情况下使用保守策略 895 | with torch.cuda.stream(torch.cuda.Stream(device)): 896 | cpu_tensor = output_tensor.cpu() 897 | return (cpu_tensor,) 898 | 899 | @classmethod 900 | def IS_CHANGED(s, **kwargs): 901 | return float("NaN") 902 | 903 | 904 | # 节点映射 905 | NODE_CLASS_MAPPINGS = { 906 | "UpscaleModelLoader": UpscaleModelLoader, 907 | "ImageUpscaleWithModelCUDAspeedFixed": ImageUpscaleWithModelCUDAspeedFixed 908 | } 909 | 910 | NODE_DISPLAY_NAME_MAPPINGS = { 911 | "ImageUpscaleWithModelCUDAspeedFixed": "🚀 Upscale Image CUDAspeed", 912 | } --------------------------------------------------------------------------------