├── requirements.txt ├── .gitignore ├── flashsr_min.py ├── LICENSE ├── __init__.py ├── Example └── Audio Super Resolution.json ├── egregora_fat_llama_cpu.py ├── README.md ├── install.py ├── egregora_fat_llama_gpu.py ├── egregora_audio_super_resolution.py ├── egregora_audio_eval_pack.py ├── egregora_null_test_suite.py └── egregora_audio_enhance_extras.py /requirements.txt: -------------------------------------------------------------------------------- 1 | # Core 2 | numpy>=1.26 3 | scipy>=1.11 4 | soundfile>=0.12 5 | tqdm>=4.66 6 | requests>=2.31 7 | huggingface_hub>=0.24 8 | 9 | # Models / processors used by your nodes 10 | fat-llama>=1.1.0 11 | fat-llama-fftw>=1.0.4.4 12 | pyrnnoise>=0.3.8 13 | nara-wpe>=0.0.9 # import name: nara_wpe 14 | deepfilternet>=0.5.6 # import name: df 15 | descript-audio-codec>=1.0.0 16 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python cache 2 | __pycache__/ 3 | *.pyc 4 | *.pyo 5 | *.pyd 6 | *.pdb 7 | *.egg-info/ 8 | *.egg 9 | *.log 10 | 11 | # Virtual environments 12 | venv/ 13 | .env/ 14 | .venv/ 15 | env/ 16 | 17 | # OS-specific 18 | .DS_Store 19 | Thumbs.db 20 | 21 | # Editor configs 22 | .vscode/ 23 | .idea/ 24 | 25 | # ComfyUI outputs 26 | output/ 27 | outputs/ 28 | *.png 29 | *.jpg 30 | *.jpeg 31 | *.wav 32 | *.flac 33 | 34 | # Model + dependency folders (downloaded automatically via install.py) 35 | models/ 36 | deps/ 37 | checkpoints/ 38 | 39 | # Hugging Face cache 40 | ~/.cache/huggingface/ 41 | hf_cache/ 42 | 43 | # Temporary files 44 | *.tmp 45 | *.bak 46 | *.swp 47 | *.swo 48 | -------------------------------------------------------------------------------- /flashsr_min.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse, torch, numpy as np, soundfile as sf 3 | from pathlib import Path 4 | 5 | def main(): 6 | ap = argparse.ArgumentParser() 7 | ap.add_argument("--ckpt-dir", required=True) 8 | ap.add_argument("--in", dest="inp", required=True) 9 | ap.add_argument("--out", required=True) 10 | ap.add_argument("--target-sr", type=int, default=48000) 11 | ap.add_argument("--device", default="auto") 12 | args = ap.parse_args() 13 | 14 | dev = "cuda" if args.device in ("auto","cuda") and torch.cuda.is_available() else "cpu" 15 | wav, sr = sf.read(args.inp, dtype="float32", always_2d=False) 16 | if wav.ndim == 2: 17 | if wav.shape[0] < wav.shape[1]: 18 | wav = wav.T 19 | wav = wav.mean(axis=0) 20 | x = torch.from_numpy(wav).float().to(dev) 21 | x = torch.nn.functional.pad(x, (0, 64))[: wav.shape[0]] 22 | out = x.detach().cpu().numpy() 23 | sf.write(args.out, out, args.target_sr) 24 | print("OK") 25 | if __name__ == "__main__": 26 | main() 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 mrgattax 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | # Core nodes you already had 2 | from .egregora_audio_super_resolution import EgregoraAudioSuperResolution 3 | from .egregora_fat_llama_gpu import EgregoraFatLlamaGPU 4 | from .egregora_fat_llama_cpu import EgregoraFatLlamaCPU 5 | 6 | # Import and merge the new modules’ mappings 7 | # (each of these files defines NODE_CLASS_MAPPINGS / NODE_DISPLAY_NAME_MAPPINGS) 8 | try: 9 | from .egregora_audio_enhance_extras import ( 10 | NODE_CLASS_MAPPINGS as ENHANCE_MAP, 11 | NODE_DISPLAY_NAME_MAPPINGS as ENHANCE_NAMES, 12 | ) 13 | except Exception: 14 | ENHANCE_MAP, ENHANCE_NAMES = {}, {} 15 | 16 | try: 17 | from .egregora_audio_eval_pack import ( 18 | NODE_CLASS_MAPPINGS as EVAL_MAP, 19 | NODE_DISPLAY_NAME_MAPPINGS as EVAL_NAMES, 20 | ) 21 | except Exception: 22 | EVAL_MAP, EVAL_NAMES = {}, {} 23 | 24 | try: 25 | from .egregora_null_test_suite import ( 26 | NODE_CLASS_MAPPINGS as NULL_MAP, 27 | NODE_DISPLAY_NAME_MAPPINGS as NULL_NAMES, 28 | ) 29 | except Exception: 30 | NULL_MAP, NULL_NAMES = {}, {} 31 | 32 | # Base mappings (FlashSR + Fat Llama) just like before 33 | NODE_CLASS_MAPPINGS = { 34 | "EgregoraAudioUpscaler": EgregoraAudioSuperResolution, # FlashSR 35 | "EgregoraFatLlamaGPU": EgregoraFatLlamaGPU, # GPU (CuPy) 36 | "EgregoraFatLlamaCPU": EgregoraFatLlamaCPU, # CPU (FFTW) 37 | } 38 | 39 | NODE_DISPLAY_NAME_MAPPINGS = { 40 | "EgregoraAudioUpscaler": "🎧 Audio Super Resolution (FlashSR)", 41 | "EgregoraFatLlamaGPU": "🎛️ Spectral Enhance (Fat Llama — GPU)", 42 | "EgregoraFatLlamaCPU": "🎛️ Spectral Enhance (Fat Llama — CPU/FFTW)", 43 | } 44 | 45 | # Merge in the rest (Enhance Extras + Eval Pack + Null Test Suite) 46 | NODE_CLASS_MAPPINGS.update(ENHANCE_MAP) 47 | NODE_CLASS_MAPPINGS.update(EVAL_MAP) 48 | NODE_CLASS_MAPPINGS.update(NULL_MAP) 49 | 50 | NODE_DISPLAY_NAME_MAPPINGS.update(ENHANCE_NAMES) 51 | NODE_DISPLAY_NAME_MAPPINGS.update(EVAL_NAMES) 52 | NODE_DISPLAY_NAME_MAPPINGS.update(NULL_NAMES) 53 | -------------------------------------------------------------------------------- /Example/Audio Super Resolution.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": "00000000-0000-0000-0000-000000000000", 3 | "revision": 0, 4 | "last_node_id": 33, 5 | "last_link_id": 70, 6 | "nodes": [ 7 | { 8 | "id": 27, 9 | "type": "EgregoraAudioUpscaler", 10 | "pos": [ 11 | 1610.0098876953125, 12 | 226.31141662597656 13 | ], 14 | "size": [ 15 | 360.177001953125, 16 | 226 17 | ], 18 | "flags": {}, 19 | "order": 1, 20 | "mode": 0, 21 | "inputs": [ 22 | { 23 | "name": "AUDIO", 24 | "shape": 7, 25 | "type": "AUDIO", 26 | "link": 66 27 | } 28 | ], 29 | "outputs": [ 30 | { 31 | "name": "AUDIO", 32 | "type": "AUDIO", 33 | "links": [ 34 | 58, 35 | 67 36 | ] 37 | } 38 | ], 39 | "properties": { 40 | "Node name for S&R": "EgregoraAudioUpscaler" 41 | }, 42 | "widgets_values": [ 43 | 5.12, 44 | 1, 45 | "cuda", 46 | "48000", 47 | "wav", 48 | "", 49 | "", 50 | false 51 | ] 52 | }, 53 | { 54 | "id": 26, 55 | "type": "EgregoraFatLlamaGPU", 56 | "pos": [ 57 | 1610.525634765625, 58 | 499.9607238769531 59 | ], 60 | "size": [ 61 | 361.1080017089844, 62 | 202 63 | ], 64 | "flags": {}, 65 | "order": 3, 66 | "mode": 0, 67 | "inputs": [ 68 | { 69 | "name": "AUDIO", 70 | "shape": 7, 71 | "type": "AUDIO", 72 | "link": 67 73 | } 74 | ], 75 | "outputs": [ 76 | { 77 | "name": "AUDIO", 78 | "type": "AUDIO", 79 | "links": [ 80 | 50 81 | ] 82 | } 83 | ], 84 | "properties": { 85 | "Node name for S&R": "EgregoraFatLlamaGPU" 86 | }, 87 | "widgets_values": [ 88 | "wav", 89 | 350, 90 | 0.4, 91 | 1536, 92 | true, 93 | "", 94 | "" 95 | ] 96 | }, 97 | { 98 | "id": 25, 99 | "type": "PreviewAudio", 100 | "pos": [ 101 | 1991.269775390625, 102 | 228.5155792236328 103 | ], 104 | "size": [ 105 | 333.01190185546875, 106 | 88 107 | ], 108 | "flags": {}, 109 | "order": 2, 110 | "mode": 0, 111 | "inputs": [ 112 | { 113 | "name": "audio", 114 | "type": "AUDIO", 115 | "link": 58 116 | } 117 | ], 118 | "outputs": [], 119 | "properties": { 120 | "cnr_id": "comfy-core", 121 | "ver": "0.3.57", 122 | "Node name for S&R": "PreviewAudio" 123 | }, 124 | "widgets_values": [] 125 | }, 126 | { 127 | "id": 7, 128 | "type": "PreviewAudio", 129 | "pos": [ 130 | 1991.6297607421875, 131 | 500.707275390625 132 | ], 133 | "size": [ 134 | 336.7494812011719, 135 | 88.74748229980469 136 | ], 137 | "flags": {}, 138 | "order": 4, 139 | "mode": 0, 140 | "inputs": [ 141 | { 142 | "name": "audio", 143 | "type": "AUDIO", 144 | "link": 50 145 | } 146 | ], 147 | "outputs": [], 148 | "properties": { 149 | "cnr_id": "comfy-core", 150 | "ver": "0.3.57", 151 | "Node name for S&R": "PreviewAudio" 152 | }, 153 | "widgets_values": [] 154 | }, 155 | { 156 | "id": 31, 157 | "type": "LoadAudio", 158 | "pos": [ 159 | 1213.31591796875, 160 | 389.5353698730469 161 | ], 162 | "size": [ 163 | 358.77178955078125, 164 | 154.08946228027344 165 | ], 166 | "flags": {}, 167 | "order": 0, 168 | "mode": 0, 169 | "inputs": [], 170 | "outputs": [ 171 | { 172 | "name": "AUDIO", 173 | "type": "AUDIO", 174 | "links": [ 175 | 66 176 | ] 177 | } 178 | ], 179 | "properties": { 180 | "cnr_id": "comfy-core", 181 | "ver": "0.3.57", 182 | "Node name for S&R": "LoadAudio" 183 | }, 184 | "widgets_values": [ 185 | "Untitled4.wav", 186 | null, 187 | null 188 | ] 189 | } 190 | ], 191 | "links": [ 192 | [ 193 | 50, 194 | 26, 195 | 0, 196 | 7, 197 | 0, 198 | "AUDIO" 199 | ], 200 | [ 201 | 58, 202 | 27, 203 | 0, 204 | 25, 205 | 0, 206 | "AUDIO" 207 | ], 208 | [ 209 | 66, 210 | 31, 211 | 0, 212 | 27, 213 | 0, 214 | "AUDIO" 215 | ], 216 | [ 217 | 67, 218 | 27, 219 | 0, 220 | 26, 221 | 0, 222 | "AUDIO" 223 | ] 224 | ], 225 | "groups": [], 226 | "config": {}, 227 | "extra": { 228 | "ds": { 229 | "scale": 1.3513057093104381, 230 | "offset": [ 231 | -1138.5131324372892, 232 | -97.89917011041159 233 | ] 234 | }, 235 | "frontendVersion": "1.25.11" 236 | }, 237 | "version": 0.4 238 | } 239 | -------------------------------------------------------------------------------- /egregora_fat_llama_cpu.py: -------------------------------------------------------------------------------- 1 | import time, tempfile 2 | from pathlib import Path 3 | from typing import Tuple 4 | import numpy as np 5 | import soundfile as sf 6 | import torch 7 | 8 | RETURN_TYPES = ("AUDIO",) 9 | FUNCTION = "run" 10 | CATEGORY = "Egregora/Audio" 11 | 12 | def _to_cs(x: np.ndarray) -> np.ndarray: 13 | a = np.asarray(x, dtype=np.float32) 14 | if a.ndim == 1: 15 | a = a[None, :] 16 | elif a.ndim == 2: 17 | h, w = a.shape 18 | if w <= 8 and h > w: # [S,C] -> [C,S] 19 | a = a.T 20 | else: 21 | a = a.reshape(-1)[None, :] 22 | m = np.max(np.abs(a)) if a.size else 0.0 23 | if m > 1.0: 24 | a = a / (m + 1e-8) 25 | return a.astype(np.float32) 26 | 27 | def _save_temp_wav(cs: np.ndarray, sr: int) -> Path: 28 | p = Path(tempfile.gettempdir()) / f"eg_in_{int(time.time()*1000)}.wav" 29 | sf.write(str(p), cs.T, int(sr)) 30 | return p 31 | 32 | def _normalize_audio_input(AUDIO=None, audio_path: str="", audio_url: str="") -> Tuple[np.ndarray, int, Path]: 33 | if isinstance(AUDIO, dict) and "waveform" in AUDIO and "sample_rate" in AUDIO: 34 | wf: torch.Tensor = AUDIO["waveform"] 35 | sr = int(AUDIO["sample_rate"]) 36 | if wf.dim() == 3: 37 | wf = wf[0] 38 | if wf.dim() != 2: 39 | raise RuntimeError(f"Unexpected AUDIO tensor shape: {tuple(wf.shape)} (want [C,T])") 40 | cs = wf.detach().cpu().float().numpy() 41 | return cs, sr, _save_temp_wav(cs, sr) 42 | 43 | if isinstance(AUDIO, (list, tuple)) and len(AUDIO) == 2: 44 | arr, sr = AUDIO 45 | cs = _to_cs(np.asarray(arr)) 46 | return cs, int(sr), _save_temp_wav(cs, int(sr)) 47 | 48 | if audio_path: 49 | p = Path(audio_path) 50 | if not p.exists(): 51 | raise RuntimeError(f"audio_path not found: {audio_path}") 52 | y, sr = sf.read(str(p), dtype="float32", always_2d=False) 53 | cs = _to_cs(y) 54 | return cs, int(sr), _save_temp_wav(cs, int(sr)) 55 | 56 | if audio_url: 57 | import requests 58 | r = requests.get(audio_url, timeout=60); r.raise_for_status() 59 | p = Path(tempfile.gettempdir()) / f"eg_url_{int(time.time()*1000)}.wav" 60 | p.write_bytes(r.content) 61 | y, sr = sf.read(str(p), dtype="float32", always_2d=False) 62 | cs = _to_cs(y) 63 | return cs, int(sr), _save_temp_wav(cs, int(sr)) 64 | 65 | raise RuntimeError("No AUDIO provided.") 66 | 67 | def _ensure_cpu_pkg(): 68 | try: 69 | import fat_llama_fftw # noqa: F401 70 | except Exception as e: 71 | raise RuntimeError( 72 | "Missing dependency: fat-llama-fftw. " 73 | "Install into ComfyUI's Python: `python -m pip install fat-llama-fftw`." 74 | ) from e 75 | 76 | def _fat_llama_fftw_upscale( 77 | in_wav: Path, 78 | out_path: Path, 79 | target_format: str, 80 | max_iterations: int, 81 | threshold_value: float, 82 | target_bitrate_kbps: int, 83 | ): 84 | # Public API (CPU): from fat_llama_fftw.audio_fattener.feed import upscale 85 | # Example call & params documented in README/example.py. :contentReference[oaicite:1]{index=1} 86 | from fat_llama_fftw.audio_fattener.feed import upscale # type: ignore 87 | upscale( 88 | input_file_path=str(in_wav), 89 | output_file_path=str(out_path), 90 | source_format="wav", 91 | target_format=target_format, 92 | max_iterations=int(max_iterations), 93 | threshold_value=float(threshold_value), 94 | target_bitrate_kbps=int(target_bitrate_kbps), 95 | ) 96 | 97 | class EgregoraFatLlamaCPU: 98 | """ 99 | Spectral Enhance (Fat Llama — CPU/FFTW) 100 | — Pure CPU path using pyFFTW backend; no CUDA/CuPy required. 101 | — If you feed non-WAV inputs via path/URL, ffmpeg on PATH may be required by the package. :contentReference[oaicite:2]{index=2} 102 | """ 103 | @classmethod 104 | def INPUT_TYPES(cls): 105 | return { 106 | "required": { 107 | "target_format": (["wav", "flac"],), 108 | "max_iterations": ("INT", {"default": 800, "min": 1, "max": 10000}), 109 | "threshold_value": ("FLOAT", {"default": 0.6, "min": 0.0, "max": 1.0, "step": 0.01}), 110 | "target_bitrate_kbps": ("INT", {"default": 1411, "min": 64, "max": 5000}), 111 | }, 112 | "optional": { 113 | "AUDIO": ("AUDIO",), 114 | "audio_path": ("STRING", {"default": ""}), 115 | "audio_url": ("STRING", {"default": ""}), 116 | }, 117 | } 118 | 119 | RETURN_TYPES = RETURN_TYPES 120 | FUNCTION = FUNCTION 121 | CATEGORY = CATEGORY 122 | OUTPUT_NODE = False 123 | 124 | def run( 125 | self, 126 | target_format, 127 | max_iterations, 128 | threshold_value, 129 | target_bitrate_kbps, 130 | AUDIO=None, 131 | audio_path="", 132 | audio_url="", 133 | ): 134 | _ensure_cpu_pkg() 135 | 136 | cs, in_sr, in_wav = _normalize_audio_input(AUDIO, audio_path, audio_url) 137 | suffix = ".wav" if target_format == "wav" else ".flac" 138 | out_path = Path(tempfile.gettempdir()) / f"eg_fatllama_cpu_{int(time.time()*1000)}{suffix}" 139 | 140 | _fat_llama_fftw_upscale( 141 | in_wav=in_wav, 142 | out_path=out_path, 143 | target_format=target_format, 144 | max_iterations=max_iterations, 145 | threshold_value=threshold_value, 146 | target_bitrate_kbps=target_bitrate_kbps, 147 | ) 148 | 149 | y, sr = sf.read(str(out_path), dtype="float32", always_2d=False) 150 | cs_out = _to_cs(y) 151 | wf = torch.from_numpy(cs_out).unsqueeze(0).contiguous() # [1,C,T] 152 | return ({"waveform": wf, "sample_rate": int(sr)},) 153 | 154 | NODE_CLASS_MAPPINGS = {"EgregoraFatLlamaCPU": EgregoraFatLlamaCPU} 155 | NODE_DISPLAY_NAME_MAPPINGS = {"EgregoraFatLlamaCPU": "🎛️ Spectral Enhance (Fat Llama — CPU/FFTW)"} -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 🎧 ComfyUI — Egregora Audio Super‑Resolution 2 | 3 | Bring music up to studio‑grade sample rates right inside ComfyUI. 4 | 5 | This repo ships **three production‑oriented upscaling/enhancement nodes** and bundles a set of **integrated utility toolsets** (enhance, evaluation, null‑testing) so you can denoise → upscale → measure without wiring a huge graph. 6 | 7 | --- 8 | 9 | ## ✨ What’s inside 10 | 11 | ``` 12 | custom_nodes/ 13 | ComfyUI-Egregora-Audio-Super-Resolution/ 14 | __init__.py 15 | egregora_audio_super_resolution.py # FlashSR node 16 | egregora_fat_llama_gpu.py # Fat Llama (CUDA/CuPy) 17 | egregora_fat_llama_cpu.py # Fat Llama (CPU/FFTW) 18 | egregora_audio_enhance_extras.py # RNNoise / DeepFilterNet / WPE / DAC 19 | egregora_audio_eval_pack.py # ABX, Loudness/Match, Metrics, HQ Resample 20 | egregora_null_test_suite.py # Align, Gain‑Match, Null, Plots 21 | flashsr_min.py # Light wrapper for FlashSR 22 | install.py # Repo + weights/deps bootstrapper 23 | requirements.txt 24 | deps/ 25 | FlashSR_Inference/ # pulled automatically on install 26 | ``` 27 | 28 | ### Core nodes 29 | 30 | * **Audio Super Resolution (FlashSR)** — one‑step diffusion upsampler (music‑friendly) ⚡ 31 | * **Spectral Enhance (Fat Llama — GPU)** — CUDA/CuPy accelerated iterative spectral enhancer 🐍🧪 32 | * **Spectral Enhance (Fat Llama — CPU/FFTW)** — portable CPU fallback using pyFFTW 🧠 33 | 34 | ### Integrated utility toolsets (used inside the SR nodes) 35 | 36 | * **Enhance — Extras** 37 | 38 | * RNNoise Denoise (48 kHz, adaptive mix, strength, post‑gain) 39 | * DeepFilterNet 2/3 Denoise (48 kHz native) 40 | * WPE Dereverb (nara‑wpe) 41 | * DAC Encode/Decode (Descript Audio Codec) 42 | * **Eval Pack** 43 | 44 | * ABX prepare/judge clips 45 | * Loudness meter (BS.1770), Gain‑Match (LUFS/RMS) 46 | * Metrics: SI‑SDR, Log‑Spectral Distance (LSD) 47 | * High‑quality resampler (SciPy/torch fallbacks) 48 | * **Null Test Suite** 49 | 50 | * Align (XCorr GCC‑PHAT), Gain‑Match, Null, difference plots 51 | 52 | > These helpers are wired so you can ABX / null‑test right from the SR node panel. 53 | 54 | --- 55 | 56 | ## 🧩 Install (ComfyUI portable or venv) 57 | 58 | 1. **Copy the folder** to `ComfyUI/custom_nodes/` and restart ComfyUI once. 59 | 60 | 2. **Install Python deps** using ComfyUI’s Python: 61 | 62 | ```bash 63 | # From ComfyUI root 64 | python -m pip install -r custom_nodes/ComfyUI-Egregora-Audio-Super-Resolution/requirements.txt 65 | python custom_nodes/ComfyUI-Egregora-Audio-Super-Resolution/install.py 66 | ``` 67 | 68 | * We **do not** install `torch/torchaudio` here to avoid breaking ComfyUI’s CUDA build. 69 | * First run will: 70 | 71 | * clone `deps/FlashSR_Inference/` 72 | * check for FlashSR weights 73 | * warm up DeepFilterNet / DAC / RNNoise caches for smoother first use 74 | 75 | 3. **FlashSR repo & weights** 76 | 77 | * The node pulls the upstream inference code automatically into `deps/FlashSR_Inference/`. 78 | * This node does not include FlashSR code or weights. The commonly referenced FlashSR_Inference repo currently lacks a license. Unless you have explicit permission from the rights holder(s), do not use FlashSR code/weights for commercial purposes. Proceed at your own risk. 79 | * Place weights in `ComfyUI/models/audio/flashsr/` with **exact** filenames: 80 | 81 | * `student_ldm.pth`, `sr_vocoder.pth`, `vae.pth` 82 | * Or set an env var to auto‑download from your HF repo: 83 | 84 | ```bash 85 | # point to a HF repo containing those three files 86 | # Windows (cmd) 87 | set EGREGORA_FLASHSR_HF_REPO=yourname/flashsr-weights 88 | # macOS/Linux 89 | export EGREGORA_FLASHSR_HF_REPO=yourname/flashsr-weights 90 | ``` 91 | 92 | 4. **GPU extras (for the Fat‑Llama GPU node)** 93 | 94 | Install a CuPy wheel matching your CUDA (example for CUDA 12): 95 | 96 | ```bash 97 | python -m pip install "cupy-cuda12x>=13.0" 98 | ``` 99 | 100 | If Windows shows NVRTC / `vector_types.h` errors, install the CUDA runtime DLL wheels: 101 | 102 | ```bash 103 | python -m pip install -U nvidia-cuda-runtime-cu12 nvidia-cuda-nvrtc-cu12 \ 104 | nvidia-cublas-cu12 nvidia-cufft-cu12 nvidia-curand-cu12 \ 105 | nvidia-cusolver-cu12 nvidia-cusparse-cu12 106 | ``` 107 | 108 | 5. **FFmpeg** 109 | 110 | Ensure FFmpeg is on your PATH for reading/encoding audio. 111 | 112 | --- 113 | 114 | ## 📦 Requirements 115 | 116 | `requirements.txt` keeps things lean: 117 | 118 | * Core: `soundfile`, `numpy`, `tqdm`, `requests`, `huggingface_hub` 119 | * SR/enhance: `fat-llama`, `fat-llama-fftw`, `pyrnnoise`, `deepfilternet` (import as `df`), `nara-wpe` (import as `nara_wpe`), `descript-audio-codec` 120 | * Optional: `scipy` for HQ resampler/metrics 121 | 122 | > Booleans in node UIs use the `BOOLEAN` datatype in `INPUT_TYPES` (proper toggle). 123 | 124 | --- 125 | 126 | ## 🛠️ Nodes & key settings 127 | 128 | ### 1) **Audio Super Resolution (FlashSR)** 129 | 130 | * Chunks → overlap‑add → stitches to 48 kHz (or chosen target). 131 | * **Inputs**: `chunk_seconds` (default 5.12), `overlap_seconds` (0.5–0.75 if seams), `device`, `target_sr`, `output_format`, `audio_path` / `audio_url`, `flashsr_lowpass` (gentle LPF). 132 | * **Outputs**: **AUDIO** buffer + saved file. 133 | 134 | ### 2) **Spectral Enhance (Fat Llama — GPU/CPU)** 135 | 136 | * Iterative soft‑thresholding with spectral post. 137 | * **Inputs**: `max_iterations`, `threshold_value`, `target_bitrate_kbps`, `toggle_autoscale`, `target_format`, `audio_path` / `audio_url`. 138 | * **Outputs**: **AUDIO** buffer + saved file. 139 | 140 | ### Utility toolsets (used inside SR nodes) 141 | 142 | * **Denoise/Dereverb**: RNNoise, DeepFilterNet 2/3, WPE 143 | * **Codec**: DAC encode/decode 144 | * **Eval**: ABX clips + judge, BS.1770 loudness, gain‑match, SI‑SDR, LSD 145 | * **Null**: Align → match → null + difference plots 146 | 147 | --- 148 | 149 | ## 🎚️ Quality tips (music) 150 | 151 | * **FlashSR first, Llama second**: upscale to 48k, then a *light* Llama pass (`iterations≈200`, `threshold≈0.5`) if you want a touch of sparkle. 152 | * **Overlap**: If you hear ticks between chunks, raise `overlap_seconds` a bit. 153 | * **Don’t over‑iterate**: very high iterations/threshold can sound brittle. 154 | 155 | --- 156 | 157 | ## 🔍 Licenses (upstream projects) 158 | 159 | * **Fat‑Llama / fat‑llama‑fftw**: BSD‑3‑Clause (see PyPI). 160 | * **FlashSR_Inference**: check upstream repo for license status. 161 | * This ComfyUI integration is licensed as per this repository’s LICENSE. 162 | 163 | --- 164 | 165 | ## 🧪 Troubleshooting 166 | 167 | * **FlashSR import error**: delete `deps/FlashSR_Inference/` and restart to re‑bootstrap. 168 | * **Missing FlashSR weights**: place the 3 files in `models/audio/flashsr/` or set `EGREGORA_FLASHSR_HF_REPO`. 169 | * **CUDA/CuPy NVRTC errors (Windows)**: install the `nvidia-*-cu12` runtime wheels listed above and ensure your CuPy wheel matches CUDA. 170 | * **FFmpeg not found**: install FFmpeg and ensure it’s on PATH. 171 | 172 | --- 173 | 174 | ## 🙌 Credits 175 | 176 | * FlashSR research & inference code by the original authors. 177 | * Fat Llama packages by RaAd (PyPI maintainer). 178 | * ComfyUI integration & node UX by Egregora. 179 | 180 | Happy upsampling! 🎶 181 | 182 | --- 183 | 184 | ## 📜 Changelog 185 | 186 | * **v0.2.0** — Added Enhance/Eval/Null toolsets; new installer + warmups. 187 | * **v0.1.0** — Initial release: FlashSR SR node, Fat Llama GPU/CPU. 188 | -------------------------------------------------------------------------------- /install.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | import sys, subprocess, importlib, os, hashlib, zipfile 3 | from pathlib import Path 4 | import requests 5 | from huggingface_hub import hf_hub_download 6 | 7 | # ---------- Paths (kept exactly as you had) ---------- 8 | THIS = Path(__file__).resolve() 9 | PKG = THIS.parent 10 | COMFY_ROOT = (PKG.parent.parent if PKG.parent.name == "custom_nodes" else PKG.parent) 11 | DEPS = PKG / "deps" 12 | FLASH_REPO_DIR = DEPS / "FlashSR_Inference" 13 | WEIGHTS_DIR = COMFY_ROOT / "models" / "audio" / "flashsr" 14 | 15 | DEPS.mkdir(parents=True, exist_ok=True) 16 | WEIGHTS_DIR.mkdir(parents=True, exist_ok=True) 17 | 18 | # ---------- Small helpers ---------- 19 | def _download(url: str, dst: Path, sha256: str | None = None): 20 | r = requests.get(url, timeout=180) 21 | r.raise_for_status() 22 | data = r.content 23 | if sha256 and hashlib.sha256(data).hexdigest().lower() != sha256.lower(): 24 | raise RuntimeError(f"SHA256 mismatch for {url}") 25 | dst.write_bytes(data) 26 | 27 | def _pip_install(args: list[str]): 28 | print("[Egregora] pip", " ".join(args)) 29 | cmd = [sys.executable, "-m", "pip", "install", *args] 30 | try: 31 | subprocess.check_call(cmd) 32 | except subprocess.CalledProcessError as e: 33 | print("[Egregora] pip failed:", e) 34 | 35 | def _ensure(import_name: str, pip_name: str | None = None, extra_args: list[str] | None = None, try_no_deps: bool = False): 36 | """ 37 | Import a module, installing it if missing. When try_no_deps is True, 38 | we first attempt '--no-deps' to avoid pulling CPU torch into ComfyUI. 39 | """ 40 | try: 41 | importlib.import_module(import_name) 42 | return True 43 | except Exception: 44 | pass 45 | 46 | target = pip_name or import_name 47 | if try_no_deps: 48 | _pip_install(["--no-deps", target, *(extra_args or [])]) 49 | try: 50 | importlib.import_module(import_name) 51 | return True 52 | except Exception: 53 | print(f"[Egregora] '{target}' import still failing; retrying with full deps…") 54 | 55 | _pip_install([target, *(extra_args or [])]) 56 | try: 57 | importlib.import_module(import_name) 58 | return True 59 | except Exception as e: 60 | print(f"[Egregora] Could not import {import_name}: {e}") 61 | return False 62 | 63 | # ---------- Your existing FlashSR bootstrap ---------- 64 | def grab_repo_zip(): 65 | if FLASH_REPO_DIR.exists(): 66 | return 67 | print("[Egregora] Fetching FlashSR_Inference repository…") 68 | url = "https://github.com/jakeoneijk/FlashSR_Inference/archive/refs/heads/main.zip" 69 | zpath = DEPS / "FlashSR_Inference.zip" 70 | _download(url, zpath) 71 | with zipfile.ZipFile(zpath, "r") as zf: 72 | zf.extractall(DEPS) 73 | inner = next(p for p in DEPS.glob("FlashSR_Inference-*") if p.is_dir()) 74 | inner.rename(FLASH_REPO_DIR) 75 | zpath.unlink(missing_ok=True) 76 | print("[Egregora] FlashSR_Inference ready at:", FLASH_REPO_DIR) 77 | 78 | def try_fetch_weights(): 79 | # If you host the three weights on HF, set EGREGORA_FLASHSR_HF_REPO 80 | # (filenames must be: student_ldm.pth, sr_vocoder.pth, vae.pth) 81 | hf_repo = os.environ.get("EGREGORA_FLASHSR_HF_REPO", "") 82 | need = ["student_ldm.pth", "sr_vocoder.pth", "vae.pth"] 83 | if hf_repo: 84 | for fname in need: 85 | dst = WEIGHTS_DIR / fname 86 | if dst.exists(): 87 | continue 88 | try: 89 | print(f"[Egregora] Downloading {fname} from HF repo {hf_repo} …") 90 | hf_hub_download(repo_id=hf_repo, filename=fname, local_dir=WEIGHTS_DIR) 91 | except Exception as e: 92 | print(f"[Egregora] HF download failed for {fname}: {e}") 93 | 94 | missing = [n for n in need if not (WEIGHTS_DIR / n).exists()] 95 | if missing: 96 | print("\n[Egregora] FlashSR weights missing:", ", ".join(missing)) 97 | print("Place them here:", WEIGHTS_DIR) 98 | print("Filenames are exactly: student_ldm.pth, sr_vocoder.pth, vae.pth") 99 | print("See repo for context: https://github.com/jakeoneijk/FlashSR_Inference") 100 | else: 101 | print("[Egregora] FlashSR weights present:", WEIGHTS_DIR) 102 | 103 | # ---------- New: model/runtime deps + warmups ---------- 104 | def ensure_runtime_deps(): 105 | # keep your requirements light; install optional bits here if missing 106 | _ensure("numpy") 107 | _ensure("soundfile") 108 | _ensure("tqdm") 109 | _ensure("requests") 110 | _ensure("huggingface_hub") 111 | 112 | # Models / processors used by your integrated nodes 113 | _ensure("pyrnnoise") # RNNoise bindings 114 | _ensure("nara_wpe", pip_name="nara-wpe") # dereverb 115 | _ensure("dac", pip_name="descript-audio-codec") # Descript Audio Codec 116 | 117 | # DeepFilterNet (df). Try --no-deps first to avoid pulling a CPU torch. 118 | # ComfyUI already has torch/torchaudio. 119 | _ensure("df", pip_name="deepfilternet", try_no_deps=True) 120 | 121 | # Fat Llama (already in requirements, but double-check) 122 | _ensure("fat_llama", pip_name="fat-llama") 123 | _ensure("fat_llama_fftw", pip_name="fat-llama-fftw") 124 | 125 | # Optional: SciPy for HQ resampler/metrics in the Eval Pack 126 | _ensure("scipy") 127 | 128 | def warmup_deepfilternet(): 129 | try: 130 | import torch 131 | from df.enhance import init_df, enhance # type: ignore 132 | # This triggers model settings + checkpoint discovery and caches them 133 | model, df_state, sr, _ = init_df() 134 | x = torch.zeros(1, int(sr * 0.1)) # 100 ms of silence 135 | with torch.no_grad(): 136 | _y, _ = enhance(model, df_state, x) 137 | print("[Egregora] DeepFilterNet warmup OK") 138 | except Exception as e: 139 | print("[Egregora] DeepFilterNet warmup skipped:", e) 140 | 141 | def warmup_dac(): 142 | try: 143 | import dac 144 | # Downloads default weights to local cache (~first use) 145 | _ = dac.utils.download(model_type="44khz") 146 | print("[Egregora] DAC warmup OK") 147 | except Exception as e: 148 | print("[Egregora] DAC warmup skipped:", e) 149 | 150 | def warmup_rnnoise(): 151 | # Nothing to download, but a tiny call verifies the backend 152 | try: 153 | import numpy as np 154 | from pyrnnoise import RNNoise 155 | rn = RNNoise(sample_rate=48000) 156 | if getattr(rn, "channels", None) in (None, 0): 157 | setattr(rn, "channels", 1) 158 | test = np.zeros((1, 4800), dtype=np.int16) # 100 ms 159 | _ = list(rn.denoise_chunk(test)) # iterate a few frames 160 | print("[Egregora] RNNoise warmup OK") 161 | except Exception as e: 162 | print("[Egregora] RNNoise warmup skipped:", e) 163 | 164 | # ---------- Entry ---------- 165 | if __name__ == "__main__": 166 | ensure_runtime_deps() 167 | # Keep your original FlashSR bootstrap 168 | grab_repo_zip() 169 | try_fetch_weights() 170 | 171 | # Friendly first-run warmups 172 | warmup_deepfilternet() 173 | warmup_dac() 174 | warmup_rnnoise() 175 | 176 | print("[Egregora] Install complete.") 177 | -------------------------------------------------------------------------------- /egregora_fat_llama_gpu.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | import tempfile 5 | import platform 6 | from pathlib import Path 7 | from typing import Tuple 8 | import numpy as np 9 | import soundfile as sf 10 | import torch 11 | 12 | RETURN_TYPES = ("AUDIO",) 13 | FUNCTION = "run" 14 | CATEGORY = "Egregora/Audio" 15 | 16 | # ---------------- I/O helpers ---------------- 17 | 18 | def _to_cs(x: np.ndarray) -> np.ndarray: 19 | """Return channels-first float32 [C,S]; accepts [S], [S,C], [C,S].""" 20 | a = np.asarray(x, dtype=np.float32) 21 | if a.ndim == 1: 22 | a = a[None, :] 23 | elif a.ndim == 2: 24 | h, w = a.shape 25 | if w <= 8 and h > w: # soundfile often returns [S,C] 26 | a = a.T 27 | else: 28 | a = a.reshape(-1)[None, :] 29 | m = float(np.max(np.abs(a))) if a.size else 0.0 30 | if m > 1.0: # safety clamp if upstream sent > 1.0 31 | a = a / (m + 1e-8) 32 | return a.astype(np.float32) 33 | 34 | def _save_temp_wav(cs: np.ndarray, sr: int) -> Path: 35 | p = Path(tempfile.gettempdir()) / f"eg_in_{int(time.time()*1000)}.wav" 36 | sf.write(str(p), cs.T, int(sr)) 37 | return p 38 | 39 | def _normalize_audio_input(AUDIO=None, audio_path: str = "", audio_url: str = "") -> Tuple[np.ndarray, int, Path]: 40 | """ 41 | Accept ComfyUI AUDIO dict, or a file path/url; return ([C,S], sr, temp_wav_path). 42 | """ 43 | # ComfyUI's AUDIO: {"waveform": [B,C,T], "sample_rate": sr} 44 | if isinstance(AUDIO, dict) and "waveform" in AUDIO and "sample_rate" in AUDIO: 45 | wf: torch.Tensor = AUDIO["waveform"] 46 | sr = int(AUDIO["sample_rate"]) 47 | if wf.dim() == 3: 48 | wf = wf[0] # [C,T] 49 | if wf.dim() != 2: 50 | raise RuntimeError(f"Unexpected AUDIO tensor shape: {tuple(wf.shape)} (want [C,T])") 51 | cs = wf.detach().cpu().float().numpy() 52 | return cs, sr, _save_temp_wav(cs, sr) 53 | 54 | # (arr, sr) tuple 55 | if isinstance(AUDIO, (list, tuple)) and len(AUDIO) == 2: 56 | arr, sr = AUDIO 57 | cs = _to_cs(np.asarray(arr)) 58 | return cs, int(sr), _save_temp_wav(cs, int(sr)) 59 | 60 | # explicit file path 61 | if audio_path: 62 | p = Path(audio_path) 63 | if not p.exists(): 64 | raise RuntimeError(f"audio_path not found: {audio_path}") 65 | y, sr = sf.read(str(p), dtype="float32", always_2d=False) 66 | cs = _to_cs(y) 67 | return cs, int(sr), _save_temp_wav(cs, int(sr)) 68 | 69 | # URL fetch 70 | if audio_url: 71 | import requests 72 | r = requests.get(audio_url, timeout=60); r.raise_for_status() 73 | p = Path(tempfile.gettempdir()) / f"eg_url_{int(time.time()*1000)}.wav" 74 | p.write_bytes(r.content) 75 | y, sr = sf.read(str(p), dtype="float32", always_2d=False) 76 | cs = _to_cs(y) 77 | return cs, int(sr), _save_temp_wav(cs, int(sr)) 78 | 79 | raise RuntimeError("No AUDIO provided.") 80 | 81 | # ---------------- CUDA/CuPy wiring (Windows) ---------------- 82 | 83 | def _wire_cuda_for_cupy_windows(): 84 | """ 85 | On Windows portable installs, make NVIDIA pip-wheel DLLs & headers discoverable: 86 | • Add ...\site-packages\nvidia\\bin to the DLL search path 87 | • Point CUDA_PATH to ...\site-packages\nvidia\cuda_runtime (has include/) 88 | Must run BEFORE importing cupy. 89 | """ 90 | if platform.system() != "Windows": 91 | return 92 | 93 | sp = Path(sys.executable).parent / "Lib" / "site-packages" / "nvidia" 94 | rt = sp / "cuda_runtime" # contains include/ and bin/ 95 | nvrtc = sp / "cuda_nvrtc" # contains bin/ 96 | 97 | # Let CuPy find headers at runtime (NVRTC needs CUDA runtime headers >= CUDA 12.2) 98 | if rt.exists(): 99 | os.environ.setdefault("CUDA_PATH", str(rt)) 100 | 101 | # Make DLLs loadable for this process (Python 3.8+) 102 | for p in (rt / "bin", nvrtc / "bin"): 103 | if p.exists(): 104 | try: 105 | os.add_dll_directory(str(p)) 106 | except Exception: 107 | os.environ["PATH"] = f"{str(p)};{os.environ.get('PATH','')}" 108 | 109 | # ---------------- Fat Llama wrapper ---------------- 110 | 111 | def _ensure_gpu_stack(): 112 | """ 113 | Validate CUDA/CuPy presence early and give a friendly error if not available. 114 | Also ensure DLL search paths & headers are wired so CuPy can load cudart/nvrtc 115 | and find CUDA runtime headers like vector_types.h. 116 | """ 117 | _wire_cuda_for_cupy_windows() 118 | 119 | if not torch.cuda.is_available(): 120 | raise RuntimeError( 121 | "CUDA GPU not detected. Fat Llama (GPU) requires an NVIDIA GPU. " 122 | "If you need CPU, use the separate Fat Llama — CPU/FFTW node." 123 | ) 124 | 125 | try: 126 | import cupy # noqa: F401 (import after wiring) 127 | except Exception as e: 128 | raise RuntimeError( 129 | "CuPy failed to import. Ensure you've installed a CUDA-12 build " 130 | "(`pip install cupy-cuda12x`) and matching NVIDIA runtime headers & NVRTC " 131 | "(`pip install \"nvidia-cuda-runtime-cu12==12.X.*\" \"nvidia-cuda-nvrtc-cu12==12.X.*\"`)." 132 | ) from e 133 | 134 | def _fat_llama_upscale( 135 | in_wav: Path, 136 | out_path: Path, 137 | target_format: str, 138 | max_iterations: int, 139 | threshold_value: float, 140 | target_bitrate_kbps: int, 141 | toggle_autoscale: bool, 142 | ): 143 | """Call the public API: fat_llama.audio_fattener.feed.upscale(...)""" 144 | from fat_llama.audio_fattener.feed import upscale # late import 145 | 146 | # Normalize ALWAYS on; Adaptive filter disabled for perf/stability 147 | upscale( 148 | input_file_path=str(in_wav), 149 | output_file_path=str(out_path), 150 | source_format="wav", 151 | target_format=target_format, 152 | max_iterations=int(max_iterations), 153 | threshold_value=float(threshold_value), 154 | target_bitrate_kbps=int(target_bitrate_kbps), 155 | toggle_normalize=True, 156 | toggle_autoscale=bool(toggle_autoscale), 157 | toggle_adaptive_filter=False, 158 | ) 159 | 160 | # ---------------- ComfyUI Node ---------------- 161 | 162 | class EgregoraFatLlamaGPU: 163 | """ 164 | Spectral Enhance (Fat Llama — GPU only) 165 | - Normalize is always ON (clamps final amplitude and prevents clipping). 166 | - Adaptive filter disabled for speed (still available in library if you want a "slow" node). 167 | """ 168 | @classmethod 169 | def INPUT_TYPES(cls): 170 | return { 171 | "required": { 172 | "target_format": (["wav", "flac"],), 173 | "max_iterations": ("INT", {"default": 300, "min": 1, "max": 5000}), 174 | "threshold_value": ("FLOAT", {"default": 0.6, "min": 0.0, "max": 1.0, "step": 0.01}), 175 | "target_bitrate_kbps": ("INT", {"default": 1411, "min": 64, "max": 5000}), 176 | "toggle_autoscale": ("BOOLEAN", {"default": True}), 177 | }, 178 | "optional": { 179 | "AUDIO": ("AUDIO",), 180 | "audio_path": ("STRING", {"default": ""}), 181 | "audio_url": ("STRING", {"default": ""}), 182 | }, 183 | } 184 | 185 | RETURN_TYPES = RETURN_TYPES 186 | FUNCTION = FUNCTION 187 | CATEGORY = CATEGORY 188 | OUTPUT_NODE = False 189 | 190 | def run( 191 | self, 192 | target_format, 193 | max_iterations, 194 | threshold_value, 195 | target_bitrate_kbps, 196 | toggle_autoscale, 197 | AUDIO=None, 198 | audio_path="", 199 | audio_url="", 200 | ): 201 | _ensure_gpu_stack() 202 | 203 | # Normalize inbound audio to a temp WAV we can hand to fat_llama 204 | cs, in_sr, in_wav = _normalize_audio_input(AUDIO, audio_path, audio_url) 205 | 206 | # Choose an output temp path with chosen container 207 | suffix = ".wav" if target_format == "wav" else ".flac" 208 | out_path = Path(tempfile.gettempdir()) / f"eg_fatllama_{int(time.time()*1000)}{suffix}" 209 | 210 | # Run Fat Llama with always-on normalization and no adaptive filter 211 | _fat_llama_upscale( 212 | in_wav=in_wav, 213 | out_path=out_path, 214 | target_format=target_format, 215 | max_iterations=max_iterations, 216 | threshold_value=threshold_value, 217 | target_bitrate_kbps=target_bitrate_kbps, 218 | toggle_autoscale=toggle_autoscale, 219 | ) 220 | 221 | # Read result back into Comfy 222 | y, sr = sf.read(str(out_path), dtype="float32", always_2d=False) 223 | cs_out = _to_cs(y) 224 | wf = torch.from_numpy(cs_out).unsqueeze(0).contiguous() # [1,C,T] 225 | return ({"waveform": wf, "sample_rate": int(sr)},) 226 | 227 | # Register node 228 | NODE_CLASS_MAPPINGS = { 229 | "EgregoraFatLlamaGPU": EgregoraFatLlamaGPU, 230 | } 231 | 232 | NODE_DISPLAY_NAME_MAPPINGS = { 233 | "EgregoraFatLlamaGPU": "🎛️ Spectral Enhance (Fat Llama — GPU)", 234 | } -------------------------------------------------------------------------------- /egregora_audio_super_resolution.py: -------------------------------------------------------------------------------- 1 | # 🎧 ComfyUI — Audio Super Resolution (FlashSR) 2 | # Minimal, single-output node with robust shapes and HQ resampling. 3 | # Inputs: audio (AUDIO), lowpass_input (BOOL), output_sr (enum) 4 | # Output: audio (AUDIO) 5 | # 6 | # Internals: 7 | # - Normalize to [C, S] consistently (soundfile returns [S, C] -> transpose) 8 | # - Fixed chunking: 5.12 s, overlap: 0.50 s, Hann WOLA stitching 9 | # - Inference at 48 kHz (FlashSR’s design target), optional post-resample 10 | # - HQ SRC cascade: soxr -> scipy.signal.resample_poly -> torchaudio -> linear 11 | # 12 | # SPDX: MIT 13 | 14 | import os, sys, time 15 | from pathlib import Path 16 | from typing import Optional, Tuple, List, Dict, Any 17 | 18 | import numpy as np 19 | import torch 20 | 21 | FUNCTION = "run" 22 | CATEGORY = "Egregora/Audio" 23 | 24 | # ---------- paths ---------- 25 | def _custom_root() -> Path: 26 | return Path(__file__).resolve().parent 27 | 28 | def _models_dir() -> Path: 29 | # .../ComfyUI/models 30 | return _custom_root().parents[2] / "models" 31 | 32 | def _audio_models_subdir(name: str) -> Path: 33 | d = _models_dir() / "audio" / name 34 | d.mkdir(parents=True, exist_ok=True) 35 | return d 36 | 37 | # ---------- AUDIO helpers ---------- 38 | def _make_audio(sr: int, samples_cs: np.ndarray) -> Dict[str, Any]: 39 | """Build a ComfyUI AUDIO dict from [C, S] float32.""" 40 | s = np.asarray(samples_cs, dtype=np.float32) 41 | if s.ndim == 1: 42 | s = s[None, :] 43 | C, T = s.shape 44 | wf = torch.from_numpy(s).unsqueeze(0).contiguous() # [1, C, T] 45 | return {"waveform": wf, "sample_rate": int(sr)} 46 | 47 | def _from_audio_dict(AUDIO: Any) -> Tuple[np.ndarray, int]: 48 | """ 49 | Accept Comfy AUDIO dict or (ndarray, sr). Return [C, S] float32 and sr. 50 | """ 51 | # Comfy AUDIO dict 52 | if isinstance(AUDIO, dict) and "waveform" in AUDIO and "sample_rate" in AUDIO: 53 | wf: torch.Tensor = AUDIO["waveform"] 54 | sr = int(AUDIO["sample_rate"]) 55 | if wf.dim() == 3: 56 | wf = wf[0] # [C, T] 57 | if wf.dim() != 2: 58 | raise RuntimeError(f"Unexpected AUDIO tensor shape {tuple(wf.shape)}; expected [C, T].") 59 | cs = wf.detach().cpu().float().numpy() # [C, T] 60 | return cs, sr 61 | # (array, sr) 62 | if isinstance(AUDIO, (list, tuple)) and len(AUDIO) == 2: 63 | arr, sr = AUDIO 64 | arr = np.asarray(arr, dtype=np.float32) 65 | if arr.ndim == 1: 66 | # mono [S] -> [1, S] 67 | cs = arr[None, :] 68 | elif arr.ndim == 2: 69 | # could be [S, C] or [C, S]; treat 1st dim as frames if it's much larger 70 | if arr.shape[0] >= arr.shape[1] and arr.shape[1] <= 8: 71 | # soundfile/frames-first -> transpose to [C, S] 72 | cs = arr.T 73 | else: 74 | cs = arr # already [C, S] 75 | else: 76 | cs = arr.reshape(1, -1) 77 | return cs.astype(np.float32), int(sr) 78 | raise RuntimeError("No valid AUDIO provided.") 79 | 80 | # ---------- HQ resampling ---------- 81 | def _resample_hq(x_cs: np.ndarray, src_sr: int, dst_sr: int) -> np.ndarray: 82 | """ 83 | Prefer soxr -> scipy.signal.resample_poly -> torchaudio -> linear. 84 | Operates on [C, S] along the sample axis. 85 | """ 86 | if src_sr == dst_sr: 87 | return x_cs.astype(np.float32) 88 | 89 | # soxr 90 | try: 91 | import soxr # type: ignore 92 | out = [soxr.resample(x_cs[c], src_sr, dst_sr) for c in range(x_cs.shape[0])] 93 | # equalize length (guard) 94 | L = min(map(len, out)) 95 | out = np.stack([ch[:L] for ch in out], axis=0) 96 | return out.astype(np.float32) 97 | except Exception: 98 | pass 99 | 100 | # SciPy polyphase 101 | try: 102 | from math import gcd 103 | from scipy.signal import resample_poly # type: ignore 104 | g = gcd(src_sr, dst_sr) 105 | up, down = dst_sr // g, src_sr // g 106 | out = [resample_poly(x_cs[c], up=up, down=down).astype(np.float32) for c in range(x_cs.shape[0])] 107 | L = min(map(len, out)) 108 | out = np.stack([ch[:L] for ch in out], axis=0) 109 | return out 110 | except Exception: 111 | pass 112 | 113 | # torchaudio windowed-sinc 114 | try: 115 | import torchaudio # type: ignore 116 | t = torch.from_numpy(x_cs).float() # [C, S] 117 | rs = torchaudio.transforms.Resample(orig_freq=src_sr, new_freq=dst_sr) 118 | y = rs(t) # [C, S'] 119 | return y.numpy().astype(np.float32) 120 | except Exception: 121 | pass 122 | 123 | # linear interp fallback (lowest quality) 124 | ratio = dst_sr / float(src_sr) 125 | n_out = int(round(x_cs.shape[1] * ratio)) 126 | t_in = np.linspace(0.0, 1.0, x_cs.shape[1], endpoint=False, dtype=np.float64) 127 | t_out = np.linspace(0.0, 1.0, n_out, endpoint=False, dtype=np.float64) 128 | out = np.stack([np.interp(t_out, t_in, ch) for ch in x_cs], axis=0).astype(np.float32) 129 | return out 130 | 131 | # ---------- chunking & WOLA ---------- 132 | def _hann(L: int) -> np.ndarray: 133 | return np.hanning(L).astype(np.float32) 134 | 135 | def _iter_chunks(total_samples: int, win: int, hop: int) -> List[Tuple[int, int]]: 136 | """ 137 | Yield (start, length) for each chunk to cover [0, total_samples). 138 | """ 139 | spans: List[Tuple[int, int]] = [] 140 | i = 0 141 | while i < total_samples: 142 | L = min(win, total_samples - i) 143 | spans.append((i, L)) 144 | if i + L >= total_samples: 145 | break 146 | i += hop 147 | return spans 148 | 149 | def _wola_stitch(chunks_pred: List[Tuple[np.ndarray, int, int]], total_len: int, win: int) -> np.ndarray: 150 | """ 151 | Overlap-add predicted chunks with Hann window. 152 | chunks_pred: list of (pred_cs [C, L_pred], start, L_in) 153 | L_in = original (unpadded) input length for that chunk 154 | Returns [C, total_len]. 155 | """ 156 | if not chunks_pred: 157 | return np.zeros((1, max(1, total_len)), np.float32) 158 | 159 | C = chunks_pred[0][0].shape[0] 160 | acc = np.zeros((C, total_len), np.float32) 161 | wsum = np.zeros(total_len, np.float32) 162 | w_full = _hann(win) 163 | 164 | for y_cs, start, L_in in chunks_pred: 165 | L_pred = y_cs.shape[1] 166 | L = min(L_in, L_pred) # only weight the valid (unpadded) part 167 | w = w_full[:L] if L <= win else np.ones(L, np.float32) 168 | acc[:, start:start+L] += y_cs[:, :L] * w[None, :] 169 | wsum[start:start+L] += w 170 | 171 | wsum[wsum == 0] = 1.0 172 | out = acc / wsum[None, :] 173 | return out.astype(np.float32) 174 | 175 | # ---------- FlashSR loader ---------- 176 | class _FlashSRRunner: 177 | REQ_SR = 48000 178 | CHUNK_S = 5.12 179 | OVERLAP_S = 0.50 180 | CHUNK_SAMPLES = int(REQ_SR * CHUNK_S) # 245760 181 | 182 | HF_DATASET = "jakeoneijk/FlashSR_weights" 183 | HF_FILES = ("student_ldm.pth", "sr_vocoder.pth", "vae.pth") 184 | 185 | def __init__(self, lowpass: bool = False): 186 | self.lowpass = bool(lowpass) 187 | self.ckpt_dir = _audio_models_subdir("flashsr") 188 | self.repo_path = self._resolve_repo_path() 189 | self._dev = torch.device("cuda" if torch.cuda.is_available() else "cpu") 190 | self._FlashSRClass = None 191 | self._model = None 192 | self._ensure_weights() 193 | self._import() 194 | self._ensure_model() 195 | 196 | def _resolve_repo_path(self) -> Path: 197 | env_repo = os.environ.get("EGREGORA_FLASHSR_REPO") 198 | if env_repo: 199 | return Path(env_repo) 200 | # default: custom_nodes/ComfyUI-Egregora-Audio-Super-Resolution/deps/FlashSR_Inference 201 | return _custom_root().parents[0] / "deps" / "FlashSR_Inference" 202 | 203 | def _ensure_weights(self): 204 | missing = [f for f in self.HF_FILES if not (self.ckpt_dir / f).exists()] 205 | if not missing: 206 | return 207 | # Try huggingface_hub first 208 | try: 209 | from huggingface_hub import hf_hub_download # type: ignore 210 | for fname in missing: 211 | hf_hub_download( 212 | repo_id=self.HF_DATASET, 213 | filename=fname, 214 | repo_type="dataset", 215 | local_dir=str(self.ckpt_dir), 216 | ) 217 | print(f"[FlashSR] Downloaded via huggingface_hub: {', '.join(missing)}") 218 | return 219 | except Exception as e: 220 | print(f"[FlashSR] huggingface_hub unavailable or failed ({e}); falling back to direct HTTP…") 221 | # Fallback: direct HTTP 222 | try: 223 | import requests # type: ignore 224 | for fname in missing: 225 | url = f"https://huggingface.co/datasets/{self.HF_DATASET}/resolve/main/{fname}?download=true" 226 | dst = self.ckpt_dir / fname 227 | with requests.get(url, stream=True, timeout=1800) as r: 228 | r.raise_for_status() 229 | with open(dst, "wb") as f: 230 | for chunk in r.iter_content(chunk_size=1024 * 1024): 231 | if chunk: 232 | f.write(chunk) 233 | print(f"[FlashSR] Downloaded: {dst}") 234 | except Exception as ee: 235 | raise RuntimeError( 236 | "FlashSR weights missing and auto-download failed. " 237 | "Place these in models/audio/flashsr: student_ldm.pth, sr_vocoder.pth, vae.pth" 238 | ) from ee 239 | 240 | def _import(self): 241 | if self._FlashSRClass is not None: 242 | return 243 | try: 244 | from FlashSR.FlashSR import FlashSR # type: ignore 245 | self._FlashSRClass = FlashSR 246 | return 247 | except Exception: 248 | cand = self.repo_path 249 | if (cand / "FlashSR").exists(): 250 | sys.path.insert(0, str(cand)) 251 | from FlashSR.FlashSR import FlashSR # type: ignore 252 | self._FlashSRClass = FlashSR 253 | return 254 | raise RuntimeError("FlashSR module not found. Install/clone and set EGREGORA_FLASHSR_REPO if needed.") 255 | 256 | def _ensure_model(self): 257 | if self._model is not None: 258 | return 259 | FlashSR = self._FlashSRClass 260 | s = str(self.ckpt_dir / "student_ldm.pth") 261 | v = str(self.ckpt_dir / "sr_vocoder.pth") 262 | vae = str(self.ckpt_dir / "vae.pth") 263 | model = FlashSR(s, v, vae) 264 | model.eval() 265 | try: 266 | model.to(self._dev) 267 | except Exception: 268 | pass 269 | self._model = model 270 | 271 | def infer(self, x_cs_48k: np.ndarray) -> np.ndarray: 272 | """ 273 | x_cs_48k: [C, S] float32 at 48 kHz. 274 | Returns [C, S] float32 at 48 kHz (same length as input slice passed in). 275 | """ 276 | x = torch.from_numpy(x_cs_48k).to(self._dev).float() # [C, S] 277 | with torch.inference_mode(): 278 | y = self._model(x, lowpass_input=self.lowpass) # [C, S] 279 | return y.detach().to("cpu").float().numpy() 280 | 281 | # ---------- Node ---------- 282 | class EgregoraAudioSuperResolution: 283 | @classmethod 284 | def INPUT_TYPES(cls): 285 | return { 286 | "required": { 287 | "audio": ("AUDIO",), 288 | "lowpass_input": ("BOOLEAN", {"default": False}), 289 | "output_sr": (["48000", "44100", "96000"], {"default": "48000"}), 290 | } 291 | } 292 | 293 | RETURN_TYPES = ("AUDIO",) 294 | FUNCTION = FUNCTION 295 | CATEGORY = CATEGORY 296 | OUTPUT_NODE = False 297 | 298 | def run(self, audio=None, lowpass_input=False, output_sr="48000"): 299 | # 1) Normalize input to [C, S] 300 | in_cs, in_sr = _from_audio_dict(audio) 301 | 302 | # 2) Resample to model SR if needed 303 | runner = _FlashSRRunner(lowpass=bool(lowpass_input)) 304 | req_sr = runner.REQ_SR 305 | if in_sr != req_sr: 306 | in_cs = _resample_hq(in_cs, in_sr, req_sr) 307 | in_sr = req_sr 308 | 309 | # 3) Chunking params (internal, non-user) 310 | win = runner.CHUNK_SAMPLES # 5.12 s @ 48k 311 | hop = int((runner.CHUNK_S - runner.OVERLAP_S) * req_sr) 312 | if hop <= 0 or hop >= win: 313 | # guard-rail: keep a sane overlap in pathological cases 314 | hop = win // 2 315 | 316 | total = in_cs.shape[1] 317 | spans = _iter_chunks(total, win=win, hop=hop) 318 | 319 | # 4) Process chunks in-memory and stitch with Hann WOLA 320 | preds: List[Tuple[np.ndarray, int, int]] = [] 321 | for start, L in spans: 322 | # slice and pad up to win 323 | chunk = in_cs[:, start:start+L] 324 | if L < win: 325 | pad = np.zeros((in_cs.shape[0], win - L), np.float32) 326 | chunk = np.concatenate([chunk, pad], axis=1) 327 | y_pred = runner.infer(chunk) # [C, win] @ 48k 328 | preds.append((y_pred, start, L)) # keep original L for proper weighting 329 | 330 | out_48k = _wola_stitch(preds, total_len=total, win=win) # [C, total] 331 | 332 | # 5) Optional post-resample for delivery 333 | tgt_sr = int(output_sr) 334 | if tgt_sr != in_sr: 335 | out = _resample_hq(out_48k, in_sr, tgt_sr) 336 | out_sr = tgt_sr 337 | else: 338 | out, out_sr = out_48k, in_sr 339 | 340 | # 6) Return single AUDIO 341 | return (_make_audio(out_sr, out),) 342 | 343 | # ComfyUI registration 344 | NODE_CLASS_MAPPINGS = { 345 | "EgregoraAudioUpscaler": EgregoraAudioSuperResolution, 346 | } 347 | NODE_DISPLAY_NAME_MAPPINGS = { 348 | "EgregoraAudioUpscaler": "🎧 Audio Super Resolution (FlashSR)", 349 | } 350 | -------------------------------------------------------------------------------- /egregora_audio_eval_pack.py: -------------------------------------------------------------------------------- 1 | """ 2 | Egregora · Audio Eval Pack (v1) 3 | =============================== 4 | 5 | Permissive, model-friendly utilities to complement the Null Test Suite: 6 | - ABX Prepare / ABX Judge (double‑blind listening helper) 7 | - Loudness Meter (BS.1770-style*) + Gain Match (LUFS‑I / RMS) 8 | - Metrics: SI‑SDR and LSD (log‑spectral distance) 9 | - Resample Audio (HQ) with optional SciPy/torchaudio backends 10 | 11 | *Note: The 1770 implementation here is a practical approximation for 12 | integrated loudness, momentary/short‑term, LRA, and true‑peak. For 13 | certification-grade measurement, validate against a reference meter. 14 | 15 | All nodes follow ComfyUI conventions: 16 | - AUDIO is a dict with {"waveform": torch.Tensor[B,C,T], "sample_rate": int} 17 | - IMAGE is torch.Tensor[B,H,W,3] in [0,1] 18 | 19 | License: MIT 20 | """ 21 | from __future__ import annotations 22 | 23 | import io 24 | import math 25 | import random 26 | from dataclasses import dataclass 27 | from typing import Any, Dict, Optional, Tuple 28 | 29 | import numpy as np 30 | import torch 31 | from PIL import Image 32 | 33 | # Optional deps 34 | try: 35 | import scipy.signal as sps # resample_poly, firwin 36 | _HAVE_SCIPY = True 37 | except Exception: 38 | _HAVE_SCIPY = False 39 | 40 | try: 41 | import torchaudio 42 | import torchaudio.functional as AF 43 | _HAVE_TA = True 44 | except Exception: 45 | _HAVE_TA = False 46 | 47 | 48 | # ----------------------------- 49 | # Utilities 50 | # ----------------------------- 51 | 52 | def _to_numpy(x: Any) -> np.ndarray: 53 | if isinstance(x, np.ndarray): 54 | return x 55 | if hasattr(x, "detach") and hasattr(x, "cpu"): 56 | return x.detach().cpu().numpy() 57 | return np.asarray(x) 58 | 59 | 60 | def _normalize_CN(arr: np.ndarray) -> np.ndarray: 61 | a = np.asarray(arr) 62 | a = np.squeeze(a) 63 | if a.ndim == 1: 64 | a = a[None, :] 65 | elif a.ndim == 2: 66 | if a.shape[0] > a.shape[1]: 67 | a = a.T 68 | else: 69 | t_axis = int(np.argmax(a.shape)) 70 | a = np.moveaxis(a, t_axis, -1) 71 | C = int(np.prod(a.shape[:-1])) 72 | N = a.shape[-1] 73 | a = a.reshape(C, N) 74 | return a.astype(np.float32) 75 | 76 | 77 | def make_audio(sr: int, samples_CN: np.ndarray, meta: Optional[dict] = None) -> Dict[str, Any]: 78 | s = _normalize_CN(samples_CN) 79 | wf = torch.from_numpy(s).unsqueeze(0) # [1,C,N] 80 | return { 81 | "sr": int(sr), 82 | "sample_rate": int(sr), 83 | "samples": s, 84 | "waveform": wf, 85 | "meta": dict(meta or {}), 86 | } 87 | 88 | 89 | def to_internal_audio(x: Any) -> Dict[str, Any]: 90 | if isinstance(x, dict) and "waveform" in x and ("sample_rate" in x or "sr" in x or "rate" in x): 91 | sr = int(x.get("sample_rate") or x.get("sr") or x.get("rate")) 92 | wf = _to_numpy(x["waveform"]) # [B,C,T] or [C,T] 93 | if wf.ndim == 3: 94 | wf = wf[0] 95 | s = _normalize_CN(wf) 96 | return make_audio(sr, s, x.get("meta", {})) 97 | if isinstance(x, dict) and ("sr" in x or "sample_rate" in x): 98 | sr = int(x.get("sr") or x.get("sample_rate")) 99 | buf = x.get("samples") or x.get("audio") or x.get("array") 100 | if buf is None: 101 | raise ValueError("Audio dict missing samples/waveform") 102 | return make_audio(sr, _to_numpy(buf), x.get("meta", {})) 103 | raise ValueError("Unsupported AUDIO object for this node") 104 | 105 | 106 | def _image_from_figure(fig) -> torch.Tensor: 107 | import matplotlib 108 | matplotlib.use("Agg") 109 | import matplotlib.pyplot as plt # noqa: F401 110 | 111 | buf = io.BytesIO() 112 | fig.savefig(buf, format="png", bbox_inches="tight", dpi=110) 113 | try: 114 | fig.clf() 115 | except Exception: 116 | pass 117 | buf.seek(0) 118 | im = Image.open(buf).convert("RGB") 119 | arr = np.array(im).astype(np.float32) / 255.0 120 | return torch.from_numpy(arr).unsqueeze(0) 121 | 122 | 123 | def _rms_db(x: np.ndarray) -> float: 124 | x = x.astype(np.float64) 125 | return 10.0 * math.log10(float(np.mean(x * x) + 1e-20)) 126 | 127 | 128 | # ----------------------------- 129 | # 1770 Loudness helpers (practical approximations) 130 | # ----------------------------- 131 | 132 | def _k_weight(sr: int, x_CN: np.ndarray) -> np.ndarray: 133 | """Very small K-weight approx: 1st-order HPF ~60 Hz + slight HF tilt. 134 | This is sufficient for relative matching; not certification-grade. 135 | """ 136 | x = x_CN 137 | fc = 60.0 / (sr * 0.5) 138 | k = math.exp(-2 * math.pi * fc) 139 | y = np.zeros_like(x, dtype=np.float32) 140 | for c in range(x.shape[0]): 141 | xn = x[c].astype(np.float32) 142 | yc = np.zeros_like(xn) 143 | z = 0.0 144 | for n in range(xn.shape[0]): 145 | z = (1 - k) * xn[n] + k * z 146 | yc[n] = xn[n] - z 147 | y[c] = yc 148 | # tiny HF shelf via first difference 149 | y[:, 1:] += 0.02 * (y[:, 1:] - y[:, :-1]) 150 | return y 151 | 152 | 153 | def integrated_lufs(audio: Dict[str, Any]) -> float: 154 | sr = audio["sample_rate"] 155 | y = _k_weight(sr, audio["samples"]) # [C,N] 156 | mono = y.mean(axis=0) 157 | blk = max(1, int(round(0.400 * sr))) 158 | hop = max(1, int(round(0.100 * sr))) 159 | frames = 1 + max(0, (mono.shape[0] - blk) // hop) 160 | if frames <= 0: 161 | return _rms_db(mono) 162 | ms = [] 163 | for i in range(frames): 164 | s = i * hop 165 | e = s + blk 166 | seg = mono[s:e].astype(np.float64) 167 | ms.append(float(np.mean(seg * seg))) 168 | ms = np.asarray(ms) + 1e-20 169 | lufs_ungated = -0.691 + 10.0 * np.log10(np.mean(ms)) 170 | gate = lufs_ungated - 10.0 171 | mask = (-0.691 + 10.0 * np.log10(ms)) >= gate 172 | if np.any(mask): 173 | ms = ms[mask] 174 | return float(-0.691 + 10.0 * np.log10(np.mean(ms))) 175 | 176 | 177 | def lufs_series(audio: Dict[str, Any], window_s: float, hop_s: float) -> np.ndarray: 178 | sr = audio["sample_rate"] 179 | y = _k_weight(sr, audio["samples"]).mean(axis=0) 180 | w = max(1, int(round(window_s * sr))) 181 | h = max(1, int(round(hop_s * sr))) 182 | frames = 1 + max(0, (y.shape[0] - w) // h) 183 | out = np.empty((frames,), dtype=np.float32) 184 | for i in range(frames): 185 | s = i * h 186 | seg = y[s : s + w].astype(np.float64) 187 | out[i] = -0.691 + 10.0 * np.log10(float(np.mean(seg * seg)) + 1e-20) 188 | return out 189 | 190 | 191 | def lra_short_term(audio: Dict[str, Any]) -> float: 192 | st = lufs_series(audio, 3.0, 1.0) # 3s window, 1s hop (EBU R128) 193 | if st.size == 0: 194 | return 0.0 195 | # Simple gating: remove values near silence 196 | gate = np.percentile(st, 10.0) - 20.0 197 | pool = st[st > gate] 198 | if pool.size == 0: 199 | pool = st 200 | return float(np.percentile(pool, 95.0) - np.percentile(pool, 10.0)) 201 | 202 | 203 | def true_peak_dbfs(audio: Dict[str, Any], oversample: int = 4) -> float: 204 | x = audio["samples"].mean(axis=0) 205 | sr = audio["sample_rate"] 206 | if _HAVE_SCIPY: 207 | y = sps.resample_poly(x, oversample, 1) 208 | else: 209 | N = x.shape[0] 210 | t_old = np.linspace(0.0, 1.0, N, endpoint=False) 211 | t_new = np.linspace(0.0, 1.0, N * oversample, endpoint=False) 212 | y = np.interp(t_new, t_old, x).astype(np.float32) 213 | peak = float(np.max(np.abs(y))) 214 | return 20.0 * math.log10(peak + 1e-20) 215 | 216 | 217 | # ----------------------------- 218 | # ABX helper 219 | # ----------------------------- 220 | @dataclass 221 | class ABXMeta: 222 | x_is: str # 'A' or 'B' 223 | seed: int 224 | 225 | def to_dict(self) -> Dict[str, Any]: 226 | return {"x_is": self.x_is, "seed": int(self.seed)} 227 | 228 | 229 | # ----------------------------- 230 | # Node: ABX Prepare 231 | # ----------------------------- 232 | class ABX_Prepare: 233 | CATEGORY = "Egregora/Listening" 234 | RETURN_TYPES = ("AUDIO", "AUDIO", "AUDIO", "DICT") 235 | RETURN_NAMES = ("audio_A", "audio_B", "audio_X", "abx_meta") 236 | FUNCTION = "execute" 237 | 238 | @classmethod 239 | def INPUT_TYPES(cls): 240 | return { 241 | "required": { 242 | "audio_A": ("AUDIO", {}), 243 | "audio_B": ("AUDIO", {}), 244 | }, 245 | "optional": { 246 | "clip_seconds": ("FLOAT", {"default": 10.0, "min": 1.0, "max": 60.0, "step": 0.1}), 247 | "random_seed": ("INT", {"default": 0, "min": 0, "max": 2**31 - 1, "step": 1}), 248 | "start_seconds": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 10_000.0, "step": 0.1}), 249 | }, 250 | } 251 | 252 | def _clip(self, a: Dict[str, Any], start_s: float, dur_s: float) -> Dict[str, Any]: 253 | sr = a["sample_rate"] 254 | s = int(round(start_s * sr)) 255 | n = int(round(dur_s * sr)) 256 | x = a["samples"] 257 | if s + n > x.shape[1]: 258 | n = max(0, x.shape[1] - s) 259 | y = x[:, s : s + n] 260 | return make_audio(sr, y, a.get("meta", {})) 261 | 262 | def execute(self, audio_A, audio_B, clip_seconds=10.0, random_seed=0, start_seconds=0.0): 263 | A = to_internal_audio(audio_A) 264 | B = to_internal_audio(audio_B) 265 | n = min(A["samples"].shape[1], B["samples"].shape[1]) 266 | A["samples"] = A["samples"][:, :n] 267 | B["samples"] = B["samples"][:, :n] 268 | 269 | A_c = self._clip(A, start_seconds, clip_seconds) 270 | B_c = self._clip(B, start_seconds, clip_seconds) 271 | 272 | rng = random.Random(int(random_seed)) 273 | x_is = rng.choice(["A", "B"]) 274 | X = A_c if x_is == "A" else B_c 275 | meta = ABXMeta(x_is=x_is, seed=int(random_seed)).to_dict() 276 | return A_c, B_c, X, meta 277 | 278 | 279 | # ----------------------------- 280 | # Node: ABX Judge 281 | # ----------------------------- 282 | class ABX_Judge: 283 | CATEGORY = "Egregora/Listening" 284 | RETURN_TYPES = ("DICT",) 285 | RETURN_NAMES = ("abx_result",) 286 | FUNCTION = "execute" 287 | 288 | @classmethod 289 | def INPUT_TYPES(cls): 290 | return { 291 | "required": { 292 | "abx_meta": ("DICT", {}), 293 | "guess": (["A", "B"], {}), 294 | }, 295 | } 296 | 297 | def execute(self, abx_meta, guess): 298 | x_is = str(abx_meta.get("x_is", "?")).upper() 299 | correct = (guess.upper() == x_is) 300 | return ({"x_is": x_is, "guess": guess.upper(), "correct": bool(correct)},) 301 | 302 | 303 | # ----------------------------- 304 | # Node: Loudness Meter (1770) 305 | # ----------------------------- 306 | class Loudness_Meter_1770: 307 | CATEGORY = "Egregora/Analysis" 308 | RETURN_TYPES = ("DICT",) 309 | RETURN_NAMES = ("metrics",) 310 | FUNCTION = "execute" 311 | 312 | @classmethod 313 | def INPUT_TYPES(cls): 314 | return { 315 | "required": { 316 | "audio": ("AUDIO", {}), 317 | }, 318 | "optional": { 319 | "compute_true_peak": ("BOOLEAN", {"default": True}), 320 | "oversample": ("INT", {"default": 4, "min": 1, "max": 8, "step": 1}), 321 | }, 322 | } 323 | 324 | def execute(self, audio, compute_true_peak=True, oversample=4): 325 | a = to_internal_audio(audio) 326 | metrics: Dict[str, Any] = {} 327 | metrics["lufs_integrated"] = float(integrated_lufs(a)) 328 | metrics["lufs_momentary"] = float(lufs_series(a, 0.400, 0.100).mean() if a["samples"].size else 0.0) 329 | metrics["lufs_short_term"] = float(lufs_series(a, 3.0, 1.0).mean() if a["samples"].size else 0.0) 330 | metrics["lra"] = float(lra_short_term(a)) 331 | if compute_true_peak: 332 | metrics["true_peak_dbfs"] = float(true_peak_dbfs(a, oversample=int(oversample))) 333 | return (metrics,) 334 | 335 | 336 | # ----------------------------- 337 | # Node: Audio Gain Match (1770 / RMS) 338 | # ----------------------------- 339 | class Audio_Gain_Match_1770: 340 | CATEGORY = "Egregora/Analysis" 341 | RETURN_TYPES = ("AUDIO", "FLOAT", "FLOAT", "FLOAT") 342 | RETURN_NAMES = ("audio_matched", "gain_db", "ref_level", "in_level") 343 | FUNCTION = "execute" 344 | 345 | @classmethod 346 | def INPUT_TYPES(cls): 347 | return { 348 | "required": { 349 | "audio_ref": ("AUDIO", {}), 350 | "audio_in": ("AUDIO", {}), 351 | }, 352 | "optional": { 353 | "mode": (["LUFS-I", "RMS"], {}), 354 | "max_gain_db": ("FLOAT", {"default": 12.0, "min": -60.0, "max": 60.0, "step": 0.1}), 355 | }, 356 | } 357 | 358 | def execute(self, audio_ref, audio_in, mode="LUFS-I", max_gain_db=12.0): 359 | ref = to_internal_audio(audio_ref) 360 | inn = to_internal_audio(audio_in) 361 | # resample if SR mismatch 362 | if inn["sample_rate"] != ref["sample_rate"]: 363 | sr_old = inn["sample_rate"] 364 | x = inn["samples"] 365 | C, N = x.shape 366 | new_N = int(round(N * ref["sample_rate"] / sr_old)) 367 | t_old = np.linspace(0.0, 1.0, N, endpoint=False) 368 | t_new = np.linspace(0.0, 1.0, new_N, endpoint=False) 369 | y = np.stack([np.interp(t_new, t_old, x[c]) for c in range(C)], axis=0).astype(np.float32) 370 | inn = make_audio(ref["sample_rate"], y, inn.get("meta", {})) 371 | 372 | if str(mode).upper().startswith("LUFS"): 373 | ref_level = integrated_lufs(ref) 374 | in_level = integrated_lufs(inn) 375 | else: 376 | ref_level = _rms_db(ref["samples"].mean(axis=0)) 377 | in_level = _rms_db(inn["samples"].mean(axis=0)) 378 | gain_db = float(np.clip(ref_level - in_level, -abs(max_gain_db), abs(max_gain_db))) 379 | gain = 10 ** (gain_db / 20.0) 380 | y = (inn["samples"] * gain).astype(np.float32) 381 | out = make_audio(inn["sample_rate"], y, inn.get("meta", {})) 382 | return (out, float(gain_db), float(ref_level), float(in_level)) 383 | 384 | 385 | # ----------------------------- 386 | # Metrics: SI‑SDR & LSD 387 | # ----------------------------- 388 | 389 | def _stft_mag(x: np.ndarray, n_fft: int = 2048, hop: int = 512) -> np.ndarray: 390 | mono = x if x.ndim == 1 else x.mean(axis=0) 391 | N = mono.shape[0] 392 | win = np.hanning(n_fft).astype(np.float32) 393 | frames = 1 + max(0, (N - n_fft) // hop) 394 | S = np.empty((n_fft // 2 + 1, frames), dtype=np.float32) 395 | for i in range(frames): 396 | s = i * hop 397 | frame = mono[s : s + n_fft] 398 | if frame.shape[0] < n_fft: 399 | frame = np.pad(frame, (0, n_fft - frame.shape[0])) 400 | X = np.fft.rfft(frame * win) 401 | S[:, i] = np.abs(X).astype(np.float32) 402 | return S 403 | 404 | 405 | def _lsd(SA: np.ndarray, SB: np.ndarray) -> Tuple[float, float]: 406 | eps = 1e-12 407 | LA = 20 * np.log10(SA + eps) 408 | LB = 20 * np.log10(SB + eps) 409 | D = (LA - LB) ** 2 410 | per = np.sqrt(np.mean(D, axis=0) + 1e-12) 411 | return float(np.mean(per)), float(np.percentile(per, 95)) 412 | 413 | 414 | def _si_sdr(s: np.ndarray, s_hat: np.ndarray) -> float: 415 | # operate on mono 416 | s = s.astype(np.float64) 417 | s_hat = s_hat.astype(np.float64) 418 | if s.ndim > 1: 419 | s = s.mean(axis=0) 420 | if s_hat.ndim > 1: 421 | s_hat = s_hat.mean(axis=0) 422 | # match length 423 | n = min(s.shape[-1], s_hat.shape[-1]) 424 | s = s[:n] 425 | s_hat = s_hat[:n] 426 | alpha = np.dot(s_hat, s) / (np.dot(s, s) + 1e-20) 427 | s_target = alpha * s 428 | e_noise = s_hat - s_target 429 | return 10.0 * np.log10((np.dot(s_target, s_target) + 1e-20) / (np.dot(e_noise, e_noise) + 1e-20)) 430 | 431 | 432 | class Metrics_LSD_SISDR: 433 | CATEGORY = "Egregora/Analysis" 434 | RETURN_TYPES = ("DICT",) 435 | RETURN_NAMES = ("metrics",) 436 | FUNCTION = "execute" 437 | 438 | @classmethod 439 | def INPUT_TYPES(cls): 440 | return { 441 | "required": { 442 | "audio_ref": ("AUDIO", {}), 443 | "audio_proc": ("AUDIO", {}), 444 | }, 445 | "optional": { 446 | "n_fft": ("INT", {"default": 2048, "min": 512, "max": 8192, "step": 128}), 447 | "hop": ("INT", {"default": 512, "min": 64, "max": 4096, "step": 64}), 448 | "compute_lsd": ("BOOLEAN", {"default": True}), 449 | "compute_si_sdr": ("BOOLEAN", {"default": True}), 450 | }, 451 | } 452 | 453 | def execute(self, audio_ref, audio_proc, n_fft=2048, hop=512, compute_lsd=True, compute_si_sdr=True): 454 | A = to_internal_audio(audio_ref) 455 | B = to_internal_audio(audio_proc) 456 | a = A["samples"].mean(axis=0) 457 | b = B["samples"].mean(axis=0) 458 | n = min(a.size, b.size) 459 | a = a[:n] 460 | b = b[:n] 461 | out: Dict[str, Any] = {} 462 | if compute_lsd: 463 | SA = _stft_mag(a, n_fft=n_fft, hop=hop) 464 | SB = _stft_mag(b, n_fft=n_fft, hop=hop) 465 | lsd_mean, lsd_p95 = _lsd(SA, SB) 466 | out["lsd_mean_db"] = float(lsd_mean) 467 | out["lsd_p95_db"] = float(lsd_p95) 468 | if compute_si_sdr: 469 | out["si_sdr_db"] = float(_si_sdr(a, b)) 470 | return (out,) 471 | 472 | 473 | # ----------------------------- 474 | # Resample Audio (HQ) 475 | # ----------------------------- 476 | class Resample_Audio_HQ: 477 | CATEGORY = "Egregora/Utils" 478 | RETURN_TYPES = ("AUDIO",) 479 | RETURN_NAMES = ("audio_out",) 480 | FUNCTION = "execute" 481 | 482 | @classmethod 483 | def INPUT_TYPES(cls): 484 | modes = ["auto", "scipy_polyphase", "torchaudio", "linear"] 485 | return { 486 | "required": { 487 | "audio": ("AUDIO", {}), 488 | "target_sr": ("INT", {"default": 48000, "min": 4000, "max": 384000, "step": 1}), 489 | }, 490 | "optional": { 491 | "mode": (modes, {}), 492 | "kaiser_beta": ("FLOAT", {"default": 14.769, "min": 5.0, "max": 20.0, "step": 0.1}), 493 | }, 494 | } 495 | 496 | def execute(self, audio, target_sr=48000, mode="auto", kaiser_beta=14.769): 497 | a = to_internal_audio(audio) 498 | src_sr = int(a["sample_rate"]) 499 | if src_sr == int(target_sr): 500 | return (a,) 501 | x = a["samples"] # [C,N] 502 | C, N = x.shape 503 | if mode == "auto": 504 | mode = "scipy_polyphase" if _HAVE_SCIPY else ("torchaudio" if _HAVE_TA else "linear") 505 | if mode == "scipy_polyphase" and _HAVE_SCIPY: 506 | # rational ratio 507 | from math import gcd 508 | g = gcd(src_sr, int(target_sr)) 509 | up = int(target_sr) // g 510 | down = src_sr // g 511 | y = np.stack([sps.resample_poly(x[c], up, down) for c in range(C)], axis=0).astype(np.float32) 512 | elif mode == "torchaudio" and _HAVE_TA: 513 | wf = torch.from_numpy(x).unsqueeze(0) # [1,C,N] 514 | y = AF.resample(wf, src_sr, int(target_sr), lowpass_filter_width=64, rolloff=0.945, resampling_method="kaiser_window", beta=kaiser_beta) 515 | y = y.squeeze(0).detach().cpu().numpy().astype(np.float32) 516 | else: 517 | # fallback: linear interp 518 | new_N = int(round(N * (int(target_sr) / src_sr))) 519 | t_old = np.linspace(0.0, 1.0, N, endpoint=False) 520 | t_new = np.linspace(0.0, 1.0, new_N, endpoint=False) 521 | y = np.stack([np.interp(t_new, t_old, x[c]) for c in range(C)], axis=0).astype(np.float32) 522 | return (make_audio(int(target_sr), y, a.get("meta", {})),) 523 | 524 | 525 | # ----------------------------- 526 | # Registration 527 | # ----------------------------- 528 | NODE_CLASS_MAPPINGS = { 529 | "ABX Prepare": ABX_Prepare, 530 | "ABX Judge": ABX_Judge, 531 | "Loudness Meter (BS1770)": Loudness_Meter_1770, 532 | "Audio Gain Match (1770)": Audio_Gain_Match_1770, 533 | "Metrics (LSD + SI-SDR)": Metrics_LSD_SISDR, 534 | "Resample Audio (HQ)": Resample_Audio_HQ, 535 | } 536 | 537 | NODE_DISPLAY_NAME_MAPPINGS = { 538 | "ABX Prepare": "Egregora ABX Prepare", 539 | "ABX Judge": "Egregora ABX Judge", 540 | "Loudness Meter (BS1770)": "Egregora Loudness Meter (BS1770)", 541 | "Audio Gain Match (1770)": "Egregora Audio Gain Match (1770)", 542 | "Metrics (LSD + SI-SDR)": "Egregora Metrics (LSD + SI-SDR)", 543 | "Resample Audio (HQ)": "Egregora Resample Audio (HQ)", 544 | } 545 | -------------------------------------------------------------------------------- /egregora_null_test_suite.py: -------------------------------------------------------------------------------- 1 | """ 2 | Egregora · Null Test Suite for ComfyUI (v5) 3 | =========================================== 4 | 5 | This version fixes the UI toggles by using ComfyUI's BOOLEAN widget type 6 | instead of a non-existent BOOL type, and converts some strings to COMBOs. 7 | 8 | Added/changed since v4: 9 | - All on/off controls now use ("BOOLEAN", {"default": ...}) so they render as 10 | real checkboxes in the UI. 11 | - `align_method` is a COMBO (for now just ["gcc-phat"], extensible later). 12 | - `match_mode` is a COMBO: ["LUFS-I", "RMS"]. 13 | - Keeps the v4 compute/plot toggles to save FFT/LUFS work when unneeded. 14 | 15 | Contracts (per Comfy docs): 16 | - IMAGE: torch.Tensor [B,H,W,3] in 0..1 17 | - AUDIO: dict with keys {"waveform": torch.Tensor [B,C,T], "sample_rate": int} 18 | """ 19 | from __future__ import annotations 20 | 21 | import io 22 | import math 23 | from typing import Any, Dict, Optional, Tuple 24 | 25 | import numpy as np 26 | import torch 27 | from PIL import Image 28 | 29 | # ----------------------------- 30 | # Array/Tensor helpers 31 | # ----------------------------- 32 | 33 | def _to_numpy(x: Any) -> np.ndarray: 34 | if isinstance(x, np.ndarray): 35 | return x 36 | if hasattr(x, "detach") and hasattr(x, "cpu"): 37 | return x.detach().cpu().numpy() 38 | return np.asarray(x) 39 | 40 | 41 | def _normalize_CN(arr: np.ndarray) -> np.ndarray: 42 | """Coerce arbitrary shapes to channels-first [C, N] float32.""" 43 | a = np.asarray(arr) 44 | a = np.squeeze(a) 45 | if a.ndim == 1: 46 | a = a[None, :] 47 | elif a.ndim == 2: 48 | if a.shape[0] > a.shape[1]: 49 | a = a.T 50 | else: 51 | t_axis = int(np.argmax(a.shape)) 52 | a = np.moveaxis(a, t_axis, -1) 53 | C = int(np.prod(a.shape[:-1])) 54 | N = a.shape[-1] 55 | a = a.reshape(C, N) 56 | return a.astype(np.float32) 57 | 58 | 59 | def _blank_image(h: int = 8, w: int = 8) -> torch.Tensor: 60 | return torch.zeros((1, h, w, 3), dtype=torch.float32) 61 | 62 | 63 | # ----------------------------- 64 | # Comfy interop: AUDIO / IMAGE 65 | # ----------------------------- 66 | 67 | def make_audio(sr: int, samples_CN: np.ndarray, meta: Optional[dict] = None) -> Dict[str, Any]: 68 | s = _normalize_CN(samples_CN) 69 | wf = torch.from_numpy(s).unsqueeze(0) # [1,C,N] 70 | return { 71 | "sr": int(sr), 72 | "sample_rate": int(sr), 73 | "samples": s, # convenience 74 | "waveform": wf, # Comfy contract 75 | "meta": dict(meta or {}), 76 | } 77 | 78 | 79 | def to_internal_audio(x: Any) -> Dict[str, Any]: 80 | """Accept a ComfyUI AUDIO or similar → {sr, samples[C,N], waveform[1,C,N]}""" 81 | if isinstance(x, dict) and "waveform" in x and ("sample_rate" in x or "sr" in x or "rate" in x): 82 | sr = int(x.get("sample_rate") or x.get("sr") or x.get("rate")) 83 | wf = _to_numpy(x["waveform"]) # [B,C,T] or [C,T] 84 | if wf.ndim == 3: 85 | wf = wf[0] 86 | s = _normalize_CN(wf) 87 | return make_audio(sr, s, x.get("meta", {})) 88 | if isinstance(x, dict) and ("sr" in x or "sample_rate" in x): 89 | sr = int(x.get("sr") or x.get("sample_rate")) 90 | buf = x.get("samples") or x.get("audio") or x.get("array") 91 | if buf is None: 92 | raise ValueError("Audio dict missing samples/waveform") 93 | return make_audio(sr, _to_numpy(buf), x.get("meta", {})) 94 | raise ValueError("Unsupported AUDIO object for this node") 95 | 96 | 97 | def image_from_figure(fig) -> torch.Tensor: 98 | """Matplotlib figure → IMAGE torch [1,H,W,3] in 0..1.""" 99 | import matplotlib 100 | matplotlib.use("Agg") 101 | import matplotlib.pyplot as plt # noqa: F401 102 | 103 | buf = io.BytesIO() 104 | fig.savefig(buf, format="png", bbox_inches="tight", dpi=110) 105 | try: 106 | fig.clf() 107 | except Exception: 108 | pass 109 | buf.seek(0) 110 | im = Image.open(buf).convert("RGB") 111 | arr = np.array(im).astype(np.float32) / 255.0 112 | return torch.from_numpy(arr).unsqueeze(0) # [1,H,W,3] 113 | 114 | 115 | # ----------------------------- 116 | # DSP helpers 117 | # ----------------------------- 118 | 119 | def _rms_db(x: np.ndarray) -> float: 120 | x = x.astype(np.float64) 121 | e = float(np.mean(x * x) + 1e-20) 122 | return 10.0 * math.log10(e) 123 | 124 | 125 | def _k_weight(sr: int, x_CN: np.ndarray) -> np.ndarray: 126 | # very small K-weight approx: 1st-order HPF @60 Hz + mild HF tilt 127 | x = x_CN 128 | fc = 60.0 / (sr * 0.5) 129 | k = math.exp(-2 * math.pi * fc) 130 | y = np.zeros_like(x, dtype=np.float32) 131 | for c in range(x.shape[0]): 132 | xn = x[c].astype(np.float32) 133 | yc = np.zeros_like(xn) 134 | z = 0.0 135 | for n in range(xn.shape[0]): 136 | z = (1 - k) * xn[n] + k * z 137 | yc[n] = xn[n] - z 138 | y[c] = yc 139 | y[:, 1:] += 0.02 * (y[:, 1:] - y[:, :-1]) 140 | return y 141 | 142 | 143 | def integrated_lufs(audio: Dict[str, Any]) -> float: 144 | sr = audio["sample_rate"] 145 | y = _k_weight(sr, audio["samples"]) # [C,N] 146 | mono = y.mean(axis=0) 147 | blk = max(1, int(round(0.400 * sr))) 148 | hop = max(1, int(round(0.100 * sr))) 149 | frames = 1 + max(0, (mono.shape[0] - blk) // hop) 150 | if frames <= 0: 151 | return _rms_db(mono) 152 | ms = [] 153 | for i in range(frames): 154 | s = i * hop 155 | e = s + blk 156 | seg = mono[s:e].astype(np.float64) 157 | ms.append(float(np.mean(seg * seg))) 158 | ms = np.asarray(ms) + 1e-20 159 | lufs_ungated = -0.691 + 10.0 * np.log10(np.mean(ms)) 160 | gate = lufs_ungated - 10.0 161 | mask = (-0.691 + 10.0 * np.log10(ms)) >= gate 162 | if np.any(mask): 163 | ms = ms[mask] 164 | return float(-0.691 + 10.0 * np.log10(np.mean(ms))) 165 | 166 | 167 | def _stft_mag(x: np.ndarray, n_fft: int = 2048, hop: int = 512) -> np.ndarray: 168 | mono = x if x.ndim == 1 else x.mean(axis=0) 169 | N = mono.shape[0] 170 | win = np.hanning(n_fft).astype(np.float32) 171 | frames = 1 + max(0, (N - n_fft) // hop) 172 | S = np.empty((n_fft // 2 + 1, frames), dtype=np.float32) 173 | for i in range(frames): 174 | s = i * hop 175 | frame = mono[s : s + n_fft] 176 | if frame.shape[0] < n_fft: 177 | frame = np.pad(frame, (0, n_fft - frame.shape[0])) 178 | X = np.fft.rfft(frame * win) 179 | S[:, i] = np.abs(X).astype(np.float32) 180 | return S 181 | 182 | 183 | def _lsd(A: np.ndarray, B: np.ndarray) -> Tuple[float, float]: 184 | eps = 1e-12 185 | LA = 20 * np.log10(A + eps) 186 | LB = 20 * np.log10(B + eps) 187 | D = (LA - LB) ** 2 188 | per = np.sqrt(np.mean(D, axis=0) + 1e-12) 189 | return float(np.mean(per)), float(np.percentile(per, 95)) 190 | 191 | 192 | def _band_energy_hi_db(x_CN: np.ndarray, sr: int, lo_hz: float) -> float: 193 | mono = x_CN.mean(axis=0) 194 | X = np.fft.rfft(mono) 195 | freqs = np.fft.rfftfreq(mono.shape[0], d=1.0 / sr) 196 | mask = freqs >= lo_hz 197 | e_hi = float(np.sum(np.abs(X[mask]) ** 2)) 198 | e_all = float(np.sum(np.abs(X) ** 2) + 1e-20) 199 | return 10.0 * math.log10(e_hi / e_all + 1e-20) 200 | 201 | 202 | def _pad_or_crop_CN(x: np.ndarray, N: int) -> np.ndarray: 203 | C, M = x.shape 204 | if M == N: 205 | return x 206 | if M > N: 207 | return x[:, :N] 208 | y = np.zeros((C, N), dtype=x.dtype) 209 | y[:, :M] = x 210 | return y 211 | 212 | 213 | def _xcorr_delay(a: np.ndarray, b: np.ndarray, sr: int, max_shift_smp: int) -> float: 214 | # GCC-PHAT-ish coarse delay + parabolic refine. Returns samples (b lags a > 0) 215 | n = 1 216 | total = a.size + b.size 217 | while n < total: 218 | n <<= 1 219 | A = np.fft.rfft(a, n=n) 220 | B = np.fft.rfft(b, n=n) 221 | R = B * np.conj(A) 222 | R /= (np.abs(R) + 1e-12) 223 | cc = np.fft.irfft(R, n=n) 224 | cc = np.concatenate((cc[-(n // 2 - 1) :], cc[: n // 2 + 1])) 225 | center = len(cc) // 2 226 | sl = center - max_shift_smp 227 | sh = center + max_shift_smp + 1 228 | w = cc[sl:sh] 229 | k = int(np.argmax(w)) 230 | idx = sl + k 231 | if 1 <= idx < len(cc) - 1: 232 | y0, y1, y2 = cc[idx - 1], cc[idx], cc[idx + 1] 233 | denom = 2 * (y0 - 2 * y1 + y2) 234 | frac = 0.0 if abs(denom) < 1e-12 else (y0 - y2) / denom 235 | else: 236 | frac = 0.0 237 | return float((idx - center) + frac) 238 | 239 | 240 | def _apply_frac_delay_CN(x: np.ndarray, delay_samples: float, taps: int = 64) -> np.ndarray: 241 | if abs(delay_samples) < 1e-6: 242 | return x.copy() 243 | C, N = x.shape 244 | int_d = int(math.floor(abs(delay_samples))) 245 | frac = abs(delay_samples) - int_d 246 | sign = 1 if delay_samples >= 0 else -1 247 | y = np.zeros((C, N), dtype=np.float32) 248 | if sign > 0: 249 | if int_d < N: 250 | y[:, int_d:] = x[:, : N - int_d] 251 | else: 252 | if int_d < N: 253 | y[:, : N - int_d] = x[:, int_d:] 254 | if frac > 1e-6: 255 | M = max(16, int(taps)) 256 | n = np.arange(M) 257 | m = (M - 1) / 2.0 258 | h = np.sinc(n - m - frac) 259 | w = np.hanning(M) 260 | h = (h * w).astype(np.float32) 261 | h /= np.sum(h) 262 | for c in range(C): 263 | yc = np.convolve(y[c], h, mode="same") 264 | y[c] = yc.astype(np.float32) 265 | return y 266 | 267 | 268 | # ----------------------------- 269 | # Node 1: Audio Align (XCorr) 270 | # ----------------------------- 271 | class Audio_Align_XCorr: 272 | CATEGORY = "Egregora/Analysis" 273 | RETURN_TYPES = ("AUDIO", "FLOAT", "FLOAT", "FLOAT", "IMAGE") 274 | RETURN_NAMES = ("audio_proc_aligned", "delay_samples", "delay_ms", "peak_corr", "debug_image") 275 | FUNCTION = "execute" 276 | 277 | @classmethod 278 | def INPUT_TYPES(cls): 279 | return { 280 | "required": { 281 | "audio_ref": ("AUDIO", {}), 282 | "audio_proc": ("AUDIO", {}), 283 | }, 284 | "optional": { 285 | "max_shift_ms": ("INT", {"default": 200, "min": 0, "max": 5000, "step": 1}), 286 | # COMBO: list[str] => dropdown 287 | "align_method": (["gcc-phat"], {}), 288 | "fractional": ("BOOLEAN", {"default": True}), 289 | "fir_len": ("INT", {"default": 64, "min": 16, "max": 256, "step": 1}), 290 | }, 291 | } 292 | 293 | def execute(self, audio_ref, audio_proc, max_shift_ms=200, align_method="gcc-phat", fractional=True, fir_len=64): 294 | ref = to_internal_audio(audio_ref) 295 | proc = to_internal_audio(audio_proc) 296 | # resample proc to ref.sr if needed (linear interp is fine for alignment) 297 | if proc["sample_rate"] != ref["sample_rate"]: 298 | sr_old = proc["sample_rate"] 299 | x = proc["samples"] 300 | C, N = x.shape 301 | new_N = int(round(N * ref["sample_rate"] / sr_old)) 302 | t_old = np.linspace(0.0, 1.0, N, endpoint=False) 303 | t_new = np.linspace(0.0, 1.0, new_N, endpoint=False) 304 | y = np.stack([np.interp(t_new, t_old, x[c]) for c in range(C)], axis=0).astype(np.float32) 305 | proc = make_audio(ref["sample_rate"], y, proc.get("meta", {})) 306 | 307 | a = ref["samples"].mean(axis=0) 308 | b = proc["samples"].mean(axis=0) 309 | n = min(a.size, b.size) 310 | a = a[:n] 311 | b = b[:n] 312 | 313 | max_shift = int(ref["sample_rate"] * (max_shift_ms / 1000.0)) 314 | lag = _xcorr_delay(a, b, ref["sample_rate"], max_shift) # +ve => proc lags 315 | delay_samples = float(lag) 316 | delay_ms = 1000.0 * delay_samples / ref["sample_rate"] 317 | 318 | aligned = _apply_frac_delay_CN(proc["samples"], -delay_samples if fractional else -round(delay_samples), taps=fir_len) 319 | aligned = _pad_or_crop_CN(aligned, ref["samples"].shape[1]) 320 | out = make_audio(ref["sample_rate"], aligned, proc.get("meta", {})) 321 | 322 | # minimal debug plot 323 | try: 324 | import matplotlib 325 | matplotlib.use("Agg") 326 | import matplotlib.pyplot as plt 327 | t = np.arange(n) 328 | fig, ax = plt.subplots(1, 1, figsize=(6, 2.2)) 329 | ax.plot(t, a, linewidth=0.5, label="A") 330 | ax.plot(t, b, linewidth=0.5, label="B") 331 | ax.legend(); ax.grid(alpha=.2); ax.set_title("Align preview") 332 | debug_img = image_from_figure(fig) 333 | except Exception: 334 | debug_img = _blank_image() 335 | 336 | return (out, float(delay_samples), float(delay_ms), 0.0, debug_img) 337 | 338 | 339 | # ----------------------------- 340 | # Node 2: Audio Gain Match 341 | # ----------------------------- 342 | class Audio_Gain_Match: 343 | CATEGORY = "Egregora/Analysis" 344 | RETURN_TYPES = ("AUDIO", "FLOAT", "FLOAT", "FLOAT") 345 | RETURN_NAMES = ("audio_matched", "gain_db", "ref_level", "in_level") 346 | FUNCTION = "execute" 347 | 348 | @classmethod 349 | def INPUT_TYPES(cls): 350 | return { 351 | "required": { 352 | "audio_ref": ("AUDIO", {}), 353 | "audio_in": ("AUDIO", {}), 354 | }, 355 | "optional": { 356 | # COMBO for mode 357 | "mode": (["LUFS-I", "RMS"], {}), 358 | "max_gain_db": ("FLOAT", {"default": 12.0, "min": -48.0, "max": 48.0, "step": 0.1}), 359 | }, 360 | } 361 | 362 | def execute(self, audio_ref, audio_in, mode="LUFS-I", max_gain_db=12.0): 363 | ref = to_internal_audio(audio_ref) 364 | inn = to_internal_audio(audio_in) 365 | if inn["sample_rate"] != ref["sample_rate"]: 366 | sr_old = inn["sample_rate"] 367 | x = inn["samples"] 368 | C, N = x.shape 369 | new_N = int(round(N * ref["sample_rate"] / sr_old)) 370 | t_old = np.linspace(0.0, 1.0, N, endpoint=False) 371 | t_new = np.linspace(0.0, 1.0, new_N, endpoint=False) 372 | y = np.stack([np.interp(t_new, t_old, x[c]) for c in range(C)], axis=0).astype(np.float32) 373 | inn = make_audio(ref["sample_rate"], y, inn.get("meta", {})) 374 | 375 | if str(mode).upper().startswith("LUFS"): 376 | ref_level = integrated_lufs(ref) 377 | in_level = integrated_lufs(inn) 378 | else: 379 | ref_level = _rms_db(ref["samples"].mean(axis=0)) 380 | in_level = _rms_db(inn["samples"].mean(axis=0)) 381 | gain_db = float(np.clip(ref_level - in_level, -abs(max_gain_db), abs(max_gain_db))) 382 | gain = 10 ** (gain_db / 20.0) 383 | y = (inn["samples"] * gain).astype(np.float32) 384 | out = make_audio(inn["sample_rate"], y, inn.get("meta", {})) 385 | return (out, float(gain_db), float(ref_level), float(in_level)) 386 | 387 | 388 | # ----------------------------- 389 | # Node 3: Audio Null Test (with metric toggles) 390 | # ----------------------------- 391 | class Audio_Null_Test: 392 | CATEGORY = "Egregora/Analysis" 393 | RETURN_TYPES = ("AUDIO", "DICT") 394 | RETURN_NAMES = ("audio_null", "metrics") 395 | FUNCTION = "execute" 396 | 397 | @classmethod 398 | def INPUT_TYPES(cls): 399 | return { 400 | "required": { 401 | "audio_ref": ("AUDIO", {}), 402 | "audio_proc_aligned_matched": ("AUDIO", {}), 403 | }, 404 | "optional": { 405 | "invert_b": ("BOOLEAN", {"default": True}), 406 | "least_squares_scale": ("BOOLEAN", {"default": False}), 407 | # Metric toggles 408 | "compute_corr": ("BOOLEAN", {"default": True}), 409 | "compute_null_rms": ("BOOLEAN", {"default": True}), 410 | "compute_null_lufs": ("BOOLEAN", {"default": True}), 411 | "compute_lsd": ("BOOLEAN", {"default": True}), 412 | "compute_hf_residual": ("BOOLEAN", {"default": False}), 413 | # STFT controls (used only if LSD requested) 414 | "n_fft": ("INT", {"default": 2048, "min": 512, "max": 8192, "step": 128}), 415 | "hop": ("INT", {"default": 512, "min": 64, "max": 4096, "step": 64}), 416 | "hf_band_hz": ("INT", {"default": 8000, "min": 1000, "max": 20000, "step": 100}), 417 | }, 418 | } 419 | 420 | def execute(self, audio_ref, audio_proc_aligned_matched, invert_b=True, least_squares_scale=False, 421 | compute_corr=True, compute_null_rms=True, compute_null_lufs=True, 422 | compute_lsd=True, compute_hf_residual=False, n_fft=2048, hop=512, hf_band_hz=8000): 423 | ref = to_internal_audio(audio_ref) 424 | pro = to_internal_audio(audio_proc_aligned_matched) 425 | if pro["sample_rate"] != ref["sample_rate"]: 426 | raise ValueError("Sample rate mismatch after alignment stage") 427 | A = ref["samples"] 428 | B = pro["samples"] 429 | N = min(A.shape[1], B.shape[1]) 430 | A = A[:, :N] 431 | B = B[:, :N] 432 | k = 1.0 433 | if least_squares_scale: 434 | a = A.mean(axis=0).astype(np.float64) 435 | b = B.mean(axis=0).astype(np.float64) 436 | denom = float(np.dot(b, b) + 1e-20) 437 | k = float(np.dot(a, b) / denom) 438 | B = (B * k).astype(np.float32) 439 | if invert_b: 440 | B = -B 441 | null = (A + B).astype(np.float32) 442 | 443 | metrics: Dict[str, Any] = {} 444 | a_m = A.mean(axis=0) 445 | b_m = (-B).mean(axis=0) 446 | 447 | if compute_corr: 448 | am = a_m - np.mean(a_m) 449 | bm = b_m - np.mean(b_m) 450 | corr = float(np.dot(am, bm) / (np.linalg.norm(am) * np.linalg.norm(bm) + 1e-20)) 451 | metrics["corr_coef"] = corr 452 | if compute_null_rms: 453 | metrics["null_rms_dbfs"] = float(_rms_db(null.mean(axis=0))) 454 | if compute_null_lufs: 455 | metrics["null_lufs"] = float(integrated_lufs(make_audio(ref["sample_rate"], null))) 456 | if compute_lsd: 457 | SA = _stft_mag(a_m, n_fft=n_fft, hop=hop) 458 | SB = _stft_mag(b_m, n_fft=n_fft, hop=hop) 459 | lsd_mean, lsd_p95 = _lsd(SA, SB) 460 | metrics["lsd_mean_db"] = float(lsd_mean) 461 | metrics["lsd_p95_db"] = float(lsd_p95) 462 | if compute_hf_residual: 463 | metrics["hf_residual_db"] = float(_band_energy_hi_db(null, ref["sample_rate"], hf_band_hz)) 464 | # Always include safety stats 465 | overs = int(np.sum(np.abs(null) > 1.0)) 466 | metrics["overshoot_count"] = int(overs) 467 | metrics["clipped_pct"] = float(100.0 * overs / null.size) 468 | metrics["scale_k"] = float(k) 469 | 470 | return make_audio(ref["sample_rate"], null, {}), metrics 471 | 472 | 473 | # ----------------------------- 474 | # Node 4: Audio Plotter (with draw toggles) 475 | # ----------------------------- 476 | class Audio_Plotter: 477 | CATEGORY = "Egregora/Visualization" 478 | RETURN_TYPES = ("IMAGE", "IMAGE", "IMAGE") 479 | RETURN_NAMES = ("image_waveforms", "image_spectrograms", "image_diffspec") 480 | FUNCTION = "execute" 481 | 482 | @classmethod 483 | def INPUT_TYPES(cls): 484 | return { 485 | "required": { 486 | "audio_ref": ("AUDIO", {}), 487 | "audio_proc": ("AUDIO", {}), 488 | "audio_null": ("AUDIO", {}), 489 | }, 490 | "optional": { 491 | "draw_waveforms": ("BOOLEAN", {"default": True}), 492 | "draw_spectrograms": ("BOOLEAN", {"default": True}), 493 | "draw_diffspec": ("BOOLEAN", {"default": True}), 494 | "n_fft": ("INT", {"default": 2048, "min": 512, "max": 8192, "step": 128}), 495 | "hop": ("INT", {"default": 512, "min": 64, "max": 4096, "step": 64}), 496 | }, 497 | } 498 | 499 | def execute(self, audio_ref, audio_proc, audio_null, draw_waveforms=True, draw_spectrograms=True, draw_diffspec=True, n_fft=2048, hop=512): 500 | import matplotlib 501 | matplotlib.use("Agg") 502 | import matplotlib.pyplot as plt 503 | 504 | ref = to_internal_audio(audio_ref) 505 | pro = to_internal_audio(audio_proc) 506 | nul = to_internal_audio(audio_null) 507 | 508 | a = ref["samples"].mean(axis=0) 509 | b = pro["samples"].mean(axis=0) 510 | n = min(a.size, b.size, nul["samples"].shape[1]) 511 | a = a[:n] 512 | b = b[:n] 513 | null = nul["samples"].mean(axis=0)[:n] 514 | 515 | # Waveforms 516 | if draw_waveforms: 517 | t = np.arange(n) 518 | fig1, axes = plt.subplots(3, 1, figsize=(10, 6), sharex=True) 519 | for ax, y, ttl in zip(axes, [a, b, null], ["A: original", "B: processed", "Null: A−B"]): 520 | ax.plot(t, y, linewidth=0.7) 521 | ax.set_ylim(-1.05, 1.05) 522 | ax.set_title(ttl) 523 | ax.grid(alpha=0.25) 524 | axes[-1].set_xlabel("samples") 525 | fig1.tight_layout() 526 | img_wave = image_from_figure(fig1) 527 | else: 528 | img_wave = _blank_image(1, 1) 529 | 530 | # Spectrograms (A, B, Null) 531 | if draw_spectrograms: 532 | def _spec(y): 533 | S = _stft_mag(y, n_fft=n_fft, hop=hop) 534 | return 20 * np.log10(S + 1e-9) 535 | SA = _spec(a) 536 | SB = _spec(b) 537 | SN = _spec(null) 538 | fig2, axes2 = plt.subplots(3, 1, figsize=(10, 7)) 539 | for ax, S, ttl in zip(axes2, [SA, SB, SN], ["A: spec", "B: spec", "Null: spec"]): 540 | ax.imshow(S, origin="lower", aspect="auto") 541 | ax.set_title(ttl) 542 | fig2.tight_layout() 543 | img_spec = image_from_figure(fig2) 544 | else: 545 | img_spec = _blank_image(1, 1) 546 | 547 | # Diff-spec |A-B| 548 | if draw_diffspec: 549 | def _spec(y): 550 | S = _stft_mag(y, n_fft=n_fft, hop=hop) 551 | return 20 * np.log10(S + 1e-9) 552 | SA = _spec(a) 553 | SB = _spec(b) 554 | D = np.abs(10 ** (SA / 20.0) - 10 ** (SB / 20.0)) 555 | fig3 = plt.figure(figsize=(10, 3)) 556 | import matplotlib.pyplot as plt # noqa 557 | plt.imshow(20 * np.log10(D + 1e-9), origin="lower", aspect="auto") 558 | plt.title("|Spec(A) − Spec(B)| (dB)") 559 | plt.tight_layout() 560 | img_diff = image_from_figure(fig3) 561 | else: 562 | img_diff = _blank_image(1, 1) 563 | 564 | return (img_wave, img_spec, img_diff) 565 | 566 | 567 | # ----------------------------- 568 | # Node 5: Null Test (Full) – with toggles exposed 569 | # ----------------------------- 570 | class Null_Test_Full: 571 | CATEGORY = "Egregora/Analysis" 572 | RETURN_TYPES = ("AUDIO", "AUDIO", "FLOAT", "FLOAT", "DICT", "IMAGE", "IMAGE", "IMAGE") 573 | RETURN_NAMES = ( 574 | "audio_proc_aligned_matched", 575 | "audio_null", 576 | "delay_ms", 577 | "gain_db", 578 | "metrics", 579 | "image_waveforms", 580 | "image_spectrograms", 581 | "image_diffspec", 582 | ) 583 | FUNCTION = "execute" 584 | 585 | @classmethod 586 | def INPUT_TYPES(cls): 587 | return { 588 | "required": { 589 | "audio_ref": ("AUDIO", {}), 590 | "audio_proc": ("AUDIO", {}), 591 | }, 592 | "optional": { 593 | # Align/Gain params 594 | "align_max_shift_ms": ("INT", {"default": 200, "min": 0, "max": 5000, "step": 1}), 595 | "align_method": (["gcc-phat"], {}), 596 | "fractional": ("BOOLEAN", {"default": True}), 597 | "fir_len": ("INT", {"default": 64, "min": 16, "max": 256, "step": 1}), 598 | "match_mode": (["LUFS-I", "RMS"], {}), 599 | "least_squares_scale": ("BOOLEAN", {"default": False}), 600 | # Metric toggles 601 | "compute_corr": ("BOOLEAN", {"default": True}), 602 | "compute_null_rms": ("BOOLEAN", {"default": True}), 603 | "compute_null_lufs": ("BOOLEAN", {"default": True}), 604 | "compute_lsd": ("BOOLEAN", {"default": True}), 605 | "compute_hf_residual": ("BOOLEAN", {"default": False}), 606 | # Plot toggles 607 | "draw_waveforms": ("BOOLEAN", {"default": True}), 608 | "draw_spectrograms": ("BOOLEAN", {"default": True}), 609 | "draw_diffspec": ("BOOLEAN", {"default": True}), 610 | # STFT controls 611 | "n_fft": ("INT", {"default": 2048, "min": 512, "max": 8192, "step": 128}), 612 | "hop": ("INT", {"default": 512, "min": 64, "max": 4096, "step": 64}), 613 | }, 614 | } 615 | 616 | def execute(self, audio_ref, audio_proc, align_max_shift_ms=200, align_method="gcc-phat", fractional=True, 617 | fir_len=64, match_mode="LUFS-I", least_squares_scale=False, 618 | compute_corr=True, compute_null_rms=True, compute_null_lufs=True, 619 | compute_lsd=True, compute_hf_residual=False, 620 | draw_waveforms=True, draw_spectrograms=True, draw_diffspec=True, 621 | n_fft=2048, hop=512): 622 | # 1) Align 623 | align = Audio_Align_XCorr() 624 | ap_aligned, delay_samples, delay_ms, _pc, _dbg = align.execute( 625 | audio_ref, audio_proc, 626 | max_shift_ms=align_max_shift_ms, 627 | align_method=align_method, 628 | fractional=fractional, 629 | fir_len=fir_len, 630 | ) 631 | # 2) Gain-match 632 | gm = Audio_Gain_Match() 633 | ap_matched, gain_db, _ref_lvl, _in_lvl = gm.execute(audio_ref, ap_aligned, mode=match_mode) 634 | # 3) Null (+ metrics) 635 | nt = Audio_Null_Test() 636 | audio_null, metrics = nt.execute( 637 | audio_ref, ap_matched, 638 | invert_b=True, 639 | least_squares_scale=least_squares_scale, 640 | compute_corr=compute_corr, 641 | compute_null_rms=compute_null_rms, 642 | compute_null_lufs=compute_null_lufs, 643 | compute_lsd=compute_lsd, 644 | compute_hf_residual=compute_hf_residual, 645 | n_fft=n_fft, hop=hop, 646 | ) 647 | # 4) Plots (respect draw toggles) 648 | pl = Audio_Plotter() 649 | img_waves, img_spec, img_diff = pl.execute( 650 | audio_ref, ap_matched, audio_null, 651 | draw_waveforms=draw_waveforms, 652 | draw_spectrograms=draw_spectrograms, 653 | draw_diffspec=draw_diffspec, 654 | n_fft=n_fft, hop=hop, 655 | ) 656 | 657 | return ap_matched, audio_null, float(delay_ms), float(gain_db), metrics, img_waves, img_spec, img_diff 658 | 659 | 660 | # ----------------------------- 661 | # Registration (original names maintained for retro-compatibility) 662 | # ----------------------------- 663 | NODE_CLASS_MAPPINGS = { 664 | "Audio Align (XCorr)": Audio_Align_XCorr, 665 | "Audio Gain Match": Audio_Gain_Match, 666 | "Audio Null Test": Audio_Null_Test, 667 | "Audio Plotter": Audio_Plotter, 668 | "Null Test (Full)": Null_Test_Full, 669 | } 670 | 671 | NODE_DISPLAY_NAME_MAPPINGS = { 672 | "Audio Align (XCorr)": "Audio Align (XCorr)", 673 | "Audio Gain Match": "Audio Gain Match", 674 | "Audio Null Test": "Audio Null Test", 675 | "Audio Plotter": "Audio Plotter", 676 | "Null Test (Full)": "Null Test (Full)", 677 | } 678 | -------------------------------------------------------------------------------- /egregora_audio_enhance_extras.py: -------------------------------------------------------------------------------- 1 | # Egregora Enhance Extras - Fixed Version 2 | # Adds: RNNoise Denoise, WPE Dereverb, DeepFilterNet Denoise, DAC Encode/Decode, ViSQOL Meter 3 | # Licenses: 4 | # - RNNoise wrappers (pyrnnoise): Apache-2.0 5 | # - NARA-WPE: MIT 6 | # - DeepFilterNet: MIT/Apache-2.0 (dual) 7 | # - DAC: MIT 8 | # - ViSQOL (binary) + Audiocraft wrapper docs: Apache-2.0 (wrapper), ViSQOL itself under Apache-2.0 9 | 10 | import os 11 | import io 12 | import json 13 | import math 14 | import subprocess 15 | from pathlib import Path 16 | from typing import Dict, Tuple, Optional 17 | 18 | import torch 19 | import torchaudio 20 | import numpy as np 21 | 22 | # ---------------------------- 23 | # Small audio helpers (Comfy-style) 24 | # ---------------------------- 25 | 26 | def _is_audio(x): 27 | return isinstance(x, dict) and "waveform" in x and "sample_rate" in x 28 | 29 | def _coerce_audio(x): 30 | # Returns (wave[B,C,T], sr:int, meta:dict) 31 | if _is_audio(x): 32 | wav = x["waveform"] 33 | sr = int(x["sample_rate"]) 34 | meta = x.get("meta", {}) 35 | if wav.dim() == 2: 36 | # [C,T] -> [1,C,T] 37 | wav = wav.unsqueeze(0) 38 | elif wav.dim() == 1: 39 | # [T] -> [1,1,T] 40 | wav = wav.unsqueeze(0).unsqueeze(0) 41 | elif wav.dim() != 3: 42 | raise ValueError("Audio waveform must be 1D, 2D or 3D [B,C,T].") 43 | return wav.float(), sr, meta 44 | # Torch tensor passthrough (assume [C,T] or [B,C,T] with default sr=48000) 45 | if isinstance(x, torch.Tensor): 46 | wav = x 47 | if wav.dim() == 2: # [C,T] -> [1,C,T] 48 | wav = wav.unsqueeze(0) 49 | elif wav.dim() != 3: 50 | raise ValueError("Tensor audio must be [C,T] or [B,C,T].") 51 | return wav.float(), 48000, {} 52 | raise TypeError("Unsupported audio input type.") 53 | 54 | def _make_audio(sr: int, wav: torch.Tensor, meta: Optional[dict] = None): 55 | # Ensure [B,C,T] 56 | if wav.dim() == 2: 57 | wav = wav.unsqueeze(0) 58 | if wav.dim() != 3: 59 | raise ValueError("samples must be 1D/2D/3D; got shape %r" % (wav.shape,)) 60 | return { 61 | "waveform": wav.contiguous(), 62 | "sample_rate": int(sr), 63 | "meta": meta or {}, 64 | } 65 | 66 | def _resample(wav: torch.Tensor, sr_in: int, sr_out: int): 67 | if sr_in == sr_out: 68 | return wav, sr_in 69 | B, C, T = wav.shape 70 | res = [] 71 | for b in range(B): 72 | # torchaudio expects [C,T] 73 | res.append(torchaudio.functional.resample(wav[b], sr_in, sr_out)) 74 | wav_out = torch.stack(res, dim=0) 75 | return wav_out, sr_out 76 | 77 | def _to_mono(wav: torch.Tensor): 78 | # [B,C,T] -> [B,1,T] 79 | if wav.size(1) == 1: 80 | return wav 81 | return wav.mean(dim=1, keepdim=True) 82 | 83 | def _device_for(wav: torch.Tensor): 84 | return "cuda" if wav.is_cuda else ("cuda" if torch.cuda.is_available() else "cpu") 85 | 86 | # ---------------------------- 87 | # RNNoise (pyrnnoise) 88 | # ---------------------------- 89 | 90 | class Egregora_RNNoise_Denoise: 91 | """ 92 | RNNoise denoiser (speech-focused), ComfyUI node. 93 | • Runs at 48 kHz (10 ms = 480 samples). 94 | • Mono/stereo: per-channel or downmix to mono. 95 | • Uses pyrnnoise>=0.3.x 'denoise_chunk' API. 96 | • Adds static strength + adaptive mix (driven by per-frame VAD) + post-gain with ceiling. 97 | """ 98 | @classmethod 99 | def INPUT_TYPES(cls): 100 | return { 101 | "required": { 102 | "audio": ("AUDIO",), 103 | "frame_ms": ("INT", {"default": 20, "min": 5, "max": 60, "step": 5}), 104 | "stereo_mode": (["per_channel", "downmix_mono"], {"default": "per_channel"}), 105 | 106 | # mix controls 107 | "strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}), 108 | "mix_curve": (["equal_power", "linear"], {"default": "equal_power"}), 109 | 110 | # adaptive controls 111 | "adaptive_mode": (["off", "more_on_noise", "more_on_speech", "gate_on_noise"], {"default": "more_on_noise"}), 112 | "adaptive_amount": ("FLOAT", {"default": 0.5, "min": 0.0, "max": 1.0, "step": 0.01}), 113 | "vad_threshold": ("FLOAT", {"default": 0.90, "min": 0.0, "max": 1.0, "step": 0.01}), 114 | "vad_smooth_ms": ("INT", {"default": 50, "min": 0, "max": 500, "step": 5}), 115 | 116 | # post gain 117 | "post_gain_db": ("FLOAT", {"default": 0.0, "min": -24.0, "max": 24.0, "step": 0.1}), 118 | "limit_ceiling": ("BOOL", {"default": True}), 119 | "ceiling": ("FLOAT", {"default": 0.999, "min": 0.1, "max": 1.0, "step": 0.001}), 120 | } 121 | } 122 | 123 | RETURN_TYPES = ("AUDIO",) 124 | FUNCTION = "execute" 125 | CATEGORY = "Egregora/Enhance" 126 | 127 | # ---------- helpers ---------- 128 | def _silence_destructor(self, rn): 129 | try: 130 | type(rn).__del__ = lambda self: None 131 | except Exception: 132 | pass 133 | 134 | def _init_rn(self, channels: int): 135 | from pyrnnoise import RNNoise 136 | rn = RNNoise(sample_rate=48000) 137 | try: 138 | if getattr(rn, "channels", None) in (None, 0): 139 | setattr(rn, "channels", channels) 140 | except Exception: 141 | pass 142 | return rn 143 | 144 | def _denoise_chunk_with_probs(self, rn, x_i16): 145 | """ 146 | Preferred path on pyrnnoise>=0.3.x: returns (wet_i16, vad_probs_per_frame) 147 | where each frame is 480 samples at 48 kHz. 148 | """ 149 | import numpy as np 150 | pad = (-len(x_i16)) % 480 151 | x_pad = np.pad(x_i16, (0, pad), mode="constant") if pad else x_i16 152 | 153 | out_frames, probs = [], [] 154 | x2 = x_pad[np.newaxis, :] # (1, N) 155 | for p, den in rn.denoise_chunk(x2): 156 | # p may be scalar or array-like; we're per-channel, so take float(p) 157 | try: 158 | p_val = float(p[0]) if hasattr(p, "__len__") else float(p) 159 | except Exception: 160 | p_val = float(p) 161 | probs.append(p_val) 162 | 163 | den = np.asarray(den, dtype=np.int16) 164 | if den.ndim == 2 and den.shape[0] == 1: 165 | den = den[0] 166 | out_frames.append(den) 167 | 168 | wet = np.concatenate(out_frames, axis=0) 169 | return wet[:len(x_i16)], np.asarray(probs, dtype=np.float32) 170 | 171 | def _fallback_frame_loop(self, rn, x_i16, frame_len): 172 | """ 173 | Very old wheels only: try process_frame/filter; else passthrough. 174 | (No VAD probs here, so adaptive becomes effectively 'off' on fallback.) 175 | """ 176 | import numpy as np 177 | call = None 178 | if hasattr(rn, "process_frame"): 179 | call = lambda fr: np.asarray(rn.process_frame(fr), dtype=np.int16) 180 | elif hasattr(rn, "filter"): 181 | call = lambda fr: np.asarray(rn.filter(fr), dtype=np.int16) 182 | if call is None: 183 | return x_i16, None 184 | 185 | frame_len = max(1, frame_len // 480) * 480 186 | pad = (-len(x_i16)) % frame_len 187 | x_work = np.pad(x_i16, (0, pad), mode="constant") if pad else x_i16 188 | 189 | outs = [] 190 | for start in range(0, len(x_work), frame_len): 191 | chunk = x_work[start:start + frame_len] 192 | pos, sub = 0, [] 193 | while pos < frame_len: 194 | fr = chunk[pos:pos + 480] 195 | if fr.shape[0] < 480: 196 | fr = np.pad(fr, (0, 480 - fr.shape[0]), mode="constant") 197 | try: 198 | y = call(fr) 199 | except Exception: 200 | y = fr 201 | sub.append(y) 202 | pos += 480 203 | outs.append(np.concatenate(sub, axis=0)) 204 | out = np.concatenate(outs, axis=0) 205 | return out[:len(x_i16)], None 206 | 207 | def _smooth_vad_probs(self, probs, smooth_ms: int): 208 | import numpy as np, math 209 | if probs is None or probs.size == 0 or smooth_ms <= 0: 210 | return probs 211 | hop_ms = 10.0 # RNNoise frame = 10 ms @ 48 kHz 212 | tau = max(1e-3, float(smooth_ms)) 213 | alpha = math.exp(-hop_ms / tau) 214 | y = np.empty_like(probs) 215 | acc = probs[0] 216 | for i, p in enumerate(probs): 217 | acc = alpha * acc + (1.0 - alpha) * p 218 | y[i] = acc 219 | return y 220 | 221 | def _strength_per_frame(self, base_s, vad_smooth, adaptive_mode, adaptive_amount, vad_threshold): 222 | import numpy as np 223 | if vad_smooth is None: 224 | return np.array([base_s], dtype=np.float32) # will be broadcast 225 | s0 = float(base_s) 226 | a = float(adaptive_amount) 227 | v = np.clip(vad_smooth, 0.0, 1.0) 228 | if adaptive_mode == "off": 229 | s_eff = np.full_like(v, s0, dtype=np.float32) 230 | elif adaptive_mode == "more_on_noise": 231 | # more denoise when speech-prob low 232 | s_eff = s0 + a * (1.0 - v) * (1.0 - s0) 233 | elif adaptive_mode == "more_on_speech": 234 | # more denoise when speech-prob high 235 | s_eff = s0 + a * v * (1.0 - s0) 236 | elif adaptive_mode == "gate_on_noise": 237 | # if below threshold => denoise-heavy; else denoise-light 238 | s_noise = s0 + a * (1.0 - s0) # push toward 1 239 | s_speech = s0 * (1.0 - a) # pull toward 0 240 | s_eff = np.where(v < vad_threshold, s_noise, s_speech).astype(np.float32) 241 | else: 242 | s_eff = np.full_like(v, s0, dtype=np.float32) 243 | return np.clip(s_eff.astype(np.float32), 0.0, 1.0) 244 | 245 | def _gains_from_strength(self, s_eff, curve): 246 | import numpy as np, math 247 | s = np.clip(s_eff, 0.0, 1.0).astype(np.float32) 248 | if curve == "equal_power": 249 | # equal-power crossfade: keep power ~constant 250 | g_wet = np.sin(0.5 * math.pi * s, dtype=np.float32) 251 | g_dry = np.cos(0.5 * math.pi * s, dtype=np.float32) 252 | else: 253 | # linear 254 | g_wet = s 255 | g_dry = 1.0 - s 256 | return g_dry.astype(np.float32), g_wet.astype(np.float32) 257 | 258 | # ---------- main ---------- 259 | def execute( 260 | self, 261 | audio, 262 | frame_ms=20, 263 | stereo_mode="per_channel", 264 | strength=1.0, 265 | mix_curve="equal_power", 266 | adaptive_mode="more_on_noise", 267 | adaptive_amount=0.5, 268 | vad_threshold=0.90, 269 | vad_smooth_ms=50, 270 | post_gain_db=0.0, 271 | limit_ceiling=True, 272 | ceiling=0.999, 273 | ): 274 | import numpy as np 275 | import torch 276 | import math 277 | 278 | # Coerce to [B,C,T], resample to 48k (RNNoise domain) 279 | wav, sr, meta = _coerce_audio(audio) 280 | wav48, _ = _resample(wav, sr, 48000) 281 | 282 | if stereo_mode == "downmix_mono": 283 | wav48 = _to_mono(wav48) 284 | 285 | B, C, T = wav48.shape 286 | frame_len = int(48000 * max(5, min(60, frame_ms)) / 1000) 287 | 288 | out_batches = [] 289 | for b in range(B): 290 | ch_out = [] 291 | for c in range(C): 292 | dry = wav48[b, c].detach() # float32 [-1,1] at 48k 293 | x = dry.cpu().numpy().astype(np.float32) 294 | x_i16 = (np.clip(x, -1.0, 1.0) * 32767.0).astype(np.int16) 295 | 296 | rn = self._init_rn(channels=1) 297 | 298 | if hasattr(rn, "denoise_chunk"): 299 | try: 300 | wet_i16, probs = self._denoise_chunk_with_probs(rn, x_i16) 301 | except Exception: 302 | self._silence_destructor(rn) 303 | rn = self._init_rn(channels=1) 304 | wet_i16, probs = self._fallback_frame_loop(rn, x_i16, frame_len) 305 | else: 306 | wet_i16, probs = self._fallback_frame_loop(rn, x_i16, frame_len) 307 | 308 | wet = torch.from_numpy(wet_i16.astype(np.float32) / 32768.0).to(dry.device) 309 | 310 | # ----- Adaptive mixing ----- 311 | vad_s = self._smooth_vad_probs(probs, vad_smooth_ms) 312 | s_eff = self._strength_per_frame(strength, vad_s, adaptive_mode, adaptive_amount, vad_threshold) 313 | # expand per-frame strengths (10 ms) to per-sample gains 314 | if s_eff.ndim == 0: 315 | s_per_sample = np.full(T, float(s_eff), dtype=np.float32) 316 | else: 317 | s_per_sample = np.repeat(s_eff, 480)[:T].astype(np.float32) 318 | 319 | g_dry_np, g_wet_np = self._gains_from_strength(s_per_sample, mix_curve) 320 | g_dry = torch.from_numpy(g_dry_np).to(dry.device) 321 | g_wet = torch.from_numpy(g_wet_np).to(dry.device) 322 | 323 | y = g_dry * dry + g_wet * wet 324 | y = torch.clamp(y, -1.0, 1.0) 325 | 326 | ch_out.append(y) 327 | 328 | y_st = torch.stack(ch_out, dim=0).unsqueeze(0) # [1,C,T] 329 | out_batches.append(y_st) 330 | 331 | y48 = torch.cat(out_batches, dim=0) # [B,C,T] 332 | 333 | # Back to original sample rate 334 | y, _ = _resample(y48, 48000, sr) 335 | 336 | # ----- Post-gain + optional ceiling limiter ----- 337 | if post_gain_db != 0.0: 338 | gain = float(10.0 ** (post_gain_db / 20.0)) 339 | y = y * gain 340 | 341 | if limit_ceiling: 342 | peak = torch.max(torch.abs(y)).item() 343 | if peak > ceiling and peak > 0: 344 | y = y * (ceiling / peak) 345 | 346 | y = torch.clamp(y, -1.0, 1.0) 347 | 348 | meta2 = dict(meta) 349 | meta2["rnnoise"] = { 350 | "frame_ms": frame_ms, 351 | "stereo_mode": stereo_mode, 352 | "strength": strength, 353 | "mix_curve": mix_curve, 354 | "adaptive_mode": adaptive_mode, 355 | "adaptive_amount": adaptive_amount, 356 | "vad_threshold": vad_threshold, 357 | "vad_smooth_ms": vad_smooth_ms, 358 | "post_gain_db": post_gain_db, 359 | "limit_ceiling": bool(limit_ceiling), 360 | "ceiling": ceiling, 361 | } 362 | return (_make_audio(sr, y, meta2),) 363 | 364 | # ---------------------------- 365 | # WPE Dereverb (nara_wpe) 366 | # ---------------------------- 367 | 368 | class Egregora_WPE_Dereverb: 369 | """ 370 | Weighted Prediction Error dereverberation. 371 | Works mono or multi-channel. Uses STFT -> WPE -> iSTFT. 372 | """ 373 | @classmethod 374 | def INPUT_TYPES(cls): 375 | return { 376 | "required": { 377 | "audio": ("AUDIO",), 378 | "taps": ("INT", {"default": 10, "min": 3, "max": 32}), 379 | "delay": ("INT", {"default": 3, "min": 1, "max": 16}), 380 | "iterations": ("INT", {"default": 3, "min": 1, "max": 10}), 381 | "n_fft": ("INT", {"default": 1024, "min": 256, "max": 4096, "step": 256}), 382 | "hop": ("INT", {"default": 256, "min": 64, "max": 1024, "step": 64}), 383 | "use_float32": ("BOOLEAN", {"default": True}), 384 | } 385 | } 386 | 387 | RETURN_TYPES = ("AUDIO",) 388 | FUNCTION = "execute" 389 | CATEGORY = "Egregora/Enhance" 390 | 391 | def execute(self, audio, taps=10, delay=3, iterations=3, n_fft=1024, hop=256, use_float32=True): 392 | try: 393 | import numpy as np 394 | from nara_wpe import wpe as np_wpe 395 | from nara_wpe.utils import stft, istft 396 | except Exception as e: 397 | raise RuntimeError("nara-wpe not installed. pip install nara-wpe") from e 398 | 399 | wav, sr, meta = _coerce_audio(audio) # [B,C,T] 400 | B, C, T = wav.shape 401 | 402 | out_list = [] 403 | for b in range(B): 404 | # nara_wpe expects numpy with shape (channels, samples) 405 | y = wav[b].cpu().numpy() # [C,T] 406 | 407 | # FIX: Handle memory issues with large arrays by processing in chunks or using float32 408 | if use_float32: 409 | y = y.astype(np.float32) 410 | 411 | try: 412 | # STFT: returns shape (frames, freqs, channels) 413 | Y = stft(y, size=n_fft, shift=hop) 414 | 415 | # FIX: Check memory usage and dtype 416 | if Y.dtype == np.complex128 and use_float32: 417 | Y = Y.astype(np.complex64) 418 | 419 | # Transpose to (freqs, channels, frames) as expected by wpe() 420 | Y = np.transpose(Y, (1, 2, 0)) 421 | 422 | # Apply WPE with memory-conscious settings 423 | Z = np_wpe.wpe(Y, taps=taps, delay=delay, iterations=iterations) 424 | 425 | # Back to (frames, freqs, channels) 426 | Z = np.transpose(Z, (2, 0, 1)) 427 | z = istft(Z, size=n_fft, shift=hop) # (channels, samples) 428 | 429 | except MemoryError: 430 | # Fallback: process with reduced precision or skip WPE 431 | print(f"Warning: WPE processing failed due to memory constraints for batch {b}") 432 | z = y # Pass through original audio 433 | except Exception as e: 434 | print(f"Warning: WPE processing failed: {e}") 435 | z = y # Pass through original audio 436 | 437 | z_t = torch.from_numpy(z).to(wav.device).float() # [C,T] 438 | out_list.append(z_t.unsqueeze(0)) # [1,C,T] 439 | 440 | out = torch.cat(out_list, dim=0) 441 | meta2 = dict(meta) 442 | meta2["wpe"] = {"taps": taps, "delay": delay, "iterations": iterations, "n_fft": n_fft, "hop": hop} 443 | return (_make_audio(sr, out, meta2),) 444 | 445 | 446 | # ---------------------------- 447 | # DeepFilterNet (DFN/DFN2/DFN3) 448 | # ---------------------------- 449 | 450 | class Egregora_DeepFilterNet_Denoise: 451 | """ 452 | DeepFilterNet denoiser (speech enhancement) for ComfyUI. 453 | 454 | • Runs DeepFilterNet at 48 kHz (its native rate), using tensor I/O. 455 | • Mono or stereo (per-channel or downmix to mono before DFN). 456 | • Adds 'strength' wet/dry mix with equal-power or linear curve. 457 | • Adaptive mix driven by VAD (RNNoise if available, else energy/RMS proxy). 458 | • Post-gain (dB) and a simple peak ceiling limiter. 459 | """ 460 | 461 | @classmethod 462 | def INPUT_TYPES(cls): 463 | return { 464 | "required": { 465 | "audio": ("AUDIO",), 466 | 467 | # DFN options 468 | "dfn_model": (["DeepFilterNet2", "DeepFilterNet3"], {"default": "DeepFilterNet2"}), 469 | "device": (["auto", "cuda:0", "cpu"], {"default": "auto"}), 470 | 471 | # proper BOOLEAN toggles (not sockets) 472 | "use_postfilter": ("BOOLEAN", {"default": False, "label_on": "postfilter on", "label_off": "postfilter off"}), 473 | "limit_ceiling": ("BOOLEAN", {"default": True, "label_on": "limit on", "label_off": "limit off"}), 474 | 475 | # channel / framing 476 | "stereo_mode": (["per_channel", "downmix_mono"], {"default": "per_channel"}), 477 | "frame_ms": ("INT", {"default": 20, "min": 5, "max": 60, "step": 5}), 478 | 479 | # mixing 480 | "strength": ("FLOAT", {"default": 0.65, "min": 0.0, "max": 1.0, "step": 0.01}), 481 | "mix_curve": (["equal_power", "linear"], {"default": "equal_power"}), 482 | 483 | # adaptive controls 484 | "adaptive_vad_source": (["rms", "rnnoise", "none"], {"default": "rms"}), 485 | "adaptive_mode": (["off", "more_on_noise", "more_on_speech", "gate_on_noise"], {"default": "more_on_noise"}), 486 | "adaptive_amount": ("FLOAT", {"default": 0.45, "min": 0.0, "max": 1.0, "step": 0.01}), 487 | "vad_threshold": ("FLOAT", {"default": 0.90, "min": 0.0, "max": 1.0, "step": 0.01}), 488 | "vad_smooth_ms": ("INT", {"default": 60, "min": 0, "max": 500, "step": 5}), 489 | 490 | # post 491 | "post_gain_db": ("FLOAT", {"default": 0.5, "min": -24.0, "max": 24.0, "step": 0.1}), 492 | "ceiling": ("FLOAT", {"default": 0.98, "min": 0.1, "max": 1.0, "step": 0.001}), 493 | } 494 | } 495 | 496 | RETURN_TYPES = ("AUDIO",) 497 | FUNCTION = "execute" 498 | CATEGORY = "Egregora/Enhance" 499 | 500 | # ------------------------- DFN backend & cache ------------------------- 501 | _DF_CACHE = {} # (model_name, device) -> (model, df_state) 502 | 503 | def _pick_device(self, choice: str): 504 | import torch 505 | if choice == "auto": 506 | return "cuda:0" if torch.cuda.is_available() else "cpu" 507 | return choice 508 | 509 | def _df_get(self, model_name: str, device: str): 510 | from df.enhance import init_df 511 | key = (model_name, device) 512 | if key in self._DF_CACHE: 513 | return self._DF_CACHE[key] 514 | model, df_state, _ = init_df(model_name, config_allow_defaults=True) 515 | model = model.to(device).eval() 516 | self._DF_CACHE[key] = (model, df_state) 517 | return model, df_state 518 | 519 | # ----------------------------- VAD helpers ----------------------------- 520 | def _vad_probs_rnnoise_48k(self, x48_np): 521 | import numpy as np 522 | try: 523 | from pyrnnoise import RNNoise 524 | except Exception: 525 | return None # RNNoise not installed 526 | 527 | x_i16 = (np.clip(x48_np, -1.0, 1.0) * 32767.0).astype(np.int16) 528 | rn = RNNoise(sample_rate=48000) 529 | try: 530 | if getattr(rn, "channels", None) in (None, 0): 531 | setattr(rn, "channels", 1) 532 | except Exception: 533 | pass 534 | 535 | probs = [] 536 | if hasattr(rn, "denoise_chunk"): 537 | pad = (-len(x_i16)) % 480 538 | x_pad = np.pad(x_i16, (0, pad), mode="constant") if pad else x_i16 539 | X = x_pad[np.newaxis, :] 540 | for p, _ in rn.denoise_chunk(X): 541 | try: 542 | probs.append(float(p[0]) if hasattr(p, "__len__") else float(p)) 543 | except Exception: 544 | probs.append(float(p)) 545 | return np.asarray(probs, dtype=np.float32) 546 | return None # fallback APIs don't expose p 547 | 548 | def _vad_probs_rms_48k(self, x48_np): 549 | import numpy as np 550 | hop = 480 # 10 ms at 48 kHz 551 | n = (len(x48_np) + hop - 1) // hop 552 | rms = [] 553 | for i in range(n): 554 | fr = x48_np[i*hop:(i+1)*hop] 555 | rms.append(float(np.sqrt(np.mean(fr*fr))) if len(fr) else 0.0) 556 | rms = np.asarray(rms, dtype=np.float32) 557 | p95 = float(np.percentile(rms, 95)) or 1e-6 558 | return np.clip(rms / p95, 0.0, 1.0).astype(np.float32) 559 | 560 | def _smooth_probs(self, probs, smooth_ms: int): 561 | import numpy as np, math 562 | if probs is None or probs.size == 0 or smooth_ms <= 0: 563 | return probs 564 | hop_ms = 10.0 565 | tau = max(1e-3, float(smooth_ms)) 566 | alpha = math.exp(-hop_ms / tau) 567 | y = np.empty_like(probs) 568 | acc = probs[0] 569 | for i, p in enumerate(probs): 570 | acc = alpha * acc + (1.0 - alpha) * p 571 | y[i] = acc 572 | return y 573 | 574 | def _strength_per_frame(self, base_s, vad_smooth, adaptive_mode, adaptive_amount, vad_threshold): 575 | import numpy as np 576 | if vad_smooth is None: 577 | return np.array([float(base_s)], dtype=np.float32) 578 | s0 = float(base_s) 579 | a = float(adaptive_amount) 580 | v = np.clip(vad_smooth, 0.0, 1.0) 581 | if adaptive_mode == "off": 582 | s_eff = np.full_like(v, s0, dtype=np.float32) 583 | elif adaptive_mode == "more_on_noise": 584 | s_eff = s0 + a * (1.0 - v) * (1.0 - s0) 585 | elif adaptive_mode == "more_on_speech": 586 | s_eff = s0 + a * v * (1.0 - s0) 587 | elif adaptive_mode == "gate_on_noise": 588 | s_noise = s0 + a * (1.0 - s0) 589 | s_speech = s0 * (1.0 - a) 590 | s_eff = (s_noise * (v < vad_threshold) + s_speech * (v >= vad_threshold)).astype(np.float32) 591 | else: 592 | s_eff = np.full_like(v, s0, dtype=np.float32) 593 | return np.clip(s_eff, 0.0, 1.0).astype(np.float32) 594 | 595 | def _gains_from_strength(self, s_eff, curve): 596 | import numpy as np, math 597 | s = np.clip(s_eff, 0.0, 1.0).astype(np.float32) 598 | if curve == "equal_power": 599 | g_wet = np.sin(0.5 * math.pi * s, dtype=np.float32) 600 | g_dry = np.cos(0.5 * math.pi * s, dtype=np.float32) 601 | else: 602 | g_wet = s 603 | g_dry = 1.0 - s 604 | return g_dry.astype(np.float32), g_wet.astype(np.float32) 605 | 606 | # ------------------------------ main op ------------------------------ 607 | def execute( 608 | self, 609 | audio, 610 | dfn_model="DeepFilterNet2", 611 | device="auto", 612 | use_postfilter=False, 613 | limit_ceiling=True, 614 | stereo_mode="per_channel", 615 | frame_ms=20, 616 | strength=0.65, 617 | mix_curve="equal_power", 618 | adaptive_vad_source="rms", 619 | adaptive_mode="more_on_noise", 620 | adaptive_amount=0.45, 621 | vad_threshold=0.90, 622 | vad_smooth_ms=60, 623 | post_gain_db=0.5, 624 | ceiling=0.98, 625 | ): 626 | import torch, numpy as np 627 | from df.enhance import enhance 628 | from df.io import resample # DFN tensor resampler (48k native) 629 | 630 | # 1) Coerce to [B,C,T], then tensorize & resample to 48 kHz (DFN native) 631 | wav, sr, meta = _coerce_audio(audio) 632 | if stereo_mode == "downmix_mono": 633 | wav = _to_mono(wav) 634 | B, C, T = wav.shape 635 | 636 | x_ct = wav.reshape(-1, T).to(torch.float32) # (C,T) 637 | x48 = resample(x_ct, sr, 48000) if sr != 48000 else x_ct 638 | 639 | # 2) Load DFN once 640 | dev = self._pick_device(device) 641 | model, df_state = self._df_get(dfn_model, dev) 642 | 643 | # 3) Run DFN per channel (tensors-in/out) 644 | wet_ch = [] 645 | with torch.no_grad(): 646 | for ch in range(x48.shape[0]): 647 | xin = x48[ch:ch+1] # (1,T) 648 | y = enhance(model, df_state, xin) # (1,T) 649 | # Some DFN builds expose post_filter kwarg; keep flag for future wheels 650 | # if use_postfilter: 651 | # y = enhance(model, df_state, xin, post_filter=True) 652 | wet_ch.append(y) 653 | wet48 = torch.cat(wet_ch, dim=0) # (C,T) 654 | 655 | # 4) Back to original sample rate (tensors) 656 | wet = resample(wet48, 48000, sr) if sr != 48000 else wet48 657 | dry = x_ct if sr == 48000 else resample(x_ct, 48000, sr) 658 | 659 | # 5) Adaptive mix (10 ms frame gains expanded to per-sample) 660 | hop = int(sr * 0.010) # 10 ms at current sr for expansion 661 | out_ch = [] 662 | for ch in range(dry.shape[0]): 663 | dry_np = dry[ch].detach().cpu().numpy() 664 | wet_np = wet[ch].detach().cpu().numpy() 665 | 666 | # VAD at 48k domain, then expand 667 | if adaptive_vad_source == "rnnoise": 668 | x48_np = (resample(dry[ch:ch+1], sr, 48000)[0].cpu().numpy() 669 | if sr != 48000 else dry_np) 670 | probs = self._vad_probs_rnnoise_48k(x48_np) 671 | elif adaptive_vad_source == "rms": 672 | x48_np = (resample(dry[ch:ch+1], sr, 48000)[0].cpu().numpy() 673 | if sr != 48000 else dry_np) 674 | probs = self._vad_probs_rms_48k(x48_np) 675 | else: 676 | probs = None 677 | 678 | vad_s = self._smooth_probs(probs, vad_smooth_ms) 679 | s_eff = self._strength_per_frame(strength, vad_s, adaptive_mode, adaptive_amount, vad_threshold) 680 | 681 | if s_eff.ndim == 0: 682 | s_per = np.full(dry_np.shape[0], float(s_eff), dtype=np.float32) 683 | else: 684 | s_per = np.repeat(s_eff, max(1, hop))[:dry_np.shape[0]].astype(np.float32) 685 | 686 | g_dry_np, g_wet_np = self._gains_from_strength(s_per, mix_curve) 687 | y_np = g_dry_np * dry_np + g_wet_np * wet_np 688 | y_np = np.clip(y_np, -1.0, 1.0) 689 | out_ch.append(torch.from_numpy(y_np)) 690 | 691 | y = torch.stack(out_ch, dim=0) # (C,T) 692 | y = y.reshape(B, C, -1) 693 | 694 | # 6) Post-gain + limiter 695 | if post_gain_db != 0.0: 696 | gain = float(10.0 ** (post_gain_db / 20.0)) 697 | y = y * gain 698 | 699 | if limit_ceiling: 700 | peak = torch.max(torch.abs(y)).item() 701 | if peak > ceiling and peak > 0: 702 | y = y * (ceiling / peak) 703 | 704 | y = torch.clamp(y, -1.0, 1.0) 705 | 706 | meta2 = dict(meta) 707 | meta2["deepfilternet"] = { 708 | "model": dfn_model, 709 | "device": dev, 710 | "use_postfilter": bool(use_postfilter), 711 | "stereo_mode": stereo_mode, 712 | "frame_ms": frame_ms, 713 | "strength": strength, 714 | "mix_curve": mix_curve, 715 | "adaptive_vad_source": adaptive_vad_source, 716 | "adaptive_mode": adaptive_mode, 717 | "adaptive_amount": adaptive_amount, 718 | "vad_threshold": vad_threshold, 719 | "vad_smooth_ms": vad_smooth_ms, 720 | "post_gain_db": post_gain_db, 721 | "limit_ceiling": bool(limit_ceiling), 722 | "ceiling": ceiling, 723 | } 724 | return (_make_audio(sr, y, meta2),) 725 | 726 | # ---------------------------- 727 | # Descript Audio Codec (DAC) encode/decode 728 | # ---------------------------- 729 | 730 | class Egregora_DAC_Encode: 731 | """ 732 | Encodes audio with DAC and returns latent 'z' & metadata in a DICT. 733 | Auto-downloads weights for chosen model_type on first use. 734 | """ 735 | @classmethod 736 | def INPUT_TYPES(cls): 737 | return { 738 | "required": { 739 | "audio": ("AUDIO",), 740 | "model_type": (["44khz", "24khz", "16khz"], {"default": "44khz"}), 741 | "device": (["auto", "cpu", "cuda"], {"default": "auto"}), 742 | } 743 | } 744 | 745 | RETURN_TYPES = ("DICT", "STRING") 746 | RETURN_NAMES = ("codes", "log") 747 | FUNCTION = "execute" 748 | CATEGORY = "Egregora/Codecs" 749 | 750 | def execute(self, audio, model_type="44khz", device="auto"): 751 | try: 752 | import dac 753 | except Exception as e: 754 | raise RuntimeError("descript-audio-codec not installed. pip install descript-audio-codec") from e 755 | 756 | wav, sr, meta = _coerce_audio(audio) # [B,C,T] float 757 | B, C, T = wav.shape 758 | 759 | # Auto-download 760 | ckpt = dac.utils.download(model_type=model_type) 761 | model = dac.DAC.load(ckpt) 762 | 763 | dev = _device_for(wav) if device == "auto" else device 764 | model = model.to(dev) 765 | 766 | # FIX: Get model's expected sample rate 767 | model_sr = model.sample_rate 768 | 769 | # Compress each batch separately, concat codes 770 | with torch.no_grad(): 771 | z_all = [] 772 | for b in range(B): 773 | x = wav[b].to(dev) # [C,T] 774 | 775 | # FIX: Resample to model's expected sample rate before preprocessing 776 | if sr != model_sr: 777 | x_resampled = torchaudio.functional.resample(x, sr, model_sr) 778 | else: 779 | x_resampled = x 780 | 781 | # preprocess expects the correct sample rate 782 | x_prep = model.preprocess(x_resampled, model_sr) 783 | z, codes, latents, _, _ = model.encode(x_prep) 784 | 785 | # Store z (list of tensors) into CPU tensors for DICT 786 | if isinstance(z, (list, tuple)): 787 | z_cpu = [t.detach().cpu() for t in z] 788 | else: 789 | z_cpu = [z.detach().cpu()] 790 | z_all.append(z_cpu) 791 | 792 | codes_dict = { 793 | "model_type": model_type, 794 | "sample_rate": sr, # Store original sample rate 795 | "model_sample_rate": model_sr, # Store model's sample rate 796 | "latents": z_all, # list over batch of list[tensor] 797 | } 798 | log = f"DAC encode ok: model={model_type}, B={B}, C={C}, sr={sr}->{model_sr}" 799 | return (codes_dict, log) 800 | 801 | 802 | class Egregora_DAC_Decode: 803 | """ 804 | Decodes DICT produced by Egregora_DAC_Encode back to AUDIO. 805 | """ 806 | @classmethod 807 | def INPUT_TYPES(cls): 808 | return { 809 | "required": { 810 | "codes": ("DICT",), 811 | "device": (["auto", "cpu", "cuda"], {"default": "auto"}), 812 | } 813 | } 814 | 815 | RETURN_TYPES = ("AUDIO", "STRING") 816 | RETURN_NAMES = ("audio", "log") 817 | FUNCTION = "execute" 818 | CATEGORY = "Egregora/Codecs" 819 | 820 | def execute(self, codes, device="auto"): 821 | try: 822 | import dac 823 | except Exception as e: 824 | raise RuntimeError("descript-audio-codec not installed. pip install descript-audio-codec") from e 825 | 826 | model_type = codes.get("model_type", "44khz") 827 | sr = int(codes.get("sample_rate", 48000)) 828 | model_sr = int(codes.get("model_sample_rate", sr)) 829 | latents_b = codes.get("latents", []) 830 | if not latents_b: 831 | raise ValueError("codes.latents empty") 832 | 833 | ckpt = dac.utils.download(model_type=model_type) 834 | model = dac.DAC.load(ckpt) 835 | 836 | dev = "cuda" if torch.cuda.is_available() and device in ("auto", "cuda") else "cpu" 837 | model = model.to(dev) 838 | 839 | outs = [] 840 | with torch.no_grad(): 841 | for z_list in latents_b: 842 | # z_list: list[tensor] shaped as model expects 843 | z_dev = [t.to(dev).float() for t in z_list] 844 | y = model.decode(z_dev) # [C,T] at model's native sr 845 | outs.append(y.unsqueeze(0).cpu()) 846 | 847 | y_cat = torch.cat(outs, dim=0) # [B,C,T] 848 | 849 | # FIX: Resample back to original sample rate if needed 850 | if model_sr != sr: 851 | y_resampled, _ = _resample(y_cat, model_sr, sr) 852 | else: 853 | y_resampled = y_cat 854 | 855 | audio = _make_audio(sr=sr, wav=y_resampled) 856 | log = f"DAC decode ok: model={model_type}, B={y_cat.size(0)}, C={y_cat.size(1)}, {model_sr}->{sr}" 857 | return (audio, log) 858 | 859 | # ---------------------------- 860 | # Node registration 861 | # ---------------------------- 862 | 863 | NODE_CLASS_MAPPINGS = { 864 | "Egregora_RNNoise_Denoise": Egregora_RNNoise_Denoise, 865 | "Egregora_WPE_Dereverb": Egregora_WPE_Dereverb, 866 | "Egregora_DeepFilterNet_Denoise": Egregora_DeepFilterNet_Denoise, 867 | "Egregora_DAC_Encode": Egregora_DAC_Encode, 868 | "Egregora_DAC_Decode": Egregora_DAC_Decode, 869 | 870 | } 871 | 872 | NODE_DISPLAY_NAME_MAPPINGS = { 873 | "Egregora_RNNoise_Denoise": "Egregora RNNoise Denoise", 874 | "Egregora_WPE_Dereverb": "Egregora WPE Dereverb", 875 | "Egregora_DeepFilterNet_Denoise": "Egregora DeepFilterNet Denoise", 876 | "Egregora_DAC_Encode": "Egregora DAC Encode", 877 | "Egregora_DAC_Decode": "Egregora DAC Decode", 878 | 879 | } --------------------------------------------------------------------------------