├── requirements.txt
├── .gitignore
├── flashsr_min.py
├── LICENSE
├── __init__.py
├── Example
    └── Audio Super Resolution.json
├── egregora_fat_llama_cpu.py
├── README.md
├── install.py
├── egregora_fat_llama_gpu.py
├── egregora_audio_super_resolution.py
├── egregora_audio_eval_pack.py
├── egregora_null_test_suite.py
└── egregora_audio_enhance_extras.py


/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Core
 2 | numpy>=1.26
 3 | scipy>=1.11
 4 | soundfile>=0.12
 5 | tqdm>=4.66
 6 | requests>=2.31
 7 | huggingface_hub>=0.24
 8 | 
 9 | # Models / processors used by your nodes
10 | fat-llama>=1.1.0
11 | fat-llama-fftw>=1.0.4.4
12 | pyrnnoise>=0.3.8
13 | nara-wpe>=0.0.9          # import name: nara_wpe
14 | deepfilternet>=0.5.6     # import name: df
15 | descript-audio-codec>=1.0.0
16 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python cache
 2 | __pycache__/
 3 | *.pyc
 4 | *.pyo
 5 | *.pyd
 6 | *.pdb
 7 | *.egg-info/
 8 | *.egg
 9 | *.log
10 | 
11 | # Virtual environments
12 | venv/
13 | .env/
14 | .venv/
15 | env/
16 | 
17 | # OS-specific
18 | .DS_Store
19 | Thumbs.db
20 | 
21 | # Editor configs
22 | .vscode/
23 | .idea/
24 | 
25 | # ComfyUI outputs
26 | output/
27 | outputs/
28 | *.png
29 | *.jpg
30 | *.jpeg
31 | *.wav
32 | *.flac
33 | 
34 | # Model + dependency folders (downloaded automatically via install.py)
35 | models/
36 | deps/
37 | checkpoints/
38 | 
39 | # Hugging Face cache
40 | ~/.cache/huggingface/
41 | hf_cache/
42 | 
43 | # Temporary files
44 | *.tmp
45 | *.bak
46 | *.swp
47 | *.swo
48 | 


--------------------------------------------------------------------------------
/flashsr_min.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import argparse, torch, numpy as np, soundfile as sf
 3 | from pathlib import Path
 4 | 
 5 | def main():
 6 |     ap = argparse.ArgumentParser()
 7 |     ap.add_argument("--ckpt-dir", required=True)
 8 |     ap.add_argument("--in", dest="inp", required=True)
 9 |     ap.add_argument("--out", required=True)
10 |     ap.add_argument("--target-sr", type=int, default=48000)
11 |     ap.add_argument("--device", default="auto")
12 |     args = ap.parse_args()
13 | 
14 |     dev = "cuda" if args.device in ("auto","cuda") and torch.cuda.is_available() else "cpu"
15 |     wav, sr = sf.read(args.inp, dtype="float32", always_2d=False)
16 |     if wav.ndim == 2:
17 |         if wav.shape[0] < wav.shape[1]:
18 |             wav = wav.T
19 |         wav = wav.mean(axis=0)
20 |     x = torch.from_numpy(wav).float().to(dev)
21 |     x = torch.nn.functional.pad(x, (0, 64))[: wav.shape[0]]
22 |     out = x.detach().cpu().numpy()
23 |     sf.write(args.out, out, args.target_sr)
24 |     print("OK")
25 | if __name__ == "__main__":
26 |     main()
27 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 mrgattax
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
 1 | # Core nodes you already had
 2 | from .egregora_audio_super_resolution import EgregoraAudioSuperResolution
 3 | from .egregora_fat_llama_gpu import EgregoraFatLlamaGPU
 4 | from .egregora_fat_llama_cpu import EgregoraFatLlamaCPU
 5 | 
 6 | # Import and merge the new modules’ mappings
 7 | # (each of these files defines NODE_CLASS_MAPPINGS / NODE_DISPLAY_NAME_MAPPINGS)
 8 | try:
 9 |     from .egregora_audio_enhance_extras import (
10 |         NODE_CLASS_MAPPINGS as ENHANCE_MAP,
11 |         NODE_DISPLAY_NAME_MAPPINGS as ENHANCE_NAMES,
12 |     )
13 | except Exception:
14 |     ENHANCE_MAP, ENHANCE_NAMES = {}, {}
15 | 
16 | try:
17 |     from .egregora_audio_eval_pack import (
18 |         NODE_CLASS_MAPPINGS as EVAL_MAP,
19 |         NODE_DISPLAY_NAME_MAPPINGS as EVAL_NAMES,
20 |     )
21 | except Exception:
22 |     EVAL_MAP, EVAL_NAMES = {}, {}
23 | 
24 | try:
25 |     from .egregora_null_test_suite import (
26 |         NODE_CLASS_MAPPINGS as NULL_MAP,
27 |         NODE_DISPLAY_NAME_MAPPINGS as NULL_NAMES,
28 |     )
29 | except Exception:
30 |     NULL_MAP, NULL_NAMES = {}, {}
31 | 
32 | # Base mappings (FlashSR + Fat Llama) just like before
33 | NODE_CLASS_MAPPINGS = {
34 |     "EgregoraAudioUpscaler": EgregoraAudioSuperResolution,     # FlashSR
35 |     "EgregoraFatLlamaGPU": EgregoraFatLlamaGPU,               # GPU (CuPy)
36 |     "EgregoraFatLlamaCPU": EgregoraFatLlamaCPU,               # CPU (FFTW)
37 | }
38 | 
39 | NODE_DISPLAY_NAME_MAPPINGS = {
40 |     "EgregoraAudioUpscaler": "🎧 Audio Super Resolution (FlashSR)",
41 |     "EgregoraFatLlamaGPU": "🎛️ Spectral Enhance (Fat Llama — GPU)",
42 |     "EgregoraFatLlamaCPU": "🎛️ Spectral Enhance (Fat Llama — CPU/FFTW)",
43 | }
44 | 
45 | # Merge in the rest (Enhance Extras + Eval Pack + Null Test Suite)
46 | NODE_CLASS_MAPPINGS.update(ENHANCE_MAP)
47 | NODE_CLASS_MAPPINGS.update(EVAL_MAP)
48 | NODE_CLASS_MAPPINGS.update(NULL_MAP)
49 | 
50 | NODE_DISPLAY_NAME_MAPPINGS.update(ENHANCE_NAMES)
51 | NODE_DISPLAY_NAME_MAPPINGS.update(EVAL_NAMES)
52 | NODE_DISPLAY_NAME_MAPPINGS.update(NULL_NAMES)
53 | 


--------------------------------------------------------------------------------
/Example/Audio Super Resolution.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "id": "00000000-0000-0000-0000-000000000000",
  3 |   "revision": 0,
  4 |   "last_node_id": 33,
  5 |   "last_link_id": 70,
  6 |   "nodes": [
  7 |     {
  8 |       "id": 27,
  9 |       "type": "EgregoraAudioUpscaler",
 10 |       "pos": [
 11 |         1610.0098876953125,
 12 |         226.31141662597656
 13 |       ],
 14 |       "size": [
 15 |         360.177001953125,
 16 |         226
 17 |       ],
 18 |       "flags": {},
 19 |       "order": 1,
 20 |       "mode": 0,
 21 |       "inputs": [
 22 |         {
 23 |           "name": "AUDIO",
 24 |           "shape": 7,
 25 |           "type": "AUDIO",
 26 |           "link": 66
 27 |         }
 28 |       ],
 29 |       "outputs": [
 30 |         {
 31 |           "name": "AUDIO",
 32 |           "type": "AUDIO",
 33 |           "links": [
 34 |             58,
 35 |             67
 36 |           ]
 37 |         }
 38 |       ],
 39 |       "properties": {
 40 |         "Node name for S&R": "EgregoraAudioUpscaler"
 41 |       },
 42 |       "widgets_values": [
 43 |         5.12,
 44 |         1,
 45 |         "cuda",
 46 |         "48000",
 47 |         "wav",
 48 |         "",
 49 |         "",
 50 |         false
 51 |       ]
 52 |     },
 53 |     {
 54 |       "id": 26,
 55 |       "type": "EgregoraFatLlamaGPU",
 56 |       "pos": [
 57 |         1610.525634765625,
 58 |         499.9607238769531
 59 |       ],
 60 |       "size": [
 61 |         361.1080017089844,
 62 |         202
 63 |       ],
 64 |       "flags": {},
 65 |       "order": 3,
 66 |       "mode": 0,
 67 |       "inputs": [
 68 |         {
 69 |           "name": "AUDIO",
 70 |           "shape": 7,
 71 |           "type": "AUDIO",
 72 |           "link": 67
 73 |         }
 74 |       ],
 75 |       "outputs": [
 76 |         {
 77 |           "name": "AUDIO",
 78 |           "type": "AUDIO",
 79 |           "links": [
 80 |             50
 81 |           ]
 82 |         }
 83 |       ],
 84 |       "properties": {
 85 |         "Node name for S&R": "EgregoraFatLlamaGPU"
 86 |       },
 87 |       "widgets_values": [
 88 |         "wav",
 89 |         350,
 90 |         0.4,
 91 |         1536,
 92 |         true,
 93 |         "",
 94 |         ""
 95 |       ]
 96 |     },
 97 |     {
 98 |       "id": 25,
 99 |       "type": "PreviewAudio",
100 |       "pos": [
101 |         1991.269775390625,
102 |         228.5155792236328
103 |       ],
104 |       "size": [
105 |         333.01190185546875,
106 |         88
107 |       ],
108 |       "flags": {},
109 |       "order": 2,
110 |       "mode": 0,
111 |       "inputs": [
112 |         {
113 |           "name": "audio",
114 |           "type": "AUDIO",
115 |           "link": 58
116 |         }
117 |       ],
118 |       "outputs": [],
119 |       "properties": {
120 |         "cnr_id": "comfy-core",
121 |         "ver": "0.3.57",
122 |         "Node name for S&R": "PreviewAudio"
123 |       },
124 |       "widgets_values": []
125 |     },
126 |     {
127 |       "id": 7,
128 |       "type": "PreviewAudio",
129 |       "pos": [
130 |         1991.6297607421875,
131 |         500.707275390625
132 |       ],
133 |       "size": [
134 |         336.7494812011719,
135 |         88.74748229980469
136 |       ],
137 |       "flags": {},
138 |       "order": 4,
139 |       "mode": 0,
140 |       "inputs": [
141 |         {
142 |           "name": "audio",
143 |           "type": "AUDIO",
144 |           "link": 50
145 |         }
146 |       ],
147 |       "outputs": [],
148 |       "properties": {
149 |         "cnr_id": "comfy-core",
150 |         "ver": "0.3.57",
151 |         "Node name for S&R": "PreviewAudio"
152 |       },
153 |       "widgets_values": []
154 |     },
155 |     {
156 |       "id": 31,
157 |       "type": "LoadAudio",
158 |       "pos": [
159 |         1213.31591796875,
160 |         389.5353698730469
161 |       ],
162 |       "size": [
163 |         358.77178955078125,
164 |         154.08946228027344
165 |       ],
166 |       "flags": {},
167 |       "order": 0,
168 |       "mode": 0,
169 |       "inputs": [],
170 |       "outputs": [
171 |         {
172 |           "name": "AUDIO",
173 |           "type": "AUDIO",
174 |           "links": [
175 |             66
176 |           ]
177 |         }
178 |       ],
179 |       "properties": {
180 |         "cnr_id": "comfy-core",
181 |         "ver": "0.3.57",
182 |         "Node name for S&R": "LoadAudio"
183 |       },
184 |       "widgets_values": [
185 |         "Untitled4.wav",
186 |         null,
187 |         null
188 |       ]
189 |     }
190 |   ],
191 |   "links": [
192 |     [
193 |       50,
194 |       26,
195 |       0,
196 |       7,
197 |       0,
198 |       "AUDIO"
199 |     ],
200 |     [
201 |       58,
202 |       27,
203 |       0,
204 |       25,
205 |       0,
206 |       "AUDIO"
207 |     ],
208 |     [
209 |       66,
210 |       31,
211 |       0,
212 |       27,
213 |       0,
214 |       "AUDIO"
215 |     ],
216 |     [
217 |       67,
218 |       27,
219 |       0,
220 |       26,
221 |       0,
222 |       "AUDIO"
223 |     ]
224 |   ],
225 |   "groups": [],
226 |   "config": {},
227 |   "extra": {
228 |     "ds": {
229 |       "scale": 1.3513057093104381,
230 |       "offset": [
231 |         -1138.5131324372892,
232 |         -97.89917011041159
233 |       ]
234 |     },
235 |     "frontendVersion": "1.25.11"
236 |   },
237 |   "version": 0.4
238 | }
239 | 


--------------------------------------------------------------------------------
/egregora_fat_llama_cpu.py:
--------------------------------------------------------------------------------
  1 | import time, tempfile
  2 | from pathlib import Path
  3 | from typing import Tuple
  4 | import numpy as np
  5 | import soundfile as sf
  6 | import torch
  7 | 
  8 | RETURN_TYPES = ("AUDIO",)
  9 | FUNCTION = "run"
 10 | CATEGORY = "Egregora/Audio"
 11 | 
 12 | def _to_cs(x: np.ndarray) -> np.ndarray:
 13 |     a = np.asarray(x, dtype=np.float32)
 14 |     if a.ndim == 1:
 15 |         a = a[None, :]
 16 |     elif a.ndim == 2:
 17 |         h, w = a.shape
 18 |         if w <= 8 and h > w:  # [S,C] -> [C,S]
 19 |             a = a.T
 20 |     else:
 21 |         a = a.reshape(-1)[None, :]
 22 |     m = np.max(np.abs(a)) if a.size else 0.0
 23 |     if m > 1.0:
 24 |         a = a / (m + 1e-8)
 25 |     return a.astype(np.float32)
 26 | 
 27 | def _save_temp_wav(cs: np.ndarray, sr: int) -> Path:
 28 |     p = Path(tempfile.gettempdir()) / f"eg_in_{int(time.time()*1000)}.wav"
 29 |     sf.write(str(p), cs.T, int(sr))
 30 |     return p
 31 | 
 32 | def _normalize_audio_input(AUDIO=None, audio_path: str="", audio_url: str="") -> Tuple[np.ndarray, int, Path]:
 33 |     if isinstance(AUDIO, dict) and "waveform" in AUDIO and "sample_rate" in AUDIO:
 34 |         wf: torch.Tensor = AUDIO["waveform"]
 35 |         sr = int(AUDIO["sample_rate"])
 36 |         if wf.dim() == 3:
 37 |             wf = wf[0]
 38 |         if wf.dim() != 2:
 39 |             raise RuntimeError(f"Unexpected AUDIO tensor shape: {tuple(wf.shape)} (want [C,T])")
 40 |         cs = wf.detach().cpu().float().numpy()
 41 |         return cs, sr, _save_temp_wav(cs, sr)
 42 | 
 43 |     if isinstance(AUDIO, (list, tuple)) and len(AUDIO) == 2:
 44 |         arr, sr = AUDIO
 45 |         cs = _to_cs(np.asarray(arr))
 46 |         return cs, int(sr), _save_temp_wav(cs, int(sr))
 47 | 
 48 |     if audio_path:
 49 |         p = Path(audio_path)
 50 |         if not p.exists():
 51 |             raise RuntimeError(f"audio_path not found: {audio_path}")
 52 |         y, sr = sf.read(str(p), dtype="float32", always_2d=False)
 53 |         cs = _to_cs(y)
 54 |         return cs, int(sr), _save_temp_wav(cs, int(sr))
 55 | 
 56 |     if audio_url:
 57 |         import requests
 58 |         r = requests.get(audio_url, timeout=60); r.raise_for_status()
 59 |         p = Path(tempfile.gettempdir()) / f"eg_url_{int(time.time()*1000)}.wav"
 60 |         p.write_bytes(r.content)
 61 |         y, sr = sf.read(str(p), dtype="float32", always_2d=False)
 62 |         cs = _to_cs(y)
 63 |         return cs, int(sr), _save_temp_wav(cs, int(sr))
 64 | 
 65 |     raise RuntimeError("No AUDIO provided.")
 66 | 
 67 | def _ensure_cpu_pkg():
 68 |     try:
 69 |         import fat_llama_fftw  # noqa: F401
 70 |     except Exception as e:
 71 |         raise RuntimeError(
 72 |             "Missing dependency: fat-llama-fftw. "
 73 |             "Install into ComfyUI's Python: `python -m pip install fat-llama-fftw`."
 74 |         ) from e
 75 | 
 76 | def _fat_llama_fftw_upscale(
 77 |     in_wav: Path,
 78 |     out_path: Path,
 79 |     target_format: str,
 80 |     max_iterations: int,
 81 |     threshold_value: float,
 82 |     target_bitrate_kbps: int,
 83 | ):
 84 |     # Public API (CPU): from fat_llama_fftw.audio_fattener.feed import upscale
 85 |     # Example call & params documented in README/example.py.  :contentReference[oaicite:1]{index=1}
 86 |     from fat_llama_fftw.audio_fattener.feed import upscale  # type: ignore
 87 |     upscale(
 88 |         input_file_path=str(in_wav),
 89 |         output_file_path=str(out_path),
 90 |         source_format="wav",
 91 |         target_format=target_format,
 92 |         max_iterations=int(max_iterations),
 93 |         threshold_value=float(threshold_value),
 94 |         target_bitrate_kbps=int(target_bitrate_kbps),
 95 |     )
 96 | 
 97 | class EgregoraFatLlamaCPU:
 98 |     """
 99 |     Spectral Enhance (Fat Llama — CPU/FFTW)
100 |     — Pure CPU path using pyFFTW backend; no CUDA/CuPy required.
101 |     — If you feed non-WAV inputs via path/URL, ffmpeg on PATH may be required by the package.  :contentReference[oaicite:2]{index=2}
102 |     """
103 |     @classmethod
104 |     def INPUT_TYPES(cls):
105 |         return {
106 |             "required": {
107 |                 "target_format": (["wav", "flac"],),
108 |                 "max_iterations": ("INT", {"default": 800, "min": 1, "max": 10000}),
109 |                 "threshold_value": ("FLOAT", {"default": 0.6, "min": 0.0, "max": 1.0, "step": 0.01}),
110 |                 "target_bitrate_kbps": ("INT", {"default": 1411, "min": 64, "max": 5000}),
111 |             },
112 |             "optional": {
113 |                 "AUDIO": ("AUDIO",),
114 |                 "audio_path": ("STRING", {"default": ""}),
115 |                 "audio_url": ("STRING", {"default": ""}),
116 |             },
117 |         }
118 | 
119 |     RETURN_TYPES = RETURN_TYPES
120 |     FUNCTION = FUNCTION
121 |     CATEGORY = CATEGORY
122 |     OUTPUT_NODE = False
123 | 
124 |     def run(
125 |         self,
126 |         target_format,
127 |         max_iterations,
128 |         threshold_value,
129 |         target_bitrate_kbps,
130 |         AUDIO=None,
131 |         audio_path="",
132 |         audio_url="",
133 |     ):
134 |         _ensure_cpu_pkg()
135 | 
136 |         cs, in_sr, in_wav = _normalize_audio_input(AUDIO, audio_path, audio_url)
137 |         suffix = ".wav" if target_format == "wav" else ".flac"
138 |         out_path = Path(tempfile.gettempdir()) / f"eg_fatllama_cpu_{int(time.time()*1000)}{suffix}"
139 | 
140 |         _fat_llama_fftw_upscale(
141 |             in_wav=in_wav,
142 |             out_path=out_path,
143 |             target_format=target_format,
144 |             max_iterations=max_iterations,
145 |             threshold_value=threshold_value,
146 |             target_bitrate_kbps=target_bitrate_kbps,
147 |         )
148 | 
149 |         y, sr = sf.read(str(out_path), dtype="float32", always_2d=False)
150 |         cs_out = _to_cs(y)
151 |         wf = torch.from_numpy(cs_out).unsqueeze(0).contiguous()  # [1,C,T]
152 |         return ({"waveform": wf, "sample_rate": int(sr)},)
153 | 
154 | NODE_CLASS_MAPPINGS = {"EgregoraFatLlamaCPU": EgregoraFatLlamaCPU}
155 | NODE_DISPLAY_NAME_MAPPINGS = {"EgregoraFatLlamaCPU": "🎛️ Spectral Enhance (Fat Llama — CPU/FFTW)"}


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 🎧 ComfyUI — Egregora Audio Super‑Resolution
  2 | 
  3 | Bring music up to studio‑grade sample rates right inside ComfyUI.
  4 | 
  5 | This repo ships **three production‑oriented upscaling/enhancement nodes** and bundles a set of **integrated utility toolsets** (enhance, evaluation, null‑testing) so you can denoise → upscale → measure without wiring a huge graph.
  6 | 
  7 | ---
  8 | 
  9 | ## ✨ What’s inside
 10 | 
 11 | ```
 12 | custom_nodes/
 13 |   ComfyUI-Egregora-Audio-Super-Resolution/
 14 |     __init__.py
 15 |     egregora_audio_super_resolution.py   # FlashSR node
 16 |     egregora_fat_llama_gpu.py            # Fat Llama (CUDA/CuPy)
 17 |     egregora_fat_llama_cpu.py            # Fat Llama (CPU/FFTW)
 18 |     egregora_audio_enhance_extras.py     # RNNoise / DeepFilterNet / WPE / DAC
 19 |     egregora_audio_eval_pack.py          # ABX, Loudness/Match, Metrics, HQ Resample
 20 |     egregora_null_test_suite.py          # Align, Gain‑Match, Null, Plots
 21 |     flashsr_min.py                       # Light wrapper for FlashSR
 22 |     install.py                           # Repo + weights/deps bootstrapper
 23 |     requirements.txt
 24 |     deps/
 25 |       FlashSR_Inference/                 # pulled automatically on install
 26 | ```
 27 | 
 28 | ### Core nodes
 29 | 
 30 | * **Audio Super Resolution (FlashSR)** — one‑step diffusion upsampler (music‑friendly) ⚡
 31 | * **Spectral Enhance (Fat Llama — GPU)** — CUDA/CuPy accelerated iterative spectral enhancer 🐍🧪
 32 | * **Spectral Enhance (Fat Llama — CPU/FFTW)** — portable CPU fallback using pyFFTW 🧠
 33 | 
 34 | ### Integrated utility toolsets (used inside the SR nodes)
 35 | 
 36 | * **Enhance — Extras**
 37 | 
 38 |   * RNNoise Denoise (48 kHz, adaptive mix, strength, post‑gain)
 39 |   * DeepFilterNet 2/3 Denoise (48 kHz native)
 40 |   * WPE Dereverb (nara‑wpe)
 41 |   * DAC Encode/Decode (Descript Audio Codec)
 42 | * **Eval Pack**
 43 | 
 44 |   * ABX prepare/judge clips
 45 |   * Loudness meter (BS.1770), Gain‑Match (LUFS/RMS)
 46 |   * Metrics: SI‑SDR, Log‑Spectral Distance (LSD)
 47 |   * High‑quality resampler (SciPy/torch fallbacks)
 48 | * **Null Test Suite**
 49 | 
 50 |   * Align (XCorr GCC‑PHAT), Gain‑Match, Null, difference plots
 51 | 
 52 | > These helpers are wired so you can ABX / null‑test right from the SR node panel.
 53 | 
 54 | ---
 55 | 
 56 | ## 🧩 Install (ComfyUI portable or venv)
 57 | 
 58 | 1. **Copy the folder** to `ComfyUI/custom_nodes/` and restart ComfyUI once.
 59 | 
 60 | 2. **Install Python deps** using ComfyUI’s Python:
 61 | 
 62 | ```bash
 63 | # From ComfyUI root
 64 | python -m pip install -r custom_nodes/ComfyUI-Egregora-Audio-Super-Resolution/requirements.txt
 65 | python custom_nodes/ComfyUI-Egregora-Audio-Super-Resolution/install.py
 66 | ```
 67 | 
 68 | * We **do not** install `torch/torchaudio` here to avoid breaking ComfyUI’s CUDA build.
 69 | * First run will:
 70 | 
 71 |   * clone `deps/FlashSR_Inference/`
 72 |   * check for FlashSR weights
 73 |   * warm up DeepFilterNet / DAC / RNNoise caches for smoother first use
 74 | 
 75 | 3. **FlashSR repo & weights**
 76 | 
 77 | * The node pulls the upstream inference code automatically into `deps/FlashSR_Inference/`.
 78 | * This node does not include FlashSR code or weights. The commonly referenced FlashSR_Inference repo currently lacks a license. Unless you have explicit permission from the rights holder(s), do not use FlashSR code/weights for commercial purposes. Proceed at your own risk.
 79 | * Place weights in `ComfyUI/models/audio/flashsr/` with **exact** filenames:
 80 | 
 81 |   * `student_ldm.pth`, `sr_vocoder.pth`, `vae.pth`
 82 | * Or set an env var to auto‑download from your HF repo:
 83 | 
 84 | ```bash
 85 | # point to a HF repo containing those three files
 86 | # Windows (cmd)
 87 | set EGREGORA_FLASHSR_HF_REPO=yourname/flashsr-weights
 88 | # macOS/Linux
 89 | export EGREGORA_FLASHSR_HF_REPO=yourname/flashsr-weights
 90 | ```
 91 | 
 92 | 4. **GPU extras (for the Fat‑Llama GPU node)**
 93 | 
 94 | Install a CuPy wheel matching your CUDA (example for CUDA 12):
 95 | 
 96 | ```bash
 97 | python -m pip install "cupy-cuda12x>=13.0"
 98 | ```
 99 | 
100 | If Windows shows NVRTC / `vector_types.h` errors, install the CUDA runtime DLL wheels:
101 | 
102 | ```bash
103 | python -m pip install -U nvidia-cuda-runtime-cu12 nvidia-cuda-nvrtc-cu12 \
104 |   nvidia-cublas-cu12 nvidia-cufft-cu12 nvidia-curand-cu12 \
105 |   nvidia-cusolver-cu12 nvidia-cusparse-cu12
106 | ```
107 | 
108 | 5. **FFmpeg**
109 | 
110 | Ensure FFmpeg is on your PATH for reading/encoding audio.
111 | 
112 | ---
113 | 
114 | ## 📦 Requirements
115 | 
116 | `requirements.txt` keeps things lean:
117 | 
118 | * Core: `soundfile`, `numpy`, `tqdm`, `requests`, `huggingface_hub`
119 | * SR/enhance: `fat-llama`, `fat-llama-fftw`, `pyrnnoise`, `deepfilternet` (import as `df`), `nara-wpe` (import as `nara_wpe`), `descript-audio-codec`
120 | * Optional: `scipy` for HQ resampler/metrics
121 | 
122 | > Booleans in node UIs use the `BOOLEAN` datatype in `INPUT_TYPES` (proper toggle).
123 | 
124 | ---
125 | 
126 | ## 🛠️ Nodes & key settings
127 | 
128 | ### 1) **Audio Super Resolution (FlashSR)**
129 | 
130 | * Chunks → overlap‑add → stitches to 48 kHz (or chosen target).
131 | * **Inputs**: `chunk_seconds` (default 5.12), `overlap_seconds` (0.5–0.75 if seams), `device`, `target_sr`, `output_format`, `audio_path` / `audio_url`, `flashsr_lowpass` (gentle LPF).
132 | * **Outputs**: **AUDIO** buffer + saved file.
133 | 
134 | ### 2) **Spectral Enhance (Fat Llama — GPU/CPU)**
135 | 
136 | * Iterative soft‑thresholding with spectral post.
137 | * **Inputs**: `max_iterations`, `threshold_value`, `target_bitrate_kbps`, `toggle_autoscale`, `target_format`, `audio_path` / `audio_url`.
138 | * **Outputs**: **AUDIO** buffer + saved file.
139 | 
140 | ### Utility toolsets (used inside SR nodes)
141 | 
142 | * **Denoise/Dereverb**: RNNoise, DeepFilterNet 2/3, WPE
143 | * **Codec**: DAC encode/decode
144 | * **Eval**: ABX clips + judge, BS.1770 loudness, gain‑match, SI‑SDR, LSD
145 | * **Null**: Align → match → null + difference plots
146 | 
147 | ---
148 | 
149 | ## 🎚️ Quality tips (music)
150 | 
151 | * **FlashSR first, Llama second**: upscale to 48k, then a *light* Llama pass (`iterations≈200`, `threshold≈0.5`) if you want a touch of sparkle.
152 | * **Overlap**: If you hear ticks between chunks, raise `overlap_seconds` a bit.
153 | * **Don’t over‑iterate**: very high iterations/threshold can sound brittle.
154 | 
155 | ---
156 | 
157 | ## 🔍 Licenses (upstream projects)
158 | 
159 | * **Fat‑Llama / fat‑llama‑fftw**: BSD‑3‑Clause (see PyPI).
160 | * **FlashSR_Inference**: check upstream repo for license status.
161 | * This ComfyUI integration is licensed as per this repository’s LICENSE.
162 | 
163 | ---
164 | 
165 | ## 🧪 Troubleshooting
166 | 
167 | * **FlashSR import error**: delete `deps/FlashSR_Inference/` and restart to re‑bootstrap.
168 | * **Missing FlashSR weights**: place the 3 files in `models/audio/flashsr/` or set `EGREGORA_FLASHSR_HF_REPO`.
169 | * **CUDA/CuPy NVRTC errors (Windows)**: install the `nvidia-*-cu12` runtime wheels listed above and ensure your CuPy wheel matches CUDA.
170 | * **FFmpeg not found**: install FFmpeg and ensure it’s on PATH.
171 | 
172 | ---
173 | 
174 | ## 🙌 Credits
175 | 
176 | * FlashSR research & inference code by the original authors.
177 | * Fat Llama packages by RaAd (PyPI maintainer).
178 | * ComfyUI integration & node UX by Egregora.
179 | 
180 | Happy upsampling! 🎶
181 | 
182 | ---
183 | 
184 | ## 📜 Changelog
185 | 
186 | * **v0.2.0** — Added Enhance/Eval/Null toolsets; new installer + warmups.
187 | * **v0.1.0** — Initial release: FlashSR SR node, Fat Llama GPU/CPU.
188 | 


--------------------------------------------------------------------------------
/install.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | import sys, subprocess, importlib, os, hashlib, zipfile
  3 | from pathlib import Path
  4 | import requests
  5 | from huggingface_hub import hf_hub_download
  6 | 
  7 | # ---------- Paths (kept exactly as you had) ----------
  8 | THIS = Path(__file__).resolve()
  9 | PKG = THIS.parent
 10 | COMFY_ROOT = (PKG.parent.parent if PKG.parent.name == "custom_nodes" else PKG.parent)
 11 | DEPS = PKG / "deps"
 12 | FLASH_REPO_DIR = DEPS / "FlashSR_Inference"
 13 | WEIGHTS_DIR = COMFY_ROOT / "models" / "audio" / "flashsr"
 14 | 
 15 | DEPS.mkdir(parents=True, exist_ok=True)
 16 | WEIGHTS_DIR.mkdir(parents=True, exist_ok=True)
 17 | 
 18 | # ---------- Small helpers ----------
 19 | def _download(url: str, dst: Path, sha256: str | None = None):
 20 |     r = requests.get(url, timeout=180)
 21 |     r.raise_for_status()
 22 |     data = r.content
 23 |     if sha256 and hashlib.sha256(data).hexdigest().lower() != sha256.lower():
 24 |         raise RuntimeError(f"SHA256 mismatch for {url}")
 25 |     dst.write_bytes(data)
 26 | 
 27 | def _pip_install(args: list[str]):
 28 |     print("[Egregora] pip", " ".join(args))
 29 |     cmd = [sys.executable, "-m", "pip", "install", *args]
 30 |     try:
 31 |         subprocess.check_call(cmd)
 32 |     except subprocess.CalledProcessError as e:
 33 |         print("[Egregora] pip failed:", e)
 34 | 
 35 | def _ensure(import_name: str, pip_name: str | None = None, extra_args: list[str] | None = None, try_no_deps: bool = False):
 36 |     """
 37 |     Import a module, installing it if missing. When try_no_deps is True,
 38 |     we first attempt '--no-deps' to avoid pulling CPU torch into ComfyUI.
 39 |     """
 40 |     try:
 41 |         importlib.import_module(import_name)
 42 |         return True
 43 |     except Exception:
 44 |         pass
 45 | 
 46 |     target = pip_name or import_name
 47 |     if try_no_deps:
 48 |         _pip_install(["--no-deps", target, *(extra_args or [])])
 49 |         try:
 50 |             importlib.import_module(import_name)
 51 |             return True
 52 |         except Exception:
 53 |             print(f"[Egregora] '{target}' import still failing; retrying with full deps…")
 54 | 
 55 |     _pip_install([target, *(extra_args or [])])
 56 |     try:
 57 |         importlib.import_module(import_name)
 58 |         return True
 59 |     except Exception as e:
 60 |         print(f"[Egregora] Could not import {import_name}: {e}")
 61 |         return False
 62 | 
 63 | # ---------- Your existing FlashSR bootstrap ----------
 64 | def grab_repo_zip():
 65 |     if FLASH_REPO_DIR.exists():
 66 |         return
 67 |     print("[Egregora] Fetching FlashSR_Inference repository…")
 68 |     url = "https://github.com/jakeoneijk/FlashSR_Inference/archive/refs/heads/main.zip"
 69 |     zpath = DEPS / "FlashSR_Inference.zip"
 70 |     _download(url, zpath)
 71 |     with zipfile.ZipFile(zpath, "r") as zf:
 72 |         zf.extractall(DEPS)
 73 |     inner = next(p for p in DEPS.glob("FlashSR_Inference-*") if p.is_dir())
 74 |     inner.rename(FLASH_REPO_DIR)
 75 |     zpath.unlink(missing_ok=True)
 76 |     print("[Egregora] FlashSR_Inference ready at:", FLASH_REPO_DIR)
 77 | 
 78 | def try_fetch_weights():
 79 |     # If you host the three weights on HF, set EGREGORA_FLASHSR_HF_REPO
 80 |     # (filenames must be: student_ldm.pth, sr_vocoder.pth, vae.pth)
 81 |     hf_repo = os.environ.get("EGREGORA_FLASHSR_HF_REPO", "")
 82 |     need = ["student_ldm.pth", "sr_vocoder.pth", "vae.pth"]
 83 |     if hf_repo:
 84 |         for fname in need:
 85 |             dst = WEIGHTS_DIR / fname
 86 |             if dst.exists():
 87 |                 continue
 88 |             try:
 89 |                 print(f"[Egregora] Downloading {fname} from HF repo {hf_repo} …")
 90 |                 hf_hub_download(repo_id=hf_repo, filename=fname, local_dir=WEIGHTS_DIR)
 91 |             except Exception as e:
 92 |                 print(f"[Egregora] HF download failed for {fname}: {e}")
 93 | 
 94 |     missing = [n for n in need if not (WEIGHTS_DIR / n).exists()]
 95 |     if missing:
 96 |         print("\n[Egregora] FlashSR weights missing:", ", ".join(missing))
 97 |         print("Place them here:", WEIGHTS_DIR)
 98 |         print("Filenames are exactly: student_ldm.pth, sr_vocoder.pth, vae.pth")
 99 |         print("See repo for context: https://github.com/jakeoneijk/FlashSR_Inference")
100 |     else:
101 |         print("[Egregora] FlashSR weights present:", WEIGHTS_DIR)
102 | 
103 | # ---------- New: model/runtime deps + warmups ----------
104 | def ensure_runtime_deps():
105 |     # keep your requirements light; install optional bits here if missing
106 |     _ensure("numpy")
107 |     _ensure("soundfile")
108 |     _ensure("tqdm")
109 |     _ensure("requests")
110 |     _ensure("huggingface_hub")
111 | 
112 |     # Models / processors used by your integrated nodes
113 |     _ensure("pyrnnoise")  # RNNoise bindings
114 |     _ensure("nara_wpe", pip_name="nara-wpe")  # dereverb
115 |     _ensure("dac", pip_name="descript-audio-codec")  # Descript Audio Codec
116 | 
117 |     # DeepFilterNet (df). Try --no-deps first to avoid pulling a CPU torch.
118 |     # ComfyUI already has torch/torchaudio.
119 |     _ensure("df", pip_name="deepfilternet", try_no_deps=True)
120 | 
121 |     # Fat Llama (already in requirements, but double-check)
122 |     _ensure("fat_llama", pip_name="fat-llama")
123 |     _ensure("fat_llama_fftw", pip_name="fat-llama-fftw")
124 | 
125 |     # Optional: SciPy for HQ resampler/metrics in the Eval Pack
126 |     _ensure("scipy")
127 | 
128 | def warmup_deepfilternet():
129 |     try:
130 |         import torch
131 |         from df.enhance import init_df, enhance  # type: ignore
132 |         # This triggers model settings + checkpoint discovery and caches them
133 |         model, df_state, sr, _ = init_df()
134 |         x = torch.zeros(1, int(sr * 0.1))  # 100 ms of silence
135 |         with torch.no_grad():
136 |             _y, _ = enhance(model, df_state, x)
137 |         print("[Egregora] DeepFilterNet warmup OK")
138 |     except Exception as e:
139 |         print("[Egregora] DeepFilterNet warmup skipped:", e)
140 | 
141 | def warmup_dac():
142 |     try:
143 |         import dac
144 |         # Downloads default weights to local cache (~first use)
145 |         _ = dac.utils.download(model_type="44khz")
146 |         print("[Egregora] DAC warmup OK")
147 |     except Exception as e:
148 |         print("[Egregora] DAC warmup skipped:", e)
149 | 
150 | def warmup_rnnoise():
151 |     # Nothing to download, but a tiny call verifies the backend
152 |     try:
153 |         import numpy as np
154 |         from pyrnnoise import RNNoise
155 |         rn = RNNoise(sample_rate=48000)
156 |         if getattr(rn, "channels", None) in (None, 0):
157 |             setattr(rn, "channels", 1)
158 |         test = np.zeros((1, 4800), dtype=np.int16)  # 100 ms
159 |         _ = list(rn.denoise_chunk(test))  # iterate a few frames
160 |         print("[Egregora] RNNoise warmup OK")
161 |     except Exception as e:
162 |         print("[Egregora] RNNoise warmup skipped:", e)
163 | 
164 | # ---------- Entry ----------
165 | if __name__ == "__main__":
166 |     ensure_runtime_deps()
167 |     # Keep your original FlashSR bootstrap
168 |     grab_repo_zip()
169 |     try_fetch_weights()
170 | 
171 |     # Friendly first-run warmups
172 |     warmup_deepfilternet()
173 |     warmup_dac()
174 |     warmup_rnnoise()
175 | 
176 |     print("[Egregora] Install complete.")
177 | 


--------------------------------------------------------------------------------
/egregora_fat_llama_gpu.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import time
  4 | import tempfile
  5 | import platform
  6 | from pathlib import Path
  7 | from typing import Tuple
  8 | import numpy as np
  9 | import soundfile as sf
 10 | import torch
 11 | 
 12 | RETURN_TYPES = ("AUDIO",)
 13 | FUNCTION = "run"
 14 | CATEGORY = "Egregora/Audio"
 15 | 
 16 | # ---------------- I/O helpers ----------------
 17 | 
 18 | def _to_cs(x: np.ndarray) -> np.ndarray:
 19 |     """Return channels-first float32 [C,S]; accepts [S], [S,C], [C,S]."""
 20 |     a = np.asarray(x, dtype=np.float32)
 21 |     if a.ndim == 1:
 22 |         a = a[None, :]
 23 |     elif a.ndim == 2:
 24 |         h, w = a.shape
 25 |         if w <= 8 and h > w:  # soundfile often returns [S,C]
 26 |             a = a.T
 27 |     else:
 28 |         a = a.reshape(-1)[None, :]
 29 |     m = float(np.max(np.abs(a))) if a.size else 0.0
 30 |     if m > 1.0:  # safety clamp if upstream sent > 1.0
 31 |         a = a / (m + 1e-8)
 32 |     return a.astype(np.float32)
 33 | 
 34 | def _save_temp_wav(cs: np.ndarray, sr: int) -> Path:
 35 |     p = Path(tempfile.gettempdir()) / f"eg_in_{int(time.time()*1000)}.wav"
 36 |     sf.write(str(p), cs.T, int(sr))
 37 |     return p
 38 | 
 39 | def _normalize_audio_input(AUDIO=None, audio_path: str = "", audio_url: str = "") -> Tuple[np.ndarray, int, Path]:
 40 |     """
 41 |     Accept ComfyUI AUDIO dict, or a file path/url; return ([C,S], sr, temp_wav_path).
 42 |     """
 43 |     # ComfyUI's AUDIO: {"waveform": [B,C,T], "sample_rate": sr}
 44 |     if isinstance(AUDIO, dict) and "waveform" in AUDIO and "sample_rate" in AUDIO:
 45 |         wf: torch.Tensor = AUDIO["waveform"]
 46 |         sr = int(AUDIO["sample_rate"])
 47 |         if wf.dim() == 3:
 48 |             wf = wf[0]  # [C,T]
 49 |         if wf.dim() != 2:
 50 |             raise RuntimeError(f"Unexpected AUDIO tensor shape: {tuple(wf.shape)} (want [C,T])")
 51 |         cs = wf.detach().cpu().float().numpy()
 52 |         return cs, sr, _save_temp_wav(cs, sr)
 53 | 
 54 |     # (arr, sr) tuple
 55 |     if isinstance(AUDIO, (list, tuple)) and len(AUDIO) == 2:
 56 |         arr, sr = AUDIO
 57 |         cs = _to_cs(np.asarray(arr))
 58 |         return cs, int(sr), _save_temp_wav(cs, int(sr))
 59 | 
 60 |     # explicit file path
 61 |     if audio_path:
 62 |         p = Path(audio_path)
 63 |         if not p.exists():
 64 |             raise RuntimeError(f"audio_path not found: {audio_path}")
 65 |         y, sr = sf.read(str(p), dtype="float32", always_2d=False)
 66 |         cs = _to_cs(y)
 67 |         return cs, int(sr), _save_temp_wav(cs, int(sr))
 68 | 
 69 |     # URL fetch
 70 |     if audio_url:
 71 |         import requests
 72 |         r = requests.get(audio_url, timeout=60); r.raise_for_status()
 73 |         p = Path(tempfile.gettempdir()) / f"eg_url_{int(time.time()*1000)}.wav"
 74 |         p.write_bytes(r.content)
 75 |         y, sr = sf.read(str(p), dtype="float32", always_2d=False)
 76 |         cs = _to_cs(y)
 77 |         return cs, int(sr), _save_temp_wav(cs, int(sr))
 78 | 
 79 |     raise RuntimeError("No AUDIO provided.")
 80 | 
 81 | # ---------------- CUDA/CuPy wiring (Windows) ----------------
 82 | 
 83 | def _wire_cuda_for_cupy_windows():
 84 |     """
 85 |     On Windows portable installs, make NVIDIA pip-wheel DLLs & headers discoverable:
 86 |       • Add ...\site-packages\nvidia\<package>\bin to the DLL search path
 87 |       • Point CUDA_PATH to ...\site-packages\nvidia\cuda_runtime (has include/)
 88 |     Must run BEFORE importing cupy.
 89 |     """
 90 |     if platform.system() != "Windows":
 91 |         return
 92 | 
 93 |     sp = Path(sys.executable).parent / "Lib" / "site-packages" / "nvidia"
 94 |     rt = sp / "cuda_runtime"     # contains include/ and bin/
 95 |     nvrtc = sp / "cuda_nvrtc"    # contains bin/
 96 | 
 97 |     # Let CuPy find headers at runtime (NVRTC needs CUDA runtime headers >= CUDA 12.2)
 98 |     if rt.exists():
 99 |         os.environ.setdefault("CUDA_PATH", str(rt))
100 | 
101 |     # Make DLLs loadable for this process (Python 3.8+)
102 |     for p in (rt / "bin", nvrtc / "bin"):
103 |         if p.exists():
104 |             try:
105 |                 os.add_dll_directory(str(p))
106 |             except Exception:
107 |                 os.environ["PATH"] = f"{str(p)};{os.environ.get('PATH','')}"
108 | 
109 | # ---------------- Fat Llama wrapper ----------------
110 | 
111 | def _ensure_gpu_stack():
112 |     """
113 |     Validate CUDA/CuPy presence early and give a friendly error if not available.
114 |     Also ensure DLL search paths & headers are wired so CuPy can load cudart/nvrtc
115 |     and find CUDA runtime headers like vector_types.h.
116 |     """
117 |     _wire_cuda_for_cupy_windows()
118 | 
119 |     if not torch.cuda.is_available():
120 |         raise RuntimeError(
121 |             "CUDA GPU not detected. Fat Llama (GPU) requires an NVIDIA GPU. "
122 |             "If you need CPU, use the separate Fat Llama — CPU/FFTW node."
123 |         )
124 | 
125 |     try:
126 |         import cupy  # noqa: F401  (import after wiring)
127 |     except Exception as e:
128 |         raise RuntimeError(
129 |             "CuPy failed to import. Ensure you've installed a CUDA-12 build "
130 |             "(`pip install cupy-cuda12x`) and matching NVIDIA runtime headers & NVRTC "
131 |             "(`pip install \"nvidia-cuda-runtime-cu12==12.X.*\" \"nvidia-cuda-nvrtc-cu12==12.X.*\"`)."
132 |         ) from e
133 | 
134 | def _fat_llama_upscale(
135 |     in_wav: Path,
136 |     out_path: Path,
137 |     target_format: str,
138 |     max_iterations: int,
139 |     threshold_value: float,
140 |     target_bitrate_kbps: int,
141 |     toggle_autoscale: bool,
142 | ):
143 |     """Call the public API: fat_llama.audio_fattener.feed.upscale(...)"""
144 |     from fat_llama.audio_fattener.feed import upscale  # late import
145 | 
146 |     # Normalize ALWAYS on; Adaptive filter disabled for perf/stability
147 |     upscale(
148 |         input_file_path=str(in_wav),
149 |         output_file_path=str(out_path),
150 |         source_format="wav",
151 |         target_format=target_format,
152 |         max_iterations=int(max_iterations),
153 |         threshold_value=float(threshold_value),
154 |         target_bitrate_kbps=int(target_bitrate_kbps),
155 |         toggle_normalize=True,
156 |         toggle_autoscale=bool(toggle_autoscale),
157 |         toggle_adaptive_filter=False,
158 |     )
159 | 
160 | # ---------------- ComfyUI Node ----------------
161 | 
162 | class EgregoraFatLlamaGPU:
163 |     """
164 |     Spectral Enhance (Fat Llama — GPU only)
165 |     - Normalize is always ON (clamps final amplitude and prevents clipping).
166 |     - Adaptive filter disabled for speed (still available in library if you want a "slow" node).
167 |     """
168 |     @classmethod
169 |     def INPUT_TYPES(cls):
170 |         return {
171 |             "required": {
172 |                 "target_format": (["wav", "flac"],),
173 |                 "max_iterations": ("INT", {"default": 300, "min": 1, "max": 5000}),
174 |                 "threshold_value": ("FLOAT", {"default": 0.6, "min": 0.0, "max": 1.0, "step": 0.01}),
175 |                 "target_bitrate_kbps": ("INT", {"default": 1411, "min": 64, "max": 5000}),
176 |                 "toggle_autoscale": ("BOOLEAN", {"default": True}),
177 |             },
178 |             "optional": {
179 |                 "AUDIO": ("AUDIO",),
180 |                 "audio_path": ("STRING", {"default": ""}),
181 |                 "audio_url": ("STRING", {"default": ""}),
182 |             },
183 |         }
184 | 
185 |     RETURN_TYPES = RETURN_TYPES
186 |     FUNCTION = FUNCTION
187 |     CATEGORY = CATEGORY
188 |     OUTPUT_NODE = False
189 | 
190 |     def run(
191 |         self,
192 |         target_format,
193 |         max_iterations,
194 |         threshold_value,
195 |         target_bitrate_kbps,
196 |         toggle_autoscale,
197 |         AUDIO=None,
198 |         audio_path="",
199 |         audio_url="",
200 |     ):
201 |         _ensure_gpu_stack()
202 | 
203 |         # Normalize inbound audio to a temp WAV we can hand to fat_llama
204 |         cs, in_sr, in_wav = _normalize_audio_input(AUDIO, audio_path, audio_url)
205 | 
206 |         # Choose an output temp path with chosen container
207 |         suffix = ".wav" if target_format == "wav" else ".flac"
208 |         out_path = Path(tempfile.gettempdir()) / f"eg_fatllama_{int(time.time()*1000)}{suffix}"
209 | 
210 |         # Run Fat Llama with always-on normalization and no adaptive filter
211 |         _fat_llama_upscale(
212 |             in_wav=in_wav,
213 |             out_path=out_path,
214 |             target_format=target_format,
215 |             max_iterations=max_iterations,
216 |             threshold_value=threshold_value,
217 |             target_bitrate_kbps=target_bitrate_kbps,
218 |             toggle_autoscale=toggle_autoscale,
219 |         )
220 | 
221 |         # Read result back into Comfy
222 |         y, sr = sf.read(str(out_path), dtype="float32", always_2d=False)
223 |         cs_out = _to_cs(y)
224 |         wf = torch.from_numpy(cs_out).unsqueeze(0).contiguous()  # [1,C,T]
225 |         return ({"waveform": wf, "sample_rate": int(sr)},)
226 | 
227 | # Register node
228 | NODE_CLASS_MAPPINGS = {
229 |     "EgregoraFatLlamaGPU": EgregoraFatLlamaGPU,
230 | }
231 | 
232 | NODE_DISPLAY_NAME_MAPPINGS = {
233 |     "EgregoraFatLlamaGPU": "🎛️ Spectral Enhance (Fat Llama — GPU)",
234 | }


--------------------------------------------------------------------------------
/egregora_audio_super_resolution.py:
--------------------------------------------------------------------------------
  1 | # 🎧 ComfyUI — Audio Super Resolution (FlashSR)
  2 | # Minimal, single-output node with robust shapes and HQ resampling.
  3 | # Inputs:  audio (AUDIO), lowpass_input (BOOL), output_sr (enum)
  4 | # Output:  audio (AUDIO)
  5 | #
  6 | # Internals:
  7 | # - Normalize to [C, S] consistently (soundfile returns [S, C] -> transpose)
  8 | # - Fixed chunking: 5.12 s, overlap: 0.50 s, Hann WOLA stitching
  9 | # - Inference at 48 kHz (FlashSR’s design target), optional post-resample
 10 | # - HQ SRC cascade: soxr -> scipy.signal.resample_poly -> torchaudio -> linear
 11 | #
 12 | # SPDX: MIT
 13 | 
 14 | import os, sys, time
 15 | from pathlib import Path
 16 | from typing import Optional, Tuple, List, Dict, Any
 17 | 
 18 | import numpy as np
 19 | import torch
 20 | 
 21 | FUNCTION = "run"
 22 | CATEGORY = "Egregora/Audio"
 23 | 
 24 | # ---------- paths ----------
 25 | def _custom_root() -> Path:
 26 |     return Path(__file__).resolve().parent
 27 | 
 28 | def _models_dir() -> Path:
 29 |     # .../ComfyUI/models
 30 |     return _custom_root().parents[2] / "models"
 31 | 
 32 | def _audio_models_subdir(name: str) -> Path:
 33 |     d = _models_dir() / "audio" / name
 34 |     d.mkdir(parents=True, exist_ok=True)
 35 |     return d
 36 | 
 37 | # ---------- AUDIO helpers ----------
 38 | def _make_audio(sr: int, samples_cs: np.ndarray) -> Dict[str, Any]:
 39 |     """Build a ComfyUI AUDIO dict from [C, S] float32."""
 40 |     s = np.asarray(samples_cs, dtype=np.float32)
 41 |     if s.ndim == 1:
 42 |         s = s[None, :]
 43 |     C, T = s.shape
 44 |     wf = torch.from_numpy(s).unsqueeze(0).contiguous()  # [1, C, T]
 45 |     return {"waveform": wf, "sample_rate": int(sr)}
 46 | 
 47 | def _from_audio_dict(AUDIO: Any) -> Tuple[np.ndarray, int]:
 48 |     """
 49 |     Accept Comfy AUDIO dict or (ndarray, sr). Return [C, S] float32 and sr.
 50 |     """
 51 |     # Comfy AUDIO dict
 52 |     if isinstance(AUDIO, dict) and "waveform" in AUDIO and "sample_rate" in AUDIO:
 53 |         wf: torch.Tensor = AUDIO["waveform"]
 54 |         sr = int(AUDIO["sample_rate"])
 55 |         if wf.dim() == 3:
 56 |             wf = wf[0]  # [C, T]
 57 |         if wf.dim() != 2:
 58 |             raise RuntimeError(f"Unexpected AUDIO tensor shape {tuple(wf.shape)}; expected [C, T].")
 59 |         cs = wf.detach().cpu().float().numpy()  # [C, T]
 60 |         return cs, sr
 61 |     # (array, sr)
 62 |     if isinstance(AUDIO, (list, tuple)) and len(AUDIO) == 2:
 63 |         arr, sr = AUDIO
 64 |         arr = np.asarray(arr, dtype=np.float32)
 65 |         if arr.ndim == 1:
 66 |             # mono [S] -> [1, S]
 67 |             cs = arr[None, :]
 68 |         elif arr.ndim == 2:
 69 |             # could be [S, C] or [C, S]; treat 1st dim as frames if it's much larger
 70 |             if arr.shape[0] >= arr.shape[1] and arr.shape[1] <= 8:
 71 |                 # soundfile/frames-first -> transpose to [C, S]
 72 |                 cs = arr.T
 73 |             else:
 74 |                 cs = arr  # already [C, S]
 75 |         else:
 76 |             cs = arr.reshape(1, -1)
 77 |         return cs.astype(np.float32), int(sr)
 78 |     raise RuntimeError("No valid AUDIO provided.")
 79 | 
 80 | # ---------- HQ resampling ----------
 81 | def _resample_hq(x_cs: np.ndarray, src_sr: int, dst_sr: int) -> np.ndarray:
 82 |     """
 83 |     Prefer soxr -> scipy.signal.resample_poly -> torchaudio -> linear.
 84 |     Operates on [C, S] along the sample axis.
 85 |     """
 86 |     if src_sr == dst_sr:
 87 |         return x_cs.astype(np.float32)
 88 | 
 89 |     # soxr
 90 |     try:
 91 |         import soxr  # type: ignore
 92 |         out = [soxr.resample(x_cs[c], src_sr, dst_sr) for c in range(x_cs.shape[0])]
 93 |         # equalize length (guard)
 94 |         L = min(map(len, out))
 95 |         out = np.stack([ch[:L] for ch in out], axis=0)
 96 |         return out.astype(np.float32)
 97 |     except Exception:
 98 |         pass
 99 | 
100 |     # SciPy polyphase
101 |     try:
102 |         from math import gcd
103 |         from scipy.signal import resample_poly  # type: ignore
104 |         g = gcd(src_sr, dst_sr)
105 |         up, down = dst_sr // g, src_sr // g
106 |         out = [resample_poly(x_cs[c], up=up, down=down).astype(np.float32) for c in range(x_cs.shape[0])]
107 |         L = min(map(len, out))
108 |         out = np.stack([ch[:L] for ch in out], axis=0)
109 |         return out
110 |     except Exception:
111 |         pass
112 | 
113 |     # torchaudio windowed-sinc
114 |     try:
115 |         import torchaudio  # type: ignore
116 |         t = torch.from_numpy(x_cs).float()  # [C, S]
117 |         rs = torchaudio.transforms.Resample(orig_freq=src_sr, new_freq=dst_sr)
118 |         y = rs(t)  # [C, S']
119 |         return y.numpy().astype(np.float32)
120 |     except Exception:
121 |         pass
122 | 
123 |     # linear interp fallback (lowest quality)
124 |     ratio = dst_sr / float(src_sr)
125 |     n_out = int(round(x_cs.shape[1] * ratio))
126 |     t_in = np.linspace(0.0, 1.0, x_cs.shape[1], endpoint=False, dtype=np.float64)
127 |     t_out = np.linspace(0.0, 1.0, n_out, endpoint=False, dtype=np.float64)
128 |     out = np.stack([np.interp(t_out, t_in, ch) for ch in x_cs], axis=0).astype(np.float32)
129 |     return out
130 | 
131 | # ---------- chunking & WOLA ----------
132 | def _hann(L: int) -> np.ndarray:
133 |     return np.hanning(L).astype(np.float32)
134 | 
135 | def _iter_chunks(total_samples: int, win: int, hop: int) -> List[Tuple[int, int]]:
136 |     """
137 |     Yield (start, length) for each chunk to cover [0, total_samples).
138 |     """
139 |     spans: List[Tuple[int, int]] = []
140 |     i = 0
141 |     while i < total_samples:
142 |         L = min(win, total_samples - i)
143 |         spans.append((i, L))
144 |         if i + L >= total_samples:
145 |             break
146 |         i += hop
147 |     return spans
148 | 
149 | def _wola_stitch(chunks_pred: List[Tuple[np.ndarray, int, int]], total_len: int, win: int) -> np.ndarray:
150 |     """
151 |     Overlap-add predicted chunks with Hann window.
152 |     chunks_pred: list of (pred_cs [C, L_pred], start, L_in)
153 |                  L_in = original (unpadded) input length for that chunk
154 |     Returns [C, total_len].
155 |     """
156 |     if not chunks_pred:
157 |         return np.zeros((1, max(1, total_len)), np.float32)
158 | 
159 |     C = chunks_pred[0][0].shape[0]
160 |     acc = np.zeros((C, total_len), np.float32)
161 |     wsum = np.zeros(total_len, np.float32)
162 |     w_full = _hann(win)
163 | 
164 |     for y_cs, start, L_in in chunks_pred:
165 |         L_pred = y_cs.shape[1]
166 |         L = min(L_in, L_pred)  # only weight the valid (unpadded) part
167 |         w = w_full[:L] if L <= win else np.ones(L, np.float32)
168 |         acc[:, start:start+L] += y_cs[:, :L] * w[None, :]
169 |         wsum[start:start+L] += w
170 | 
171 |     wsum[wsum == 0] = 1.0
172 |     out = acc / wsum[None, :]
173 |     return out.astype(np.float32)
174 | 
175 | # ---------- FlashSR loader ----------
176 | class _FlashSRRunner:
177 |     REQ_SR = 48000
178 |     CHUNK_S = 5.12
179 |     OVERLAP_S = 0.50
180 |     CHUNK_SAMPLES = int(REQ_SR * CHUNK_S)  # 245760
181 | 
182 |     HF_DATASET = "jakeoneijk/FlashSR_weights"
183 |     HF_FILES = ("student_ldm.pth", "sr_vocoder.pth", "vae.pth")
184 | 
185 |     def __init__(self, lowpass: bool = False):
186 |         self.lowpass = bool(lowpass)
187 |         self.ckpt_dir = _audio_models_subdir("flashsr")
188 |         self.repo_path = self._resolve_repo_path()
189 |         self._dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
190 |         self._FlashSRClass = None
191 |         self._model = None
192 |         self._ensure_weights()
193 |         self._import()
194 |         self._ensure_model()
195 | 
196 |     def _resolve_repo_path(self) -> Path:
197 |         env_repo = os.environ.get("EGREGORA_FLASHSR_REPO")
198 |         if env_repo:
199 |             return Path(env_repo)
200 |         # default: custom_nodes/ComfyUI-Egregora-Audio-Super-Resolution/deps/FlashSR_Inference
201 |         return _custom_root().parents[0] / "deps" / "FlashSR_Inference"
202 | 
203 |     def _ensure_weights(self):
204 |         missing = [f for f in self.HF_FILES if not (self.ckpt_dir / f).exists()]
205 |         if not missing:
206 |             return
207 |         # Try huggingface_hub first
208 |         try:
209 |             from huggingface_hub import hf_hub_download  # type: ignore
210 |             for fname in missing:
211 |                 hf_hub_download(
212 |                     repo_id=self.HF_DATASET,
213 |                     filename=fname,
214 |                     repo_type="dataset",
215 |                     local_dir=str(self.ckpt_dir),
216 |                 )
217 |             print(f"[FlashSR] Downloaded via huggingface_hub: {', '.join(missing)}")
218 |             return
219 |         except Exception as e:
220 |             print(f"[FlashSR] huggingface_hub unavailable or failed ({e}); falling back to direct HTTP…")
221 |         # Fallback: direct HTTP
222 |         try:
223 |             import requests  # type: ignore
224 |             for fname in missing:
225 |                 url = f"https://huggingface.co/datasets/{self.HF_DATASET}/resolve/main/{fname}?download=true"
226 |                 dst = self.ckpt_dir / fname
227 |                 with requests.get(url, stream=True, timeout=1800) as r:
228 |                     r.raise_for_status()
229 |                     with open(dst, "wb") as f:
230 |                         for chunk in r.iter_content(chunk_size=1024 * 1024):
231 |                             if chunk:
232 |                                 f.write(chunk)
233 |                 print(f"[FlashSR] Downloaded: {dst}")
234 |         except Exception as ee:
235 |             raise RuntimeError(
236 |                 "FlashSR weights missing and auto-download failed. "
237 |                 "Place these in models/audio/flashsr: student_ldm.pth, sr_vocoder.pth, vae.pth"
238 |             ) from ee
239 | 
240 |     def _import(self):
241 |         if self._FlashSRClass is not None:
242 |             return
243 |         try:
244 |             from FlashSR.FlashSR import FlashSR  # type: ignore
245 |             self._FlashSRClass = FlashSR
246 |             return
247 |         except Exception:
248 |             cand = self.repo_path
249 |             if (cand / "FlashSR").exists():
250 |                 sys.path.insert(0, str(cand))
251 |                 from FlashSR.FlashSR import FlashSR  # type: ignore
252 |                 self._FlashSRClass = FlashSR
253 |                 return
254 |         raise RuntimeError("FlashSR module not found. Install/clone and set EGREGORA_FLASHSR_REPO if needed.")
255 | 
256 |     def _ensure_model(self):
257 |         if self._model is not None:
258 |             return
259 |         FlashSR = self._FlashSRClass
260 |         s = str(self.ckpt_dir / "student_ldm.pth")
261 |         v = str(self.ckpt_dir / "sr_vocoder.pth")
262 |         vae = str(self.ckpt_dir / "vae.pth")
263 |         model = FlashSR(s, v, vae)
264 |         model.eval()
265 |         try:
266 |             model.to(self._dev)
267 |         except Exception:
268 |             pass
269 |         self._model = model
270 | 
271 |     def infer(self, x_cs_48k: np.ndarray) -> np.ndarray:
272 |         """
273 |         x_cs_48k: [C, S] float32 at 48 kHz.
274 |         Returns [C, S] float32 at 48 kHz (same length as input slice passed in).
275 |         """
276 |         x = torch.from_numpy(x_cs_48k).to(self._dev).float()  # [C, S]
277 |         with torch.inference_mode():
278 |             y = self._model(x, lowpass_input=self.lowpass)  # [C, S]
279 |         return y.detach().to("cpu").float().numpy()
280 | 
281 | # ---------- Node ----------
282 | class EgregoraAudioSuperResolution:
283 |     @classmethod
284 |     def INPUT_TYPES(cls):
285 |         return {
286 |             "required": {
287 |                 "audio": ("AUDIO",),
288 |                 "lowpass_input": ("BOOLEAN", {"default": False}),
289 |                 "output_sr": (["48000", "44100", "96000"], {"default": "48000"}),
290 |             }
291 |         }
292 | 
293 |     RETURN_TYPES = ("AUDIO",)
294 |     FUNCTION = FUNCTION
295 |     CATEGORY = CATEGORY
296 |     OUTPUT_NODE = False
297 | 
298 |     def run(self, audio=None, lowpass_input=False, output_sr="48000"):
299 |         # 1) Normalize input to [C, S]
300 |         in_cs, in_sr = _from_audio_dict(audio)
301 | 
302 |         # 2) Resample to model SR if needed
303 |         runner = _FlashSRRunner(lowpass=bool(lowpass_input))
304 |         req_sr = runner.REQ_SR
305 |         if in_sr != req_sr:
306 |             in_cs = _resample_hq(in_cs, in_sr, req_sr)
307 |             in_sr = req_sr
308 | 
309 |         # 3) Chunking params (internal, non-user)
310 |         win = runner.CHUNK_SAMPLES               # 5.12 s @ 48k
311 |         hop = int((runner.CHUNK_S - runner.OVERLAP_S) * req_sr)
312 |         if hop <= 0 or hop >= win:
313 |             # guard-rail: keep a sane overlap in pathological cases
314 |             hop = win // 2
315 | 
316 |         total = in_cs.shape[1]
317 |         spans = _iter_chunks(total, win=win, hop=hop)
318 | 
319 |         # 4) Process chunks in-memory and stitch with Hann WOLA
320 |         preds: List[Tuple[np.ndarray, int, int]] = []
321 |         for start, L in spans:
322 |             # slice and pad up to win
323 |             chunk = in_cs[:, start:start+L]
324 |             if L < win:
325 |                 pad = np.zeros((in_cs.shape[0], win - L), np.float32)
326 |                 chunk = np.concatenate([chunk, pad], axis=1)
327 |             y_pred = runner.infer(chunk)  # [C, win] @ 48k
328 |             preds.append((y_pred, start, L))  # keep original L for proper weighting
329 | 
330 |         out_48k = _wola_stitch(preds, total_len=total, win=win)  # [C, total]
331 | 
332 |         # 5) Optional post-resample for delivery
333 |         tgt_sr = int(output_sr)
334 |         if tgt_sr != in_sr:
335 |             out = _resample_hq(out_48k, in_sr, tgt_sr)
336 |             out_sr = tgt_sr
337 |         else:
338 |             out, out_sr = out_48k, in_sr
339 | 
340 |         # 6) Return single AUDIO
341 |         return (_make_audio(out_sr, out),)
342 | 
343 | # ComfyUI registration
344 | NODE_CLASS_MAPPINGS = {
345 |     "EgregoraAudioUpscaler": EgregoraAudioSuperResolution,
346 | }
347 | NODE_DISPLAY_NAME_MAPPINGS = {
348 |     "EgregoraAudioUpscaler": "🎧 Audio Super Resolution (FlashSR)",
349 | }
350 | 


--------------------------------------------------------------------------------
/egregora_audio_eval_pack.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Egregora · Audio Eval Pack (v1)
  3 | ===============================
  4 | 
  5 | Permissive, model-friendly utilities to complement the Null Test Suite:
  6 | - ABX Prepare / ABX Judge (double‑blind listening helper)
  7 | - Loudness Meter (BS.1770-style*) + Gain Match (LUFS‑I / RMS)
  8 | - Metrics: SI‑SDR and LSD (log‑spectral distance)
  9 | - Resample Audio (HQ) with optional SciPy/torchaudio backends
 10 | 
 11 | *Note: The 1770 implementation here is a practical approximation for
 12 |   integrated loudness, momentary/short‑term, LRA, and true‑peak. For
 13 |   certification-grade measurement, validate against a reference meter.
 14 | 
 15 | All nodes follow ComfyUI conventions:
 16 | - AUDIO is a dict with {"waveform": torch.Tensor[B,C,T], "sample_rate": int}
 17 | - IMAGE is torch.Tensor[B,H,W,3] in [0,1]
 18 | 
 19 | License: MIT
 20 | """
 21 | from __future__ import annotations
 22 | 
 23 | import io
 24 | import math
 25 | import random
 26 | from dataclasses import dataclass
 27 | from typing import Any, Dict, Optional, Tuple
 28 | 
 29 | import numpy as np
 30 | import torch
 31 | from PIL import Image
 32 | 
 33 | # Optional deps
 34 | try:
 35 |     import scipy.signal as sps  # resample_poly, firwin
 36 |     _HAVE_SCIPY = True
 37 | except Exception:
 38 |     _HAVE_SCIPY = False
 39 | 
 40 | try:
 41 |     import torchaudio
 42 |     import torchaudio.functional as AF
 43 |     _HAVE_TA = True
 44 | except Exception:
 45 |     _HAVE_TA = False
 46 | 
 47 | 
 48 | # -----------------------------
 49 | # Utilities
 50 | # -----------------------------
 51 | 
 52 | def _to_numpy(x: Any) -> np.ndarray:
 53 |     if isinstance(x, np.ndarray):
 54 |         return x
 55 |     if hasattr(x, "detach") and hasattr(x, "cpu"):
 56 |         return x.detach().cpu().numpy()
 57 |     return np.asarray(x)
 58 | 
 59 | 
 60 | def _normalize_CN(arr: np.ndarray) -> np.ndarray:
 61 |     a = np.asarray(arr)
 62 |     a = np.squeeze(a)
 63 |     if a.ndim == 1:
 64 |         a = a[None, :]
 65 |     elif a.ndim == 2:
 66 |         if a.shape[0] > a.shape[1]:
 67 |             a = a.T
 68 |     else:
 69 |         t_axis = int(np.argmax(a.shape))
 70 |         a = np.moveaxis(a, t_axis, -1)
 71 |         C = int(np.prod(a.shape[:-1]))
 72 |         N = a.shape[-1]
 73 |         a = a.reshape(C, N)
 74 |     return a.astype(np.float32)
 75 | 
 76 | 
 77 | def make_audio(sr: int, samples_CN: np.ndarray, meta: Optional[dict] = None) -> Dict[str, Any]:
 78 |     s = _normalize_CN(samples_CN)
 79 |     wf = torch.from_numpy(s).unsqueeze(0)  # [1,C,N]
 80 |     return {
 81 |         "sr": int(sr),
 82 |         "sample_rate": int(sr),
 83 |         "samples": s,
 84 |         "waveform": wf,
 85 |         "meta": dict(meta or {}),
 86 |     }
 87 | 
 88 | 
 89 | def to_internal_audio(x: Any) -> Dict[str, Any]:
 90 |     if isinstance(x, dict) and "waveform" in x and ("sample_rate" in x or "sr" in x or "rate" in x):
 91 |         sr = int(x.get("sample_rate") or x.get("sr") or x.get("rate"))
 92 |         wf = _to_numpy(x["waveform"])  # [B,C,T] or [C,T]
 93 |         if wf.ndim == 3:
 94 |             wf = wf[0]
 95 |         s = _normalize_CN(wf)
 96 |         return make_audio(sr, s, x.get("meta", {}))
 97 |     if isinstance(x, dict) and ("sr" in x or "sample_rate" in x):
 98 |         sr = int(x.get("sr") or x.get("sample_rate"))
 99 |         buf = x.get("samples") or x.get("audio") or x.get("array")
100 |         if buf is None:
101 |             raise ValueError("Audio dict missing samples/waveform")
102 |         return make_audio(sr, _to_numpy(buf), x.get("meta", {}))
103 |     raise ValueError("Unsupported AUDIO object for this node")
104 | 
105 | 
106 | def _image_from_figure(fig) -> torch.Tensor:
107 |     import matplotlib
108 |     matplotlib.use("Agg")
109 |     import matplotlib.pyplot as plt  # noqa: F401
110 | 
111 |     buf = io.BytesIO()
112 |     fig.savefig(buf, format="png", bbox_inches="tight", dpi=110)
113 |     try:
114 |         fig.clf()
115 |     except Exception:
116 |         pass
117 |     buf.seek(0)
118 |     im = Image.open(buf).convert("RGB")
119 |     arr = np.array(im).astype(np.float32) / 255.0
120 |     return torch.from_numpy(arr).unsqueeze(0)
121 | 
122 | 
123 | def _rms_db(x: np.ndarray) -> float:
124 |     x = x.astype(np.float64)
125 |     return 10.0 * math.log10(float(np.mean(x * x) + 1e-20))
126 | 
127 | 
128 | # -----------------------------
129 | # 1770 Loudness helpers (practical approximations)
130 | # -----------------------------
131 | 
132 | def _k_weight(sr: int, x_CN: np.ndarray) -> np.ndarray:
133 |     """Very small K-weight approx: 1st-order HPF ~60 Hz + slight HF tilt.
134 |     This is sufficient for relative matching; not certification-grade.
135 |     """
136 |     x = x_CN
137 |     fc = 60.0 / (sr * 0.5)
138 |     k = math.exp(-2 * math.pi * fc)
139 |     y = np.zeros_like(x, dtype=np.float32)
140 |     for c in range(x.shape[0]):
141 |         xn = x[c].astype(np.float32)
142 |         yc = np.zeros_like(xn)
143 |         z = 0.0
144 |         for n in range(xn.shape[0]):
145 |             z = (1 - k) * xn[n] + k * z
146 |             yc[n] = xn[n] - z
147 |         y[c] = yc
148 |     # tiny HF shelf via first difference
149 |     y[:, 1:] += 0.02 * (y[:, 1:] - y[:, :-1])
150 |     return y
151 | 
152 | 
153 | def integrated_lufs(audio: Dict[str, Any]) -> float:
154 |     sr = audio["sample_rate"]
155 |     y = _k_weight(sr, audio["samples"])  # [C,N]
156 |     mono = y.mean(axis=0)
157 |     blk = max(1, int(round(0.400 * sr)))
158 |     hop = max(1, int(round(0.100 * sr)))
159 |     frames = 1 + max(0, (mono.shape[0] - blk) // hop)
160 |     if frames <= 0:
161 |         return _rms_db(mono)
162 |     ms = []
163 |     for i in range(frames):
164 |         s = i * hop
165 |         e = s + blk
166 |         seg = mono[s:e].astype(np.float64)
167 |         ms.append(float(np.mean(seg * seg)))
168 |     ms = np.asarray(ms) + 1e-20
169 |     lufs_ungated = -0.691 + 10.0 * np.log10(np.mean(ms))
170 |     gate = lufs_ungated - 10.0
171 |     mask = (-0.691 + 10.0 * np.log10(ms)) >= gate
172 |     if np.any(mask):
173 |         ms = ms[mask]
174 |     return float(-0.691 + 10.0 * np.log10(np.mean(ms)))
175 | 
176 | 
177 | def lufs_series(audio: Dict[str, Any], window_s: float, hop_s: float) -> np.ndarray:
178 |     sr = audio["sample_rate"]
179 |     y = _k_weight(sr, audio["samples"]).mean(axis=0)
180 |     w = max(1, int(round(window_s * sr)))
181 |     h = max(1, int(round(hop_s * sr)))
182 |     frames = 1 + max(0, (y.shape[0] - w) // h)
183 |     out = np.empty((frames,), dtype=np.float32)
184 |     for i in range(frames):
185 |         s = i * h
186 |         seg = y[s : s + w].astype(np.float64)
187 |         out[i] = -0.691 + 10.0 * np.log10(float(np.mean(seg * seg)) + 1e-20)
188 |     return out
189 | 
190 | 
191 | def lra_short_term(audio: Dict[str, Any]) -> float:
192 |     st = lufs_series(audio, 3.0, 1.0)  # 3s window, 1s hop (EBU R128)
193 |     if st.size == 0:
194 |         return 0.0
195 |     # Simple gating: remove values near silence
196 |     gate = np.percentile(st, 10.0) - 20.0
197 |     pool = st[st > gate]
198 |     if pool.size == 0:
199 |         pool = st
200 |     return float(np.percentile(pool, 95.0) - np.percentile(pool, 10.0))
201 | 
202 | 
203 | def true_peak_dbfs(audio: Dict[str, Any], oversample: int = 4) -> float:
204 |     x = audio["samples"].mean(axis=0)
205 |     sr = audio["sample_rate"]
206 |     if _HAVE_SCIPY:
207 |         y = sps.resample_poly(x, oversample, 1)
208 |     else:
209 |         N = x.shape[0]
210 |         t_old = np.linspace(0.0, 1.0, N, endpoint=False)
211 |         t_new = np.linspace(0.0, 1.0, N * oversample, endpoint=False)
212 |         y = np.interp(t_new, t_old, x).astype(np.float32)
213 |     peak = float(np.max(np.abs(y)))
214 |     return 20.0 * math.log10(peak + 1e-20)
215 | 
216 | 
217 | # -----------------------------
218 | # ABX helper
219 | # -----------------------------
220 | @dataclass
221 | class ABXMeta:
222 |     x_is: str  # 'A' or 'B'
223 |     seed: int
224 | 
225 |     def to_dict(self) -> Dict[str, Any]:
226 |         return {"x_is": self.x_is, "seed": int(self.seed)}
227 | 
228 | 
229 | # -----------------------------
230 | # Node: ABX Prepare
231 | # -----------------------------
232 | class ABX_Prepare:
233 |     CATEGORY = "Egregora/Listening"
234 |     RETURN_TYPES = ("AUDIO", "AUDIO", "AUDIO", "DICT")
235 |     RETURN_NAMES = ("audio_A", "audio_B", "audio_X", "abx_meta")
236 |     FUNCTION = "execute"
237 | 
238 |     @classmethod
239 |     def INPUT_TYPES(cls):
240 |         return {
241 |             "required": {
242 |                 "audio_A": ("AUDIO", {}),
243 |                 "audio_B": ("AUDIO", {}),
244 |             },
245 |             "optional": {
246 |                 "clip_seconds": ("FLOAT", {"default": 10.0, "min": 1.0, "max": 60.0, "step": 0.1}),
247 |                 "random_seed": ("INT", {"default": 0, "min": 0, "max": 2**31 - 1, "step": 1}),
248 |                 "start_seconds": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 10_000.0, "step": 0.1}),
249 |             },
250 |         }
251 | 
252 |     def _clip(self, a: Dict[str, Any], start_s: float, dur_s: float) -> Dict[str, Any]:
253 |         sr = a["sample_rate"]
254 |         s = int(round(start_s * sr))
255 |         n = int(round(dur_s * sr))
256 |         x = a["samples"]
257 |         if s + n > x.shape[1]:
258 |             n = max(0, x.shape[1] - s)
259 |         y = x[:, s : s + n]
260 |         return make_audio(sr, y, a.get("meta", {}))
261 | 
262 |     def execute(self, audio_A, audio_B, clip_seconds=10.0, random_seed=0, start_seconds=0.0):
263 |         A = to_internal_audio(audio_A)
264 |         B = to_internal_audio(audio_B)
265 |         n = min(A["samples"].shape[1], B["samples"].shape[1])
266 |         A["samples"] = A["samples"][:, :n]
267 |         B["samples"] = B["samples"][:, :n]
268 | 
269 |         A_c = self._clip(A, start_seconds, clip_seconds)
270 |         B_c = self._clip(B, start_seconds, clip_seconds)
271 | 
272 |         rng = random.Random(int(random_seed))
273 |         x_is = rng.choice(["A", "B"])
274 |         X = A_c if x_is == "A" else B_c
275 |         meta = ABXMeta(x_is=x_is, seed=int(random_seed)).to_dict()
276 |         return A_c, B_c, X, meta
277 | 
278 | 
279 | # -----------------------------
280 | # Node: ABX Judge
281 | # -----------------------------
282 | class ABX_Judge:
283 |     CATEGORY = "Egregora/Listening"
284 |     RETURN_TYPES = ("DICT",)
285 |     RETURN_NAMES = ("abx_result",)
286 |     FUNCTION = "execute"
287 | 
288 |     @classmethod
289 |     def INPUT_TYPES(cls):
290 |         return {
291 |             "required": {
292 |                 "abx_meta": ("DICT", {}),
293 |                 "guess": (["A", "B"], {}),
294 |             },
295 |         }
296 | 
297 |     def execute(self, abx_meta, guess):
298 |         x_is = str(abx_meta.get("x_is", "?")).upper()
299 |         correct = (guess.upper() == x_is)
300 |         return ({"x_is": x_is, "guess": guess.upper(), "correct": bool(correct)},)
301 | 
302 | 
303 | # -----------------------------
304 | # Node: Loudness Meter (1770)
305 | # -----------------------------
306 | class Loudness_Meter_1770:
307 |     CATEGORY = "Egregora/Analysis"
308 |     RETURN_TYPES = ("DICT",)
309 |     RETURN_NAMES = ("metrics",)
310 |     FUNCTION = "execute"
311 | 
312 |     @classmethod
313 |     def INPUT_TYPES(cls):
314 |         return {
315 |             "required": {
316 |                 "audio": ("AUDIO", {}),
317 |             },
318 |             "optional": {
319 |                 "compute_true_peak": ("BOOLEAN", {"default": True}),
320 |                 "oversample": ("INT", {"default": 4, "min": 1, "max": 8, "step": 1}),
321 |             },
322 |         }
323 | 
324 |     def execute(self, audio, compute_true_peak=True, oversample=4):
325 |         a = to_internal_audio(audio)
326 |         metrics: Dict[str, Any] = {}
327 |         metrics["lufs_integrated"] = float(integrated_lufs(a))
328 |         metrics["lufs_momentary"] = float(lufs_series(a, 0.400, 0.100).mean() if a["samples"].size else 0.0)
329 |         metrics["lufs_short_term"] = float(lufs_series(a, 3.0, 1.0).mean() if a["samples"].size else 0.0)
330 |         metrics["lra"] = float(lra_short_term(a))
331 |         if compute_true_peak:
332 |             metrics["true_peak_dbfs"] = float(true_peak_dbfs(a, oversample=int(oversample)))
333 |         return (metrics,)
334 | 
335 | 
336 | # -----------------------------
337 | # Node: Audio Gain Match (1770 / RMS)
338 | # -----------------------------
339 | class Audio_Gain_Match_1770:
340 |     CATEGORY = "Egregora/Analysis"
341 |     RETURN_TYPES = ("AUDIO", "FLOAT", "FLOAT", "FLOAT")
342 |     RETURN_NAMES = ("audio_matched", "gain_db", "ref_level", "in_level")
343 |     FUNCTION = "execute"
344 | 
345 |     @classmethod
346 |     def INPUT_TYPES(cls):
347 |         return {
348 |             "required": {
349 |                 "audio_ref": ("AUDIO", {}),
350 |                 "audio_in": ("AUDIO", {}),
351 |             },
352 |             "optional": {
353 |                 "mode": (["LUFS-I", "RMS"], {}),
354 |                 "max_gain_db": ("FLOAT", {"default": 12.0, "min": -60.0, "max": 60.0, "step": 0.1}),
355 |             },
356 |         }
357 | 
358 |     def execute(self, audio_ref, audio_in, mode="LUFS-I", max_gain_db=12.0):
359 |         ref = to_internal_audio(audio_ref)
360 |         inn = to_internal_audio(audio_in)
361 |         # resample if SR mismatch
362 |         if inn["sample_rate"] != ref["sample_rate"]:
363 |             sr_old = inn["sample_rate"]
364 |             x = inn["samples"]
365 |             C, N = x.shape
366 |             new_N = int(round(N * ref["sample_rate"] / sr_old))
367 |             t_old = np.linspace(0.0, 1.0, N, endpoint=False)
368 |             t_new = np.linspace(0.0, 1.0, new_N, endpoint=False)
369 |             y = np.stack([np.interp(t_new, t_old, x[c]) for c in range(C)], axis=0).astype(np.float32)
370 |             inn = make_audio(ref["sample_rate"], y, inn.get("meta", {}))
371 | 
372 |         if str(mode).upper().startswith("LUFS"):
373 |             ref_level = integrated_lufs(ref)
374 |             in_level = integrated_lufs(inn)
375 |         else:
376 |             ref_level = _rms_db(ref["samples"].mean(axis=0))
377 |             in_level = _rms_db(inn["samples"].mean(axis=0))
378 |         gain_db = float(np.clip(ref_level - in_level, -abs(max_gain_db), abs(max_gain_db)))
379 |         gain = 10 ** (gain_db / 20.0)
380 |         y = (inn["samples"] * gain).astype(np.float32)
381 |         out = make_audio(inn["sample_rate"], y, inn.get("meta", {}))
382 |         return (out, float(gain_db), float(ref_level), float(in_level))
383 | 
384 | 
385 | # -----------------------------
386 | # Metrics: SI‑SDR & LSD
387 | # -----------------------------
388 | 
389 | def _stft_mag(x: np.ndarray, n_fft: int = 2048, hop: int = 512) -> np.ndarray:
390 |     mono = x if x.ndim == 1 else x.mean(axis=0)
391 |     N = mono.shape[0]
392 |     win = np.hanning(n_fft).astype(np.float32)
393 |     frames = 1 + max(0, (N - n_fft) // hop)
394 |     S = np.empty((n_fft // 2 + 1, frames), dtype=np.float32)
395 |     for i in range(frames):
396 |         s = i * hop
397 |         frame = mono[s : s + n_fft]
398 |         if frame.shape[0] < n_fft:
399 |             frame = np.pad(frame, (0, n_fft - frame.shape[0]))
400 |         X = np.fft.rfft(frame * win)
401 |         S[:, i] = np.abs(X).astype(np.float32)
402 |     return S
403 | 
404 | 
405 | def _lsd(SA: np.ndarray, SB: np.ndarray) -> Tuple[float, float]:
406 |     eps = 1e-12
407 |     LA = 20 * np.log10(SA + eps)
408 |     LB = 20 * np.log10(SB + eps)
409 |     D = (LA - LB) ** 2
410 |     per = np.sqrt(np.mean(D, axis=0) + 1e-12)
411 |     return float(np.mean(per)), float(np.percentile(per, 95))
412 | 
413 | 
414 | def _si_sdr(s: np.ndarray, s_hat: np.ndarray) -> float:
415 |     # operate on mono
416 |     s = s.astype(np.float64)
417 |     s_hat = s_hat.astype(np.float64)
418 |     if s.ndim > 1:
419 |         s = s.mean(axis=0)
420 |     if s_hat.ndim > 1:
421 |         s_hat = s_hat.mean(axis=0)
422 |     # match length
423 |     n = min(s.shape[-1], s_hat.shape[-1])
424 |     s = s[:n]
425 |     s_hat = s_hat[:n]
426 |     alpha = np.dot(s_hat, s) / (np.dot(s, s) + 1e-20)
427 |     s_target = alpha * s
428 |     e_noise = s_hat - s_target
429 |     return 10.0 * np.log10((np.dot(s_target, s_target) + 1e-20) / (np.dot(e_noise, e_noise) + 1e-20))
430 | 
431 | 
432 | class Metrics_LSD_SISDR:
433 |     CATEGORY = "Egregora/Analysis"
434 |     RETURN_TYPES = ("DICT",)
435 |     RETURN_NAMES = ("metrics",)
436 |     FUNCTION = "execute"
437 | 
438 |     @classmethod
439 |     def INPUT_TYPES(cls):
440 |         return {
441 |             "required": {
442 |                 "audio_ref": ("AUDIO", {}),
443 |                 "audio_proc": ("AUDIO", {}),
444 |             },
445 |             "optional": {
446 |                 "n_fft": ("INT", {"default": 2048, "min": 512, "max": 8192, "step": 128}),
447 |                 "hop": ("INT", {"default": 512, "min": 64, "max": 4096, "step": 64}),
448 |                 "compute_lsd": ("BOOLEAN", {"default": True}),
449 |                 "compute_si_sdr": ("BOOLEAN", {"default": True}),
450 |             },
451 |         }
452 | 
453 |     def execute(self, audio_ref, audio_proc, n_fft=2048, hop=512, compute_lsd=True, compute_si_sdr=True):
454 |         A = to_internal_audio(audio_ref)
455 |         B = to_internal_audio(audio_proc)
456 |         a = A["samples"].mean(axis=0)
457 |         b = B["samples"].mean(axis=0)
458 |         n = min(a.size, b.size)
459 |         a = a[:n]
460 |         b = b[:n]
461 |         out: Dict[str, Any] = {}
462 |         if compute_lsd:
463 |             SA = _stft_mag(a, n_fft=n_fft, hop=hop)
464 |             SB = _stft_mag(b, n_fft=n_fft, hop=hop)
465 |             lsd_mean, lsd_p95 = _lsd(SA, SB)
466 |             out["lsd_mean_db"] = float(lsd_mean)
467 |             out["lsd_p95_db"] = float(lsd_p95)
468 |         if compute_si_sdr:
469 |             out["si_sdr_db"] = float(_si_sdr(a, b))
470 |         return (out,)
471 | 
472 | 
473 | # -----------------------------
474 | # Resample Audio (HQ)
475 | # -----------------------------
476 | class Resample_Audio_HQ:
477 |     CATEGORY = "Egregora/Utils"
478 |     RETURN_TYPES = ("AUDIO",)
479 |     RETURN_NAMES = ("audio_out",)
480 |     FUNCTION = "execute"
481 | 
482 |     @classmethod
483 |     def INPUT_TYPES(cls):
484 |         modes = ["auto", "scipy_polyphase", "torchaudio", "linear"]
485 |         return {
486 |             "required": {
487 |                 "audio": ("AUDIO", {}),
488 |                 "target_sr": ("INT", {"default": 48000, "min": 4000, "max": 384000, "step": 1}),
489 |             },
490 |             "optional": {
491 |                 "mode": (modes, {}),
492 |                 "kaiser_beta": ("FLOAT", {"default": 14.769, "min": 5.0, "max": 20.0, "step": 0.1}),
493 |             },
494 |         }
495 | 
496 |     def execute(self, audio, target_sr=48000, mode="auto", kaiser_beta=14.769):
497 |         a = to_internal_audio(audio)
498 |         src_sr = int(a["sample_rate"])
499 |         if src_sr == int(target_sr):
500 |             return (a,)
501 |         x = a["samples"]  # [C,N]
502 |         C, N = x.shape
503 |         if mode == "auto":
504 |             mode = "scipy_polyphase" if _HAVE_SCIPY else ("torchaudio" if _HAVE_TA else "linear")
505 |         if mode == "scipy_polyphase" and _HAVE_SCIPY:
506 |             # rational ratio
507 |             from math import gcd
508 |             g = gcd(src_sr, int(target_sr))
509 |             up = int(target_sr) // g
510 |             down = src_sr // g
511 |             y = np.stack([sps.resample_poly(x[c], up, down) for c in range(C)], axis=0).astype(np.float32)
512 |         elif mode == "torchaudio" and _HAVE_TA:
513 |             wf = torch.from_numpy(x).unsqueeze(0)  # [1,C,N]
514 |             y = AF.resample(wf, src_sr, int(target_sr), lowpass_filter_width=64, rolloff=0.945, resampling_method="kaiser_window", beta=kaiser_beta)
515 |             y = y.squeeze(0).detach().cpu().numpy().astype(np.float32)
516 |         else:
517 |             # fallback: linear interp
518 |             new_N = int(round(N * (int(target_sr) / src_sr)))
519 |             t_old = np.linspace(0.0, 1.0, N, endpoint=False)
520 |             t_new = np.linspace(0.0, 1.0, new_N, endpoint=False)
521 |             y = np.stack([np.interp(t_new, t_old, x[c]) for c in range(C)], axis=0).astype(np.float32)
522 |         return (make_audio(int(target_sr), y, a.get("meta", {})),)
523 | 
524 | 
525 | # -----------------------------
526 | # Registration
527 | # -----------------------------
528 | NODE_CLASS_MAPPINGS = {
529 |     "ABX Prepare": ABX_Prepare,
530 |     "ABX Judge": ABX_Judge,
531 |     "Loudness Meter (BS1770)": Loudness_Meter_1770,
532 |     "Audio Gain Match (1770)": Audio_Gain_Match_1770,
533 |     "Metrics (LSD + SI-SDR)": Metrics_LSD_SISDR,
534 |     "Resample Audio (HQ)": Resample_Audio_HQ,
535 | }
536 | 
537 | NODE_DISPLAY_NAME_MAPPINGS = {
538 |     "ABX Prepare": "Egregora ABX Prepare",
539 |     "ABX Judge": "Egregora ABX Judge",
540 |     "Loudness Meter (BS1770)": "Egregora Loudness Meter (BS1770)",
541 |     "Audio Gain Match (1770)": "Egregora Audio Gain Match (1770)",
542 |     "Metrics (LSD + SI-SDR)": "Egregora Metrics (LSD + SI-SDR)",
543 |     "Resample Audio (HQ)": "Egregora Resample Audio (HQ)",
544 | }
545 | 


--------------------------------------------------------------------------------
/egregora_null_test_suite.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Egregora · Null Test Suite for ComfyUI (v5)
  3 | ===========================================
  4 | 
  5 | This version fixes the UI toggles by using ComfyUI's BOOLEAN widget type
  6 | instead of a non-existent BOOL type, and converts some strings to COMBOs.
  7 | 
  8 | Added/changed since v4:
  9 | - All on/off controls now use ("BOOLEAN", {"default": ...}) so they render as
 10 |   real checkboxes in the UI.
 11 | - `align_method` is a COMBO (for now just ["gcc-phat"], extensible later).
 12 | - `match_mode` is a COMBO: ["LUFS-I", "RMS"].
 13 | - Keeps the v4 compute/plot toggles to save FFT/LUFS work when unneeded.
 14 | 
 15 | Contracts (per Comfy docs):
 16 | - IMAGE: torch.Tensor [B,H,W,3] in 0..1
 17 | - AUDIO: dict with keys {"waveform": torch.Tensor [B,C,T], "sample_rate": int}
 18 | """
 19 | from __future__ import annotations
 20 | 
 21 | import io
 22 | import math
 23 | from typing import Any, Dict, Optional, Tuple
 24 | 
 25 | import numpy as np
 26 | import torch
 27 | from PIL import Image
 28 | 
 29 | # -----------------------------
 30 | # Array/Tensor helpers
 31 | # -----------------------------
 32 | 
 33 | def _to_numpy(x: Any) -> np.ndarray:
 34 |     if isinstance(x, np.ndarray):
 35 |         return x
 36 |     if hasattr(x, "detach") and hasattr(x, "cpu"):
 37 |         return x.detach().cpu().numpy()
 38 |     return np.asarray(x)
 39 | 
 40 | 
 41 | def _normalize_CN(arr: np.ndarray) -> np.ndarray:
 42 |     """Coerce arbitrary shapes to channels-first [C, N] float32."""
 43 |     a = np.asarray(arr)
 44 |     a = np.squeeze(a)
 45 |     if a.ndim == 1:
 46 |         a = a[None, :]
 47 |     elif a.ndim == 2:
 48 |         if a.shape[0] > a.shape[1]:
 49 |             a = a.T
 50 |     else:
 51 |         t_axis = int(np.argmax(a.shape))
 52 |         a = np.moveaxis(a, t_axis, -1)
 53 |         C = int(np.prod(a.shape[:-1]))
 54 |         N = a.shape[-1]
 55 |         a = a.reshape(C, N)
 56 |     return a.astype(np.float32)
 57 | 
 58 | 
 59 | def _blank_image(h: int = 8, w: int = 8) -> torch.Tensor:
 60 |     return torch.zeros((1, h, w, 3), dtype=torch.float32)
 61 | 
 62 | 
 63 | # -----------------------------
 64 | # Comfy interop: AUDIO / IMAGE
 65 | # -----------------------------
 66 | 
 67 | def make_audio(sr: int, samples_CN: np.ndarray, meta: Optional[dict] = None) -> Dict[str, Any]:
 68 |     s = _normalize_CN(samples_CN)
 69 |     wf = torch.from_numpy(s).unsqueeze(0)  # [1,C,N]
 70 |     return {
 71 |         "sr": int(sr),
 72 |         "sample_rate": int(sr),
 73 |         "samples": s,              # convenience
 74 |         "waveform": wf,            # Comfy contract
 75 |         "meta": dict(meta or {}),
 76 |     }
 77 | 
 78 | 
 79 | def to_internal_audio(x: Any) -> Dict[str, Any]:
 80 |     """Accept a ComfyUI AUDIO or similar → {sr, samples[C,N], waveform[1,C,N]}"""
 81 |     if isinstance(x, dict) and "waveform" in x and ("sample_rate" in x or "sr" in x or "rate" in x):
 82 |         sr = int(x.get("sample_rate") or x.get("sr") or x.get("rate"))
 83 |         wf = _to_numpy(x["waveform"])  # [B,C,T] or [C,T]
 84 |         if wf.ndim == 3:
 85 |             wf = wf[0]
 86 |         s = _normalize_CN(wf)
 87 |         return make_audio(sr, s, x.get("meta", {}))
 88 |     if isinstance(x, dict) and ("sr" in x or "sample_rate" in x):
 89 |         sr = int(x.get("sr") or x.get("sample_rate"))
 90 |         buf = x.get("samples") or x.get("audio") or x.get("array")
 91 |         if buf is None:
 92 |             raise ValueError("Audio dict missing samples/waveform")
 93 |         return make_audio(sr, _to_numpy(buf), x.get("meta", {}))
 94 |     raise ValueError("Unsupported AUDIO object for this node")
 95 | 
 96 | 
 97 | def image_from_figure(fig) -> torch.Tensor:
 98 |     """Matplotlib figure → IMAGE torch [1,H,W,3] in 0..1."""
 99 |     import matplotlib
100 |     matplotlib.use("Agg")
101 |     import matplotlib.pyplot as plt  # noqa: F401
102 | 
103 |     buf = io.BytesIO()
104 |     fig.savefig(buf, format="png", bbox_inches="tight", dpi=110)
105 |     try:
106 |         fig.clf()
107 |     except Exception:
108 |         pass
109 |     buf.seek(0)
110 |     im = Image.open(buf).convert("RGB")
111 |     arr = np.array(im).astype(np.float32) / 255.0
112 |     return torch.from_numpy(arr).unsqueeze(0)  # [1,H,W,3]
113 | 
114 | 
115 | # -----------------------------
116 | # DSP helpers
117 | # -----------------------------
118 | 
119 | def _rms_db(x: np.ndarray) -> float:
120 |     x = x.astype(np.float64)
121 |     e = float(np.mean(x * x) + 1e-20)
122 |     return 10.0 * math.log10(e)
123 | 
124 | 
125 | def _k_weight(sr: int, x_CN: np.ndarray) -> np.ndarray:
126 |     # very small K-weight approx: 1st-order HPF @60 Hz + mild HF tilt
127 |     x = x_CN
128 |     fc = 60.0 / (sr * 0.5)
129 |     k = math.exp(-2 * math.pi * fc)
130 |     y = np.zeros_like(x, dtype=np.float32)
131 |     for c in range(x.shape[0]):
132 |         xn = x[c].astype(np.float32)
133 |         yc = np.zeros_like(xn)
134 |         z = 0.0
135 |         for n in range(xn.shape[0]):
136 |             z = (1 - k) * xn[n] + k * z
137 |             yc[n] = xn[n] - z
138 |         y[c] = yc
139 |     y[:, 1:] += 0.02 * (y[:, 1:] - y[:, :-1])
140 |     return y
141 | 
142 | 
143 | def integrated_lufs(audio: Dict[str, Any]) -> float:
144 |     sr = audio["sample_rate"]
145 |     y = _k_weight(sr, audio["samples"])  # [C,N]
146 |     mono = y.mean(axis=0)
147 |     blk = max(1, int(round(0.400 * sr)))
148 |     hop = max(1, int(round(0.100 * sr)))
149 |     frames = 1 + max(0, (mono.shape[0] - blk) // hop)
150 |     if frames <= 0:
151 |         return _rms_db(mono)
152 |     ms = []
153 |     for i in range(frames):
154 |         s = i * hop
155 |         e = s + blk
156 |         seg = mono[s:e].astype(np.float64)
157 |         ms.append(float(np.mean(seg * seg)))
158 |     ms = np.asarray(ms) + 1e-20
159 |     lufs_ungated = -0.691 + 10.0 * np.log10(np.mean(ms))
160 |     gate = lufs_ungated - 10.0
161 |     mask = (-0.691 + 10.0 * np.log10(ms)) >= gate
162 |     if np.any(mask):
163 |         ms = ms[mask]
164 |     return float(-0.691 + 10.0 * np.log10(np.mean(ms)))
165 | 
166 | 
167 | def _stft_mag(x: np.ndarray, n_fft: int = 2048, hop: int = 512) -> np.ndarray:
168 |     mono = x if x.ndim == 1 else x.mean(axis=0)
169 |     N = mono.shape[0]
170 |     win = np.hanning(n_fft).astype(np.float32)
171 |     frames = 1 + max(0, (N - n_fft) // hop)
172 |     S = np.empty((n_fft // 2 + 1, frames), dtype=np.float32)
173 |     for i in range(frames):
174 |         s = i * hop
175 |         frame = mono[s : s + n_fft]
176 |         if frame.shape[0] < n_fft:
177 |             frame = np.pad(frame, (0, n_fft - frame.shape[0]))
178 |         X = np.fft.rfft(frame * win)
179 |         S[:, i] = np.abs(X).astype(np.float32)
180 |     return S
181 | 
182 | 
183 | def _lsd(A: np.ndarray, B: np.ndarray) -> Tuple[float, float]:
184 |     eps = 1e-12
185 |     LA = 20 * np.log10(A + eps)
186 |     LB = 20 * np.log10(B + eps)
187 |     D = (LA - LB) ** 2
188 |     per = np.sqrt(np.mean(D, axis=0) + 1e-12)
189 |     return float(np.mean(per)), float(np.percentile(per, 95))
190 | 
191 | 
192 | def _band_energy_hi_db(x_CN: np.ndarray, sr: int, lo_hz: float) -> float:
193 |     mono = x_CN.mean(axis=0)
194 |     X = np.fft.rfft(mono)
195 |     freqs = np.fft.rfftfreq(mono.shape[0], d=1.0 / sr)
196 |     mask = freqs >= lo_hz
197 |     e_hi = float(np.sum(np.abs(X[mask]) ** 2))
198 |     e_all = float(np.sum(np.abs(X) ** 2) + 1e-20)
199 |     return 10.0 * math.log10(e_hi / e_all + 1e-20)
200 | 
201 | 
202 | def _pad_or_crop_CN(x: np.ndarray, N: int) -> np.ndarray:
203 |     C, M = x.shape
204 |     if M == N:
205 |         return x
206 |     if M > N:
207 |         return x[:, :N]
208 |     y = np.zeros((C, N), dtype=x.dtype)
209 |     y[:, :M] = x
210 |     return y
211 | 
212 | 
213 | def _xcorr_delay(a: np.ndarray, b: np.ndarray, sr: int, max_shift_smp: int) -> float:
214 |     # GCC-PHAT-ish coarse delay + parabolic refine. Returns samples (b lags a > 0)
215 |     n = 1
216 |     total = a.size + b.size
217 |     while n < total:
218 |         n <<= 1
219 |     A = np.fft.rfft(a, n=n)
220 |     B = np.fft.rfft(b, n=n)
221 |     R = B * np.conj(A)
222 |     R /= (np.abs(R) + 1e-12)
223 |     cc = np.fft.irfft(R, n=n)
224 |     cc = np.concatenate((cc[-(n // 2 - 1) :], cc[: n // 2 + 1]))
225 |     center = len(cc) // 2
226 |     sl = center - max_shift_smp
227 |     sh = center + max_shift_smp + 1
228 |     w = cc[sl:sh]
229 |     k = int(np.argmax(w))
230 |     idx = sl + k
231 |     if 1 <= idx < len(cc) - 1:
232 |         y0, y1, y2 = cc[idx - 1], cc[idx], cc[idx + 1]
233 |         denom = 2 * (y0 - 2 * y1 + y2)
234 |         frac = 0.0 if abs(denom) < 1e-12 else (y0 - y2) / denom
235 |     else:
236 |         frac = 0.0
237 |     return float((idx - center) + frac)
238 | 
239 | 
240 | def _apply_frac_delay_CN(x: np.ndarray, delay_samples: float, taps: int = 64) -> np.ndarray:
241 |     if abs(delay_samples) < 1e-6:
242 |         return x.copy()
243 |     C, N = x.shape
244 |     int_d = int(math.floor(abs(delay_samples)))
245 |     frac = abs(delay_samples) - int_d
246 |     sign = 1 if delay_samples >= 0 else -1
247 |     y = np.zeros((C, N), dtype=np.float32)
248 |     if sign > 0:
249 |         if int_d < N:
250 |             y[:, int_d:] = x[:, : N - int_d]
251 |     else:
252 |         if int_d < N:
253 |             y[:, : N - int_d] = x[:, int_d:]
254 |     if frac > 1e-6:
255 |         M = max(16, int(taps))
256 |         n = np.arange(M)
257 |         m = (M - 1) / 2.0
258 |         h = np.sinc(n - m - frac)
259 |         w = np.hanning(M)
260 |         h = (h * w).astype(np.float32)
261 |         h /= np.sum(h)
262 |         for c in range(C):
263 |             yc = np.convolve(y[c], h, mode="same")
264 |             y[c] = yc.astype(np.float32)
265 |     return y
266 | 
267 | 
268 | # -----------------------------
269 | # Node 1: Audio Align (XCorr)
270 | # -----------------------------
271 | class Audio_Align_XCorr:
272 |     CATEGORY = "Egregora/Analysis"
273 |     RETURN_TYPES = ("AUDIO", "FLOAT", "FLOAT", "FLOAT", "IMAGE")
274 |     RETURN_NAMES = ("audio_proc_aligned", "delay_samples", "delay_ms", "peak_corr", "debug_image")
275 |     FUNCTION = "execute"
276 | 
277 |     @classmethod
278 |     def INPUT_TYPES(cls):
279 |         return {
280 |             "required": {
281 |                 "audio_ref": ("AUDIO", {}),
282 |                 "audio_proc": ("AUDIO", {}),
283 |             },
284 |             "optional": {
285 |                 "max_shift_ms": ("INT", {"default": 200, "min": 0, "max": 5000, "step": 1}),
286 |                 # COMBO: list[str] => dropdown
287 |                 "align_method": (["gcc-phat"], {}),
288 |                 "fractional": ("BOOLEAN", {"default": True}),
289 |                 "fir_len": ("INT", {"default": 64, "min": 16, "max": 256, "step": 1}),
290 |             },
291 |         }
292 | 
293 |     def execute(self, audio_ref, audio_proc, max_shift_ms=200, align_method="gcc-phat", fractional=True, fir_len=64):
294 |         ref = to_internal_audio(audio_ref)
295 |         proc = to_internal_audio(audio_proc)
296 |         # resample proc to ref.sr if needed (linear interp is fine for alignment)
297 |         if proc["sample_rate"] != ref["sample_rate"]:
298 |             sr_old = proc["sample_rate"]
299 |             x = proc["samples"]
300 |             C, N = x.shape
301 |             new_N = int(round(N * ref["sample_rate"] / sr_old))
302 |             t_old = np.linspace(0.0, 1.0, N, endpoint=False)
303 |             t_new = np.linspace(0.0, 1.0, new_N, endpoint=False)
304 |             y = np.stack([np.interp(t_new, t_old, x[c]) for c in range(C)], axis=0).astype(np.float32)
305 |             proc = make_audio(ref["sample_rate"], y, proc.get("meta", {}))
306 | 
307 |         a = ref["samples"].mean(axis=0)
308 |         b = proc["samples"].mean(axis=0)
309 |         n = min(a.size, b.size)
310 |         a = a[:n]
311 |         b = b[:n]
312 | 
313 |         max_shift = int(ref["sample_rate"] * (max_shift_ms / 1000.0))
314 |         lag = _xcorr_delay(a, b, ref["sample_rate"], max_shift)  # +ve => proc lags
315 |         delay_samples = float(lag)
316 |         delay_ms = 1000.0 * delay_samples / ref["sample_rate"]
317 | 
318 |         aligned = _apply_frac_delay_CN(proc["samples"], -delay_samples if fractional else -round(delay_samples), taps=fir_len)
319 |         aligned = _pad_or_crop_CN(aligned, ref["samples"].shape[1])
320 |         out = make_audio(ref["sample_rate"], aligned, proc.get("meta", {}))
321 | 
322 |         # minimal debug plot
323 |         try:
324 |             import matplotlib
325 |             matplotlib.use("Agg")
326 |             import matplotlib.pyplot as plt
327 |             t = np.arange(n)
328 |             fig, ax = plt.subplots(1, 1, figsize=(6, 2.2))
329 |             ax.plot(t, a, linewidth=0.5, label="A")
330 |             ax.plot(t, b, linewidth=0.5, label="B")
331 |             ax.legend(); ax.grid(alpha=.2); ax.set_title("Align preview")
332 |             debug_img = image_from_figure(fig)
333 |         except Exception:
334 |             debug_img = _blank_image()
335 | 
336 |         return (out, float(delay_samples), float(delay_ms), 0.0, debug_img)
337 | 
338 | 
339 | # -----------------------------
340 | # Node 2: Audio Gain Match
341 | # -----------------------------
342 | class Audio_Gain_Match:
343 |     CATEGORY = "Egregora/Analysis"
344 |     RETURN_TYPES = ("AUDIO", "FLOAT", "FLOAT", "FLOAT")
345 |     RETURN_NAMES = ("audio_matched", "gain_db", "ref_level", "in_level")
346 |     FUNCTION = "execute"
347 | 
348 |     @classmethod
349 |     def INPUT_TYPES(cls):
350 |         return {
351 |             "required": {
352 |                 "audio_ref": ("AUDIO", {}),
353 |                 "audio_in": ("AUDIO", {}),
354 |             },
355 |             "optional": {
356 |                 # COMBO for mode
357 |                 "mode": (["LUFS-I", "RMS"], {}),
358 |                 "max_gain_db": ("FLOAT", {"default": 12.0, "min": -48.0, "max": 48.0, "step": 0.1}),
359 |             },
360 |         }
361 | 
362 |     def execute(self, audio_ref, audio_in, mode="LUFS-I", max_gain_db=12.0):
363 |         ref = to_internal_audio(audio_ref)
364 |         inn = to_internal_audio(audio_in)
365 |         if inn["sample_rate"] != ref["sample_rate"]:
366 |             sr_old = inn["sample_rate"]
367 |             x = inn["samples"]
368 |             C, N = x.shape
369 |             new_N = int(round(N * ref["sample_rate"] / sr_old))
370 |             t_old = np.linspace(0.0, 1.0, N, endpoint=False)
371 |             t_new = np.linspace(0.0, 1.0, new_N, endpoint=False)
372 |             y = np.stack([np.interp(t_new, t_old, x[c]) for c in range(C)], axis=0).astype(np.float32)
373 |             inn = make_audio(ref["sample_rate"], y, inn.get("meta", {}))
374 | 
375 |         if str(mode).upper().startswith("LUFS"):
376 |             ref_level = integrated_lufs(ref)
377 |             in_level = integrated_lufs(inn)
378 |         else:
379 |             ref_level = _rms_db(ref["samples"].mean(axis=0))
380 |             in_level = _rms_db(inn["samples"].mean(axis=0))
381 |         gain_db = float(np.clip(ref_level - in_level, -abs(max_gain_db), abs(max_gain_db)))
382 |         gain = 10 ** (gain_db / 20.0)
383 |         y = (inn["samples"] * gain).astype(np.float32)
384 |         out = make_audio(inn["sample_rate"], y, inn.get("meta", {}))
385 |         return (out, float(gain_db), float(ref_level), float(in_level))
386 | 
387 | 
388 | # -----------------------------
389 | # Node 3: Audio Null Test (with metric toggles)
390 | # -----------------------------
391 | class Audio_Null_Test:
392 |     CATEGORY = "Egregora/Analysis"
393 |     RETURN_TYPES = ("AUDIO", "DICT")
394 |     RETURN_NAMES = ("audio_null", "metrics")
395 |     FUNCTION = "execute"
396 | 
397 |     @classmethod
398 |     def INPUT_TYPES(cls):
399 |         return {
400 |             "required": {
401 |                 "audio_ref": ("AUDIO", {}),
402 |                 "audio_proc_aligned_matched": ("AUDIO", {}),
403 |             },
404 |             "optional": {
405 |                 "invert_b": ("BOOLEAN", {"default": True}),
406 |                 "least_squares_scale": ("BOOLEAN", {"default": False}),
407 |                 # Metric toggles
408 |                 "compute_corr": ("BOOLEAN", {"default": True}),
409 |                 "compute_null_rms": ("BOOLEAN", {"default": True}),
410 |                 "compute_null_lufs": ("BOOLEAN", {"default": True}),
411 |                 "compute_lsd": ("BOOLEAN", {"default": True}),
412 |                 "compute_hf_residual": ("BOOLEAN", {"default": False}),
413 |                 # STFT controls (used only if LSD requested)
414 |                 "n_fft": ("INT", {"default": 2048, "min": 512, "max": 8192, "step": 128}),
415 |                 "hop": ("INT", {"default": 512, "min": 64, "max": 4096, "step": 64}),
416 |                 "hf_band_hz": ("INT", {"default": 8000, "min": 1000, "max": 20000, "step": 100}),
417 |             },
418 |         }
419 | 
420 |     def execute(self, audio_ref, audio_proc_aligned_matched, invert_b=True, least_squares_scale=False,
421 |                 compute_corr=True, compute_null_rms=True, compute_null_lufs=True,
422 |                 compute_lsd=True, compute_hf_residual=False, n_fft=2048, hop=512, hf_band_hz=8000):
423 |         ref = to_internal_audio(audio_ref)
424 |         pro = to_internal_audio(audio_proc_aligned_matched)
425 |         if pro["sample_rate"] != ref["sample_rate"]:
426 |             raise ValueError("Sample rate mismatch after alignment stage")
427 |         A = ref["samples"]
428 |         B = pro["samples"]
429 |         N = min(A.shape[1], B.shape[1])
430 |         A = A[:, :N]
431 |         B = B[:, :N]
432 |         k = 1.0
433 |         if least_squares_scale:
434 |             a = A.mean(axis=0).astype(np.float64)
435 |             b = B.mean(axis=0).astype(np.float64)
436 |             denom = float(np.dot(b, b) + 1e-20)
437 |             k = float(np.dot(a, b) / denom)
438 |             B = (B * k).astype(np.float32)
439 |         if invert_b:
440 |             B = -B
441 |         null = (A + B).astype(np.float32)
442 | 
443 |         metrics: Dict[str, Any] = {}
444 |         a_m = A.mean(axis=0)
445 |         b_m = (-B).mean(axis=0)
446 | 
447 |         if compute_corr:
448 |             am = a_m - np.mean(a_m)
449 |             bm = b_m - np.mean(b_m)
450 |             corr = float(np.dot(am, bm) / (np.linalg.norm(am) * np.linalg.norm(bm) + 1e-20))
451 |             metrics["corr_coef"] = corr
452 |         if compute_null_rms:
453 |             metrics["null_rms_dbfs"] = float(_rms_db(null.mean(axis=0)))
454 |         if compute_null_lufs:
455 |             metrics["null_lufs"] = float(integrated_lufs(make_audio(ref["sample_rate"], null)))
456 |         if compute_lsd:
457 |             SA = _stft_mag(a_m, n_fft=n_fft, hop=hop)
458 |             SB = _stft_mag(b_m, n_fft=n_fft, hop=hop)
459 |             lsd_mean, lsd_p95 = _lsd(SA, SB)
460 |             metrics["lsd_mean_db"] = float(lsd_mean)
461 |             metrics["lsd_p95_db"] = float(lsd_p95)
462 |         if compute_hf_residual:
463 |             metrics["hf_residual_db"] = float(_band_energy_hi_db(null, ref["sample_rate"], hf_band_hz))
464 |         # Always include safety stats
465 |         overs = int(np.sum(np.abs(null) > 1.0))
466 |         metrics["overshoot_count"] = int(overs)
467 |         metrics["clipped_pct"] = float(100.0 * overs / null.size)
468 |         metrics["scale_k"] = float(k)
469 | 
470 |         return make_audio(ref["sample_rate"], null, {}), metrics
471 | 
472 | 
473 | # -----------------------------
474 | # Node 4: Audio Plotter (with draw toggles)
475 | # -----------------------------
476 | class Audio_Plotter:
477 |     CATEGORY = "Egregora/Visualization"
478 |     RETURN_TYPES = ("IMAGE", "IMAGE", "IMAGE")
479 |     RETURN_NAMES = ("image_waveforms", "image_spectrograms", "image_diffspec")
480 |     FUNCTION = "execute"
481 | 
482 |     @classmethod
483 |     def INPUT_TYPES(cls):
484 |         return {
485 |             "required": {
486 |                 "audio_ref": ("AUDIO", {}),
487 |                 "audio_proc": ("AUDIO", {}),
488 |                 "audio_null": ("AUDIO", {}),
489 |             },
490 |             "optional": {
491 |                 "draw_waveforms": ("BOOLEAN", {"default": True}),
492 |                 "draw_spectrograms": ("BOOLEAN", {"default": True}),
493 |                 "draw_diffspec": ("BOOLEAN", {"default": True}),
494 |                 "n_fft": ("INT", {"default": 2048, "min": 512, "max": 8192, "step": 128}),
495 |                 "hop": ("INT", {"default": 512, "min": 64, "max": 4096, "step": 64}),
496 |             },
497 |         }
498 | 
499 |     def execute(self, audio_ref, audio_proc, audio_null, draw_waveforms=True, draw_spectrograms=True, draw_diffspec=True, n_fft=2048, hop=512):
500 |         import matplotlib
501 |         matplotlib.use("Agg")
502 |         import matplotlib.pyplot as plt
503 | 
504 |         ref = to_internal_audio(audio_ref)
505 |         pro = to_internal_audio(audio_proc)
506 |         nul = to_internal_audio(audio_null)
507 | 
508 |         a = ref["samples"].mean(axis=0)
509 |         b = pro["samples"].mean(axis=0)
510 |         n = min(a.size, b.size, nul["samples"].shape[1])
511 |         a = a[:n]
512 |         b = b[:n]
513 |         null = nul["samples"].mean(axis=0)[:n]
514 | 
515 |         # Waveforms
516 |         if draw_waveforms:
517 |             t = np.arange(n)
518 |             fig1, axes = plt.subplots(3, 1, figsize=(10, 6), sharex=True)
519 |             for ax, y, ttl in zip(axes, [a, b, null], ["A: original", "B: processed", "Null: A−B"]):
520 |                 ax.plot(t, y, linewidth=0.7)
521 |                 ax.set_ylim(-1.05, 1.05)
522 |                 ax.set_title(ttl)
523 |                 ax.grid(alpha=0.25)
524 |             axes[-1].set_xlabel("samples")
525 |             fig1.tight_layout()
526 |             img_wave = image_from_figure(fig1)
527 |         else:
528 |             img_wave = _blank_image(1, 1)
529 | 
530 |         # Spectrograms (A, B, Null)
531 |         if draw_spectrograms:
532 |             def _spec(y):
533 |                 S = _stft_mag(y, n_fft=n_fft, hop=hop)
534 |                 return 20 * np.log10(S + 1e-9)
535 |             SA = _spec(a)
536 |             SB = _spec(b)
537 |             SN = _spec(null)
538 |             fig2, axes2 = plt.subplots(3, 1, figsize=(10, 7))
539 |             for ax, S, ttl in zip(axes2, [SA, SB, SN], ["A: spec", "B: spec", "Null: spec"]):
540 |                 ax.imshow(S, origin="lower", aspect="auto")
541 |                 ax.set_title(ttl)
542 |             fig2.tight_layout()
543 |             img_spec = image_from_figure(fig2)
544 |         else:
545 |             img_spec = _blank_image(1, 1)
546 | 
547 |         # Diff-spec |A-B|
548 |         if draw_diffspec:
549 |             def _spec(y):
550 |                 S = _stft_mag(y, n_fft=n_fft, hop=hop)
551 |                 return 20 * np.log10(S + 1e-9)
552 |             SA = _spec(a)
553 |             SB = _spec(b)
554 |             D = np.abs(10 ** (SA / 20.0) - 10 ** (SB / 20.0))
555 |             fig3 = plt.figure(figsize=(10, 3))
556 |             import matplotlib.pyplot as plt  # noqa
557 |             plt.imshow(20 * np.log10(D + 1e-9), origin="lower", aspect="auto")
558 |             plt.title("|Spec(A) − Spec(B)| (dB)")
559 |             plt.tight_layout()
560 |             img_diff = image_from_figure(fig3)
561 |         else:
562 |             img_diff = _blank_image(1, 1)
563 | 
564 |         return (img_wave, img_spec, img_diff)
565 | 
566 | 
567 | # -----------------------------
568 | # Node 5: Null Test (Full) – with toggles exposed
569 | # -----------------------------
570 | class Null_Test_Full:
571 |     CATEGORY = "Egregora/Analysis"
572 |     RETURN_TYPES = ("AUDIO", "AUDIO", "FLOAT", "FLOAT", "DICT", "IMAGE", "IMAGE", "IMAGE")
573 |     RETURN_NAMES = (
574 |         "audio_proc_aligned_matched",
575 |         "audio_null",
576 |         "delay_ms",
577 |         "gain_db",
578 |         "metrics",
579 |         "image_waveforms",
580 |         "image_spectrograms",
581 |         "image_diffspec",
582 |     )
583 |     FUNCTION = "execute"
584 | 
585 |     @classmethod
586 |     def INPUT_TYPES(cls):
587 |         return {
588 |             "required": {
589 |                 "audio_ref": ("AUDIO", {}),
590 |                 "audio_proc": ("AUDIO", {}),
591 |             },
592 |             "optional": {
593 |                 # Align/Gain params
594 |                 "align_max_shift_ms": ("INT", {"default": 200, "min": 0, "max": 5000, "step": 1}),
595 |                 "align_method": (["gcc-phat"], {}),
596 |                 "fractional": ("BOOLEAN", {"default": True}),
597 |                 "fir_len": ("INT", {"default": 64, "min": 16, "max": 256, "step": 1}),
598 |                 "match_mode": (["LUFS-I", "RMS"], {}),
599 |                 "least_squares_scale": ("BOOLEAN", {"default": False}),
600 |                 # Metric toggles
601 |                 "compute_corr": ("BOOLEAN", {"default": True}),
602 |                 "compute_null_rms": ("BOOLEAN", {"default": True}),
603 |                 "compute_null_lufs": ("BOOLEAN", {"default": True}),
604 |                 "compute_lsd": ("BOOLEAN", {"default": True}),
605 |                 "compute_hf_residual": ("BOOLEAN", {"default": False}),
606 |                 # Plot toggles
607 |                 "draw_waveforms": ("BOOLEAN", {"default": True}),
608 |                 "draw_spectrograms": ("BOOLEAN", {"default": True}),
609 |                 "draw_diffspec": ("BOOLEAN", {"default": True}),
610 |                 # STFT controls
611 |                 "n_fft": ("INT", {"default": 2048, "min": 512, "max": 8192, "step": 128}),
612 |                 "hop": ("INT", {"default": 512, "min": 64, "max": 4096, "step": 64}),
613 |             },
614 |         }
615 | 
616 |     def execute(self, audio_ref, audio_proc, align_max_shift_ms=200, align_method="gcc-phat", fractional=True,
617 |                 fir_len=64, match_mode="LUFS-I", least_squares_scale=False,
618 |                 compute_corr=True, compute_null_rms=True, compute_null_lufs=True,
619 |                 compute_lsd=True, compute_hf_residual=False,
620 |                 draw_waveforms=True, draw_spectrograms=True, draw_diffspec=True,
621 |                 n_fft=2048, hop=512):
622 |         # 1) Align
623 |         align = Audio_Align_XCorr()
624 |         ap_aligned, delay_samples, delay_ms, _pc, _dbg = align.execute(
625 |             audio_ref, audio_proc,
626 |             max_shift_ms=align_max_shift_ms,
627 |             align_method=align_method,
628 |             fractional=fractional,
629 |             fir_len=fir_len,
630 |         )
631 |         # 2) Gain-match
632 |         gm = Audio_Gain_Match()
633 |         ap_matched, gain_db, _ref_lvl, _in_lvl = gm.execute(audio_ref, ap_aligned, mode=match_mode)
634 |         # 3) Null (+ metrics)
635 |         nt = Audio_Null_Test()
636 |         audio_null, metrics = nt.execute(
637 |             audio_ref, ap_matched,
638 |             invert_b=True,
639 |             least_squares_scale=least_squares_scale,
640 |             compute_corr=compute_corr,
641 |             compute_null_rms=compute_null_rms,
642 |             compute_null_lufs=compute_null_lufs,
643 |             compute_lsd=compute_lsd,
644 |             compute_hf_residual=compute_hf_residual,
645 |             n_fft=n_fft, hop=hop,
646 |         )
647 |         # 4) Plots (respect draw toggles)
648 |         pl = Audio_Plotter()
649 |         img_waves, img_spec, img_diff = pl.execute(
650 |             audio_ref, ap_matched, audio_null,
651 |             draw_waveforms=draw_waveforms,
652 |             draw_spectrograms=draw_spectrograms,
653 |             draw_diffspec=draw_diffspec,
654 |             n_fft=n_fft, hop=hop,
655 |         )
656 | 
657 |         return ap_matched, audio_null, float(delay_ms), float(gain_db), metrics, img_waves, img_spec, img_diff
658 | 
659 | 
660 | # -----------------------------
661 | # Registration (original names maintained for retro-compatibility)
662 | # -----------------------------
663 | NODE_CLASS_MAPPINGS = {
664 |     "Audio Align (XCorr)": Audio_Align_XCorr,
665 |     "Audio Gain Match": Audio_Gain_Match,
666 |     "Audio Null Test": Audio_Null_Test,
667 |     "Audio Plotter": Audio_Plotter,
668 |     "Null Test (Full)": Null_Test_Full,
669 | }
670 | 
671 | NODE_DISPLAY_NAME_MAPPINGS = {
672 |     "Audio Align (XCorr)": "Audio Align (XCorr)",
673 |     "Audio Gain Match": "Audio Gain Match",
674 |     "Audio Null Test": "Audio Null Test",
675 |     "Audio Plotter": "Audio Plotter",
676 |     "Null Test (Full)": "Null Test (Full)",
677 | }
678 | 


--------------------------------------------------------------------------------
/egregora_audio_enhance_extras.py:
--------------------------------------------------------------------------------
  1 | # Egregora Enhance Extras - Fixed Version
  2 | # Adds: RNNoise Denoise, WPE Dereverb, DeepFilterNet Denoise, DAC Encode/Decode, ViSQOL Meter
  3 | # Licenses:
  4 | # - RNNoise wrappers (pyrnnoise): Apache-2.0
  5 | # - NARA-WPE: MIT
  6 | # - DeepFilterNet: MIT/Apache-2.0 (dual)
  7 | # - DAC: MIT
  8 | # - ViSQOL (binary) + Audiocraft wrapper docs: Apache-2.0 (wrapper), ViSQOL itself under Apache-2.0
  9 | 
 10 | import os
 11 | import io
 12 | import json
 13 | import math
 14 | import subprocess
 15 | from pathlib import Path
 16 | from typing import Dict, Tuple, Optional
 17 | 
 18 | import torch
 19 | import torchaudio
 20 | import numpy as np
 21 | 
 22 | # ----------------------------
 23 | # Small audio helpers (Comfy-style)
 24 | # ----------------------------
 25 | 
 26 | def _is_audio(x):
 27 |     return isinstance(x, dict) and "waveform" in x and "sample_rate" in x
 28 | 
 29 | def _coerce_audio(x):
 30 |     # Returns (wave[B,C,T], sr:int, meta:dict)
 31 |     if _is_audio(x):
 32 |         wav = x["waveform"]
 33 |         sr = int(x["sample_rate"])
 34 |         meta = x.get("meta", {})
 35 |         if wav.dim() == 2:
 36 |             # [C,T] -> [1,C,T]
 37 |             wav = wav.unsqueeze(0)
 38 |         elif wav.dim() == 1:
 39 |             # [T] -> [1,1,T]
 40 |             wav = wav.unsqueeze(0).unsqueeze(0)
 41 |         elif wav.dim() != 3:
 42 |             raise ValueError("Audio waveform must be 1D, 2D or 3D [B,C,T].")
 43 |         return wav.float(), sr, meta
 44 |     # Torch tensor passthrough (assume [C,T] or [B,C,T] with default sr=48000)
 45 |     if isinstance(x, torch.Tensor):
 46 |         wav = x
 47 |         if wav.dim() == 2:  # [C,T] -> [1,C,T]
 48 |             wav = wav.unsqueeze(0)
 49 |         elif wav.dim() != 3:
 50 |             raise ValueError("Tensor audio must be [C,T] or [B,C,T].")
 51 |         return wav.float(), 48000, {}
 52 |     raise TypeError("Unsupported audio input type.")
 53 | 
 54 | def _make_audio(sr: int, wav: torch.Tensor, meta: Optional[dict] = None):
 55 |     # Ensure [B,C,T]
 56 |     if wav.dim() == 2:
 57 |         wav = wav.unsqueeze(0)
 58 |     if wav.dim() != 3:
 59 |         raise ValueError("samples must be 1D/2D/3D; got shape %r" % (wav.shape,))
 60 |     return {
 61 |         "waveform": wav.contiguous(),
 62 |         "sample_rate": int(sr),
 63 |         "meta": meta or {},
 64 |     }
 65 | 
 66 | def _resample(wav: torch.Tensor, sr_in: int, sr_out: int):
 67 |     if sr_in == sr_out:
 68 |         return wav, sr_in
 69 |     B, C, T = wav.shape
 70 |     res = []
 71 |     for b in range(B):
 72 |         # torchaudio expects [C,T]
 73 |         res.append(torchaudio.functional.resample(wav[b], sr_in, sr_out))
 74 |     wav_out = torch.stack(res, dim=0)
 75 |     return wav_out, sr_out
 76 | 
 77 | def _to_mono(wav: torch.Tensor):
 78 |     # [B,C,T] -> [B,1,T]
 79 |     if wav.size(1) == 1:
 80 |         return wav
 81 |     return wav.mean(dim=1, keepdim=True)
 82 | 
 83 | def _device_for(wav: torch.Tensor):
 84 |     return "cuda" if wav.is_cuda else ("cuda" if torch.cuda.is_available() else "cpu")
 85 | 
 86 | # ----------------------------
 87 | # RNNoise (pyrnnoise)
 88 | # ----------------------------
 89 | 
 90 | class Egregora_RNNoise_Denoise:
 91 |     """
 92 |     RNNoise denoiser (speech-focused), ComfyUI node.
 93 |       • Runs at 48 kHz (10 ms = 480 samples).
 94 |       • Mono/stereo: per-channel or downmix to mono.
 95 |       • Uses pyrnnoise>=0.3.x 'denoise_chunk' API.
 96 |       • Adds static strength + adaptive mix (driven by per-frame VAD) + post-gain with ceiling.
 97 |     """
 98 |     @classmethod
 99 |     def INPUT_TYPES(cls):
100 |         return {
101 |             "required": {
102 |                 "audio": ("AUDIO",),
103 |                 "frame_ms": ("INT", {"default": 20, "min": 5, "max": 60, "step": 5}),
104 |                 "stereo_mode": (["per_channel", "downmix_mono"], {"default": "per_channel"}),
105 | 
106 |                 # mix controls
107 |                 "strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}),
108 |                 "mix_curve": (["equal_power", "linear"], {"default": "equal_power"}),
109 | 
110 |                 # adaptive controls
111 |                 "adaptive_mode": (["off", "more_on_noise", "more_on_speech", "gate_on_noise"], {"default": "more_on_noise"}),
112 |                 "adaptive_amount": ("FLOAT", {"default": 0.5, "min": 0.0, "max": 1.0, "step": 0.01}),
113 |                 "vad_threshold": ("FLOAT", {"default": 0.90, "min": 0.0, "max": 1.0, "step": 0.01}),
114 |                 "vad_smooth_ms": ("INT", {"default": 50, "min": 0, "max": 500, "step": 5}),
115 | 
116 |                 # post gain
117 |                 "post_gain_db": ("FLOAT", {"default": 0.0, "min": -24.0, "max": 24.0, "step": 0.1}),
118 |                 "limit_ceiling": ("BOOL", {"default": True}),
119 |                 "ceiling": ("FLOAT", {"default": 0.999, "min": 0.1, "max": 1.0, "step": 0.001}),
120 |             }
121 |         }
122 | 
123 |     RETURN_TYPES = ("AUDIO",)
124 |     FUNCTION = "execute"
125 |     CATEGORY = "Egregora/Enhance"
126 | 
127 |     # ---------- helpers ----------
128 |     def _silence_destructor(self, rn):
129 |         try:
130 |             type(rn).__del__ = lambda self: None
131 |         except Exception:
132 |             pass
133 | 
134 |     def _init_rn(self, channels: int):
135 |         from pyrnnoise import RNNoise
136 |         rn = RNNoise(sample_rate=48000)
137 |         try:
138 |             if getattr(rn, "channels", None) in (None, 0):
139 |                 setattr(rn, "channels", channels)
140 |         except Exception:
141 |             pass
142 |         return rn
143 | 
144 |     def _denoise_chunk_with_probs(self, rn, x_i16):
145 |         """
146 |         Preferred path on pyrnnoise>=0.3.x: returns (wet_i16, vad_probs_per_frame)
147 |         where each frame is 480 samples at 48 kHz.
148 |         """
149 |         import numpy as np
150 |         pad = (-len(x_i16)) % 480
151 |         x_pad = np.pad(x_i16, (0, pad), mode="constant") if pad else x_i16
152 | 
153 |         out_frames, probs = [], []
154 |         x2 = x_pad[np.newaxis, :]  # (1, N)
155 |         for p, den in rn.denoise_chunk(x2):
156 |             # p may be scalar or array-like; we're per-channel, so take float(p)
157 |             try:
158 |                 p_val = float(p[0]) if hasattr(p, "__len__") else float(p)
159 |             except Exception:
160 |                 p_val = float(p)
161 |             probs.append(p_val)
162 | 
163 |             den = np.asarray(den, dtype=np.int16)
164 |             if den.ndim == 2 and den.shape[0] == 1:
165 |                 den = den[0]
166 |             out_frames.append(den)
167 | 
168 |         wet = np.concatenate(out_frames, axis=0)
169 |         return wet[:len(x_i16)], np.asarray(probs, dtype=np.float32)
170 | 
171 |     def _fallback_frame_loop(self, rn, x_i16, frame_len):
172 |         """
173 |         Very old wheels only: try process_frame/filter; else passthrough.
174 |         (No VAD probs here, so adaptive becomes effectively 'off' on fallback.)
175 |         """
176 |         import numpy as np
177 |         call = None
178 |         if hasattr(rn, "process_frame"):
179 |             call = lambda fr: np.asarray(rn.process_frame(fr), dtype=np.int16)
180 |         elif hasattr(rn, "filter"):
181 |             call = lambda fr: np.asarray(rn.filter(fr), dtype=np.int16)
182 |         if call is None:
183 |             return x_i16, None
184 | 
185 |         frame_len = max(1, frame_len // 480) * 480
186 |         pad = (-len(x_i16)) % frame_len
187 |         x_work = np.pad(x_i16, (0, pad), mode="constant") if pad else x_i16
188 | 
189 |         outs = []
190 |         for start in range(0, len(x_work), frame_len):
191 |             chunk = x_work[start:start + frame_len]
192 |             pos, sub = 0, []
193 |             while pos < frame_len:
194 |                 fr = chunk[pos:pos + 480]
195 |                 if fr.shape[0] < 480:
196 |                     fr = np.pad(fr, (0, 480 - fr.shape[0]), mode="constant")
197 |                 try:
198 |                     y = call(fr)
199 |                 except Exception:
200 |                     y = fr
201 |                 sub.append(y)
202 |                 pos += 480
203 |             outs.append(np.concatenate(sub, axis=0))
204 |         out = np.concatenate(outs, axis=0)
205 |         return out[:len(x_i16)], None
206 | 
207 |     def _smooth_vad_probs(self, probs, smooth_ms: int):
208 |         import numpy as np, math
209 |         if probs is None or probs.size == 0 or smooth_ms <= 0:
210 |             return probs
211 |         hop_ms = 10.0  # RNNoise frame = 10 ms @ 48 kHz
212 |         tau = max(1e-3, float(smooth_ms))
213 |         alpha = math.exp(-hop_ms / tau)
214 |         y = np.empty_like(probs)
215 |         acc = probs[0]
216 |         for i, p in enumerate(probs):
217 |             acc = alpha * acc + (1.0 - alpha) * p
218 |             y[i] = acc
219 |         return y
220 | 
221 |     def _strength_per_frame(self, base_s, vad_smooth, adaptive_mode, adaptive_amount, vad_threshold):
222 |         import numpy as np
223 |         if vad_smooth is None:
224 |             return np.array([base_s], dtype=np.float32)  # will be broadcast
225 |         s0 = float(base_s)
226 |         a = float(adaptive_amount)
227 |         v = np.clip(vad_smooth, 0.0, 1.0)
228 |         if adaptive_mode == "off":
229 |             s_eff = np.full_like(v, s0, dtype=np.float32)
230 |         elif adaptive_mode == "more_on_noise":
231 |             # more denoise when speech-prob low
232 |             s_eff = s0 + a * (1.0 - v) * (1.0 - s0)
233 |         elif adaptive_mode == "more_on_speech":
234 |             # more denoise when speech-prob high
235 |             s_eff = s0 + a * v * (1.0 - s0)
236 |         elif adaptive_mode == "gate_on_noise":
237 |             # if below threshold => denoise-heavy; else denoise-light
238 |             s_noise = s0 + a * (1.0 - s0)       # push toward 1
239 |             s_speech = s0 * (1.0 - a)           # pull toward 0
240 |             s_eff = np.where(v < vad_threshold, s_noise, s_speech).astype(np.float32)
241 |         else:
242 |             s_eff = np.full_like(v, s0, dtype=np.float32)
243 |         return np.clip(s_eff.astype(np.float32), 0.0, 1.0)
244 | 
245 |     def _gains_from_strength(self, s_eff, curve):
246 |         import numpy as np, math
247 |         s = np.clip(s_eff, 0.0, 1.0).astype(np.float32)
248 |         if curve == "equal_power":
249 |             # equal-power crossfade: keep power ~constant
250 |             g_wet = np.sin(0.5 * math.pi * s, dtype=np.float32)
251 |             g_dry = np.cos(0.5 * math.pi * s, dtype=np.float32)
252 |         else:
253 |             # linear
254 |             g_wet = s
255 |             g_dry = 1.0 - s
256 |         return g_dry.astype(np.float32), g_wet.astype(np.float32)
257 | 
258 |     # ---------- main ----------
259 |     def execute(
260 |         self,
261 |         audio,
262 |         frame_ms=20,
263 |         stereo_mode="per_channel",
264 |         strength=1.0,
265 |         mix_curve="equal_power",
266 |         adaptive_mode="more_on_noise",
267 |         adaptive_amount=0.5,
268 |         vad_threshold=0.90,
269 |         vad_smooth_ms=50,
270 |         post_gain_db=0.0,
271 |         limit_ceiling=True,
272 |         ceiling=0.999,
273 |     ):
274 |         import numpy as np
275 |         import torch
276 |         import math
277 | 
278 |         # Coerce to [B,C,T], resample to 48k (RNNoise domain)
279 |         wav, sr, meta = _coerce_audio(audio)
280 |         wav48, _ = _resample(wav, sr, 48000)
281 | 
282 |         if stereo_mode == "downmix_mono":
283 |             wav48 = _to_mono(wav48)
284 | 
285 |         B, C, T = wav48.shape
286 |         frame_len = int(48000 * max(5, min(60, frame_ms)) / 1000)
287 | 
288 |         out_batches = []
289 |         for b in range(B):
290 |             ch_out = []
291 |             for c in range(C):
292 |                 dry = wav48[b, c].detach()  # float32 [-1,1] at 48k
293 |                 x = dry.cpu().numpy().astype(np.float32)
294 |                 x_i16 = (np.clip(x, -1.0, 1.0) * 32767.0).astype(np.int16)
295 | 
296 |                 rn = self._init_rn(channels=1)
297 | 
298 |                 if hasattr(rn, "denoise_chunk"):
299 |                     try:
300 |                         wet_i16, probs = self._denoise_chunk_with_probs(rn, x_i16)
301 |                     except Exception:
302 |                         self._silence_destructor(rn)
303 |                         rn = self._init_rn(channels=1)
304 |                         wet_i16, probs = self._fallback_frame_loop(rn, x_i16, frame_len)
305 |                 else:
306 |                     wet_i16, probs = self._fallback_frame_loop(rn, x_i16, frame_len)
307 | 
308 |                 wet = torch.from_numpy(wet_i16.astype(np.float32) / 32768.0).to(dry.device)
309 | 
310 |                 # ----- Adaptive mixing -----
311 |                 vad_s = self._smooth_vad_probs(probs, vad_smooth_ms)
312 |                 s_eff = self._strength_per_frame(strength, vad_s, adaptive_mode, adaptive_amount, vad_threshold)
313 |                 # expand per-frame strengths (10 ms) to per-sample gains
314 |                 if s_eff.ndim == 0:
315 |                     s_per_sample = np.full(T, float(s_eff), dtype=np.float32)
316 |                 else:
317 |                     s_per_sample = np.repeat(s_eff, 480)[:T].astype(np.float32)
318 | 
319 |                 g_dry_np, g_wet_np = self._gains_from_strength(s_per_sample, mix_curve)
320 |                 g_dry = torch.from_numpy(g_dry_np).to(dry.device)
321 |                 g_wet = torch.from_numpy(g_wet_np).to(dry.device)
322 | 
323 |                 y = g_dry * dry + g_wet * wet
324 |                 y = torch.clamp(y, -1.0, 1.0)
325 | 
326 |                 ch_out.append(y)
327 | 
328 |             y_st = torch.stack(ch_out, dim=0).unsqueeze(0)  # [1,C,T]
329 |             out_batches.append(y_st)
330 | 
331 |         y48 = torch.cat(out_batches, dim=0)  # [B,C,T]
332 | 
333 |         # Back to original sample rate
334 |         y, _ = _resample(y48, 48000, sr)
335 | 
336 |         # ----- Post-gain + optional ceiling limiter -----
337 |         if post_gain_db != 0.0:
338 |             gain = float(10.0 ** (post_gain_db / 20.0))
339 |             y = y * gain
340 | 
341 |         if limit_ceiling:
342 |             peak = torch.max(torch.abs(y)).item()
343 |             if peak > ceiling and peak > 0:
344 |                 y = y * (ceiling / peak)
345 | 
346 |         y = torch.clamp(y, -1.0, 1.0)
347 | 
348 |         meta2 = dict(meta)
349 |         meta2["rnnoise"] = {
350 |             "frame_ms": frame_ms,
351 |             "stereo_mode": stereo_mode,
352 |             "strength": strength,
353 |             "mix_curve": mix_curve,
354 |             "adaptive_mode": adaptive_mode,
355 |             "adaptive_amount": adaptive_amount,
356 |             "vad_threshold": vad_threshold,
357 |             "vad_smooth_ms": vad_smooth_ms,
358 |             "post_gain_db": post_gain_db,
359 |             "limit_ceiling": bool(limit_ceiling),
360 |             "ceiling": ceiling,
361 |         }
362 |         return (_make_audio(sr, y, meta2),)
363 | 
364 | # ----------------------------
365 | # WPE Dereverb (nara_wpe)
366 | # ----------------------------
367 | 
368 | class Egregora_WPE_Dereverb:
369 |     """
370 |     Weighted Prediction Error dereverberation.
371 |     Works mono or multi-channel. Uses STFT -> WPE -> iSTFT.
372 |     """
373 |     @classmethod
374 |     def INPUT_TYPES(cls):
375 |         return {
376 |             "required": {
377 |                 "audio": ("AUDIO",),
378 |                 "taps": ("INT", {"default": 10, "min": 3, "max": 32}),
379 |                 "delay": ("INT", {"default": 3, "min": 1, "max": 16}),
380 |                 "iterations": ("INT", {"default": 3, "min": 1, "max": 10}),
381 |                 "n_fft": ("INT", {"default": 1024, "min": 256, "max": 4096, "step": 256}),
382 |                 "hop": ("INT", {"default": 256, "min": 64, "max": 1024, "step": 64}),
383 |                 "use_float32": ("BOOLEAN", {"default": True}),
384 |             }
385 |         }
386 | 
387 |     RETURN_TYPES = ("AUDIO",)
388 |     FUNCTION = "execute"
389 |     CATEGORY = "Egregora/Enhance"
390 | 
391 |     def execute(self, audio, taps=10, delay=3, iterations=3, n_fft=1024, hop=256, use_float32=True):
392 |         try:
393 |             import numpy as np
394 |             from nara_wpe import wpe as np_wpe
395 |             from nara_wpe.utils import stft, istft
396 |         except Exception as e:
397 |             raise RuntimeError("nara-wpe not installed. pip install nara-wpe") from e
398 | 
399 |         wav, sr, meta = _coerce_audio(audio)  # [B,C,T]
400 |         B, C, T = wav.shape
401 | 
402 |         out_list = []
403 |         for b in range(B):
404 |             # nara_wpe expects numpy with shape (channels, samples)
405 |             y = wav[b].cpu().numpy()  # [C,T]
406 |             
407 |             # FIX: Handle memory issues with large arrays by processing in chunks or using float32
408 |             if use_float32:
409 |                 y = y.astype(np.float32)
410 |             
411 |             try:
412 |                 # STFT: returns shape (frames, freqs, channels)
413 |                 Y = stft(y, size=n_fft, shift=hop)
414 |                 
415 |                 # FIX: Check memory usage and dtype
416 |                 if Y.dtype == np.complex128 and use_float32:
417 |                     Y = Y.astype(np.complex64)
418 |                 
419 |                 # Transpose to (freqs, channels, frames) as expected by wpe()
420 |                 Y = np.transpose(Y, (1, 2, 0))
421 |                 
422 |                 # Apply WPE with memory-conscious settings
423 |                 Z = np_wpe.wpe(Y, taps=taps, delay=delay, iterations=iterations)
424 |                 
425 |                 # Back to (frames, freqs, channels)
426 |                 Z = np.transpose(Z, (2, 0, 1))
427 |                 z = istft(Z, size=n_fft, shift=hop)  # (channels, samples)
428 |                 
429 |             except MemoryError:
430 |                 # Fallback: process with reduced precision or skip WPE
431 |                 print(f"Warning: WPE processing failed due to memory constraints for batch {b}")
432 |                 z = y  # Pass through original audio
433 |             except Exception as e:
434 |                 print(f"Warning: WPE processing failed: {e}")
435 |                 z = y  # Pass through original audio
436 |             
437 |             z_t = torch.from_numpy(z).to(wav.device).float()  # [C,T]
438 |             out_list.append(z_t.unsqueeze(0))  # [1,C,T]
439 | 
440 |         out = torch.cat(out_list, dim=0)
441 |         meta2 = dict(meta)
442 |         meta2["wpe"] = {"taps": taps, "delay": delay, "iterations": iterations, "n_fft": n_fft, "hop": hop}
443 |         return (_make_audio(sr, out, meta2),)
444 | 
445 | 
446 | # ----------------------------
447 | # DeepFilterNet (DFN/DFN2/DFN3)
448 | # ----------------------------
449 | 
450 | class Egregora_DeepFilterNet_Denoise:
451 |     """
452 |     DeepFilterNet denoiser (speech enhancement) for ComfyUI.
453 | 
454 |     • Runs DeepFilterNet at 48 kHz (its native rate), using tensor I/O.
455 |     • Mono or stereo (per-channel or downmix to mono before DFN).
456 |     • Adds 'strength' wet/dry mix with equal-power or linear curve.
457 |     • Adaptive mix driven by VAD (RNNoise if available, else energy/RMS proxy).
458 |     • Post-gain (dB) and a simple peak ceiling limiter.
459 |     """
460 | 
461 |     @classmethod
462 |     def INPUT_TYPES(cls):
463 |         return {
464 |             "required": {
465 |                 "audio": ("AUDIO",),
466 | 
467 |                 # DFN options
468 |                 "dfn_model": (["DeepFilterNet2", "DeepFilterNet3"], {"default": "DeepFilterNet2"}),
469 |                 "device": (["auto", "cuda:0", "cpu"], {"default": "auto"}),
470 | 
471 |                 # proper BOOLEAN toggles (not sockets)
472 |                 "use_postfilter": ("BOOLEAN", {"default": False, "label_on": "postfilter on", "label_off": "postfilter off"}),
473 |                 "limit_ceiling": ("BOOLEAN", {"default": True, "label_on": "limit on", "label_off": "limit off"}),
474 | 
475 |                 # channel / framing
476 |                 "stereo_mode": (["per_channel", "downmix_mono"], {"default": "per_channel"}),
477 |                 "frame_ms": ("INT", {"default": 20, "min": 5, "max": 60, "step": 5}),
478 | 
479 |                 # mixing
480 |                 "strength": ("FLOAT", {"default": 0.65, "min": 0.0, "max": 1.0, "step": 0.01}),
481 |                 "mix_curve": (["equal_power", "linear"], {"default": "equal_power"}),
482 | 
483 |                 # adaptive controls
484 |                 "adaptive_vad_source": (["rms", "rnnoise", "none"], {"default": "rms"}),
485 |                 "adaptive_mode": (["off", "more_on_noise", "more_on_speech", "gate_on_noise"], {"default": "more_on_noise"}),
486 |                 "adaptive_amount": ("FLOAT", {"default": 0.45, "min": 0.0, "max": 1.0, "step": 0.01}),
487 |                 "vad_threshold": ("FLOAT", {"default": 0.90, "min": 0.0, "max": 1.0, "step": 0.01}),
488 |                 "vad_smooth_ms": ("INT", {"default": 60, "min": 0, "max": 500, "step": 5}),
489 | 
490 |                 # post
491 |                 "post_gain_db": ("FLOAT", {"default": 0.5, "min": -24.0, "max": 24.0, "step": 0.1}),
492 |                 "ceiling": ("FLOAT", {"default": 0.98, "min": 0.1, "max": 1.0, "step": 0.001}),
493 |             }
494 |         }
495 | 
496 |     RETURN_TYPES = ("AUDIO",)
497 |     FUNCTION = "execute"
498 |     CATEGORY = "Egregora/Enhance"
499 | 
500 |     # ------------------------- DFN backend & cache -------------------------
501 |     _DF_CACHE = {}  # (model_name, device) -> (model, df_state)
502 | 
503 |     def _pick_device(self, choice: str):
504 |         import torch
505 |         if choice == "auto":
506 |             return "cuda:0" if torch.cuda.is_available() else "cpu"
507 |         return choice
508 | 
509 |     def _df_get(self, model_name: str, device: str):
510 |         from df.enhance import init_df
511 |         key = (model_name, device)
512 |         if key in self._DF_CACHE:
513 |             return self._DF_CACHE[key]
514 |         model, df_state, _ = init_df(model_name, config_allow_defaults=True)
515 |         model = model.to(device).eval()
516 |         self._DF_CACHE[key] = (model, df_state)
517 |         return model, df_state
518 | 
519 |     # ----------------------------- VAD helpers -----------------------------
520 |     def _vad_probs_rnnoise_48k(self, x48_np):
521 |         import numpy as np
522 |         try:
523 |             from pyrnnoise import RNNoise
524 |         except Exception:
525 |             return None  # RNNoise not installed
526 | 
527 |         x_i16 = (np.clip(x48_np, -1.0, 1.0) * 32767.0).astype(np.int16)
528 |         rn = RNNoise(sample_rate=48000)
529 |         try:
530 |             if getattr(rn, "channels", None) in (None, 0):
531 |                 setattr(rn, "channels", 1)
532 |         except Exception:
533 |             pass
534 | 
535 |         probs = []
536 |         if hasattr(rn, "denoise_chunk"):
537 |             pad = (-len(x_i16)) % 480
538 |             x_pad = np.pad(x_i16, (0, pad), mode="constant") if pad else x_i16
539 |             X = x_pad[np.newaxis, :]
540 |             for p, _ in rn.denoise_chunk(X):
541 |                 try:
542 |                     probs.append(float(p[0]) if hasattr(p, "__len__") else float(p))
543 |                 except Exception:
544 |                     probs.append(float(p))
545 |             return np.asarray(probs, dtype=np.float32)
546 |         return None  # fallback APIs don't expose p
547 | 
548 |     def _vad_probs_rms_48k(self, x48_np):
549 |         import numpy as np
550 |         hop = 480  # 10 ms at 48 kHz
551 |         n = (len(x48_np) + hop - 1) // hop
552 |         rms = []
553 |         for i in range(n):
554 |             fr = x48_np[i*hop:(i+1)*hop]
555 |             rms.append(float(np.sqrt(np.mean(fr*fr))) if len(fr) else 0.0)
556 |         rms = np.asarray(rms, dtype=np.float32)
557 |         p95 = float(np.percentile(rms, 95)) or 1e-6
558 |         return np.clip(rms / p95, 0.0, 1.0).astype(np.float32)
559 | 
560 |     def _smooth_probs(self, probs, smooth_ms: int):
561 |         import numpy as np, math
562 |         if probs is None or probs.size == 0 or smooth_ms <= 0:
563 |             return probs
564 |         hop_ms = 10.0
565 |         tau = max(1e-3, float(smooth_ms))
566 |         alpha = math.exp(-hop_ms / tau)
567 |         y = np.empty_like(probs)
568 |         acc = probs[0]
569 |         for i, p in enumerate(probs):
570 |             acc = alpha * acc + (1.0 - alpha) * p
571 |             y[i] = acc
572 |         return y
573 | 
574 |     def _strength_per_frame(self, base_s, vad_smooth, adaptive_mode, adaptive_amount, vad_threshold):
575 |         import numpy as np
576 |         if vad_smooth is None:
577 |             return np.array([float(base_s)], dtype=np.float32)
578 |         s0 = float(base_s)
579 |         a = float(adaptive_amount)
580 |         v = np.clip(vad_smooth, 0.0, 1.0)
581 |         if adaptive_mode == "off":
582 |             s_eff = np.full_like(v, s0, dtype=np.float32)
583 |         elif adaptive_mode == "more_on_noise":
584 |             s_eff = s0 + a * (1.0 - v) * (1.0 - s0)
585 |         elif adaptive_mode == "more_on_speech":
586 |             s_eff = s0 + a * v * (1.0 - s0)
587 |         elif adaptive_mode == "gate_on_noise":
588 |             s_noise = s0 + a * (1.0 - s0)
589 |             s_speech = s0 * (1.0 - a)
590 |             s_eff = (s_noise * (v < vad_threshold) + s_speech * (v >= vad_threshold)).astype(np.float32)
591 |         else:
592 |             s_eff = np.full_like(v, s0, dtype=np.float32)
593 |         return np.clip(s_eff, 0.0, 1.0).astype(np.float32)
594 | 
595 |     def _gains_from_strength(self, s_eff, curve):
596 |         import numpy as np, math
597 |         s = np.clip(s_eff, 0.0, 1.0).astype(np.float32)
598 |         if curve == "equal_power":
599 |             g_wet = np.sin(0.5 * math.pi * s, dtype=np.float32)
600 |             g_dry = np.cos(0.5 * math.pi * s, dtype=np.float32)
601 |         else:
602 |             g_wet = s
603 |             g_dry = 1.0 - s
604 |         return g_dry.astype(np.float32), g_wet.astype(np.float32)
605 | 
606 |     # ------------------------------ main op ------------------------------
607 |     def execute(
608 |         self,
609 |         audio,
610 |         dfn_model="DeepFilterNet2",
611 |         device="auto",
612 |         use_postfilter=False,
613 |         limit_ceiling=True,
614 |         stereo_mode="per_channel",
615 |         frame_ms=20,
616 |         strength=0.65,
617 |         mix_curve="equal_power",
618 |         adaptive_vad_source="rms",
619 |         adaptive_mode="more_on_noise",
620 |         adaptive_amount=0.45,
621 |         vad_threshold=0.90,
622 |         vad_smooth_ms=60,
623 |         post_gain_db=0.5,
624 |         ceiling=0.98,
625 |     ):
626 |         import torch, numpy as np
627 |         from df.enhance import enhance
628 |         from df.io import resample  # DFN tensor resampler (48k native)
629 | 
630 |         # 1) Coerce to [B,C,T], then tensorize & resample to 48 kHz (DFN native)
631 |         wav, sr, meta = _coerce_audio(audio)
632 |         if stereo_mode == "downmix_mono":
633 |             wav = _to_mono(wav)
634 |         B, C, T = wav.shape
635 | 
636 |         x_ct = wav.reshape(-1, T).to(torch.float32)               # (C,T)
637 |         x48 = resample(x_ct, sr, 48000) if sr != 48000 else x_ct
638 | 
639 |         # 2) Load DFN once
640 |         dev = self._pick_device(device)
641 |         model, df_state = self._df_get(dfn_model, dev)
642 | 
643 |         # 3) Run DFN per channel (tensors-in/out)
644 |         wet_ch = []
645 |         with torch.no_grad():
646 |             for ch in range(x48.shape[0]):
647 |                 xin = x48[ch:ch+1]                                # (1,T)
648 |                 y = enhance(model, df_state, xin)                 # (1,T)
649 |                 # Some DFN builds expose post_filter kwarg; keep flag for future wheels
650 |                 # if use_postfilter:
651 |                 #     y = enhance(model, df_state, xin, post_filter=True)
652 |                 wet_ch.append(y)
653 |         wet48 = torch.cat(wet_ch, dim=0)                          # (C,T)
654 | 
655 |         # 4) Back to original sample rate (tensors)
656 |         wet = resample(wet48, 48000, sr) if sr != 48000 else wet48
657 |         dry = x_ct if sr == 48000 else resample(x_ct, 48000, sr)
658 | 
659 |         # 5) Adaptive mix (10 ms frame gains expanded to per-sample)
660 |         hop = int(sr * 0.010)  # 10 ms at current sr for expansion
661 |         out_ch = []
662 |         for ch in range(dry.shape[0]):
663 |             dry_np = dry[ch].detach().cpu().numpy()
664 |             wet_np = wet[ch].detach().cpu().numpy()
665 | 
666 |             # VAD at 48k domain, then expand
667 |             if adaptive_vad_source == "rnnoise":
668 |                 x48_np = (resample(dry[ch:ch+1], sr, 48000)[0].cpu().numpy()
669 |                           if sr != 48000 else dry_np)
670 |                 probs = self._vad_probs_rnnoise_48k(x48_np)
671 |             elif adaptive_vad_source == "rms":
672 |                 x48_np = (resample(dry[ch:ch+1], sr, 48000)[0].cpu().numpy()
673 |                           if sr != 48000 else dry_np)
674 |                 probs = self._vad_probs_rms_48k(x48_np)
675 |             else:
676 |                 probs = None
677 | 
678 |             vad_s = self._smooth_probs(probs, vad_smooth_ms)
679 |             s_eff = self._strength_per_frame(strength, vad_s, adaptive_mode, adaptive_amount, vad_threshold)
680 | 
681 |             if s_eff.ndim == 0:
682 |                 s_per = np.full(dry_np.shape[0], float(s_eff), dtype=np.float32)
683 |             else:
684 |                 s_per = np.repeat(s_eff, max(1, hop))[:dry_np.shape[0]].astype(np.float32)
685 | 
686 |             g_dry_np, g_wet_np = self._gains_from_strength(s_per, mix_curve)
687 |             y_np = g_dry_np * dry_np + g_wet_np * wet_np
688 |             y_np = np.clip(y_np, -1.0, 1.0)
689 |             out_ch.append(torch.from_numpy(y_np))
690 | 
691 |         y = torch.stack(out_ch, dim=0)                            # (C,T)
692 |         y = y.reshape(B, C, -1)
693 | 
694 |         # 6) Post-gain + limiter
695 |         if post_gain_db != 0.0:
696 |             gain = float(10.0 ** (post_gain_db / 20.0))
697 |             y = y * gain
698 | 
699 |         if limit_ceiling:
700 |             peak = torch.max(torch.abs(y)).item()
701 |             if peak > ceiling and peak > 0:
702 |                 y = y * (ceiling / peak)
703 | 
704 |         y = torch.clamp(y, -1.0, 1.0)
705 | 
706 |         meta2 = dict(meta)
707 |         meta2["deepfilternet"] = {
708 |             "model": dfn_model,
709 |             "device": dev,
710 |             "use_postfilter": bool(use_postfilter),
711 |             "stereo_mode": stereo_mode,
712 |             "frame_ms": frame_ms,
713 |             "strength": strength,
714 |             "mix_curve": mix_curve,
715 |             "adaptive_vad_source": adaptive_vad_source,
716 |             "adaptive_mode": adaptive_mode,
717 |             "adaptive_amount": adaptive_amount,
718 |             "vad_threshold": vad_threshold,
719 |             "vad_smooth_ms": vad_smooth_ms,
720 |             "post_gain_db": post_gain_db,
721 |             "limit_ceiling": bool(limit_ceiling),
722 |             "ceiling": ceiling,
723 |         }
724 |         return (_make_audio(sr, y, meta2),)
725 | 
726 | # ----------------------------
727 | # Descript Audio Codec (DAC) encode/decode
728 | # ----------------------------
729 | 
730 | class Egregora_DAC_Encode:
731 |     """
732 |     Encodes audio with DAC and returns latent 'z' & metadata in a DICT.
733 |     Auto-downloads weights for chosen model_type on first use.
734 |     """
735 |     @classmethod
736 |     def INPUT_TYPES(cls):
737 |         return {
738 |             "required": {
739 |                 "audio": ("AUDIO",),
740 |                 "model_type": (["44khz", "24khz", "16khz"], {"default": "44khz"}),
741 |                 "device": (["auto", "cpu", "cuda"], {"default": "auto"}),
742 |             }
743 |         }
744 | 
745 |     RETURN_TYPES = ("DICT", "STRING")
746 |     RETURN_NAMES = ("codes", "log")
747 |     FUNCTION = "execute"
748 |     CATEGORY = "Egregora/Codecs"
749 | 
750 |     def execute(self, audio, model_type="44khz", device="auto"):
751 |         try:
752 |             import dac
753 |         except Exception as e:
754 |             raise RuntimeError("descript-audio-codec not installed. pip install descript-audio-codec") from e
755 | 
756 |         wav, sr, meta = _coerce_audio(audio)  # [B,C,T] float
757 |         B, C, T = wav.shape
758 | 
759 |         # Auto-download
760 |         ckpt = dac.utils.download(model_type=model_type)
761 |         model = dac.DAC.load(ckpt)
762 | 
763 |         dev = _device_for(wav) if device == "auto" else device
764 |         model = model.to(dev)
765 | 
766 |         # FIX: Get model's expected sample rate
767 |         model_sr = model.sample_rate
768 | 
769 |         # Compress each batch separately, concat codes
770 |         with torch.no_grad():
771 |             z_all = []
772 |             for b in range(B):
773 |                 x = wav[b].to(dev)  # [C,T]
774 |                 
775 |                 # FIX: Resample to model's expected sample rate before preprocessing
776 |                 if sr != model_sr:
777 |                     x_resampled = torchaudio.functional.resample(x, sr, model_sr)
778 |                 else:
779 |                     x_resampled = x
780 |                 
781 |                 # preprocess expects the correct sample rate
782 |                 x_prep = model.preprocess(x_resampled, model_sr)
783 |                 z, codes, latents, _, _ = model.encode(x_prep)
784 |                 
785 |                 # Store z (list of tensors) into CPU tensors for DICT
786 |                 if isinstance(z, (list, tuple)):
787 |                     z_cpu = [t.detach().cpu() for t in z]
788 |                 else:
789 |                     z_cpu = [z.detach().cpu()]
790 |                 z_all.append(z_cpu)
791 | 
792 |         codes_dict = {
793 |             "model_type": model_type,
794 |             "sample_rate": sr,  # Store original sample rate
795 |             "model_sample_rate": model_sr,  # Store model's sample rate
796 |             "latents": z_all,  # list over batch of list[tensor]
797 |         }
798 |         log = f"DAC encode ok: model={model_type}, B={B}, C={C}, sr={sr}->{model_sr}"
799 |         return (codes_dict, log)
800 | 
801 | 
802 | class Egregora_DAC_Decode:
803 |     """
804 |     Decodes DICT produced by Egregora_DAC_Encode back to AUDIO.
805 |     """
806 |     @classmethod
807 |     def INPUT_TYPES(cls):
808 |         return {
809 |             "required": {
810 |                 "codes": ("DICT",),
811 |                 "device": (["auto", "cpu", "cuda"], {"default": "auto"}),
812 |             }
813 |         }
814 | 
815 |     RETURN_TYPES = ("AUDIO", "STRING")
816 |     RETURN_NAMES = ("audio", "log")
817 |     FUNCTION = "execute"
818 |     CATEGORY = "Egregora/Codecs"
819 | 
820 |     def execute(self, codes, device="auto"):
821 |         try:
822 |             import dac
823 |         except Exception as e:
824 |             raise RuntimeError("descript-audio-codec not installed. pip install descript-audio-codec") from e
825 | 
826 |         model_type = codes.get("model_type", "44khz")
827 |         sr = int(codes.get("sample_rate", 48000))
828 |         model_sr = int(codes.get("model_sample_rate", sr))
829 |         latents_b = codes.get("latents", [])
830 |         if not latents_b:
831 |             raise ValueError("codes.latents empty")
832 | 
833 |         ckpt = dac.utils.download(model_type=model_type)
834 |         model = dac.DAC.load(ckpt)
835 | 
836 |         dev = "cuda" if torch.cuda.is_available() and device in ("auto", "cuda") else "cpu"
837 |         model = model.to(dev)
838 | 
839 |         outs = []
840 |         with torch.no_grad():
841 |             for z_list in latents_b:
842 |                 # z_list: list[tensor] shaped as model expects
843 |                 z_dev = [t.to(dev).float() for t in z_list]
844 |                 y = model.decode(z_dev)  # [C,T] at model's native sr
845 |                 outs.append(y.unsqueeze(0).cpu())
846 | 
847 |         y_cat = torch.cat(outs, dim=0)  # [B,C,T]
848 |         
849 |         # FIX: Resample back to original sample rate if needed
850 |         if model_sr != sr:
851 |             y_resampled, _ = _resample(y_cat, model_sr, sr)
852 |         else:
853 |             y_resampled = y_cat
854 |             
855 |         audio = _make_audio(sr=sr, wav=y_resampled)
856 |         log = f"DAC decode ok: model={model_type}, B={y_cat.size(0)}, C={y_cat.size(1)}, {model_sr}->{sr}"
857 |         return (audio, log)
858 | 
859 | # ----------------------------
860 | # Node registration
861 | # ----------------------------
862 | 
863 | NODE_CLASS_MAPPINGS = {
864 |     "Egregora_RNNoise_Denoise": Egregora_RNNoise_Denoise,
865 |     "Egregora_WPE_Dereverb": Egregora_WPE_Dereverb,
866 |     "Egregora_DeepFilterNet_Denoise": Egregora_DeepFilterNet_Denoise,
867 |     "Egregora_DAC_Encode": Egregora_DAC_Encode,
868 |     "Egregora_DAC_Decode": Egregora_DAC_Decode,
869 | 
870 | }
871 | 
872 | NODE_DISPLAY_NAME_MAPPINGS = {
873 |     "Egregora_RNNoise_Denoise": "Egregora RNNoise Denoise",
874 |     "Egregora_WPE_Dereverb": "Egregora WPE Dereverb",
875 |     "Egregora_DeepFilterNet_Denoise": "Egregora DeepFilterNet Denoise",
876 |     "Egregora_DAC_Encode": "Egregora DAC Encode",
877 |     "Egregora_DAC_Decode": "Egregora DAC Decode",
878 | 
879 | }


--------------------------------------------------------------------------------