├── Elevenlabs-TTS ├── Elevenlabs-TTS.zip ├── plugin.py └── requirements.txt ├── Example-Plugin ├── plugin.py └── requirements.txt ├── README.md └── UVR ├── UVR.zip ├── models └── download_checks.json ├── plugin.py ├── requirements.txt ├── uvr.py └── uvr ├── __init__.py ├── architectures ├── __init__.py ├── demucs_separator.py ├── mdx_separator.py ├── mdxc_separator.py └── vr_separator.py ├── common_separator.py ├── separator.py └── uvr_lib_v5 ├── __init__.py ├── attend.py ├── bs_roformer.py ├── demucs ├── __init__.py ├── __main__.py ├── apply.py ├── demucs.py ├── filtering.py ├── hdemucs.py ├── htdemucs.py ├── model.py ├── model_v2.py ├── pretrained.py ├── repo.py ├── spec.py ├── states.py ├── tasnet.py ├── tasnet_v2.py ├── transformer.py └── utils.py ├── mdxnet.py ├── mel_band_roformer.py ├── mixer.ckpt ├── modules.py ├── playsound.py ├── pyrb.py ├── results.py ├── spec_utils.py ├── stft.py ├── tfc_tdf_v3.py └── vr_network ├── __init__.py ├── layers.py ├── layers_new.py ├── model_param_init.py ├── modelparams ├── 1band_sr16000_hl512.json ├── 1band_sr32000_hl512.json ├── 1band_sr33075_hl384.json ├── 1band_sr44100_hl1024.json ├── 1band_sr44100_hl256.json ├── 1band_sr44100_hl512.json ├── 1band_sr44100_hl512_cut.json ├── 1band_sr44100_hl512_nf1024.json ├── 2band_32000.json ├── 2band_44100_lofi.json ├── 2band_48000.json ├── 3band_44100.json ├── 3band_44100_mid.json ├── 3band_44100_msb2.json ├── 4band_44100.json ├── 4band_44100_mid.json ├── 4band_44100_msb.json ├── 4band_44100_msb2.json ├── 4band_44100_reverse.json ├── 4band_44100_sw.json ├── 4band_v2.json ├── 4band_v2_sn.json ├── 4band_v3.json ├── 4band_v3_sn.json └── ensemble.json ├── nets.py └── nets_new.py /Elevenlabs-TTS/Elevenlabs-TTS.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IAHispano/Applio-Plugins/b80054bb20ade068aa69fed31bfe48f7dcbc4cad/Elevenlabs-TTS/Elevenlabs-TTS.zip -------------------------------------------------------------------------------- /Elevenlabs-TTS/requirements.txt: -------------------------------------------------------------------------------- 1 | elevenlabs -------------------------------------------------------------------------------- /Example-Plugin/plugin.py: -------------------------------------------------------------------------------- 1 | import gradio as gr 2 | 3 | def applio_plugin(): 4 | gr.Markdown( 5 | value= 6 | "This code snippet introduces an Applio plugin. The heart of the plugin lies in the `def applio_plugin()` function, acting as the interface for the Gradio tab. This function will be brought into the plugins tab later on. It's crucial to maintain the original names of both the function and the `plugin.py` file, as they are integral to the import process. Additionally, there's a requirements file that cannot be relocated or renamed but can be removed if not needed." 7 | ) -------------------------------------------------------------------------------- /Example-Plugin/requirements.txt: -------------------------------------------------------------------------------- 1 | # Example requirements.txt file, this file cannot be relocated or renamed but can be removed if not needed. 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Applio Plugins 2 | 3 | > [!IMPORTANT] 4 | > We've just launched our new plugin system and we're looking for your assistance in creating amazing plugins! Below, you'll find all the information you need to get started. 5 | 6 | Welcome to **Applio Plugins**, a repository specifically designed for Applio plugins. 7 | 8 | If you're not familiar with Applio, check it out on our incredible webpage, [applio.org](https://applio.org), or visit our [GitHub repository](https://github.com/IAHispano/Applio). 9 | 10 | ## Instructions 11 | 12 | - The core of the plugin lies in the `def applio_plugin()` function, acting as the interface for the Gradio tab. *This function will be brought into the plugins tab later on.* 13 | - It's crucial to maintain the original names of both the function and the `plugin.py` file, as they are integral to the import process. 14 | 15 | *Additionally, there's a requirements file that cannot be relocated or renamed but can be removed if not needed.* 16 | -------------------------------------------------------------------------------- /UVR/UVR.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IAHispano/Applio-Plugins/b80054bb20ade068aa69fed31bfe48f7dcbc4cad/UVR/UVR.zip -------------------------------------------------------------------------------- /UVR/models/download_checks.json: -------------------------------------------------------------------------------- 1 | { 2 | "current_version": "UVR_Patch_10_6_23_4_27", 3 | "current_version_ocl": "UVR_Patch_10_6_23_4_27", 4 | "current_version_mac": "UVR_Patch_10_6_23_4_27", 5 | "current_version_linux": "UVR_Patch_10_6_23_4_27", 6 | "vr_download_list": { 7 | "VR Arch Single Model v5: 1_HP-UVR": "1_HP-UVR.pth", 8 | "VR Arch Single Model v5: 2_HP-UVR": "2_HP-UVR.pth", 9 | "VR Arch Single Model v5: 3_HP-Vocal-UVR": "3_HP-Vocal-UVR.pth", 10 | "VR Arch Single Model v5: 4_HP-Vocal-UVR": "4_HP-Vocal-UVR.pth", 11 | "VR Arch Single Model v5: 5_HP-Karaoke-UVR": "5_HP-Karaoke-UVR.pth", 12 | "VR Arch Single Model v5: 6_HP-Karaoke-UVR": "6_HP-Karaoke-UVR.pth", 13 | "VR Arch Single Model v5: 7_HP2-UVR": "7_HP2-UVR.pth", 14 | "VR Arch Single Model v5: 8_HP2-UVR": "8_HP2-UVR.pth", 15 | "VR Arch Single Model v5: 9_HP2-UVR": "9_HP2-UVR.pth", 16 | "VR Arch Single Model v5: 10_SP-UVR-2B-32000-1": "10_SP-UVR-2B-32000-1.pth", 17 | "VR Arch Single Model v5: 11_SP-UVR-2B-32000-2": "11_SP-UVR-2B-32000-2.pth", 18 | "VR Arch Single Model v5: 12_SP-UVR-3B-44100": "12_SP-UVR-3B-44100.pth", 19 | "VR Arch Single Model v5: 13_SP-UVR-4B-44100-1": "13_SP-UVR-4B-44100-1.pth", 20 | "VR Arch Single Model v5: 14_SP-UVR-4B-44100-2": "14_SP-UVR-4B-44100-2.pth", 21 | "VR Arch Single Model v5: 15_SP-UVR-MID-44100-1": "15_SP-UVR-MID-44100-1.pth", 22 | "VR Arch Single Model v5: 16_SP-UVR-MID-44100-2": "16_SP-UVR-MID-44100-2.pth", 23 | "VR Arch Single Model v5: 17_HP-Wind_Inst-UVR": "17_HP-Wind_Inst-UVR.pth", 24 | "VR Arch Single Model v5: UVR-De-Echo-Aggressive by FoxJoy": "UVR-De-Echo-Aggressive.pth", 25 | "VR Arch Single Model v5: UVR-De-Echo-Normal by FoxJoy": "UVR-De-Echo-Normal.pth", 26 | "VR Arch Single Model v5: UVR-DeEcho-DeReverb by FoxJoy": "UVR-DeEcho-DeReverb.pth", 27 | "VR Arch Single Model v5: UVR-DeNoise-Lite by FoxJoy": "UVR-DeNoise-Lite.pth", 28 | "VR Arch Single Model v5: UVR-DeNoise by FoxJoy": "UVR-DeNoise.pth", 29 | "VR Arch Single Model v5: UVR-BVE-4B_SN-44100-1": "UVR-BVE-4B_SN-44100-1.pth", 30 | "VR Arch Single Model v4: MGM_HIGHEND_v4": "MGM_HIGHEND_v4.pth", 31 | "VR Arch Single Model v4: MGM_LOWEND_A_v4": "MGM_LOWEND_A_v4.pth", 32 | "VR Arch Single Model v4: MGM_LOWEND_B_v4": "MGM_LOWEND_B_v4.pth", 33 | "VR Arch Single Model v4: MGM_MAIN_v4": "MGM_MAIN_v4.pth" 34 | }, 35 | 36 | "mdx_download_list": { 37 | "MDX-Net Model: UVR-MDX-NET Inst HQ 1": "UVR-MDX-NET-Inst_HQ_1.onnx", 38 | "MDX-Net Model: UVR-MDX-NET Inst HQ 2": "UVR-MDX-NET-Inst_HQ_2.onnx", 39 | "MDX-Net Model: UVR-MDX-NET Inst HQ 3": "UVR-MDX-NET-Inst_HQ_3.onnx", 40 | "MDX-Net Model: UVR-MDX-NET Inst HQ 4": "UVR-MDX-NET-Inst_HQ_4.onnx", 41 | "MDX-Net Model: UVR-MDX-NET Main": "UVR_MDXNET_Main.onnx", 42 | "MDX-Net Model: UVR-MDX-NET Inst Main": "UVR-MDX-NET-Inst_Main.onnx", 43 | "MDX-Net Model: UVR-MDX-NET 1": "UVR_MDXNET_1_9703.onnx", 44 | "MDX-Net Model: UVR-MDX-NET 2": "UVR_MDXNET_2_9682.onnx", 45 | "MDX-Net Model: UVR-MDX-NET 3": "UVR_MDXNET_3_9662.onnx", 46 | "MDX-Net Model: UVR-MDX-NET Inst 1": "UVR-MDX-NET-Inst_1.onnx", 47 | "MDX-Net Model: UVR-MDX-NET Inst 2": "UVR-MDX-NET-Inst_2.onnx", 48 | "MDX-Net Model: UVR-MDX-NET Inst 3": "UVR-MDX-NET-Inst_3.onnx", 49 | "MDX-Net Model: UVR-MDX-NET Karaoke": "UVR_MDXNET_KARA.onnx", 50 | "MDX-Net Model: UVR-MDX-NET Karaoke 2": "UVR_MDXNET_KARA_2.onnx", 51 | "MDX-Net Model: UVR_MDXNET_9482": "UVR_MDXNET_9482.onnx", 52 | "MDX-Net Model: UVR-MDX-NET Voc FT": "UVR-MDX-NET-Voc_FT.onnx", 53 | "MDX-Net Model: Kim Vocal 1": "Kim_Vocal_1.onnx", 54 | "MDX-Net Model: Kim Vocal 2": "Kim_Vocal_2.onnx", 55 | "MDX-Net Model: Kim Inst": "Kim_Inst.onnx", 56 | "MDX-Net Model: Reverb HQ By FoxJoy": "Reverb_HQ_By_FoxJoy.onnx", 57 | "MDX-Net Model: UVR-MDX-NET Crowd HQ 1 By Aufr33": "UVR-MDX-NET_Crowd_HQ_1.onnx", 58 | "MDX-Net Model: kuielab_a_vocals": "kuielab_a_vocals.onnx", 59 | "MDX-Net Model: kuielab_a_other": "kuielab_a_other.onnx", 60 | "MDX-Net Model: kuielab_a_bass": "kuielab_a_bass.onnx", 61 | "MDX-Net Model: kuielab_a_drums": "kuielab_a_drums.onnx", 62 | "MDX-Net Model: kuielab_b_vocals": "kuielab_b_vocals.onnx", 63 | "MDX-Net Model: kuielab_b_other": "kuielab_b_other.onnx", 64 | "MDX-Net Model: kuielab_b_bass": "kuielab_b_bass.onnx", 65 | "MDX-Net Model: kuielab_b_drums": "kuielab_b_drums.onnx" 66 | }, 67 | 68 | "demucs_download_list":{ 69 | 70 | "Demucs v4: htdemucs_ft":{ 71 | "f7e0c4bc-ba3fe64a.th":"https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/f7e0c4bc-ba3fe64a.th", 72 | "d12395a8-e57c48e6.th":"https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/d12395a8-e57c48e6.th", 73 | "92cfc3b6-ef3bcb9c.th":"https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/92cfc3b6-ef3bcb9c.th", 74 | "04573f0d-f3cf25b2.th":"https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/04573f0d-f3cf25b2.th", 75 | "htdemucs_ft.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/htdemucs_ft.yaml" 76 | }, 77 | 78 | "Demucs v4: htdemucs":{ 79 | "955717e8-8726e21a.th": "https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/955717e8-8726e21a.th", 80 | "htdemucs.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/htdemucs.yaml" 81 | }, 82 | 83 | "Demucs v4: hdemucs_mmi":{ 84 | "75fc33f5-1941ce65.th": "https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/75fc33f5-1941ce65.th", 85 | "hdemucs_mmi.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/hdemucs_mmi.yaml" 86 | }, 87 | "Demucs v4: htdemucs_6s":{ 88 | "5c90dfd2-34c22ccb.th": "https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/5c90dfd2-34c22ccb.th", 89 | "htdemucs_6s.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/htdemucs_6s.yaml" 90 | }, 91 | "Demucs v3: mdx":{ 92 | "0d19c1c6-0f06f20e.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/0d19c1c6-0f06f20e.th", 93 | "7ecf8ec1-70f50cc9.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/7ecf8ec1-70f50cc9.th", 94 | "c511e2ab-fe698775.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/c511e2ab-fe698775.th", 95 | "7d865c68-3d5dd56b.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/7d865c68-3d5dd56b.th", 96 | "mdx.yaml": "https://raw.githubusercontent.com/facebookresearch/demucs/main/demucs/remote/mdx.yaml" 97 | }, 98 | 99 | "Demucs v3: mdx_q":{ 100 | "6b9c2ca1-3fd82607.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/6b9c2ca1-3fd82607.th", 101 | "b72baf4e-8778635e.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/b72baf4e-8778635e.th", 102 | "42e558d4-196e0e1b.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/42e558d4-196e0e1b.th", 103 | "305bc58f-18378783.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/305bc58f-18378783.th", 104 | "mdx_q.yaml": "https://raw.githubusercontent.com/facebookresearch/demucs/main/demucs/remote/mdx_q.yaml" 105 | }, 106 | 107 | "Demucs v3: mdx_extra":{ 108 | "e51eebcc-c1b80bdd.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/e51eebcc-c1b80bdd.th", 109 | "a1d90b5c-ae9d2452.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/a1d90b5c-ae9d2452.th", 110 | "5d2d6c55-db83574e.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/5d2d6c55-db83574e.th", 111 | "cfa93e08-61801ae1.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/cfa93e08-61801ae1.th", 112 | "mdx_extra.yaml": "https://raw.githubusercontent.com/facebookresearch/demucs/main/demucs/remote/mdx_extra.yaml" 113 | }, 114 | 115 | "Demucs v3: mdx_extra_q": { 116 | "83fc094f-4a16d450.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/83fc094f-4a16d450.th", 117 | "464b36d7-e5a9386e.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/464b36d7-e5a9386e.th", 118 | "14fc6a69-a89dd0ee.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/14fc6a69-a89dd0ee.th", 119 | "7fd6ef75-a905dd85.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/7fd6ef75-a905dd85.th", 120 | "mdx_extra_q.yaml": "https://raw.githubusercontent.com/facebookresearch/demucs/main/demucs/remote/mdx_extra_q.yaml" 121 | }, 122 | 123 | "Demucs v3: UVR Model":{ 124 | "ebf34a2db.th": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/ebf34a2db.th", 125 | "UVR_Demucs_Model_1.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/UVR_Demucs_Model_1.yaml" 126 | }, 127 | 128 | "Demucs v3: repro_mdx_a":{ 129 | "9a6b4851-03af0aa6.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/9a6b4851-03af0aa6.th", 130 | "1ef250f1-592467ce.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/1ef250f1-592467ce.th", 131 | "fa0cb7f9-100d8bf4.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/fa0cb7f9-100d8bf4.th", 132 | "902315c2-b39ce9c9.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/902315c2-b39ce9c9.th", 133 | "repro_mdx_a.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/repro_mdx_a.yaml" 134 | }, 135 | 136 | "Demucs v3: repro_mdx_a_time_only":{ 137 | "9a6b4851-03af0aa6.th":"https://dl.fbaipublicfiles.com/demucs/mdx_final/9a6b4851-03af0aa6.th", 138 | "1ef250f1-592467ce.th":"https://dl.fbaipublicfiles.com/demucs/mdx_final/1ef250f1-592467ce.th", 139 | "repro_mdx_a_time_only.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/repro_mdx_a_time_only.yaml" 140 | }, 141 | 142 | "Demucs v3: repro_mdx_a_hybrid_only":{ 143 | "fa0cb7f9-100d8bf4.th":"https://dl.fbaipublicfiles.com/demucs/mdx_final/fa0cb7f9-100d8bf4.th", 144 | "902315c2-b39ce9c9.th":"https://dl.fbaipublicfiles.com/demucs/mdx_final/902315c2-b39ce9c9.th", 145 | "repro_mdx_a_hybrid_only.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/repro_mdx_a_hybrid_only.yaml" 146 | }, 147 | 148 | "Demucs v2: demucs": { 149 | "demucs-e07c671f.th": "https://dl.fbaipublicfiles.com/demucs/v3.0/demucs-e07c671f.th" 150 | }, 151 | 152 | "Demucs v2: demucs_extra": { 153 | "demucs_extra-3646af93.th":"https://dl.fbaipublicfiles.com/demucs/v3.0/demucs_extra-3646af93.th" 154 | }, 155 | 156 | "Demucs v2: demucs48_hq": { 157 | "demucs48_hq-28a1282c.th":"https://dl.fbaipublicfiles.com/demucs/v3.0/demucs48_hq-28a1282c.th" 158 | }, 159 | 160 | "Demucs v2: tasnet": { 161 | "tasnet-beb46fac.th":"https://dl.fbaipublicfiles.com/demucs/v3.0/tasnet-beb46fac.th" 162 | }, 163 | 164 | "Demucs v2: tasnet_extra": { 165 | "tasnet_extra-df3777b2.th":"https://dl.fbaipublicfiles.com/demucs/v3.0/tasnet_extra-df3777b2.th" 166 | }, 167 | 168 | "Demucs v2: demucs_unittest": { 169 | "demucs_unittest-09ebc15f.th":"https://dl.fbaipublicfiles.com/demucs/v3.0/demucs_unittest-09ebc15f.th" 170 | }, 171 | 172 | "Demucs v1: demucs": { 173 | "demucs.th":"https://dl.fbaipublicfiles.com/demucs/v2.0/demucs.th" 174 | }, 175 | 176 | "Demucs v1: demucs_extra": { 177 | "demucs_extra.th":"https://dl.fbaipublicfiles.com/demucs/v2.0/demucs_extra.th" 178 | }, 179 | 180 | "Demucs v1: light": { 181 | "light.th":"https://dl.fbaipublicfiles.com/demucs/v2.0/light.th" 182 | }, 183 | 184 | "Demucs v1: light_extra": { 185 | "light_extra.th":"https://dl.fbaipublicfiles.com/demucs/v2.0/light_extra.th" 186 | }, 187 | 188 | "Demucs v1: tasnet": { 189 | "tasnet.th":"https://dl.fbaipublicfiles.com/demucs/v2.0/tasnet.th" 190 | }, 191 | 192 | "Demucs v1: tasnet_extra": { 193 | "tasnet_extra.th":"https://dl.fbaipublicfiles.com/demucs/v2.0/tasnet_extra.th" 194 | } 195 | }, 196 | 197 | "mdx_download_vip_list": { 198 | "MDX-Net Model VIP: UVR-MDX-NET_Main_340": "UVR-MDX-NET_Main_340.onnx", 199 | "MDX-Net Model VIP: UVR-MDX-NET_Main_390": "UVR-MDX-NET_Main_390.onnx", 200 | "MDX-Net Model VIP: UVR-MDX-NET_Main_406": "UVR-MDX-NET_Main_406.onnx", 201 | "MDX-Net Model VIP: UVR-MDX-NET_Main_427": "UVR-MDX-NET_Main_427.onnx", 202 | "MDX-Net Model VIP: UVR-MDX-NET_Main_438": "UVR-MDX-NET_Main_438.onnx", 203 | "MDX-Net Model VIP: UVR-MDX-NET_Inst_82_beta": "UVR-MDX-NET_Inst_82_beta.onnx", 204 | "MDX-Net Model VIP: UVR-MDX-NET_Inst_90_beta": "UVR-MDX-NET_Inst_90_beta.onnx", 205 | "MDX-Net Model VIP: UVR-MDX-NET_Inst_187_beta": "UVR-MDX-NET_Inst_187_beta.onnx", 206 | "MDX-Net Model VIP: UVR-MDX-NET-Inst_full_292": "UVR-MDX-NET-Inst_full_292.onnx" 207 | }, 208 | 209 | "mdx23_download_list": { 210 | "MDX23C Model: MDX23C_D1581": {"MDX23C_D1581.ckpt":"model_2_stem_061321.yaml"} 211 | }, 212 | 213 | "mdx23c_download_list": { 214 | "MDX23C Model: MDX23C-InstVoc HQ": {"MDX23C-8KFFT-InstVoc_HQ.ckpt":"model_2_stem_full_band_8k.yaml"} 215 | }, 216 | 217 | "roformer_download_list": { 218 | "Roformer Model: BS-Roformer-Viperx-1297": {"model_bs_roformer_ep_317_sdr_12.9755.ckpt":"model_bs_roformer_ep_317_sdr_12.9755.yaml"}, 219 | "Roformer Model: BS-Roformer-Viperx-1296": {"model_bs_roformer_ep_368_sdr_12.9628.ckpt":"model_bs_roformer_ep_368_sdr_12.9628.yaml"}, 220 | "Roformer Model: BS-Roformer-Viperx-1053": {"model_bs_roformer_ep_937_sdr_10.5309.ckpt":"model_bs_roformer_ep_937_sdr_10.5309.yaml"}, 221 | "Roformer Model: Mel-Roformer-Viperx-1143": {"model_mel_band_roformer_ep_3005_sdr_11.4360.ckpt":"model_mel_band_roformer_ep_3005_sdr_11.4360.yaml"} 222 | }, 223 | 224 | "mdx23c_download_vip_list": { 225 | "MDX23C Model VIP: MDX23C_D1581": {"MDX23C_D1581.ckpt":"model_2_stem_061321.yaml"}, 226 | "MDX23C Model VIP: MDX23C-InstVoc HQ 2": {"MDX23C-8KFFT-InstVoc_HQ_2.ckpt":"model_2_stem_full_band_8k.yaml"} 227 | }, 228 | 229 | "vr_download_vip_list": [], 230 | "demucs_download_vip_list": [] 231 | } 232 | -------------------------------------------------------------------------------- /UVR/plugin.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import json 3 | import gradio as gr 4 | 5 | now_dir = os.getcwd() 6 | sys.path.append(now_dir) 7 | 8 | from tabs.plugins.installed.UVR.uvr import Separator 9 | 10 | plugin_folder = os.path.relpath( 11 | os.path.join(now_dir, "tabs", "plugins", "installed", "UVR") 12 | ) 13 | 14 | 15 | def get_models_by_type(type): 16 | download_checks_path = os.path.join(plugin_folder, "models", "download_checks.json") 17 | 18 | model_downloads_list = json.load(open(download_checks_path, encoding="utf-8")) 19 | 20 | filtered_demucs_v4 = { 21 | key: value 22 | for key, value in model_downloads_list["demucs_download_list"].items() 23 | if key.startswith("Demucs v4") 24 | } 25 | 26 | model_files_grouped_by_type = { 27 | "VR": model_downloads_list["vr_download_list"], 28 | "MDX": { 29 | **model_downloads_list["mdx_download_list"], 30 | **model_downloads_list["mdx_download_vip_list"], 31 | }, 32 | "Demucs": filtered_demucs_v4, 33 | "MDXC": { 34 | **model_downloads_list["mdx23c_download_list"], 35 | **model_downloads_list["mdx23c_download_vip_list"], 36 | **model_downloads_list["roformer_download_list"], 37 | }, 38 | } 39 | 40 | results = [] 41 | for model_name, model_info in model_files_grouped_by_type[type].items(): 42 | results.append(model_info) 43 | 44 | return results 45 | 46 | 47 | def run_uvr( 48 | audio, 49 | output_format, 50 | output_dir, 51 | invert_spect, 52 | normalization, 53 | single_stem, 54 | sample_rate, 55 | vr_model, 56 | vr_batch_size, 57 | vr_window_size, 58 | vr_aggression, 59 | vr_enable_tta, 60 | vr_high_end_process, 61 | vr_enable_post_process, 62 | vr_post_process_threshold, 63 | mdx_model, 64 | mdx_segment_size, 65 | mdx_overlap, 66 | mdx_batch_size, 67 | mdx_hop_length, 68 | mdx_enable_denoise, 69 | mdxc_model, 70 | mdxc_segment_size, 71 | mdxc_override_model_segment_size, 72 | mdxc_overlap, 73 | mdxc_batch_size, 74 | mdxc_pitch_shift, 75 | # demucs_model, 76 | # demucs_segment_size, 77 | # demucs_shifts, 78 | # demucs_overlap, 79 | # demucs_segments_enabled, 80 | tab_selected, 81 | ): 82 | if tab_selected == "VR": 83 | model = vr_model 84 | elif tab_selected == "MDX": 85 | model = mdx_model 86 | elif tab_selected == "MDXC": 87 | model = mdxc_model 88 | # elif tab_selected == "Demucs": 89 | # model = demucs_model 90 | 91 | if single_stem == "None": 92 | single_stem = None 93 | 94 | separator = Separator( 95 | model_file_dir=os.path.join(plugin_folder, "models"), 96 | output_dir=output_dir, 97 | output_format=output_format, 98 | normalization_threshold=float(normalization), 99 | output_single_stem=single_stem, 100 | invert_using_spec=invert_spect, 101 | sample_rate=int(sample_rate), 102 | mdx_params={ 103 | "hop_length": int(mdx_hop_length), 104 | "segment_size": int(mdx_segment_size), 105 | "overlap": float(mdx_overlap), 106 | "batch_size": int(mdx_batch_size), 107 | "enable_denoise": mdx_enable_denoise, 108 | }, 109 | vr_params={ 110 | "batch_size": int(vr_batch_size), 111 | "window_size": int(vr_window_size), 112 | "aggression": int(vr_aggression), 113 | "enable_tta": vr_enable_tta, 114 | "enable_post_process": vr_enable_post_process, 115 | "post_process_threshold": float(vr_post_process_threshold), 116 | "high_end_process": vr_high_end_process, 117 | }, 118 | mdxc_params={ 119 | "segment_size": int(mdxc_segment_size), 120 | "batch_size": int(mdxc_batch_size), 121 | "overlap": int(mdxc_overlap), 122 | "override_model_segment_size": mdxc_override_model_segment_size, 123 | "pitch_shift": int(mdxc_pitch_shift), 124 | }, 125 | ) 126 | """ 127 | demucs_params={ 128 | "segment_size": demucs_segment_size, 129 | "shifts": demucs_shifts, 130 | "overlap": demucs_overlap, 131 | "segments_enabled": demucs_segments_enabled, 132 | }, 133 | """ 134 | separator.load_model(model_filename=model) 135 | 136 | results = [] 137 | files = separator.separate(audio) 138 | try: 139 | for file in files: 140 | file_path = os.path.join(output_dir, file) 141 | results.append(file_path) 142 | return results 143 | except AttributeError: 144 | return os.path.join(output_dir, files) 145 | 146 | 147 | def applio_plugin(): 148 | audio = gr.Audio( 149 | label="Input audio", 150 | sources=["upload", "microphone"], 151 | type="filepath", 152 | interactive=True, 153 | ) 154 | 155 | single_stem = gr.Radio( 156 | label="Single stem", 157 | choices=[ 158 | "None", 159 | "Instrumental", 160 | "Vocals", 161 | "Drums", 162 | "Bass", 163 | "Guitar", 164 | "Piano", 165 | "Other", 166 | ], 167 | value="None", 168 | interactive=True, 169 | ) 170 | 171 | with gr.Accordion("Advanced Settings", open=False): 172 | invert_spect = gr.Checkbox( 173 | label="Invert spectrogram", 174 | value=False, 175 | interactive=True, 176 | ) 177 | 178 | output_format = gr.Radio( 179 | label="Output format", 180 | choices=["wav", "mp3"], 181 | value="wav", 182 | interactive=True, 183 | ) 184 | 185 | output_dir = gr.Textbox( 186 | label="Output directory", 187 | value=os.path.join(plugin_folder, "output"), 188 | interactive=True, 189 | ) 190 | 191 | with gr.Row(): 192 | sample_rate = gr.Textbox( 193 | label="Sample rate", 194 | value=44100, 195 | interactive=True, 196 | ) 197 | 198 | normalization = gr.Textbox( 199 | label="Normalization", 200 | value=0.9, 201 | interactive=True, 202 | ) 203 | 204 | with gr.Tab("VR") as vr_tab: 205 | vr_model = gr.Dropdown( 206 | label="Model", 207 | choices=get_models_by_type("VR"), 208 | interactive=True, 209 | ) 210 | with gr.Accordion("Settings", open=False): 211 | vr_enable_tta = gr.Checkbox( 212 | label="Enable TTA", 213 | value=False, 214 | interactive=True, 215 | ) 216 | vr_high_end_process = gr.Checkbox( 217 | label="High-end process", 218 | value=False, 219 | interactive=True, 220 | ) 221 | vr_enable_post_process = gr.Checkbox( 222 | label="Enable post-process", 223 | value=False, 224 | interactive=True, 225 | ) 226 | with gr.Row(): 227 | vr_aggression = gr.Slider( 228 | label="Aggression", 229 | minimum=-100, 230 | maximum=100, 231 | value=5, 232 | interactive=True, 233 | ) 234 | vr_post_process_threshold = gr.Slider( 235 | label="Post-process threshold", 236 | minimum=0.1, 237 | maximum=0.3, 238 | step=0.01, 239 | value=0.2, 240 | interactive=True, 241 | ) 242 | with gr.Row(): 243 | vr_batch_size = gr.Textbox( 244 | label="Batch size", 245 | value=4, 246 | interactive=True, 247 | ) 248 | vr_window_size = gr.Dropdown( 249 | label="Window size", 250 | choices=[1024, 512, 320], 251 | value=512, 252 | interactive=True, 253 | allow_custom_value=True, 254 | ) 255 | 256 | with gr.Tab("MDX") as mdx_tab: 257 | mdx_model = gr.Dropdown( 258 | label="Model", 259 | choices=get_models_by_type("MDX"), 260 | interactive=True, 261 | ) 262 | with gr.Accordion("Settings", open=False): 263 | mdx_enable_denoise = gr.Checkbox( 264 | label="Enable denoise", 265 | value=False, 266 | interactive=True, 267 | ) 268 | with gr.Row(): 269 | mdx_overlap = gr.Slider( 270 | label="Overlap", 271 | minimum=0.001, 272 | maximum=0.999, 273 | value=0.25, 274 | interactive=True, 275 | ) 276 | with gr.Row(): 277 | mdx_batch_size = gr.Textbox( 278 | label="Batch size", 279 | value=1, 280 | interactive=True, 281 | ) 282 | mdx_segment_size = gr.Textbox( 283 | label="Segment size", 284 | value=256, 285 | interactive=True, 286 | ) 287 | mdx_hop_length = gr.Textbox( 288 | label="Hop length", 289 | value=1024, 290 | interactive=True, 291 | ) 292 | 293 | with gr.Tab("MDXC") as mdxc_tab: 294 | mdxc_model = gr.Dropdown( 295 | label="Model", 296 | choices=get_models_by_type("MDXC"), 297 | interactive=True, 298 | ) 299 | with gr.Accordion("Settings", open=False): 300 | mdxc_override_model_segment_size = gr.Checkbox( 301 | label="Override model segment size", 302 | value=False, 303 | ) 304 | with gr.Row(): 305 | mdxc_overlap = gr.Slider( 306 | label="Overlap", 307 | minimum=0.001, 308 | maximum=0.999, 309 | value=0.25, 310 | interactive=True, 311 | ) 312 | with gr.Row(): 313 | mdxc_batch_size = gr.Textbox( 314 | label="Batch size", 315 | value=1, 316 | interactive=True, 317 | ) 318 | mdxc_segment_size = gr.Textbox( 319 | label="Segment size", 320 | value=256, 321 | interactive=True, 322 | ) 323 | mdxc_pitch_shift = gr.Textbox( 324 | label="Hop length", 325 | value=0, 326 | interactive=True, 327 | ) 328 | 329 | with gr.Tab("Demucs") as demucs_tab: 330 | gr.Markdown("Demucs is not available in this version of the plugin.") 331 | """ 332 | demucs_model = gr.Dropdown( 333 | label="Model", 334 | choices=get_models_by_type("Demucs"), 335 | interactive=True, 336 | ) 337 | with gr.Accordion("Settings", open=False): 338 | demucs_segments_enabled = gr.Checkbox( 339 | label="Segments enabled", 340 | value=True, 341 | interactive=True, 342 | ) 343 | demucs_overlap = gr.Slider( 344 | label="Overlap", 345 | minimum=0.001, 346 | maximum=0.999, 347 | value=0.25, 348 | interactive=True, 349 | ) 350 | with gr.Row(): 351 | demucs_segment_size = gr.Textbox( 352 | label="Segment size", 353 | value="Default", 354 | interactive=True, 355 | ) 356 | demucs_shifts = gr.Textbox( 357 | label="Shifts", 358 | value=2, 359 | interactive=True, 360 | ) 361 | """ 362 | 363 | tab_selected = gr.Textbox( 364 | label="Tab selected", 365 | value="VR", 366 | interactive=False, 367 | visible=False, 368 | ) 369 | 370 | run_uvr_button = gr.Button("Run") 371 | output_files = gr.File( 372 | label="Output files", file_count="multiple", type="filepath", interactive=False 373 | ) 374 | 375 | run_uvr_button.click( 376 | fn=run_uvr, 377 | inputs=[ 378 | audio, 379 | output_format, 380 | output_dir, 381 | invert_spect, 382 | normalization, 383 | single_stem, 384 | sample_rate, 385 | vr_model, 386 | vr_batch_size, 387 | vr_window_size, 388 | vr_aggression, 389 | vr_enable_tta, 390 | vr_high_end_process, 391 | vr_enable_post_process, 392 | vr_post_process_threshold, 393 | mdx_model, 394 | mdx_segment_size, 395 | mdx_overlap, 396 | mdx_batch_size, 397 | mdx_hop_length, 398 | mdx_enable_denoise, 399 | mdxc_model, 400 | mdxc_segment_size, 401 | mdxc_override_model_segment_size, 402 | mdxc_overlap, 403 | mdxc_batch_size, 404 | mdxc_pitch_shift, 405 | # demucs_model, 406 | # demucs_segment_size, 407 | # demucs_shifts, 408 | # demucs_overlap, 409 | # demucs_segments_enabled, 410 | tab_selected, 411 | ], 412 | outputs=output_files, 413 | ) 414 | 415 | vr_tab.select(lambda: "VR", None, tab_selected) 416 | mdx_tab.select(lambda: "MDX", None, tab_selected) 417 | mdxc_tab.select(lambda: "MDXC", None, tab_selected) 418 | demucs_tab.select(lambda: "Demucs", None, tab_selected) 419 | -------------------------------------------------------------------------------- /UVR/requirements.txt: -------------------------------------------------------------------------------- 1 | six>=1.16 2 | samplerate==0.1.0 3 | pyyaml 4 | ml_collections 5 | 6 | onnx2torch>=1.5 7 | onnx>=1.14 8 | onnxruntime 9 | onnxruntime_gpu==1.15.1 10 | 11 | julius>=0.2 12 | diffq>=0.2 13 | 14 | beartype==0.18.5 15 | rotary-embedding-torch==0.6.1 -------------------------------------------------------------------------------- /UVR/uvr.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import json 4 | import sys 5 | 6 | 7 | def main(): 8 | """Main entry point for the CLI.""" 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | parser = argparse.ArgumentParser( 13 | description="Separate audio file into different stems.", 14 | formatter_class=lambda prog: argparse.RawTextHelpFormatter( 15 | prog, max_help_position=60 16 | ), 17 | ) 18 | 19 | parser.add_argument( 20 | "--audio_file", 21 | nargs="?", 22 | help="The audio file path to separate, in any common format.", 23 | default=argparse.SUPPRESS, 24 | ) 25 | 26 | debug_help = "Enable debug logging, equivalent to --log_level=debug." 27 | env_info_help = "Print environment information and exit." 28 | list_models_help = "List all supported models and exit." 29 | log_level_help = "Log level, e.g. info, debug, warning (default: %(default)s)." 30 | 31 | info_params = parser.add_argument_group("Info and Debugging") 32 | info_params.add_argument("-d", "--debug", action="store_true", help=debug_help) 33 | info_params.add_argument( 34 | "-e", "--env_info", action="store_true", help=env_info_help 35 | ) 36 | info_params.add_argument( 37 | "-l", "--list_models", action="store_true", help=list_models_help 38 | ) 39 | info_params.add_argument("--log_level", default="info", help=log_level_help) 40 | 41 | model_filename_help = ( 42 | "model to use for separation (default: %(default)s). Example: -m 2_HP-UVR.pth" 43 | ) 44 | output_format_help = "output format for separated files, any common format (default: %(default)s). Example: --output_format=MP3" 45 | output_dir_help = "directory to write output files (default: ). Example: --output_dir=/app/separated" 46 | model_file_dir_help = "model files directory (default: %(default)s). Example: --model_file_dir=/app/models" 47 | 48 | io_params = parser.add_argument_group("Separation I/O Params") 49 | io_params.add_argument( 50 | "-m", 51 | "--model_filename", 52 | default="model_mel_band_roformer_ep_3005_sdr_11.4360.ckpt", 53 | help=model_filename_help, 54 | ) 55 | io_params.add_argument("--output_format", default="WAV", help=output_format_help) 56 | io_params.add_argument("--output_dir", default=None, help=output_dir_help) 57 | io_params.add_argument( 58 | "--model_file_dir", 59 | default="uvr/tmp/audio-separator-models/", 60 | help=model_file_dir_help, 61 | ) 62 | 63 | invert_spect_help = "invert secondary stem using spectogram (default: %(default)s). Example: --invert_spect" 64 | normalization_help = "max peak amplitude to normalize input and output audio to (default: %(default)s). Example: --normalization=0.7" 65 | single_stem_help = "output only single stem, e.g. Instrumental, Vocals, Drums, Bass, Guitar, Piano, Other. Example: --single_stem=Instrumental" 66 | sample_rate_help = "modify the sample rate of the output audio (default: %(default)s). Example: --sample_rate=44100" 67 | 68 | common_params = parser.add_argument_group("Common Separation Parameters") 69 | common_params.add_argument( 70 | "--invert_spect", action="store_true", help=invert_spect_help 71 | ) 72 | common_params.add_argument( 73 | "--normalization", type=float, default=0.9, help=normalization_help 74 | ) 75 | common_params.add_argument("--single_stem", default=None, help=single_stem_help) 76 | common_params.add_argument( 77 | "--sample_rate", type=int, default=44100, help=sample_rate_help 78 | ) 79 | 80 | mdx_segment_size_help = "larger consumes more resources, but may give better results (default: %(default)s). Example: --mdx_segment_size=256" 81 | mdx_overlap_help = "amount of overlap between prediction windows, 0.001-0.999. higher is better but slower (default: %(default)s). Example: --mdx_overlap=0.25" 82 | mdx_batch_size_help = "larger consumes more RAM but may process slightly faster (default: %(default)s). Example: --mdx_batch_size=4" 83 | mdx_hop_length_help = "usually called stride in neural networks, only change if you know what you're doing (default: %(default)s). Example: --mdx_hop_length=1024" 84 | mdx_enable_denoise_help = "enable denoising during separation (default: %(default)s). Example: --mdx_enable_denoise" 85 | 86 | mdx_params = parser.add_argument_group("MDX Architecture Parameters") 87 | mdx_params.add_argument( 88 | "--mdx_segment_size", type=int, default=256, help=mdx_segment_size_help 89 | ) 90 | mdx_params.add_argument( 91 | "--mdx_overlap", type=float, default=0.25, help=mdx_overlap_help 92 | ) 93 | mdx_params.add_argument( 94 | "--mdx_batch_size", type=int, default=1, help=mdx_batch_size_help 95 | ) 96 | mdx_params.add_argument( 97 | "--mdx_hop_length", type=int, default=1024, help=mdx_hop_length_help 98 | ) 99 | mdx_params.add_argument( 100 | "--mdx_enable_denoise", action="store_true", help=mdx_enable_denoise_help 101 | ) 102 | 103 | vr_batch_size_help = "number of batches to process at a time. higher = more RAM, slightly faster processing (default: %(default)s). Example: --vr_batch_size=16" 104 | vr_window_size_help = "balance quality and speed. 1024 = fast but lower, 320 = slower but better quality. (default: %(default)s). Example: --vr_window_size=320" 105 | vr_aggression_help = "intensity of primary stem extraction, -100 - 100. typically 5 for vocals & instrumentals (default: %(default)s). Example: --vr_aggression=2" 106 | vr_enable_tta_help = "enable Test-Time-Augmentation; slow but improves quality (default: %(default)s). Example: --vr_enable_tta" 107 | vr_high_end_process_help = "mirror the missing frequency range of the output (default: %(default)s). Example: --vr_high_end_process" 108 | vr_enable_post_process_help = "identify leftover artifacts within vocal output; may improve separation for some songs (default: %(default)s). Example: --vr_enable_post_process" 109 | vr_post_process_threshold_help = "threshold for post_process feature: 0.1-0.3 (default: %(default)s). Example: --vr_post_process_threshold=0.1" 110 | 111 | vr_params = parser.add_argument_group("VR Architecture Parameters") 112 | vr_params.add_argument( 113 | "--vr_batch_size", type=int, default=4, help=vr_batch_size_help 114 | ) 115 | vr_params.add_argument( 116 | "--vr_window_size", type=int, default=512, help=vr_window_size_help 117 | ) 118 | vr_params.add_argument( 119 | "--vr_aggression", type=int, default=5, help=vr_aggression_help 120 | ) 121 | vr_params.add_argument( 122 | "--vr_enable_tta", action="store_true", help=vr_enable_tta_help 123 | ) 124 | vr_params.add_argument( 125 | "--vr_high_end_process", action="store_true", help=vr_high_end_process_help 126 | ) 127 | vr_params.add_argument( 128 | "--vr_enable_post_process", 129 | action="store_true", 130 | help=vr_enable_post_process_help, 131 | ) 132 | vr_params.add_argument( 133 | "--vr_post_process_threshold", 134 | type=float, 135 | default=0.2, 136 | help=vr_post_process_threshold_help, 137 | ) 138 | 139 | demucs_segment_size_help = "size of segments into which the audio is split, 1-100. higher = slower but better quality (default: %(default)s). Example: --demucs_segment_size=256" 140 | demucs_shifts_help = "number of predictions with random shifts, higher = slower but better quality (default: %(default)s). Example: --demucs_shifts=4" 141 | demucs_overlap_help = "overlap between prediction windows, 0.001-0.999. higher = slower but better quality (default: %(default)s). Example: --demucs_overlap=0.25" 142 | demucs_segments_enabled_help = "enable segment-wise processing (default: %(default)s). Example: --demucs_segments_enabled=False" 143 | 144 | demucs_params = parser.add_argument_group("Demucs Architecture Parameters") 145 | demucs_params.add_argument( 146 | "--demucs_segment_size", 147 | type=str, 148 | default="Default", 149 | help=demucs_segment_size_help, 150 | ) 151 | demucs_params.add_argument( 152 | "--demucs_shifts", type=int, default=2, help=demucs_shifts_help 153 | ) 154 | demucs_params.add_argument( 155 | "--demucs_overlap", type=float, default=0.25, help=demucs_overlap_help 156 | ) 157 | demucs_params.add_argument( 158 | "--demucs_segments_enabled", 159 | type=bool, 160 | default=True, 161 | help=demucs_segments_enabled_help, 162 | ) 163 | 164 | mdxc_segment_size_help = "larger consumes more resources, but may give better results (default: %(default)s). Example: --mdxc_segment_size=256" 165 | mdxc_override_model_segment_size_help = "override model default segment size instead of using the model default value. Example: --mdxc_override_model_segment_size" 166 | mdxc_overlap_help = "amount of overlap between prediction windows, 2-50. higher is better but slower (default: %(default)s). Example: --mdxc_overlap=8" 167 | mdxc_batch_size_help = "larger consumes more RAM but may process slightly faster (default: %(default)s). Example: --mdxc_batch_size=4" 168 | mdxc_pitch_shift_help = "shift audio pitch by a number of semitones while processing. may improve output for deep/high vocals. (default: %(default)s). Example: --mdxc_pitch_shift=2" 169 | 170 | mdxc_params = parser.add_argument_group("MDXC Architecture Parameters") 171 | mdxc_params.add_argument( 172 | "--mdxc_segment_size", type=int, default=256, help=mdxc_segment_size_help 173 | ) 174 | mdxc_params.add_argument( 175 | "--mdxc_override_model_segment_size", 176 | action="store_true", 177 | help=mdxc_override_model_segment_size_help, 178 | ) 179 | mdxc_params.add_argument( 180 | "--mdxc_overlap", type=int, default=8, help=mdxc_overlap_help 181 | ) 182 | mdxc_params.add_argument( 183 | "--mdxc_batch_size", type=int, default=1, help=mdxc_batch_size_help 184 | ) 185 | mdxc_params.add_argument( 186 | "--mdxc_pitch_shift", type=int, default=0, help=mdxc_pitch_shift_help 187 | ) 188 | 189 | args = parser.parse_args() 190 | 191 | if args.debug: 192 | log_level = logging.DEBUG 193 | else: 194 | log_level = getattr(logging, args.log_level.upper()) 195 | 196 | logger.setLevel(log_level) 197 | 198 | from tabs.plugins.installed.UVR.uvr.separator import Separator 199 | 200 | if args.env_info: 201 | separator = Separator() 202 | sys.exit(0) 203 | 204 | if args.list_models: 205 | separator = Separator() 206 | print( 207 | json.dumps(separator.list_supported_model_files(), indent=4, sort_keys=True) 208 | ) 209 | sys.exit(0) 210 | 211 | if not hasattr(args, "audio_file"): 212 | parser.print_help() 213 | sys.exit(1) 214 | 215 | separator = Separator( 216 | log_level=log_level, 217 | model_file_dir=args.model_file_dir, 218 | output_dir=args.output_dir, 219 | output_format=args.output_format, 220 | normalization_threshold=args.normalization, 221 | output_single_stem=args.single_stem, 222 | invert_using_spec=args.invert_spect, 223 | sample_rate=args.sample_rate, 224 | mdx_params={ 225 | "hop_length": args.mdx_hop_length, 226 | "segment_size": args.mdx_segment_size, 227 | "overlap": args.mdx_overlap, 228 | "batch_size": args.mdx_batch_size, 229 | "enable_denoise": args.mdx_enable_denoise, 230 | }, 231 | vr_params={ 232 | "batch_size": args.vr_batch_size, 233 | "window_size": args.vr_window_size, 234 | "aggression": args.vr_aggression, 235 | "enable_tta": args.vr_enable_tta, 236 | "enable_post_process": args.vr_enable_post_process, 237 | "post_process_threshold": args.vr_post_process_threshold, 238 | "high_end_process": args.vr_high_end_process, 239 | }, 240 | demucs_params={ 241 | "segment_size": args.demucs_segment_size, 242 | "shifts": args.demucs_shifts, 243 | "overlap": args.demucs_overlap, 244 | "segments_enabled": args.demucs_segments_enabled, 245 | }, 246 | mdxc_params={ 247 | "segment_size": args.mdxc_segment_size, 248 | "batch_size": args.mdxc_batch_size, 249 | "overlap": args.mdxc_overlap, 250 | "override_model_segment_size": args.mdxc_override_model_segment_size, 251 | "pitch_shift": args.mdxc_pitch_shift, 252 | }, 253 | ) 254 | 255 | separator.load_model(model_filename=args.model_filename) 256 | 257 | output_files = separator.separate(args.audio_file) 258 | 259 | logger.info(f"Separation complete! Output file(s): {' '.join(output_files)}") 260 | 261 | 262 | if __name__ == "__main__": 263 | main() 264 | -------------------------------------------------------------------------------- /UVR/uvr/__init__.py: -------------------------------------------------------------------------------- 1 | from .separator import Separator 2 | -------------------------------------------------------------------------------- /UVR/uvr/architectures/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IAHispano/Applio-Plugins/b80054bb20ade068aa69fed31bfe48f7dcbc4cad/UVR/uvr/architectures/__init__.py -------------------------------------------------------------------------------- /UVR/uvr/architectures/demucs_separator.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from pathlib import Path 4 | import torch 5 | import numpy as np 6 | from tabs.plugins.installed.UVR.uvr.common_separator import CommonSeparator 7 | from tabs.plugins.installed.UVR.uvr.uvr_lib_v5.demucs.apply import apply_model, demucs_segments 8 | from tabs.plugins.installed.UVR.uvr.uvr_lib_v5.demucs.hdemucs import HDemucs 9 | from tabs.plugins.installed.UVR.uvr.uvr_lib_v5.demucs.pretrained import get_model as get_demucs_model 10 | from tabs.plugins.installed.UVR.uvr.uvr_lib_v5 import spec_utils 11 | 12 | DEMUCS_4_SOURCE = ["drums", "bass", "other", "vocals"] 13 | 14 | DEMUCS_2_SOURCE_MAPPER = {CommonSeparator.INST_STEM: 0, CommonSeparator.VOCAL_STEM: 1} 15 | DEMUCS_4_SOURCE_MAPPER = { 16 | CommonSeparator.BASS_STEM: 0, 17 | CommonSeparator.DRUM_STEM: 1, 18 | CommonSeparator.OTHER_STEM: 2, 19 | CommonSeparator.VOCAL_STEM: 3, 20 | } 21 | DEMUCS_6_SOURCE_MAPPER = { 22 | CommonSeparator.BASS_STEM: 0, 23 | CommonSeparator.DRUM_STEM: 1, 24 | CommonSeparator.OTHER_STEM: 2, 25 | CommonSeparator.VOCAL_STEM: 3, 26 | CommonSeparator.GUITAR_STEM: 4, 27 | CommonSeparator.PIANO_STEM: 5, 28 | } 29 | 30 | 31 | class DemucsSeparator(CommonSeparator): 32 | """ 33 | DemucsSeparator is responsible for separating audio sources using Demucs models. 34 | It initializes with configuration parameters and prepares the model for separation tasks. 35 | """ 36 | 37 | def __init__(self, common_config, arch_config): 38 | # Any configuration values which can be shared between architectures should be set already in CommonSeparator, 39 | # e.g. user-specified functionality choices (self.output_single_stem) or common model parameters (self.primary_stem_name) 40 | super().__init__(config=common_config) 41 | 42 | # Initializing user-configurable parameters, passed through with an mdx_from the CLI or Separator instance 43 | 44 | # Adjust segments to manage RAM or V-RAM usage: 45 | # - Smaller sizes consume less resources. 46 | # - Bigger sizes consume more resources, but may provide better results. 47 | # - "Default" picks the optimal size. 48 | # DEMUCS_SEGMENTS = (DEF_OPT, '1', '5', '10', '15', '20', 49 | # '25', '30', '35', '40', '45', '50', 50 | # '55', '60', '65', '70', '75', '80', 51 | # '85', '90', '95', '100') 52 | self.segment_size = arch_config.get("segment_size", "Default") 53 | 54 | # Performs multiple predictions with random shifts of the input and averages them. 55 | # The higher number of shifts, the longer the prediction will take. 56 | # Not recommended unless you have a GPU. 57 | # DEMUCS_SHIFTS = (0, 1, 2, 3, 4, 5, 58 | # 6, 7, 8, 9, 10, 11, 59 | # 12, 13, 14, 15, 16, 17, 60 | # 18, 19, 20) 61 | self.shifts = arch_config.get("shifts", 2) 62 | 63 | # This option controls the amount of overlap between prediction windows. 64 | # - Higher values can provide better results, but will lead to longer processing times. 65 | # - You can choose between 0.001-0.999 66 | # DEMUCS_OVERLAP = (0.25, 0.50, 0.75, 0.99) 67 | self.overlap = arch_config.get("overlap", 0.25) 68 | 69 | # Enables "Segments". Deselecting this option is only recommended for those with powerful PCs. 70 | self.segments_enabled = arch_config.get("segments_enabled", True) 71 | 72 | self.logger.debug( 73 | f"Demucs arch params: segment_size={self.segment_size}, segments_enabled={self.segments_enabled}" 74 | ) 75 | self.logger.debug( 76 | f"Demucs arch params: shifts={self.shifts}, overlap={self.overlap}" 77 | ) 78 | 79 | self.demucs_source_map = DEMUCS_4_SOURCE_MAPPER 80 | 81 | self.audio_file_path = None 82 | self.audio_file_base = None 83 | self.demucs_model_instance = None 84 | 85 | # Add uvr_lib_v5 folder to system path so pytorch serialization can find the demucs module 86 | current_dir = os.path.dirname(__file__) 87 | uvr_lib_v5_path = os.path.join(current_dir, "..", "uvr_lib_v5") 88 | sys.path.insert(0, uvr_lib_v5_path) 89 | 90 | self.logger.info("Demucs Separator initialisation complete") 91 | 92 | def separate(self, audio_file_path): 93 | """ 94 | Separates the audio file into its component stems using the Demucs model. 95 | """ 96 | self.logger.debug("Starting separation process...") 97 | source = None 98 | stem_source = None 99 | inst_source = {} 100 | 101 | self.audio_file_path = audio_file_path 102 | self.audio_file_base = os.path.splitext(os.path.basename(audio_file_path))[0] 103 | 104 | # Prepare the mix for processing 105 | self.logger.debug("Preparing mix...") 106 | mix = self.prepare_mix(self.audio_file_path) 107 | 108 | self.logger.debug(f"Mix prepared for demixing. Shape: {mix.shape}") 109 | 110 | self.logger.debug("Loading model for demixing...") 111 | 112 | self.demucs_model_instance = HDemucs(sources=DEMUCS_4_SOURCE) 113 | self.demucs_model_instance = get_demucs_model( 114 | name=os.path.splitext(os.path.basename(self.model_path))[0], 115 | repo=Path(os.path.dirname(self.model_path)), 116 | ) 117 | self.demucs_model_instance = demucs_segments( 118 | self.segment_size, self.demucs_model_instance 119 | ) 120 | self.demucs_model_instance.to(self.torch_device) 121 | self.demucs_model_instance.eval() 122 | 123 | self.logger.debug("Model loaded and set to evaluation mode.") 124 | 125 | source = self.demix_demucs(mix) 126 | 127 | del self.demucs_model_instance 128 | self.clear_gpu_cache() 129 | self.logger.debug("Model and GPU cache cleared after demixing.") 130 | 131 | output_files = [] 132 | self.logger.debug("Processing output files...") 133 | 134 | if isinstance(inst_source, np.ndarray): 135 | self.logger.debug("Processing instance source...") 136 | source_reshape = spec_utils.reshape_sources( 137 | inst_source[self.demucs_source_map[CommonSeparator.VOCAL_STEM]], 138 | source[self.demucs_source_map[CommonSeparator.VOCAL_STEM]], 139 | ) 140 | inst_source[self.demucs_source_map[CommonSeparator.VOCAL_STEM]] = ( 141 | source_reshape 142 | ) 143 | source = inst_source 144 | 145 | if isinstance(source, np.ndarray): 146 | source_length = len(source) 147 | self.logger.debug( 148 | f"Processing source array, source length is {source_length}" 149 | ) 150 | if source_length == 2: 151 | self.logger.debug("Setting source map to 2-stem...") 152 | self.demucs_source_map = DEMUCS_2_SOURCE_MAPPER 153 | elif source_length == 6: 154 | self.logger.debug("Setting source map to 6-stem...") 155 | self.demucs_source_map = DEMUCS_6_SOURCE_MAPPER 156 | else: 157 | self.logger.debug("Setting source map to 4-stem...") 158 | self.demucs_source_map = DEMUCS_4_SOURCE_MAPPER 159 | 160 | self.logger.debug("Processing for all stems...") 161 | for stem_name, stem_value in self.demucs_source_map.items(): 162 | if self.output_single_stem is not None: 163 | if stem_name.lower() != self.output_single_stem.lower(): 164 | self.logger.debug( 165 | f"Skipping writing stem {stem_name} as output_single_stem is set to {self.output_single_stem}..." 166 | ) 167 | continue 168 | 169 | stem_path = os.path.join( 170 | f"{self.audio_file_base}_({stem_name})_{self.model_name}.{self.output_format.lower()}" 171 | ) 172 | stem_source = source[stem_value].T 173 | 174 | self.final_process(stem_path, stem_source, stem_name) 175 | output_files.append(stem_path) 176 | 177 | return output_files 178 | 179 | def demix_demucs(self, mix): 180 | """ 181 | Demixes the input mix using the demucs model. 182 | """ 183 | self.logger.debug("Starting demixing process in demix_demucs...") 184 | 185 | processed = {} 186 | mix = torch.tensor(mix, dtype=torch.float32) 187 | ref = mix.mean(0) 188 | mix = (mix - ref.mean()) / ref.std() 189 | mix_infer = mix 190 | 191 | with torch.no_grad(): 192 | self.logger.debug("Running model inference...") 193 | sources = apply_model( 194 | model=self.demucs_model_instance, 195 | mix=mix_infer[None], 196 | shifts=self.shifts, 197 | split=self.segments_enabled, 198 | overlap=self.overlap, 199 | static_shifts=1 if self.shifts == 0 else self.shifts, 200 | set_progress_bar=None, 201 | device=self.torch_device, 202 | progress=True, 203 | )[0] 204 | 205 | sources = (sources * ref.std() + ref.mean()).cpu().numpy() 206 | sources[[0, 1]] = sources[[1, 0]] 207 | processed[mix] = sources[:, :, 0:None].copy() 208 | sources = list(processed.values()) 209 | sources = [s[:, :, 0:None] for s in sources] 210 | sources = np.concatenate(sources, axis=-1) 211 | 212 | return sources 213 | -------------------------------------------------------------------------------- /UVR/uvr/common_separator.py: -------------------------------------------------------------------------------- 1 | """ This file contains the CommonSeparator class, common to all architecture-specific Separator classes. """ 2 | 3 | from logging import Logger 4 | import os 5 | import gc 6 | import numpy as np 7 | import librosa 8 | import torch 9 | from pydub import AudioSegment 10 | from tabs.plugins.installed.UVR.uvr.uvr_lib_v5 import spec_utils 11 | 12 | 13 | class CommonSeparator: 14 | """ 15 | This class contains the common methods and attributes common to all architecture-specific Separator classes. 16 | """ 17 | 18 | ALL_STEMS = "All Stems" 19 | VOCAL_STEM = "Vocals" 20 | INST_STEM = "Instrumental" 21 | OTHER_STEM = "Other" 22 | BASS_STEM = "Bass" 23 | DRUM_STEM = "Drums" 24 | GUITAR_STEM = "Guitar" 25 | PIANO_STEM = "Piano" 26 | SYNTH_STEM = "Synthesizer" 27 | STRINGS_STEM = "Strings" 28 | WOODWINDS_STEM = "Woodwinds" 29 | BRASS_STEM = "Brass" 30 | WIND_INST_STEM = "Wind Inst" 31 | NO_OTHER_STEM = "No Other" 32 | NO_BASS_STEM = "No Bass" 33 | NO_DRUM_STEM = "No Drums" 34 | NO_GUITAR_STEM = "No Guitar" 35 | NO_PIANO_STEM = "No Piano" 36 | NO_SYNTH_STEM = "No Synthesizer" 37 | NO_STRINGS_STEM = "No Strings" 38 | NO_WOODWINDS_STEM = "No Woodwinds" 39 | NO_WIND_INST_STEM = "No Wind Inst" 40 | NO_BRASS_STEM = "No Brass" 41 | PRIMARY_STEM = "Primary Stem" 42 | SECONDARY_STEM = "Secondary Stem" 43 | LEAD_VOCAL_STEM = "lead_only" 44 | BV_VOCAL_STEM = "backing_only" 45 | LEAD_VOCAL_STEM_I = "with_lead_vocals" 46 | BV_VOCAL_STEM_I = "with_backing_vocals" 47 | LEAD_VOCAL_STEM_LABEL = "Lead Vocals" 48 | BV_VOCAL_STEM_LABEL = "Backing Vocals" 49 | 50 | NON_ACCOM_STEMS = ( 51 | VOCAL_STEM, 52 | OTHER_STEM, 53 | BASS_STEM, 54 | DRUM_STEM, 55 | GUITAR_STEM, 56 | PIANO_STEM, 57 | SYNTH_STEM, 58 | STRINGS_STEM, 59 | WOODWINDS_STEM, 60 | BRASS_STEM, 61 | WIND_INST_STEM, 62 | ) 63 | 64 | def __init__(self, config): 65 | 66 | self.logger: Logger = config.get("logger") 67 | self.log_level: int = config.get("log_level") 68 | 69 | # Inferencing device / acceleration config 70 | self.torch_device = config.get("torch_device") 71 | self.torch_device_cpu = config.get("torch_device_cpu") 72 | self.torch_device_mps = config.get("torch_device_mps") 73 | self.onnx_execution_provider = config.get("onnx_execution_provider") 74 | 75 | # Model data 76 | self.model_name = config.get("model_name") 77 | self.model_path = config.get("model_path") 78 | self.model_data = config.get("model_data") 79 | 80 | # Output directory and format 81 | self.output_dir = config.get("output_dir") 82 | self.output_format = config.get("output_format") 83 | 84 | # Functional options which are applicable to all architectures and the user may tweak to affect the output 85 | self.normalization_threshold = config.get("normalization_threshold") 86 | self.enable_denoise = config.get("enable_denoise") 87 | self.output_single_stem = config.get("output_single_stem") 88 | self.invert_using_spec = config.get("invert_using_spec") 89 | self.sample_rate = config.get("sample_rate") 90 | 91 | # Model specific properties 92 | self.primary_stem_name = self.model_data.get("primary_stem", "Vocals") 93 | self.secondary_stem_name = ( 94 | "Vocals" if self.primary_stem_name == "Instrumental" else "Instrumental" 95 | ) 96 | self.is_karaoke = self.model_data.get("is_karaoke", False) 97 | self.is_bv_model = self.model_data.get("is_bv_model", False) 98 | self.bv_model_rebalance = self.model_data.get("is_bv_model_rebalanced", 0) 99 | 100 | self.logger.debug( 101 | f"Common params: model_name={self.model_name}, model_path={self.model_path}" 102 | ) 103 | self.logger.debug( 104 | f"Common params: output_dir={self.output_dir}, output_format={self.output_format}" 105 | ) 106 | self.logger.debug( 107 | f"Common params: normalization_threshold={self.normalization_threshold}" 108 | ) 109 | self.logger.debug( 110 | f"Common params: enable_denoise={self.enable_denoise}, output_single_stem={self.output_single_stem}" 111 | ) 112 | self.logger.debug( 113 | f"Common params: invert_using_spec={self.invert_using_spec}, sample_rate={self.sample_rate}" 114 | ) 115 | 116 | self.logger.debug( 117 | f"Common params: primary_stem_name={self.primary_stem_name}, secondary_stem_name={self.secondary_stem_name}" 118 | ) 119 | self.logger.debug( 120 | f"Common params: is_karaoke={self.is_karaoke}, is_bv_model={self.is_bv_model}, bv_model_rebalance={self.bv_model_rebalance}" 121 | ) 122 | 123 | # File-specific variables which need to be cleared between processing different audio inputs 124 | self.audio_file_path = None 125 | self.audio_file_base = None 126 | 127 | self.primary_source = None 128 | self.secondary_source = None 129 | 130 | self.primary_stem_output_path = None 131 | self.secondary_stem_output_path = None 132 | 133 | self.cached_sources_map = {} 134 | 135 | def separate(self, audio_file_path): 136 | """ 137 | Placeholder method for separating audio sources. Should be overridden by subclasses. 138 | """ 139 | raise NotImplementedError("This method should be overridden by subclasses.") 140 | 141 | def final_process(self, stem_path, source, stem_name): 142 | """ 143 | Finalizes the processing of a stem by writing the audio to a file and returning the processed source. 144 | """ 145 | self.logger.debug( 146 | f"Finalizing {stem_name} stem processing and writing audio..." 147 | ) 148 | self.write_audio(stem_path, source) 149 | 150 | return {stem_name: source} 151 | 152 | def cached_sources_clear(self): 153 | """ 154 | Clears the cache dictionaries for VR, MDX, and Demucs models. 155 | 156 | This function is essential for ensuring that the cache does not hold outdated or irrelevant data 157 | between different processing sessions or when a new batch of audio files is processed. 158 | It helps in managing memory efficiently and prevents potential errors due to stale data. 159 | """ 160 | self.cached_sources_map = {} 161 | 162 | def cached_source_callback(self, model_architecture, model_name=None): 163 | """ 164 | Retrieves the model and sources from the cache based on the processing method and model name. 165 | 166 | Args: 167 | model_architecture: The architecture type (VR, MDX, or Demucs) being used for processing. 168 | model_name: The specific model name within the architecture type, if applicable. 169 | 170 | Returns: 171 | A tuple containing the model and its sources if found in the cache; otherwise, None. 172 | 173 | This function is crucial for optimizing performance by avoiding redundant processing. 174 | If the requested model and its sources are already in the cache, they can be reused directly, 175 | saving time and computational resources. 176 | """ 177 | model, sources = None, None 178 | 179 | mapper = self.cached_sources_map[model_architecture] 180 | 181 | for key, value in mapper.items(): 182 | if model_name in key: 183 | model = key 184 | sources = value 185 | 186 | return model, sources 187 | 188 | def cached_model_source_holder(self, model_architecture, sources, model_name=None): 189 | """ 190 | Update the dictionary for the given model_architecture with the new model name and its sources. 191 | Use the model_architecture as a key to access the corresponding cache source mapper dictionary. 192 | """ 193 | self.cached_sources_map[model_architecture] = { 194 | **self.cached_sources_map.get(model_architecture, {}), 195 | **{model_name: sources}, 196 | } 197 | 198 | def prepare_mix(self, mix): 199 | """ 200 | Prepares the mix for processing. This includes loading the audio from a file if necessary, 201 | ensuring the mix is in the correct format, and converting mono to stereo if needed. 202 | """ 203 | # Store the original path or the mix itself for later checks 204 | audio_path = mix 205 | 206 | # Check if the input is a file path (string) and needs to be loaded 207 | if not isinstance(mix, np.ndarray): 208 | self.logger.debug(f"Loading audio from file: {mix}") 209 | mix, sr = librosa.load(mix, mono=False, sr=self.sample_rate) 210 | self.logger.debug( 211 | f"Audio loaded. Sample rate: {sr}, Audio shape: {mix.shape}" 212 | ) 213 | else: 214 | # Transpose the mix if it's already an ndarray (expected shape: [channels, samples]) 215 | self.logger.debug("Transposing the provided mix array.") 216 | mix = mix.T 217 | self.logger.debug(f"Transposed mix shape: {mix.shape}") 218 | 219 | # If the original input was a filepath, check if the loaded mix is empty 220 | if isinstance(audio_path, str): 221 | if not np.any(mix): 222 | error_msg = f"Audio file {audio_path} is empty or not valid" 223 | self.logger.error(error_msg) 224 | raise ValueError(error_msg) 225 | else: 226 | self.logger.debug("Audio file is valid and contains data.") 227 | 228 | # Ensure the mix is in stereo format 229 | if mix.ndim == 1: 230 | self.logger.debug("Mix is mono. Converting to stereo.") 231 | mix = np.asfortranarray([mix, mix]) 232 | self.logger.debug("Converted to stereo mix.") 233 | 234 | # Final log indicating successful preparation of the mix 235 | self.logger.debug("Mix preparation completed.") 236 | return mix 237 | 238 | def write_audio(self, stem_path: str, stem_source): 239 | """ 240 | Writes the separated audio source to a file. 241 | """ 242 | self.logger.debug(f"Entering write_audio with stem_path: {stem_path}") 243 | 244 | stem_source = spec_utils.normalize( 245 | wave=stem_source, max_peak=self.normalization_threshold 246 | ) 247 | 248 | # Check if the numpy array is empty or contains very low values 249 | if np.max(np.abs(stem_source)) < 1e-6: 250 | self.logger.warning("Warning: stem_source array is near-silent or empty.") 251 | return 252 | 253 | # If output_dir is specified, create it and join it with stem_path 254 | if self.output_dir: 255 | os.makedirs(self.output_dir, exist_ok=True) 256 | stem_path = os.path.join(self.output_dir, stem_path) 257 | 258 | self.logger.debug(f"Audio data shape before processing: {stem_source.shape}") 259 | self.logger.debug(f"Data type before conversion: {stem_source.dtype}") 260 | 261 | # Ensure the audio data is in the correct format (e.g., int16) 262 | if stem_source.dtype != np.int16: 263 | stem_source = (stem_source * 32767).astype(np.int16) 264 | self.logger.debug("Converted stem_source to int16.") 265 | 266 | # Correctly interleave stereo channels 267 | stem_source_interleaved = np.empty((2 * stem_source.shape[0],), dtype=np.int16) 268 | stem_source_interleaved[0::2] = stem_source[:, 0] # Left channel 269 | stem_source_interleaved[1::2] = stem_source[:, 1] # Right channel 270 | 271 | self.logger.debug( 272 | f"Interleaved audio data shape: {stem_source_interleaved.shape}" 273 | ) 274 | 275 | # Create a pydub AudioSegment 276 | try: 277 | audio_segment = AudioSegment( 278 | stem_source_interleaved.tobytes(), 279 | frame_rate=self.sample_rate, 280 | sample_width=stem_source.dtype.itemsize, 281 | channels=2, 282 | ) 283 | self.logger.debug("Created AudioSegment successfully.") 284 | except (IOError, ValueError) as e: 285 | self.logger.error(f"Specific error creating AudioSegment: {e}") 286 | return 287 | 288 | # Determine file format based on the file extension 289 | file_format = stem_path.lower().split(".")[-1] 290 | 291 | # For m4a files, specify mp4 as the container format as the extension doesn't match the format name 292 | if file_format == "m4a": 293 | file_format = "mp4" 294 | elif file_format == "mka": 295 | file_format = "matroska" 296 | 297 | # Export using the determined format 298 | try: 299 | audio_segment.export(stem_path, format=file_format) 300 | self.logger.debug(f"Exported audio file successfully to {stem_path}") 301 | except (IOError, ValueError) as e: 302 | self.logger.error(f"Error exporting audio file: {e}") 303 | 304 | def clear_gpu_cache(self): 305 | """ 306 | This method clears the GPU cache to free up memory. 307 | """ 308 | self.logger.debug("Running garbage collection...") 309 | gc.collect() 310 | if self.torch_device == torch.device("mps"): 311 | self.logger.debug("Clearing MPS cache...") 312 | torch.mps.empty_cache() 313 | if self.torch_device == torch.device("cuda"): 314 | self.logger.debug("Clearing CUDA cache...") 315 | torch.cuda.empty_cache() 316 | 317 | def clear_file_specific_paths(self): 318 | """ 319 | Clears the file-specific variables which need to be cleared between processing different audio inputs. 320 | """ 321 | self.logger.info("Clearing input audio file paths, sources and stems...") 322 | 323 | self.audio_file_path = None 324 | self.audio_file_base = None 325 | 326 | self.primary_source = None 327 | self.secondary_source = None 328 | 329 | self.primary_stem_output_path = None 330 | self.secondary_stem_output_path = None 331 | -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IAHispano/Applio-Plugins/b80054bb20ade068aa69fed31bfe48f7dcbc4cad/UVR/uvr/uvr_lib_v5/__init__.py -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/attend.py: -------------------------------------------------------------------------------- 1 | from functools import wraps 2 | from packaging import version 3 | from collections import namedtuple 4 | 5 | import torch 6 | from torch import nn, einsum 7 | import torch.nn.functional as F 8 | 9 | from einops import rearrange, reduce 10 | 11 | # constants 12 | 13 | FlashAttentionConfig = namedtuple( 14 | "FlashAttentionConfig", ["enable_flash", "enable_math", "enable_mem_efficient"] 15 | ) 16 | 17 | # helpers 18 | 19 | 20 | def exists(val): 21 | return val is not None 22 | 23 | 24 | def once(fn): 25 | called = False 26 | 27 | @wraps(fn) 28 | def inner(x): 29 | nonlocal called 30 | if called: 31 | return 32 | called = True 33 | return fn(x) 34 | 35 | return inner 36 | 37 | 38 | print_once = once(print) 39 | 40 | # main class 41 | 42 | 43 | class Attend(nn.Module): 44 | def __init__(self, dropout=0.0, flash=False): 45 | super().__init__() 46 | self.dropout = dropout 47 | self.attn_dropout = nn.Dropout(dropout) 48 | 49 | self.flash = flash 50 | assert not ( 51 | flash and version.parse(torch.__version__) < version.parse("2.0.0") 52 | ), "in order to use flash attention, you must be using pytorch 2.0 or above" 53 | 54 | # determine efficient attention configs for cuda and cpu 55 | 56 | self.cpu_config = FlashAttentionConfig(True, True, True) 57 | self.cuda_config = None 58 | 59 | if not torch.cuda.is_available() or not flash: 60 | return 61 | 62 | device_properties = torch.cuda.get_device_properties(torch.device("cuda")) 63 | 64 | if device_properties.major == 8 and device_properties.minor == 0: 65 | print_once( 66 | "A100 GPU detected, using flash attention if input tensor is on cuda" 67 | ) 68 | self.cuda_config = FlashAttentionConfig(True, False, False) 69 | else: 70 | self.cuda_config = FlashAttentionConfig(False, True, True) 71 | 72 | def flash_attn(self, q, k, v): 73 | _, heads, q_len, _, k_len, is_cuda, device = ( 74 | *q.shape, 75 | k.shape[-2], 76 | q.is_cuda, 77 | q.device, 78 | ) 79 | 80 | # Check if there is a compatible device for flash attention 81 | 82 | config = self.cuda_config if is_cuda else self.cpu_config 83 | 84 | # pytorch 2.0 flash attn: q, k, v, mask, dropout, softmax_scale 85 | 86 | with torch.backends.cuda.sdp_kernel(**config._asdict()): 87 | out = F.scaled_dot_product_attention( 88 | q, k, v, dropout_p=self.dropout if self.training else 0.0 89 | ) 90 | 91 | return out 92 | 93 | def forward(self, q, k, v): 94 | """ 95 | einstein notation 96 | b - batch 97 | h - heads 98 | n, i, j - sequence length (base sequence length, source, target) 99 | d - feature dimension 100 | """ 101 | 102 | q_len, k_len, device = q.shape[-2], k.shape[-2], q.device 103 | 104 | scale = q.shape[-1] ** -0.5 105 | 106 | if self.flash: 107 | return self.flash_attn(q, k, v) 108 | 109 | # similarity 110 | 111 | sim = einsum(f"b h i d, b h j d -> b h i j", q, k) * scale 112 | 113 | # attention 114 | 115 | attn = sim.softmax(dim=-1) 116 | attn = self.attn_dropout(attn) 117 | 118 | # aggregate values 119 | 120 | out = einsum(f"b h i j, b h j d -> b h i d", attn, v) 121 | 122 | return out 123 | -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/demucs/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/demucs/__main__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import json 8 | import os 9 | import sys 10 | import time 11 | from dataclasses import dataclass, field 12 | from fractions import Fraction 13 | 14 | import torch as th 15 | from torch import distributed, nn 16 | from torch.nn.parallel.distributed import DistributedDataParallel 17 | 18 | from .augment import FlipChannels, FlipSign, Remix, Shift 19 | from .compressed import StemsSet, build_musdb_metadata, get_musdb_tracks 20 | from .model import Demucs 21 | from .parser import get_name, get_parser 22 | from .raw import Rawset 23 | from .tasnet import ConvTasNet 24 | from .test import evaluate 25 | from .train import train_model, validate_model 26 | from .utils import human_seconds, load_model, save_model, sizeof_fmt 27 | 28 | 29 | @dataclass 30 | class SavedState: 31 | metrics: list = field(default_factory=list) 32 | last_state: dict = None 33 | best_state: dict = None 34 | optimizer: dict = None 35 | 36 | 37 | def main(): 38 | parser = get_parser() 39 | args = parser.parse_args() 40 | name = get_name(parser, args) 41 | print(f"Experiment {name}") 42 | 43 | if args.musdb is None and args.rank == 0: 44 | print( 45 | "You must provide the path to the MusDB dataset with the --musdb flag. " 46 | "To download the MusDB dataset, see https://sigsep.github.io/datasets/musdb.html.", 47 | file=sys.stderr, 48 | ) 49 | sys.exit(1) 50 | 51 | eval_folder = args.evals / name 52 | eval_folder.mkdir(exist_ok=True, parents=True) 53 | args.logs.mkdir(exist_ok=True) 54 | metrics_path = args.logs / f"{name}.json" 55 | eval_folder.mkdir(exist_ok=True, parents=True) 56 | args.checkpoints.mkdir(exist_ok=True, parents=True) 57 | args.models.mkdir(exist_ok=True, parents=True) 58 | 59 | if args.device is None: 60 | device = "cpu" 61 | if th.cuda.is_available(): 62 | device = "cuda" 63 | else: 64 | device = args.device 65 | 66 | th.manual_seed(args.seed) 67 | # Prevents too many threads to be started when running `museval` as it can be quite 68 | # inefficient on NUMA architectures. 69 | os.environ["OMP_NUM_THREADS"] = "1" 70 | 71 | if args.world_size > 1: 72 | if device != "cuda" and args.rank == 0: 73 | print( 74 | "Error: distributed training is only available with cuda device", 75 | file=sys.stderr, 76 | ) 77 | sys.exit(1) 78 | th.cuda.set_device(args.rank % th.cuda.device_count()) 79 | distributed.init_process_group( 80 | backend="nccl", 81 | init_method="tcp://" + args.master, 82 | rank=args.rank, 83 | world_size=args.world_size, 84 | ) 85 | 86 | checkpoint = args.checkpoints / f"{name}.th" 87 | checkpoint_tmp = args.checkpoints / f"{name}.th.tmp" 88 | if args.restart and checkpoint.exists(): 89 | checkpoint.unlink() 90 | 91 | if args.test: 92 | args.epochs = 1 93 | args.repeat = 0 94 | model = load_model(args.models / args.test) 95 | elif args.tasnet: 96 | model = ConvTasNet( 97 | audio_channels=args.audio_channels, samplerate=args.samplerate, X=args.X 98 | ) 99 | else: 100 | model = Demucs( 101 | audio_channels=args.audio_channels, 102 | channels=args.channels, 103 | context=args.context, 104 | depth=args.depth, 105 | glu=args.glu, 106 | growth=args.growth, 107 | kernel_size=args.kernel_size, 108 | lstm_layers=args.lstm_layers, 109 | rescale=args.rescale, 110 | rewrite=args.rewrite, 111 | sources=4, 112 | stride=args.conv_stride, 113 | upsample=args.upsample, 114 | samplerate=args.samplerate, 115 | ) 116 | model.to(device) 117 | if args.show: 118 | print(model) 119 | size = sizeof_fmt(4 * sum(p.numel() for p in model.parameters())) 120 | print(f"Model size {size}") 121 | return 122 | 123 | optimizer = th.optim.Adam(model.parameters(), lr=args.lr) 124 | 125 | try: 126 | saved = th.load(checkpoint, map_location="cpu") 127 | except IOError: 128 | saved = SavedState() 129 | else: 130 | model.load_state_dict(saved.last_state) 131 | optimizer.load_state_dict(saved.optimizer) 132 | 133 | if args.save_model: 134 | if args.rank == 0: 135 | model.to("cpu") 136 | model.load_state_dict(saved.best_state) 137 | save_model(model, args.models / f"{name}.th") 138 | return 139 | 140 | if args.rank == 0: 141 | done = args.logs / f"{name}.done" 142 | if done.exists(): 143 | done.unlink() 144 | 145 | if args.augment: 146 | augment = nn.Sequential( 147 | FlipSign(), 148 | FlipChannels(), 149 | Shift(args.data_stride), 150 | Remix(group_size=args.remix_group_size), 151 | ).to(device) 152 | else: 153 | augment = Shift(args.data_stride) 154 | 155 | if args.mse: 156 | criterion = nn.MSELoss() 157 | else: 158 | criterion = nn.L1Loss() 159 | 160 | # Setting number of samples so that all convolution windows are full. 161 | # Prevents hard to debug mistake with the prediction being shifted compared 162 | # to the input mixture. 163 | samples = model.valid_length(args.samples) 164 | print(f"Number of training samples adjusted to {samples}") 165 | 166 | if args.raw: 167 | train_set = Rawset( 168 | args.raw / "train", 169 | samples=samples + args.data_stride, 170 | channels=args.audio_channels, 171 | streams=[0, 1, 2, 3, 4], 172 | stride=args.data_stride, 173 | ) 174 | 175 | valid_set = Rawset(args.raw / "valid", channels=args.audio_channels) 176 | else: 177 | if not args.metadata.is_file() and args.rank == 0: 178 | build_musdb_metadata(args.metadata, args.musdb, args.workers) 179 | if args.world_size > 1: 180 | distributed.barrier() 181 | metadata = json.load(open(args.metadata)) 182 | duration = Fraction(samples + args.data_stride, args.samplerate) 183 | stride = Fraction(args.data_stride, args.samplerate) 184 | train_set = StemsSet( 185 | get_musdb_tracks(args.musdb, subsets=["train"], split="train"), 186 | metadata, 187 | duration=duration, 188 | stride=stride, 189 | samplerate=args.samplerate, 190 | channels=args.audio_channels, 191 | ) 192 | valid_set = StemsSet( 193 | get_musdb_tracks(args.musdb, subsets=["train"], split="valid"), 194 | metadata, 195 | samplerate=args.samplerate, 196 | channels=args.audio_channels, 197 | ) 198 | 199 | best_loss = float("inf") 200 | for epoch, metrics in enumerate(saved.metrics): 201 | print( 202 | f"Epoch {epoch:03d}: " 203 | f"train={metrics['train']:.8f} " 204 | f"valid={metrics['valid']:.8f} " 205 | f"best={metrics['best']:.4f} " 206 | f"duration={human_seconds(metrics['duration'])}" 207 | ) 208 | best_loss = metrics["best"] 209 | 210 | if args.world_size > 1: 211 | dmodel = DistributedDataParallel( 212 | model, 213 | device_ids=[th.cuda.current_device()], 214 | output_device=th.cuda.current_device(), 215 | ) 216 | else: 217 | dmodel = model 218 | 219 | for epoch in range(len(saved.metrics), args.epochs): 220 | begin = time.time() 221 | model.train() 222 | train_loss = train_model( 223 | epoch, 224 | train_set, 225 | dmodel, 226 | criterion, 227 | optimizer, 228 | augment, 229 | batch_size=args.batch_size, 230 | device=device, 231 | repeat=args.repeat, 232 | seed=args.seed, 233 | workers=args.workers, 234 | world_size=args.world_size, 235 | ) 236 | model.eval() 237 | valid_loss = validate_model( 238 | epoch, 239 | valid_set, 240 | model, 241 | criterion, 242 | device=device, 243 | rank=args.rank, 244 | split=args.split_valid, 245 | world_size=args.world_size, 246 | ) 247 | 248 | duration = time.time() - begin 249 | if valid_loss < best_loss: 250 | best_loss = valid_loss 251 | saved.best_state = { 252 | key: value.to("cpu").clone() 253 | for key, value in model.state_dict().items() 254 | } 255 | saved.metrics.append( 256 | { 257 | "train": train_loss, 258 | "valid": valid_loss, 259 | "best": best_loss, 260 | "duration": duration, 261 | } 262 | ) 263 | if args.rank == 0: 264 | json.dump(saved.metrics, open(metrics_path, "w")) 265 | 266 | saved.last_state = model.state_dict() 267 | saved.optimizer = optimizer.state_dict() 268 | if args.rank == 0 and not args.test: 269 | th.save(saved, checkpoint_tmp) 270 | checkpoint_tmp.rename(checkpoint) 271 | 272 | print( 273 | f"Epoch {epoch:03d}: " 274 | f"train={train_loss:.8f} valid={valid_loss:.8f} best={best_loss:.4f} " 275 | f"duration={human_seconds(duration)}" 276 | ) 277 | 278 | del dmodel 279 | model.load_state_dict(saved.best_state) 280 | if args.eval_cpu: 281 | device = "cpu" 282 | model.to(device) 283 | model.eval() 284 | evaluate( 285 | model, 286 | args.musdb, 287 | eval_folder, 288 | rank=args.rank, 289 | world_size=args.world_size, 290 | device=device, 291 | save=args.save, 292 | split=args.split_valid, 293 | shifts=args.shifts, 294 | workers=args.eval_workers, 295 | ) 296 | model.to("cpu") 297 | save_model(model, args.models / f"{name}.th") 298 | if args.rank == 0: 299 | print("done") 300 | done.write_text("done") 301 | 302 | 303 | if __name__ == "__main__": 304 | main() 305 | -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/demucs/apply.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | """ 7 | Code to apply a model to a mix. It will handle chunking with overlaps and 8 | inteprolation between chunks, as well as the "shift trick". 9 | """ 10 | from concurrent.futures import ThreadPoolExecutor 11 | import random 12 | import typing as tp 13 | from multiprocessing import Process, Queue, Pipe 14 | 15 | import torch as th 16 | from torch import nn 17 | from torch.nn import functional as F 18 | import tqdm 19 | import tkinter as tk 20 | 21 | from .demucs import Demucs 22 | from .hdemucs import HDemucs 23 | from .utils import center_trim, DummyPoolExecutor 24 | 25 | Model = tp.Union[Demucs, HDemucs] 26 | 27 | progress_bar_num = 0 28 | 29 | 30 | class BagOfModels(nn.Module): 31 | def __init__( 32 | self, 33 | models: tp.List[Model], 34 | weights: tp.Optional[tp.List[tp.List[float]]] = None, 35 | segment: tp.Optional[float] = None, 36 | ): 37 | """ 38 | Represents a bag of models with specific weights. 39 | You should call `apply_model` rather than calling directly the forward here for 40 | optimal performance. 41 | 42 | Args: 43 | models (list[nn.Module]): list of Demucs/HDemucs models. 44 | weights (list[list[float]]): list of weights. If None, assumed to 45 | be all ones, otherwise it should be a list of N list (N number of models), 46 | each containing S floats (S number of sources). 47 | segment (None or float): overrides the `segment` attribute of each model 48 | (this is performed inplace, be careful if you reuse the models passed). 49 | """ 50 | 51 | super().__init__() 52 | assert len(models) > 0 53 | first = models[0] 54 | for other in models: 55 | assert other.sources == first.sources 56 | assert other.samplerate == first.samplerate 57 | assert other.audio_channels == first.audio_channels 58 | if segment is not None: 59 | other.segment = segment 60 | 61 | self.audio_channels = first.audio_channels 62 | self.samplerate = first.samplerate 63 | self.sources = first.sources 64 | self.models = nn.ModuleList(models) 65 | 66 | if weights is None: 67 | weights = [[1.0 for _ in first.sources] for _ in models] 68 | else: 69 | assert len(weights) == len(models) 70 | for weight in weights: 71 | assert len(weight) == len(first.sources) 72 | self.weights = weights 73 | 74 | def forward(self, x): 75 | raise NotImplementedError("Call `apply_model` on this.") 76 | 77 | 78 | class TensorChunk: 79 | def __init__(self, tensor, offset=0, length=None): 80 | total_length = tensor.shape[-1] 81 | assert offset >= 0 82 | assert offset < total_length 83 | 84 | if length is None: 85 | length = total_length - offset 86 | else: 87 | length = min(total_length - offset, length) 88 | 89 | if isinstance(tensor, TensorChunk): 90 | self.tensor = tensor.tensor 91 | self.offset = offset + tensor.offset 92 | else: 93 | self.tensor = tensor 94 | self.offset = offset 95 | self.length = length 96 | self.device = tensor.device 97 | 98 | @property 99 | def shape(self): 100 | shape = list(self.tensor.shape) 101 | shape[-1] = self.length 102 | return shape 103 | 104 | def padded(self, target_length): 105 | delta = target_length - self.length 106 | total_length = self.tensor.shape[-1] 107 | assert delta >= 0 108 | 109 | start = self.offset - delta // 2 110 | end = start + target_length 111 | 112 | correct_start = max(0, start) 113 | correct_end = min(total_length, end) 114 | 115 | pad_left = correct_start - start 116 | pad_right = end - correct_end 117 | 118 | out = F.pad(self.tensor[..., correct_start:correct_end], (pad_left, pad_right)) 119 | assert out.shape[-1] == target_length 120 | return out 121 | 122 | 123 | def tensor_chunk(tensor_or_chunk): 124 | if isinstance(tensor_or_chunk, TensorChunk): 125 | return tensor_or_chunk 126 | else: 127 | assert isinstance(tensor_or_chunk, th.Tensor) 128 | return TensorChunk(tensor_or_chunk) 129 | 130 | 131 | def apply_model( 132 | model, 133 | mix, 134 | shifts=1, 135 | split=True, 136 | overlap=0.25, 137 | transition_power=1.0, 138 | static_shifts=1, 139 | set_progress_bar=None, 140 | device=None, 141 | progress=False, 142 | num_workers=0, 143 | pool=None, 144 | ): 145 | """ 146 | Apply model to a given mixture. 147 | 148 | Args: 149 | shifts (int): if > 0, will shift in time `mix` by a random amount between 0 and 0.5 sec 150 | and apply the oppositve shift to the output. This is repeated `shifts` time and 151 | all predictions are averaged. This effectively makes the model time equivariant 152 | and improves SDR by up to 0.2 points. 153 | split (bool): if True, the input will be broken down in 8 seconds extracts 154 | and predictions will be performed individually on each and concatenated. 155 | Useful for model with large memory footprint like Tasnet. 156 | progress (bool): if True, show a progress bar (requires split=True) 157 | device (torch.device, str, or None): if provided, device on which to 158 | execute the computation, otherwise `mix.device` is assumed. 159 | When `device` is different from `mix.device`, only local computations will 160 | be on `device`, while the entire tracks will be stored on `mix.device`. 161 | """ 162 | 163 | global fut_length 164 | global bag_num 165 | global prog_bar 166 | 167 | if device is None: 168 | device = mix.device 169 | else: 170 | device = th.device(device) 171 | if pool is None: 172 | if num_workers > 0 and device.type == "cpu": 173 | pool = ThreadPoolExecutor(num_workers) 174 | else: 175 | pool = DummyPoolExecutor() 176 | 177 | kwargs = { 178 | "shifts": shifts, 179 | "split": split, 180 | "overlap": overlap, 181 | "transition_power": transition_power, 182 | "progress": progress, 183 | "device": device, 184 | "pool": pool, 185 | "set_progress_bar": set_progress_bar, 186 | "static_shifts": static_shifts, 187 | } 188 | 189 | if isinstance(model, BagOfModels): 190 | # Special treatment for bag of model. 191 | # We explicitely apply multiple times `apply_model` so that the random shifts 192 | # are different for each model. 193 | 194 | estimates = 0 195 | totals = [0] * len(model.sources) 196 | bag_num = len(model.models) 197 | fut_length = 0 198 | prog_bar = 0 199 | current_model = 0 # (bag_num + 1) 200 | for sub_model, weight in zip(model.models, model.weights): 201 | original_model_device = next(iter(sub_model.parameters())).device 202 | sub_model.to(device) 203 | fut_length += fut_length 204 | current_model += 1 205 | out = apply_model(sub_model, mix, **kwargs) 206 | sub_model.to(original_model_device) 207 | for k, inst_weight in enumerate(weight): 208 | out[:, k, :, :] *= inst_weight 209 | totals[k] += inst_weight 210 | estimates += out 211 | del out 212 | 213 | for k in range(estimates.shape[1]): 214 | estimates[:, k, :, :] /= totals[k] 215 | return estimates 216 | 217 | model.to(device) 218 | model.eval() 219 | assert transition_power >= 1, "transition_power < 1 leads to weird behavior." 220 | batch, channels, length = mix.shape 221 | 222 | if shifts: 223 | kwargs["shifts"] = 0 224 | max_shift = int(0.5 * model.samplerate) 225 | mix = tensor_chunk(mix) 226 | padded_mix = mix.padded(length + 2 * max_shift) 227 | out = 0 228 | for _ in range(shifts): 229 | offset = random.randint(0, max_shift) 230 | shifted = TensorChunk(padded_mix, offset, length + max_shift - offset) 231 | shifted_out = apply_model(model, shifted, **kwargs) 232 | out += shifted_out[..., max_shift - offset :] 233 | out /= shifts 234 | return out 235 | elif split: 236 | kwargs["split"] = False 237 | out = th.zeros(batch, len(model.sources), channels, length, device=mix.device) 238 | sum_weight = th.zeros(length, device=mix.device) 239 | segment = int(model.samplerate * model.segment) 240 | stride = int((1 - overlap) * segment) 241 | offsets = range(0, length, stride) 242 | scale = float(format(stride / model.samplerate, ".2f")) 243 | # We start from a triangle shaped weight, with maximal weight in the middle 244 | # of the segment. Then we normalize and take to the power `transition_power`. 245 | # Large values of transition power will lead to sharper transitions. 246 | weight = th.cat( 247 | [ 248 | th.arange(1, segment // 2 + 1, device=device), 249 | th.arange(segment - segment // 2, 0, -1, device=device), 250 | ] 251 | ) 252 | assert len(weight) == segment 253 | # If the overlap < 50%, this will translate to linear transition when 254 | # transition_power is 1. 255 | weight = (weight / weight.max()) ** transition_power 256 | futures = [] 257 | for offset in offsets: 258 | chunk = TensorChunk(mix, offset, segment) 259 | future = pool.submit(apply_model, model, chunk, **kwargs) 260 | futures.append((future, offset)) 261 | offset += segment 262 | if progress: 263 | futures = tqdm.tqdm(futures) 264 | for future, offset in futures: 265 | if set_progress_bar: 266 | fut_length = len(futures) * bag_num * static_shifts 267 | prog_bar += 1 268 | set_progress_bar(0.1, (0.8 / fut_length * prog_bar)) 269 | chunk_out = future.result() 270 | chunk_length = chunk_out.shape[-1] 271 | out[..., offset : offset + segment] += ( 272 | weight[:chunk_length] * chunk_out 273 | ).to(mix.device) 274 | sum_weight[offset : offset + segment] += weight[:chunk_length].to( 275 | mix.device 276 | ) 277 | assert sum_weight.min() > 0 278 | out /= sum_weight 279 | return out 280 | else: 281 | if hasattr(model, "valid_length"): 282 | valid_length = model.valid_length(length) 283 | else: 284 | valid_length = length 285 | mix = tensor_chunk(mix) 286 | padded_mix = mix.padded(valid_length).to(device) 287 | with th.no_grad(): 288 | out = model(padded_mix) 289 | return center_trim(out, length) 290 | 291 | 292 | def demucs_segments(demucs_segment, demucs_model): 293 | 294 | if demucs_segment == "Default": 295 | segment = None 296 | if isinstance(demucs_model, BagOfModels): 297 | if segment is not None: 298 | for sub in demucs_model.models: 299 | sub.segment = segment 300 | else: 301 | if segment is not None: 302 | sub.segment = segment 303 | else: 304 | try: 305 | segment = int(demucs_segment) 306 | if isinstance(demucs_model, BagOfModels): 307 | if segment is not None: 308 | for sub in demucs_model.models: 309 | sub.segment = segment 310 | else: 311 | if segment is not None: 312 | sub.segment = segment 313 | except: 314 | segment = None 315 | if isinstance(demucs_model, BagOfModels): 316 | if segment is not None: 317 | for sub in demucs_model.models: 318 | sub.segment = segment 319 | else: 320 | if segment is not None: 321 | sub.segment = segment 322 | 323 | return demucs_model 324 | -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/demucs/model.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import math 8 | 9 | import torch as th 10 | from torch import nn 11 | 12 | from .utils import capture_init, center_trim 13 | 14 | 15 | class BLSTM(nn.Module): 16 | def __init__(self, dim, layers=1): 17 | super().__init__() 18 | self.lstm = nn.LSTM( 19 | bidirectional=True, num_layers=layers, hidden_size=dim, input_size=dim 20 | ) 21 | self.linear = nn.Linear(2 * dim, dim) 22 | 23 | def forward(self, x): 24 | x = x.permute(2, 0, 1) 25 | x = self.lstm(x)[0] 26 | x = self.linear(x) 27 | x = x.permute(1, 2, 0) 28 | return x 29 | 30 | 31 | def rescale_conv(conv, reference): 32 | std = conv.weight.std().detach() 33 | scale = (std / reference) ** 0.5 34 | conv.weight.data /= scale 35 | if conv.bias is not None: 36 | conv.bias.data /= scale 37 | 38 | 39 | def rescale_module(module, reference): 40 | for sub in module.modules(): 41 | if isinstance(sub, (nn.Conv1d, nn.ConvTranspose1d)): 42 | rescale_conv(sub, reference) 43 | 44 | 45 | def upsample(x, stride): 46 | """ 47 | Linear upsampling, the output will be `stride` times longer. 48 | """ 49 | batch, channels, time = x.size() 50 | weight = th.arange(stride, device=x.device, dtype=th.float) / stride 51 | x = x.view(batch, channels, time, 1) 52 | out = x[..., :-1, :] * (1 - weight) + x[..., 1:, :] * weight 53 | return out.reshape(batch, channels, -1) 54 | 55 | 56 | def downsample(x, stride): 57 | """ 58 | Downsample x by decimation. 59 | """ 60 | return x[:, :, ::stride] 61 | 62 | 63 | class Demucs(nn.Module): 64 | @capture_init 65 | def __init__( 66 | self, 67 | sources=4, 68 | audio_channels=2, 69 | channels=64, 70 | depth=6, 71 | rewrite=True, 72 | glu=True, 73 | upsample=False, 74 | rescale=0.1, 75 | kernel_size=8, 76 | stride=4, 77 | growth=2.0, 78 | lstm_layers=2, 79 | context=3, 80 | samplerate=44100, 81 | ): 82 | """ 83 | Args: 84 | sources (int): number of sources to separate 85 | audio_channels (int): stereo or mono 86 | channels (int): first convolution channels 87 | depth (int): number of encoder/decoder layers 88 | rewrite (bool): add 1x1 convolution to each encoder layer 89 | and a convolution to each decoder layer. 90 | For the decoder layer, `context` gives the kernel size. 91 | glu (bool): use glu instead of ReLU 92 | upsample (bool): use linear upsampling with convolutions 93 | Wave-U-Net style, instead of transposed convolutions 94 | rescale (int): rescale initial weights of convolutions 95 | to get their standard deviation closer to `rescale` 96 | kernel_size (int): kernel size for convolutions 97 | stride (int): stride for convolutions 98 | growth (float): multiply (resp divide) number of channels by that 99 | for each layer of the encoder (resp decoder) 100 | lstm_layers (int): number of lstm layers, 0 = no lstm 101 | context (int): kernel size of the convolution in the 102 | decoder before the transposed convolution. If > 1, 103 | will provide some context from neighboring time 104 | steps. 105 | """ 106 | 107 | super().__init__() 108 | self.audio_channels = audio_channels 109 | self.sources = sources 110 | self.kernel_size = kernel_size 111 | self.context = context 112 | self.stride = stride 113 | self.depth = depth 114 | self.upsample = upsample 115 | self.channels = channels 116 | self.samplerate = samplerate 117 | 118 | self.encoder = nn.ModuleList() 119 | self.decoder = nn.ModuleList() 120 | 121 | self.final = None 122 | if upsample: 123 | self.final = nn.Conv1d( 124 | channels + audio_channels, sources * audio_channels, 1 125 | ) 126 | stride = 1 127 | 128 | if glu: 129 | activation = nn.GLU(dim=1) 130 | ch_scale = 2 131 | else: 132 | activation = nn.ReLU() 133 | ch_scale = 1 134 | in_channels = audio_channels 135 | for index in range(depth): 136 | encode = [] 137 | encode += [nn.Conv1d(in_channels, channels, kernel_size, stride), nn.ReLU()] 138 | if rewrite: 139 | encode += [nn.Conv1d(channels, ch_scale * channels, 1), activation] 140 | self.encoder.append(nn.Sequential(*encode)) 141 | 142 | decode = [] 143 | if index > 0: 144 | out_channels = in_channels 145 | else: 146 | if upsample: 147 | out_channels = channels 148 | else: 149 | out_channels = sources * audio_channels 150 | if rewrite: 151 | decode += [ 152 | nn.Conv1d(channels, ch_scale * channels, context), 153 | activation, 154 | ] 155 | if upsample: 156 | decode += [nn.Conv1d(channels, out_channels, kernel_size, stride=1)] 157 | else: 158 | decode += [ 159 | nn.ConvTranspose1d(channels, out_channels, kernel_size, stride) 160 | ] 161 | if index > 0: 162 | decode.append(nn.ReLU()) 163 | self.decoder.insert(0, nn.Sequential(*decode)) 164 | in_channels = channels 165 | channels = int(growth * channels) 166 | 167 | channels = in_channels 168 | 169 | if lstm_layers: 170 | self.lstm = BLSTM(channels, lstm_layers) 171 | else: 172 | self.lstm = None 173 | 174 | if rescale: 175 | rescale_module(self, reference=rescale) 176 | 177 | def valid_length(self, length): 178 | """ 179 | Return the nearest valid length to use with the model so that 180 | there is no time steps left over in a convolutions, e.g. for all 181 | layers, size of the input - kernel_size % stride = 0. 182 | 183 | If the mixture has a valid length, the estimated sources 184 | will have exactly the same length when context = 1. If context > 1, 185 | the two signals can be center trimmed to match. 186 | 187 | For training, extracts should have a valid length.For evaluation 188 | on full tracks we recommend passing `pad = True` to :method:`forward`. 189 | """ 190 | for _ in range(self.depth): 191 | if self.upsample: 192 | length = math.ceil(length / self.stride) + self.kernel_size - 1 193 | else: 194 | length = math.ceil((length - self.kernel_size) / self.stride) + 1 195 | length = max(1, length) 196 | length += self.context - 1 197 | for _ in range(self.depth): 198 | if self.upsample: 199 | length = length * self.stride + self.kernel_size - 1 200 | else: 201 | length = (length - 1) * self.stride + self.kernel_size 202 | 203 | return int(length) 204 | 205 | def forward(self, mix): 206 | x = mix 207 | saved = [x] 208 | for encode in self.encoder: 209 | x = encode(x) 210 | saved.append(x) 211 | if self.upsample: 212 | x = downsample(x, self.stride) 213 | if self.lstm: 214 | x = self.lstm(x) 215 | for decode in self.decoder: 216 | if self.upsample: 217 | x = upsample(x, stride=self.stride) 218 | skip = center_trim(saved.pop(-1), x) 219 | x = x + skip 220 | x = decode(x) 221 | if self.final: 222 | skip = center_trim(saved.pop(-1), x) 223 | x = th.cat([x, skip], dim=1) 224 | x = self.final(x) 225 | 226 | x = x.view(x.size(0), self.sources, self.audio_channels, x.size(-1)) 227 | return x 228 | -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/demucs/model_v2.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import math 8 | 9 | import julius 10 | from torch import nn 11 | from .tasnet_v2 import ConvTasNet 12 | 13 | from .utils import capture_init, center_trim 14 | 15 | 16 | class BLSTM(nn.Module): 17 | def __init__(self, dim, layers=1): 18 | super().__init__() 19 | self.lstm = nn.LSTM( 20 | bidirectional=True, num_layers=layers, hidden_size=dim, input_size=dim 21 | ) 22 | self.linear = nn.Linear(2 * dim, dim) 23 | 24 | def forward(self, x): 25 | x = x.permute(2, 0, 1) 26 | x = self.lstm(x)[0] 27 | x = self.linear(x) 28 | x = x.permute(1, 2, 0) 29 | return x 30 | 31 | 32 | def rescale_conv(conv, reference): 33 | std = conv.weight.std().detach() 34 | scale = (std / reference) ** 0.5 35 | conv.weight.data /= scale 36 | if conv.bias is not None: 37 | conv.bias.data /= scale 38 | 39 | 40 | def rescale_module(module, reference): 41 | for sub in module.modules(): 42 | if isinstance(sub, (nn.Conv1d, nn.ConvTranspose1d)): 43 | rescale_conv(sub, reference) 44 | 45 | 46 | def auto_load_demucs_model_v2(sources, demucs_model_name): 47 | 48 | if "48" in demucs_model_name: 49 | channels = 48 50 | elif "unittest" in demucs_model_name: 51 | channels = 4 52 | else: 53 | channels = 64 54 | 55 | if "tasnet" in demucs_model_name: 56 | init_demucs_model = ConvTasNet(sources, X=10) 57 | else: 58 | init_demucs_model = Demucs(sources, channels=channels) 59 | 60 | return init_demucs_model 61 | 62 | 63 | class Demucs(nn.Module): 64 | @capture_init 65 | def __init__( 66 | self, 67 | sources, 68 | audio_channels=2, 69 | channels=64, 70 | depth=6, 71 | rewrite=True, 72 | glu=True, 73 | rescale=0.1, 74 | resample=True, 75 | kernel_size=8, 76 | stride=4, 77 | growth=2.0, 78 | lstm_layers=2, 79 | context=3, 80 | normalize=False, 81 | samplerate=44100, 82 | segment_length=4 * 10 * 44100, 83 | ): 84 | """ 85 | Args: 86 | sources (list[str]): list of source names 87 | audio_channels (int): stereo or mono 88 | channels (int): first convolution channels 89 | depth (int): number of encoder/decoder layers 90 | rewrite (bool): add 1x1 convolution to each encoder layer 91 | and a convolution to each decoder layer. 92 | For the decoder layer, `context` gives the kernel size. 93 | glu (bool): use glu instead of ReLU 94 | resample_input (bool): upsample x2 the input and downsample /2 the output. 95 | rescale (int): rescale initial weights of convolutions 96 | to get their standard deviation closer to `rescale` 97 | kernel_size (int): kernel size for convolutions 98 | stride (int): stride for convolutions 99 | growth (float): multiply (resp divide) number of channels by that 100 | for each layer of the encoder (resp decoder) 101 | lstm_layers (int): number of lstm layers, 0 = no lstm 102 | context (int): kernel size of the convolution in the 103 | decoder before the transposed convolution. If > 1, 104 | will provide some context from neighboring time 105 | steps. 106 | samplerate (int): stored as meta information for easing 107 | future evaluations of the model. 108 | segment_length (int): stored as meta information for easing 109 | future evaluations of the model. Length of the segments on which 110 | the model was trained. 111 | """ 112 | 113 | super().__init__() 114 | self.audio_channels = audio_channels 115 | self.sources = sources 116 | self.kernel_size = kernel_size 117 | self.context = context 118 | self.stride = stride 119 | self.depth = depth 120 | self.resample = resample 121 | self.channels = channels 122 | self.normalize = normalize 123 | self.samplerate = samplerate 124 | self.segment_length = segment_length 125 | 126 | self.encoder = nn.ModuleList() 127 | self.decoder = nn.ModuleList() 128 | 129 | if glu: 130 | activation = nn.GLU(dim=1) 131 | ch_scale = 2 132 | else: 133 | activation = nn.ReLU() 134 | ch_scale = 1 135 | in_channels = audio_channels 136 | for index in range(depth): 137 | encode = [] 138 | encode += [nn.Conv1d(in_channels, channels, kernel_size, stride), nn.ReLU()] 139 | if rewrite: 140 | encode += [nn.Conv1d(channels, ch_scale * channels, 1), activation] 141 | self.encoder.append(nn.Sequential(*encode)) 142 | 143 | decode = [] 144 | if index > 0: 145 | out_channels = in_channels 146 | else: 147 | out_channels = len(self.sources) * audio_channels 148 | if rewrite: 149 | decode += [ 150 | nn.Conv1d(channels, ch_scale * channels, context), 151 | activation, 152 | ] 153 | decode += [nn.ConvTranspose1d(channels, out_channels, kernel_size, stride)] 154 | if index > 0: 155 | decode.append(nn.ReLU()) 156 | self.decoder.insert(0, nn.Sequential(*decode)) 157 | in_channels = channels 158 | channels = int(growth * channels) 159 | 160 | channels = in_channels 161 | 162 | if lstm_layers: 163 | self.lstm = BLSTM(channels, lstm_layers) 164 | else: 165 | self.lstm = None 166 | 167 | if rescale: 168 | rescale_module(self, reference=rescale) 169 | 170 | def valid_length(self, length): 171 | """ 172 | Return the nearest valid length to use with the model so that 173 | there is no time steps left over in a convolutions, e.g. for all 174 | layers, size of the input - kernel_size % stride = 0. 175 | 176 | If the mixture has a valid length, the estimated sources 177 | will have exactly the same length when context = 1. If context > 1, 178 | the two signals can be center trimmed to match. 179 | 180 | For training, extracts should have a valid length.For evaluation 181 | on full tracks we recommend passing `pad = True` to :method:`forward`. 182 | """ 183 | if self.resample: 184 | length *= 2 185 | for _ in range(self.depth): 186 | length = math.ceil((length - self.kernel_size) / self.stride) + 1 187 | length = max(1, length) 188 | length += self.context - 1 189 | for _ in range(self.depth): 190 | length = (length - 1) * self.stride + self.kernel_size 191 | 192 | if self.resample: 193 | length = math.ceil(length / 2) 194 | return int(length) 195 | 196 | def forward(self, mix): 197 | x = mix 198 | 199 | if self.normalize: 200 | mono = mix.mean(dim=1, keepdim=True) 201 | mean = mono.mean(dim=-1, keepdim=True) 202 | std = mono.std(dim=-1, keepdim=True) 203 | else: 204 | mean = 0 205 | std = 1 206 | 207 | x = (x - mean) / (1e-5 + std) 208 | 209 | if self.resample: 210 | x = julius.resample_frac(x, 1, 2) 211 | 212 | saved = [] 213 | for encode in self.encoder: 214 | x = encode(x) 215 | saved.append(x) 216 | if self.lstm: 217 | x = self.lstm(x) 218 | for decode in self.decoder: 219 | skip = center_trim(saved.pop(-1), x) 220 | x = x + skip 221 | x = decode(x) 222 | 223 | if self.resample: 224 | x = julius.resample_frac(x, 2, 1) 225 | x = x * std + mean 226 | x = x.view(x.size(0), len(self.sources), self.audio_channels, x.size(-1)) 227 | return x 228 | -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/demucs/pretrained.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | """Loading pretrained models. 7 | """ 8 | 9 | import logging 10 | from pathlib import Path 11 | import typing as tp 12 | 13 | # from dora.log import fatal 14 | 15 | import logging 16 | 17 | from diffq import DiffQuantizer 18 | import torch.hub 19 | 20 | from .model import Demucs 21 | from .tasnet_v2 import ConvTasNet 22 | from .utils import set_state 23 | 24 | from .hdemucs import HDemucs 25 | from .repo import ( 26 | RemoteRepo, 27 | LocalRepo, 28 | ModelOnlyRepo, 29 | BagOnlyRepo, 30 | AnyModelRepo, 31 | ModelLoadingError, 32 | ) # noqa 33 | 34 | logger = logging.getLogger(__name__) 35 | ROOT_URL = "https://dl.fbaipublicfiles.com/demucs/mdx_final/" 36 | REMOTE_ROOT = Path(__file__).parent / "remote" 37 | 38 | SOURCES = ["drums", "bass", "other", "vocals"] 39 | 40 | 41 | def demucs_unittest(): 42 | model = HDemucs(channels=4, sources=SOURCES) 43 | return model 44 | 45 | 46 | def add_model_flags(parser): 47 | group = parser.add_mutually_exclusive_group(required=False) 48 | group.add_argument("-s", "--sig", help="Locally trained XP signature.") 49 | group.add_argument( 50 | "-n", 51 | "--name", 52 | default="mdx_extra_q", 53 | help="Pretrained model name or signature. Default is mdx_extra_q.", 54 | ) 55 | parser.add_argument( 56 | "--repo", 57 | type=Path, 58 | help="Folder containing all pre-trained models for use with -n.", 59 | ) 60 | 61 | 62 | def _parse_remote_files(remote_file_list) -> tp.Dict[str, str]: 63 | root: str = "" 64 | models: tp.Dict[str, str] = {} 65 | for line in remote_file_list.read_text().split("\n"): 66 | line = line.strip() 67 | if line.startswith("#"): 68 | continue 69 | elif line.startswith("root:"): 70 | root = line.split(":", 1)[1].strip() 71 | else: 72 | sig = line.split("-", 1)[0] 73 | assert sig not in models 74 | models[sig] = ROOT_URL + root + line 75 | return models 76 | 77 | 78 | def get_model(name: str, repo: tp.Optional[Path] = None): 79 | """`name` must be a bag of models name or a pretrained signature 80 | from the remote AWS model repo or the specified local repo if `repo` is not None. 81 | """ 82 | if name == "demucs_unittest": 83 | return demucs_unittest() 84 | model_repo: ModelOnlyRepo 85 | if repo is None: 86 | models = _parse_remote_files(REMOTE_ROOT / "files.txt") 87 | model_repo = RemoteRepo(models) 88 | bag_repo = BagOnlyRepo(REMOTE_ROOT, model_repo) 89 | else: 90 | if not repo.is_dir(): 91 | fatal(f"{repo} must exist and be a directory.") 92 | model_repo = LocalRepo(repo) 93 | bag_repo = BagOnlyRepo(repo, model_repo) 94 | any_repo = AnyModelRepo(model_repo, bag_repo) 95 | model = any_repo.get_model(name) 96 | model.eval() 97 | return model 98 | 99 | 100 | def get_model_from_args(args): 101 | """ 102 | Load local model package or pre-trained model. 103 | """ 104 | return get_model(name=args.name, repo=args.repo) 105 | 106 | 107 | logger = logging.getLogger(__name__) 108 | ROOT = "https://dl.fbaipublicfiles.com/demucs/v3.0/" 109 | 110 | PRETRAINED_MODELS = { 111 | "demucs": "e07c671f", 112 | "demucs48_hq": "28a1282c", 113 | "demucs_extra": "3646af93", 114 | "demucs_quantized": "07afea75", 115 | "tasnet": "beb46fac", 116 | "tasnet_extra": "df3777b2", 117 | "demucs_unittest": "09ebc15f", 118 | } 119 | 120 | SOURCES = ["drums", "bass", "other", "vocals"] 121 | 122 | 123 | def get_url(name): 124 | sig = PRETRAINED_MODELS[name] 125 | return ROOT + name + "-" + sig[:8] + ".th" 126 | 127 | 128 | def is_pretrained(name): 129 | return name in PRETRAINED_MODELS 130 | 131 | 132 | def load_pretrained(name): 133 | if name == "demucs": 134 | return demucs(pretrained=True) 135 | elif name == "demucs48_hq": 136 | return demucs(pretrained=True, hq=True, channels=48) 137 | elif name == "demucs_extra": 138 | return demucs(pretrained=True, extra=True) 139 | elif name == "demucs_quantized": 140 | return demucs(pretrained=True, quantized=True) 141 | elif name == "demucs_unittest": 142 | return demucs_unittest(pretrained=True) 143 | elif name == "tasnet": 144 | return tasnet(pretrained=True) 145 | elif name == "tasnet_extra": 146 | return tasnet(pretrained=True, extra=True) 147 | else: 148 | raise ValueError(f"Invalid pretrained name {name}") 149 | 150 | 151 | def _load_state(name, model, quantizer=None): 152 | url = get_url(name) 153 | state = torch.hub.load_state_dict_from_url(url, map_location="cpu", check_hash=True) 154 | set_state(model, quantizer, state) 155 | if quantizer: 156 | quantizer.detach() 157 | 158 | 159 | def demucs_unittest(pretrained=True): 160 | model = Demucs(channels=4, sources=SOURCES) 161 | if pretrained: 162 | _load_state("demucs_unittest", model) 163 | return model 164 | 165 | 166 | def demucs(pretrained=True, extra=False, quantized=False, hq=False, channels=64): 167 | if not pretrained and (extra or quantized or hq): 168 | raise ValueError("if extra or quantized is True, pretrained must be True.") 169 | model = Demucs(sources=SOURCES, channels=channels) 170 | if pretrained: 171 | name = "demucs" 172 | if channels != 64: 173 | name += str(channels) 174 | quantizer = None 175 | if sum([extra, quantized, hq]) > 1: 176 | raise ValueError("Only one of extra, quantized, hq, can be True.") 177 | if quantized: 178 | quantizer = DiffQuantizer(model, group_size=8, min_size=1) 179 | name += "_quantized" 180 | if extra: 181 | name += "_extra" 182 | if hq: 183 | name += "_hq" 184 | _load_state(name, model, quantizer) 185 | return model 186 | 187 | 188 | def tasnet(pretrained=True, extra=False): 189 | if not pretrained and extra: 190 | raise ValueError("if extra is True, pretrained must be True.") 191 | model = ConvTasNet(X=10, sources=SOURCES) 192 | if pretrained: 193 | name = "tasnet" 194 | if extra: 195 | name = "tasnet_extra" 196 | _load_state(name, model) 197 | return model 198 | -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/demucs/repo.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | """Represents a model repository, including pre-trained models and bags of models. 7 | A repo can either be the main remote repository stored in AWS, or a local repository 8 | with your own models. 9 | """ 10 | 11 | from hashlib import sha256 12 | from pathlib import Path 13 | import typing as tp 14 | 15 | import torch 16 | import yaml 17 | 18 | from .apply import BagOfModels, Model 19 | from .states import load_model 20 | 21 | 22 | AnyModel = tp.Union[Model, BagOfModels] 23 | 24 | 25 | class ModelLoadingError(RuntimeError): 26 | pass 27 | 28 | 29 | def check_checksum(path: Path, checksum: str): 30 | sha = sha256() 31 | with open(path, "rb") as file: 32 | while True: 33 | buf = file.read(2**20) 34 | if not buf: 35 | break 36 | sha.update(buf) 37 | actual_checksum = sha.hexdigest()[: len(checksum)] 38 | if actual_checksum != checksum: 39 | raise ModelLoadingError( 40 | f"Invalid checksum for file {path}, " 41 | f"expected {checksum} but got {actual_checksum}" 42 | ) 43 | 44 | 45 | class ModelOnlyRepo: 46 | """Base class for all model only repos.""" 47 | 48 | def has_model(self, sig: str) -> bool: 49 | raise NotImplementedError() 50 | 51 | def get_model(self, sig: str) -> Model: 52 | raise NotImplementedError() 53 | 54 | 55 | class RemoteRepo(ModelOnlyRepo): 56 | def __init__(self, models: tp.Dict[str, str]): 57 | self._models = models 58 | 59 | def has_model(self, sig: str) -> bool: 60 | return sig in self._models 61 | 62 | def get_model(self, sig: str) -> Model: 63 | try: 64 | url = self._models[sig] 65 | except KeyError: 66 | raise ModelLoadingError( 67 | f"Could not find a pre-trained model with signature {sig}." 68 | ) 69 | pkg = torch.hub.load_state_dict_from_url( 70 | url, map_location="cpu", check_hash=True 71 | ) 72 | return load_model(pkg) 73 | 74 | 75 | class LocalRepo(ModelOnlyRepo): 76 | def __init__(self, root: Path): 77 | self.root = root 78 | self.scan() 79 | 80 | def scan(self): 81 | self._models = {} 82 | self._checksums = {} 83 | for file in self.root.iterdir(): 84 | if file.suffix == ".th": 85 | if "-" in file.stem: 86 | xp_sig, checksum = file.stem.split("-") 87 | self._checksums[xp_sig] = checksum 88 | else: 89 | xp_sig = file.stem 90 | if xp_sig in self._models: 91 | print("Whats xp? ", xp_sig) 92 | raise ModelLoadingError( 93 | f"Duplicate pre-trained model exist for signature {xp_sig}. " 94 | "Please delete all but one." 95 | ) 96 | self._models[xp_sig] = file 97 | 98 | def has_model(self, sig: str) -> bool: 99 | return sig in self._models 100 | 101 | def get_model(self, sig: str) -> Model: 102 | try: 103 | file = self._models[sig] 104 | except KeyError: 105 | raise ModelLoadingError( 106 | f"Could not find pre-trained model with signature {sig}." 107 | ) 108 | if sig in self._checksums: 109 | check_checksum(file, self._checksums[sig]) 110 | return load_model(file) 111 | 112 | 113 | class BagOnlyRepo: 114 | """Handles only YAML files containing bag of models, leaving the actual 115 | model loading to some Repo. 116 | """ 117 | 118 | def __init__(self, root: Path, model_repo: ModelOnlyRepo): 119 | self.root = root 120 | self.model_repo = model_repo 121 | self.scan() 122 | 123 | def scan(self): 124 | self._bags = {} 125 | for file in self.root.iterdir(): 126 | if file.suffix == ".yaml": 127 | self._bags[file.stem] = file 128 | 129 | def has_model(self, name: str) -> bool: 130 | return name in self._bags 131 | 132 | def get_model(self, name: str) -> BagOfModels: 133 | try: 134 | yaml_file = self._bags[name] 135 | except KeyError: 136 | raise ModelLoadingError( 137 | f"{name} is neither a single pre-trained model or " "a bag of models." 138 | ) 139 | bag = yaml.safe_load(open(yaml_file)) 140 | signatures = bag["models"] 141 | models = [self.model_repo.get_model(sig) for sig in signatures] 142 | weights = bag.get("weights") 143 | segment = bag.get("segment") 144 | return BagOfModels(models, weights, segment) 145 | 146 | 147 | class AnyModelRepo: 148 | def __init__(self, model_repo: ModelOnlyRepo, bag_repo: BagOnlyRepo): 149 | self.model_repo = model_repo 150 | self.bag_repo = bag_repo 151 | 152 | def has_model(self, name_or_sig: str) -> bool: 153 | return self.model_repo.has_model(name_or_sig) or self.bag_repo.has_model( 154 | name_or_sig 155 | ) 156 | 157 | def get_model(self, name_or_sig: str) -> AnyModel: 158 | # print('name_or_sig: ', name_or_sig) 159 | if self.model_repo.has_model(name_or_sig): 160 | return self.model_repo.get_model(name_or_sig) 161 | else: 162 | return self.bag_repo.get_model(name_or_sig) 163 | -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/demucs/spec.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | """Conveniance wrapper to perform STFT and iSTFT""" 7 | 8 | import torch as th 9 | 10 | 11 | def spectro(x, n_fft=512, hop_length=None, pad=0): 12 | *other, length = x.shape 13 | x = x.reshape(-1, length) 14 | 15 | device_type = x.device.type 16 | is_other_gpu = not device_type in ["cuda", "cpu"] 17 | 18 | if is_other_gpu: 19 | x = x.cpu() 20 | z = th.stft( 21 | x, 22 | n_fft * (1 + pad), 23 | hop_length or n_fft // 4, 24 | window=th.hann_window(n_fft).to(x), 25 | win_length=n_fft, 26 | normalized=True, 27 | center=True, 28 | return_complex=True, 29 | pad_mode="reflect", 30 | ) 31 | _, freqs, frame = z.shape 32 | return z.view(*other, freqs, frame) 33 | 34 | 35 | def ispectro(z, hop_length=None, length=None, pad=0): 36 | *other, freqs, frames = z.shape 37 | n_fft = 2 * freqs - 2 38 | z = z.view(-1, freqs, frames) 39 | win_length = n_fft // (1 + pad) 40 | 41 | device_type = z.device.type 42 | is_other_gpu = not device_type in ["cuda", "cpu"] 43 | 44 | if is_other_gpu: 45 | z = z.cpu() 46 | x = th.istft( 47 | z, 48 | n_fft, 49 | hop_length, 50 | window=th.hann_window(win_length).to(z.real), 51 | win_length=win_length, 52 | normalized=True, 53 | length=length, 54 | center=True, 55 | ) 56 | _, length = x.shape 57 | return x.view(*other, length) 58 | -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/demucs/states.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | """ 7 | Utilities to save and load models. 8 | """ 9 | from contextlib import contextmanager 10 | 11 | import functools 12 | import hashlib 13 | import inspect 14 | import io 15 | from pathlib import Path 16 | import warnings 17 | 18 | from diffq import DiffQuantizer, UniformQuantizer, restore_quantized_state 19 | import torch 20 | 21 | 22 | def get_quantizer(model, args, optimizer=None): 23 | """Return the quantizer given the XP quantization args.""" 24 | quantizer = None 25 | if args.diffq: 26 | quantizer = DiffQuantizer( 27 | model, min_size=args.min_size, group_size=args.group_size 28 | ) 29 | if optimizer is not None: 30 | quantizer.setup_optimizer(optimizer) 31 | elif args.qat: 32 | quantizer = UniformQuantizer(model, bits=args.qat, min_size=args.min_size) 33 | return quantizer 34 | 35 | 36 | def load_model(path_or_package, strict=False): 37 | """Load a model from the given serialized model, either given as a dict (already loaded) 38 | or a path to a file on disk.""" 39 | if isinstance(path_or_package, dict): 40 | package = path_or_package 41 | elif isinstance(path_or_package, (str, Path)): 42 | with warnings.catch_warnings(): 43 | warnings.simplefilter("ignore") 44 | path = path_or_package 45 | package = torch.load(path, "cpu") 46 | else: 47 | raise ValueError(f"Invalid type for {path_or_package}.") 48 | 49 | klass = package["klass"] 50 | args = package["args"] 51 | kwargs = package["kwargs"] 52 | 53 | if strict: 54 | model = klass(*args, **kwargs) 55 | else: 56 | sig = inspect.signature(klass) 57 | for key in list(kwargs): 58 | if key not in sig.parameters: 59 | warnings.warn("Dropping inexistant parameter " + key) 60 | del kwargs[key] 61 | model = klass(*args, **kwargs) 62 | 63 | state = package["state"] 64 | 65 | set_state(model, state) 66 | return model 67 | 68 | 69 | def get_state(model, quantizer, half=False): 70 | """Get the state from a model, potentially with quantization applied. 71 | If `half` is True, model are stored as half precision, which shouldn't impact performance 72 | but half the state size.""" 73 | if quantizer is None: 74 | dtype = torch.half if half else None 75 | state = { 76 | k: p.data.to(device="cpu", dtype=dtype) 77 | for k, p in model.state_dict().items() 78 | } 79 | else: 80 | state = quantizer.get_quantized_state() 81 | state["__quantized"] = True 82 | return state 83 | 84 | 85 | def set_state(model, state, quantizer=None): 86 | """Set the state on a given model.""" 87 | if state.get("__quantized"): 88 | if quantizer is not None: 89 | quantizer.restore_quantized_state(model, state["quantized"]) 90 | else: 91 | restore_quantized_state(model, state) 92 | else: 93 | model.load_state_dict(state) 94 | return state 95 | 96 | 97 | def save_with_checksum(content, path): 98 | """Save the given value on disk, along with a sha256 hash. 99 | Should be used with the output of either `serialize_model` or `get_state`.""" 100 | buf = io.BytesIO() 101 | torch.save(content, buf) 102 | sig = hashlib.sha256(buf.getvalue()).hexdigest()[:8] 103 | 104 | path = path.parent / (path.stem + "-" + sig + path.suffix) 105 | path.write_bytes(buf.getvalue()) 106 | 107 | 108 | def copy_state(state): 109 | return {k: v.cpu().clone() for k, v in state.items()} 110 | 111 | 112 | @contextmanager 113 | def swap_state(model, state): 114 | """ 115 | Context manager that swaps the state of a model, e.g: 116 | 117 | # model is in old state 118 | with swap_state(model, new_state): 119 | # model in new state 120 | # model back to old state 121 | """ 122 | old_state = copy_state(model.state_dict()) 123 | model.load_state_dict(state, strict=False) 124 | try: 125 | yield 126 | finally: 127 | model.load_state_dict(old_state) 128 | 129 | 130 | def capture_init(init): 131 | @functools.wraps(init) 132 | def __init__(self, *args, **kwargs): 133 | self._init_args_kwargs = (args, kwargs) 134 | init(self, *args, **kwargs) 135 | 136 | return __init__ 137 | -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/mdxnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from .modules import TFC_TDF 4 | from pytorch_lightning import LightningModule 5 | 6 | dim_s = 4 7 | 8 | 9 | class AbstractMDXNet(LightningModule): 10 | def __init__( 11 | self, 12 | target_name, 13 | lr, 14 | optimizer, 15 | dim_c, 16 | dim_f, 17 | dim_t, 18 | n_fft, 19 | hop_length, 20 | overlap, 21 | ): 22 | super().__init__() 23 | self.target_name = target_name 24 | self.lr = lr 25 | self.optimizer = optimizer 26 | self.dim_c = dim_c 27 | self.dim_f = dim_f 28 | self.dim_t = dim_t 29 | self.n_fft = n_fft 30 | self.n_bins = n_fft // 2 + 1 31 | self.hop_length = hop_length 32 | self.window = nn.Parameter( 33 | torch.hann_window(window_length=self.n_fft, periodic=True), 34 | requires_grad=False, 35 | ) 36 | self.freq_pad = nn.Parameter( 37 | torch.zeros([1, dim_c, self.n_bins - self.dim_f, self.dim_t]), 38 | requires_grad=False, 39 | ) 40 | 41 | def get_optimizer(self): 42 | if self.optimizer == "rmsprop": 43 | return torch.optim.RMSprop(self.parameters(), self.lr) 44 | 45 | if self.optimizer == "adamw": 46 | return torch.optim.AdamW(self.parameters(), self.lr) 47 | 48 | 49 | class ConvTDFNet(AbstractMDXNet): 50 | def __init__( 51 | self, 52 | target_name, 53 | lr, 54 | optimizer, 55 | dim_c, 56 | dim_f, 57 | dim_t, 58 | n_fft, 59 | hop_length, 60 | num_blocks, 61 | l, 62 | g, 63 | k, 64 | bn, 65 | bias, 66 | overlap, 67 | ): 68 | 69 | super(ConvTDFNet, self).__init__( 70 | target_name, lr, optimizer, dim_c, dim_f, dim_t, n_fft, hop_length, overlap 71 | ) 72 | # self.save_hyperparameters() 73 | 74 | self.num_blocks = num_blocks 75 | self.l = l 76 | self.g = g 77 | self.k = k 78 | self.bn = bn 79 | self.bias = bias 80 | 81 | if optimizer == "rmsprop": 82 | norm = nn.BatchNorm2d 83 | 84 | if optimizer == "adamw": 85 | norm = lambda input: nn.GroupNorm(2, input) 86 | 87 | self.n = num_blocks // 2 88 | scale = (2, 2) 89 | 90 | self.first_conv = nn.Sequential( 91 | nn.Conv2d(in_channels=self.dim_c, out_channels=g, kernel_size=(1, 1)), 92 | norm(g), 93 | nn.ReLU(), 94 | ) 95 | 96 | f = self.dim_f 97 | c = g 98 | self.encoding_blocks = nn.ModuleList() 99 | self.ds = nn.ModuleList() 100 | for i in range(self.n): 101 | self.encoding_blocks.append(TFC_TDF(c, l, f, k, bn, bias=bias, norm=norm)) 102 | self.ds.append( 103 | nn.Sequential( 104 | nn.Conv2d( 105 | in_channels=c, 106 | out_channels=c + g, 107 | kernel_size=scale, 108 | stride=scale, 109 | ), 110 | norm(c + g), 111 | nn.ReLU(), 112 | ) 113 | ) 114 | f = f // 2 115 | c += g 116 | 117 | self.bottleneck_block = TFC_TDF(c, l, f, k, bn, bias=bias, norm=norm) 118 | 119 | self.decoding_blocks = nn.ModuleList() 120 | self.us = nn.ModuleList() 121 | for i in range(self.n): 122 | self.us.append( 123 | nn.Sequential( 124 | nn.ConvTranspose2d( 125 | in_channels=c, 126 | out_channels=c - g, 127 | kernel_size=scale, 128 | stride=scale, 129 | ), 130 | norm(c - g), 131 | nn.ReLU(), 132 | ) 133 | ) 134 | f = f * 2 135 | c -= g 136 | 137 | self.decoding_blocks.append(TFC_TDF(c, l, f, k, bn, bias=bias, norm=norm)) 138 | 139 | self.final_conv = nn.Sequential( 140 | nn.Conv2d(in_channels=c, out_channels=self.dim_c, kernel_size=(1, 1)), 141 | ) 142 | 143 | def forward(self, x): 144 | 145 | x = self.first_conv(x) 146 | 147 | x = x.transpose(-1, -2) 148 | 149 | ds_outputs = [] 150 | for i in range(self.n): 151 | x = self.encoding_blocks[i](x) 152 | ds_outputs.append(x) 153 | x = self.ds[i](x) 154 | 155 | x = self.bottleneck_block(x) 156 | 157 | for i in range(self.n): 158 | x = self.us[i](x) 159 | x *= ds_outputs[-i - 1] 160 | x = self.decoding_blocks[i](x) 161 | 162 | x = x.transpose(-1, -2) 163 | 164 | x = self.final_conv(x) 165 | 166 | return x 167 | 168 | 169 | class Mixer(nn.Module): 170 | def __init__(self, device, mixer_path): 171 | 172 | super(Mixer, self).__init__() 173 | 174 | self.linear = nn.Linear((dim_s + 1) * 2, dim_s * 2, bias=False) 175 | 176 | self.load_state_dict(torch.load(mixer_path, map_location=device)) 177 | 178 | def forward(self, x): 179 | x = x.reshape(1, (dim_s + 1) * 2, -1).transpose(-1, -2) 180 | x = self.linear(x) 181 | return x.transpose(-1, -2).reshape(dim_s, 2, -1) 182 | -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/mixer.ckpt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IAHispano/Applio-Plugins/b80054bb20ade068aa69fed31bfe48f7dcbc4cad/UVR/uvr/uvr_lib_v5/mixer.ckpt -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/modules.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class TFC(nn.Module): 6 | def __init__(self, c, l, k, norm): 7 | super(TFC, self).__init__() 8 | 9 | self.H = nn.ModuleList() 10 | for i in range(l): 11 | self.H.append( 12 | nn.Sequential( 13 | nn.Conv2d( 14 | in_channels=c, 15 | out_channels=c, 16 | kernel_size=k, 17 | stride=1, 18 | padding=k // 2, 19 | ), 20 | norm(c), 21 | nn.ReLU(), 22 | ) 23 | ) 24 | 25 | def forward(self, x): 26 | for h in self.H: 27 | x = h(x) 28 | return x 29 | 30 | 31 | class DenseTFC(nn.Module): 32 | def __init__(self, c, l, k, norm): 33 | super(DenseTFC, self).__init__() 34 | 35 | self.conv = nn.ModuleList() 36 | for i in range(l): 37 | self.conv.append( 38 | nn.Sequential( 39 | nn.Conv2d( 40 | in_channels=c, 41 | out_channels=c, 42 | kernel_size=k, 43 | stride=1, 44 | padding=k // 2, 45 | ), 46 | norm(c), 47 | nn.ReLU(), 48 | ) 49 | ) 50 | 51 | def forward(self, x): 52 | for layer in self.conv[:-1]: 53 | x = torch.cat([layer(x), x], 1) 54 | return self.conv[-1](x) 55 | 56 | 57 | class TFC_TDF(nn.Module): 58 | def __init__(self, c, l, f, k, bn, dense=False, bias=True, norm=nn.BatchNorm2d): 59 | 60 | super(TFC_TDF, self).__init__() 61 | 62 | self.use_tdf = bn is not None 63 | 64 | self.tfc = DenseTFC(c, l, k, norm) if dense else TFC(c, l, k, norm) 65 | 66 | if self.use_tdf: 67 | if bn == 0: 68 | self.tdf = nn.Sequential(nn.Linear(f, f, bias=bias), norm(c), nn.ReLU()) 69 | else: 70 | self.tdf = nn.Sequential( 71 | nn.Linear(f, f // bn, bias=bias), 72 | norm(c), 73 | nn.ReLU(), 74 | nn.Linear(f // bn, f, bias=bias), 75 | norm(c), 76 | nn.ReLU(), 77 | ) 78 | 79 | def forward(self, x): 80 | x = self.tfc(x) 81 | return x + self.tdf(x) if self.use_tdf else x 82 | -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/playsound.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logger = logging.getLogger(__name__) 4 | 5 | 6 | class PlaysoundException(Exception): 7 | pass 8 | 9 | 10 | def _canonicalizePath(path): 11 | """ 12 | Support passing in a pathlib.Path-like object by converting to str. 13 | """ 14 | import sys 15 | 16 | if sys.version_info[0] >= 3: 17 | return str(path) 18 | else: 19 | # On earlier Python versions, str is a byte string, so attempting to 20 | # convert a unicode string to str will fail. Leave it alone in this case. 21 | return path 22 | 23 | 24 | def _playsoundWin(sound, block=True): 25 | """ 26 | Utilizes windll.winmm. Tested and known to work with MP3 and WAVE on 27 | Windows 7 with Python 2.7. Probably works with more file formats. 28 | Probably works on Windows XP thru Windows 10. Probably works with all 29 | versions of Python. 30 | 31 | Inspired by (but not copied from) Michael Gundlach 's mp3play: 32 | https://github.com/michaelgundlach/mp3play 33 | 34 | I never would have tried using windll.winmm without seeing his code. 35 | """ 36 | sound = '"' + _canonicalizePath(sound) + '"' 37 | 38 | from ctypes import create_unicode_buffer, windll, wintypes 39 | from time import sleep 40 | 41 | windll.winmm.mciSendStringW.argtypes = [ 42 | wintypes.LPCWSTR, 43 | wintypes.LPWSTR, 44 | wintypes.UINT, 45 | wintypes.HANDLE, 46 | ] 47 | windll.winmm.mciGetErrorStringW.argtypes = [ 48 | wintypes.DWORD, 49 | wintypes.LPWSTR, 50 | wintypes.UINT, 51 | ] 52 | 53 | def winCommand(*command): 54 | bufLen = 600 55 | buf = create_unicode_buffer(bufLen) 56 | command = " ".join(command) 57 | errorCode = int( 58 | windll.winmm.mciSendStringW(command, buf, bufLen - 1, 0) 59 | ) # use widestring version of the function 60 | if errorCode: 61 | errorBuffer = create_unicode_buffer(bufLen) 62 | windll.winmm.mciGetErrorStringW( 63 | errorCode, errorBuffer, bufLen - 1 64 | ) # use widestring version of the function 65 | exceptionMessage = ( 66 | "\n Error " + str(errorCode) + " for command:" 67 | "\n " + command + "\n " + errorBuffer.value 68 | ) 69 | logger.error(exceptionMessage) 70 | raise PlaysoundException(exceptionMessage) 71 | return buf.value 72 | 73 | try: 74 | logger.debug("Starting") 75 | winCommand("open {}".format(sound)) 76 | winCommand("play {}{}".format(sound, " wait" if block else "")) 77 | logger.debug("Returning") 78 | finally: 79 | try: 80 | winCommand("close {}".format(sound)) 81 | except PlaysoundException: 82 | logger.warning("Failed to close the file: {}".format(sound)) 83 | # If it fails, there's nothing more that can be done... 84 | pass 85 | 86 | 87 | def _handlePathOSX(sound): 88 | sound = _canonicalizePath(sound) 89 | 90 | if "://" not in sound: 91 | if not sound.startswith("/"): 92 | from os import getcwd 93 | 94 | sound = getcwd() + "/" + sound 95 | sound = "file://" + sound 96 | 97 | try: 98 | # Don't double-encode it. 99 | sound.encode("ascii") 100 | return sound.replace(" ", "%20") 101 | except UnicodeEncodeError: 102 | try: 103 | from urllib.parse import quote # Try the Python 3 import first... 104 | except ImportError: 105 | from urllib import ( 106 | quote, 107 | ) # Try using the Python 2 import before giving up entirely... 108 | 109 | parts = sound.split("://", 1) 110 | return parts[0] + "://" + quote(parts[1].encode("utf-8")).replace(" ", "%20") 111 | 112 | 113 | def _playsoundOSX(sound, block=True): 114 | """ 115 | Utilizes AppKit.NSSound. Tested and known to work with MP3 and WAVE on 116 | OS X 10.11 with Python 2.7. Probably works with anything QuickTime supports. 117 | Probably works on OS X 10.5 and newer. Probably works with all versions of 118 | Python. 119 | 120 | Inspired by (but not copied from) Aaron's Stack Overflow answer here: 121 | http://stackoverflow.com/a/34568298/901641 122 | 123 | I never would have tried using AppKit.NSSound without seeing his code. 124 | """ 125 | try: 126 | from AppKit import NSSound 127 | except ImportError: 128 | logger.warning( 129 | "playsound could not find a copy of AppKit - falling back to using macOS's system copy." 130 | ) 131 | sys.path.append( 132 | "/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/PyObjC" 133 | ) 134 | from AppKit import NSSound 135 | 136 | from Foundation import NSURL 137 | from time import sleep 138 | 139 | sound = _handlePathOSX(sound) 140 | url = NSURL.URLWithString_(sound) 141 | if not url: 142 | raise PlaysoundException("Cannot find a sound with filename: " + sound) 143 | 144 | for i in range(5): 145 | nssound = NSSound.alloc().initWithContentsOfURL_byReference_(url, True) 146 | if nssound: 147 | break 148 | else: 149 | logger.debug("Failed to load sound, although url was good... " + sound) 150 | else: 151 | raise PlaysoundException( 152 | "Could not load sound with filename, although URL was good... " + sound 153 | ) 154 | nssound.play() 155 | 156 | if block: 157 | sleep(nssound.duration()) 158 | 159 | 160 | def _playsoundNix(sound, block=True): 161 | """Play a sound using GStreamer. 162 | 163 | Inspired by this: 164 | https://gstreamer.freedesktop.org/documentation/tutorials/playback/playbin-usage.html 165 | """ 166 | sound = _canonicalizePath(sound) 167 | 168 | # pathname2url escapes non-URL-safe characters 169 | from os.path import abspath, exists 170 | 171 | try: 172 | from urllib.request import pathname2url 173 | except ImportError: 174 | # python 2 175 | from urllib import pathname2url 176 | 177 | import gi 178 | 179 | gi.require_version("Gst", "1.0") 180 | from gi.repository import Gst 181 | 182 | Gst.init(None) 183 | 184 | playbin = Gst.ElementFactory.make("playbin", "playbin") 185 | if sound.startswith(("http://", "https://")): 186 | playbin.props.uri = sound 187 | else: 188 | path = abspath(sound) 189 | if not exists(path): 190 | raise PlaysoundException("File not found: {}".format(path)) 191 | playbin.props.uri = "file://" + pathname2url(path) 192 | 193 | set_result = playbin.set_state(Gst.State.PLAYING) 194 | if set_result != Gst.StateChangeReturn.ASYNC: 195 | raise PlaysoundException("playbin.set_state returned " + repr(set_result)) 196 | 197 | # FIXME: use some other bus method than poll() with block=False 198 | # https://lazka.github.io/pgi-docs/#Gst-1.0/classes/Bus.html 199 | logger.debug("Starting play") 200 | if block: 201 | bus = playbin.get_bus() 202 | try: 203 | bus.poll(Gst.MessageType.EOS, Gst.CLOCK_TIME_NONE) 204 | finally: 205 | playbin.set_state(Gst.State.NULL) 206 | 207 | logger.debug("Finishing play") 208 | 209 | 210 | def _playsoundAnotherPython(otherPython, sound, block=True, macOS=False): 211 | """ 212 | Mostly written so that when this is run on python3 on macOS, it can invoke 213 | python2 on macOS... but maybe this idea could be useful on linux, too. 214 | """ 215 | from inspect import getsourcefile 216 | from os.path import abspath, exists 217 | from subprocess import check_call 218 | from threading import Thread 219 | 220 | sound = _canonicalizePath(sound) 221 | 222 | class PropogatingThread(Thread): 223 | def run(self): 224 | self.exc = None 225 | try: 226 | self.ret = self._target(*self._args, **self._kwargs) 227 | except BaseException as e: 228 | self.exc = e 229 | 230 | def join(self, timeout=None): 231 | super().join(timeout) 232 | if self.exc: 233 | raise self.exc 234 | return self.ret 235 | 236 | # Check if the file exists... 237 | if not exists(abspath(sound)): 238 | raise PlaysoundException("Cannot find a sound with filename: " + sound) 239 | 240 | playsoundPath = abspath(getsourcefile(lambda: 0)) 241 | t = PropogatingThread( 242 | target=lambda: check_call( 243 | [otherPython, playsoundPath, _handlePathOSX(sound) if macOS else sound] 244 | ) 245 | ) 246 | t.start() 247 | if block: 248 | t.join() 249 | 250 | 251 | from platform import system 252 | 253 | system = system() 254 | 255 | if system == "Windows": 256 | playsound_func = _playsoundWin 257 | elif system == "Darwin": 258 | playsound_func = _playsoundOSX 259 | import sys 260 | 261 | if sys.version_info[0] > 2: 262 | try: 263 | from AppKit import NSSound 264 | except ImportError: 265 | logger.warning( 266 | "playsound is relying on a python 2 subprocess. Please use `pip3 install PyObjC` if you want playsound to run more efficiently." 267 | ) 268 | playsound_func = lambda sound, block=True: _playsoundAnotherPython( 269 | "/System/Library/Frameworks/Python.framework/Versions/2.7/bin/python", 270 | sound, 271 | block, 272 | macOS=True, 273 | ) 274 | else: 275 | playsound_func = _playsoundNix 276 | if ( 277 | __name__ != "__main__" 278 | ): # Ensure we don't infinitely recurse trying to get another python instance. 279 | try: 280 | import gi 281 | 282 | gi.require_version("Gst", "1.0") 283 | from gi.repository import Gst 284 | except: 285 | logger.warning( 286 | "playsound is relying on another python subprocess. Please use `pip install pygobject` if you want playsound to run more efficiently." 287 | ) 288 | playsound_func = lambda sound, block=True: _playsoundAnotherPython( 289 | "/usr/bin/python3", sound, block, macOS=False 290 | ) 291 | 292 | del system 293 | 294 | 295 | def play(audio_filepath): 296 | playsound_func(audio_filepath) 297 | -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/pyrb.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import tempfile 4 | import six 5 | import numpy as np 6 | import soundfile as sf 7 | import sys 8 | 9 | if getattr(sys, "frozen", False): 10 | BASE_PATH_RUB = sys._MEIPASS 11 | else: 12 | BASE_PATH_RUB = os.path.dirname(os.path.abspath(__file__)) 13 | 14 | __all__ = ["time_stretch", "pitch_shift"] 15 | 16 | __RUBBERBAND_UTIL = os.path.join(BASE_PATH_RUB, "rubberband") 17 | 18 | if six.PY2: 19 | DEVNULL = open(os.devnull, "w") 20 | else: 21 | DEVNULL = subprocess.DEVNULL 22 | 23 | 24 | def __rubberband(y, sr, **kwargs): 25 | 26 | assert sr > 0 27 | 28 | # Get the input and output tempfile 29 | fd, infile = tempfile.mkstemp(suffix=".wav") 30 | os.close(fd) 31 | fd, outfile = tempfile.mkstemp(suffix=".wav") 32 | os.close(fd) 33 | 34 | # dump the audio 35 | sf.write(infile, y, sr) 36 | 37 | try: 38 | # Execute rubberband 39 | arguments = [__RUBBERBAND_UTIL, "-q"] 40 | 41 | for key, value in six.iteritems(kwargs): 42 | arguments.append(str(key)) 43 | arguments.append(str(value)) 44 | 45 | arguments.extend([infile, outfile]) 46 | 47 | subprocess.check_call(arguments, stdout=DEVNULL, stderr=DEVNULL) 48 | 49 | # Load the processed audio. 50 | y_out, _ = sf.read(outfile, always_2d=True) 51 | 52 | # make sure that output dimensions matches input 53 | if y.ndim == 1: 54 | y_out = np.squeeze(y_out) 55 | 56 | except OSError as exc: 57 | six.raise_from( 58 | RuntimeError( 59 | "Failed to execute rubberband. " 60 | "Please verify that rubberband-cli " 61 | "is installed." 62 | ), 63 | exc, 64 | ) 65 | 66 | finally: 67 | # Remove temp files 68 | os.unlink(infile) 69 | os.unlink(outfile) 70 | 71 | return y_out 72 | 73 | 74 | def time_stretch(y, sr, rate, rbargs=None): 75 | if rate <= 0: 76 | raise ValueError("rate must be strictly positive") 77 | 78 | if rate == 1.0: 79 | return y 80 | 81 | if rbargs is None: 82 | rbargs = dict() 83 | 84 | rbargs.setdefault("--tempo", rate) 85 | 86 | return __rubberband(y, sr, **rbargs) 87 | 88 | 89 | def pitch_shift(y, sr, n_steps, rbargs=None): 90 | 91 | if n_steps == 0: 92 | return y 93 | 94 | if rbargs is None: 95 | rbargs = dict() 96 | 97 | rbargs.setdefault("--pitch", n_steps) 98 | 99 | return __rubberband(y, sr, **rbargs) 100 | -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/results.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Matchering - Audio Matching and Mastering Python Library 5 | Copyright (C) 2016-2022 Sergree 6 | 7 | This program is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | This program is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with this program. If not, see . 19 | """ 20 | 21 | import os 22 | import soundfile as sf 23 | 24 | 25 | class Result: 26 | def __init__( 27 | self, file: str, subtype: str, use_limiter: bool = True, normalize: bool = True 28 | ): 29 | _, file_ext = os.path.splitext(file) 30 | file_ext = file_ext[1:].upper() 31 | if not sf.check_format(file_ext): 32 | raise TypeError(f"{file_ext} format is not supported") 33 | if not sf.check_format(file_ext, subtype): 34 | raise TypeError(f"{file_ext} format does not have {subtype} subtype") 35 | self.file = file 36 | self.subtype = subtype 37 | self.use_limiter = use_limiter 38 | self.normalize = normalize 39 | 40 | 41 | def pcm16(file: str) -> Result: 42 | return Result(file, "PCM_16") 43 | 44 | 45 | def pcm24(file: str) -> Result: 46 | return Result(file, "FLOAT") 47 | 48 | 49 | def save_audiofile(file: str, wav_set="PCM_16") -> Result: 50 | return Result(file, wav_set) 51 | -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/stft.py: -------------------------------------------------------------------------------- 1 | import torch 2 | cpu_device=torch.device("cpu") 3 | 4 | class STFT: 5 | """ 6 | This class performs the Short-Time Fourier Transform (STFT) and its inverse (ISTFT). 7 | These functions are essential for converting the audio between the time domain and the frequency domain, 8 | which is a crucial aspect of audio processing in neural networks. 9 | """ 10 | 11 | def __init__(self, logger, n_fft, hop_length, dim_f, device): 12 | self.logger = logger 13 | self.n_fft = n_fft 14 | self.hop_length = hop_length 15 | self.dim_f = dim_f 16 | self.device = device 17 | # Create a Hann window tensor for use in the STFT. 18 | self.hann_window = torch.hann_window(window_length=self.n_fft, periodic=True) 19 | 20 | def __call__(self, input_tensor): 21 | # Determine if the input tensor's device is not a standard computing device (i.e., not CPU or CUDA). 22 | is_non_standard_device = not input_tensor.device.type in ["cuda", "cpu"] 23 | 24 | # If on a non-standard device, temporarily move the tensor to CPU for processing. 25 | if is_non_standard_device: 26 | input_tensor = input_tensor.cpu() 27 | 28 | # Transfer the pre-defined window tensor to the same device as the input tensor. 29 | stft_window = self.hann_window.to(input_tensor.device) 30 | 31 | # Extract batch dimensions (all dimensions except the last two which are channel and time). 32 | batch_dimensions = input_tensor.shape[:-2] 33 | 34 | # Extract channel and time dimensions (last two dimensions of the tensor). 35 | channel_dim, time_dim = input_tensor.shape[-2:] 36 | 37 | # Reshape the tensor to merge batch and channel dimensions for STFT processing. 38 | reshaped_tensor = input_tensor.reshape([-1, time_dim]) 39 | 40 | # Perform the Short-Time Fourier Transform (STFT) on the reshaped tensor. 41 | source_device = reshaped_tensor.device 42 | if reshaped_tensor.device.type == "cuda" and torch.cuda.get_device_name(reshaped_tensor.device.index).endswith("[ZLUDA]"): 43 | reshaped_tensor = reshaped_tensor.to("cpu") 44 | stft_window = stft_window.to("cpu") 45 | 46 | stft_output = torch.stft( 47 | reshaped_tensor, 48 | n_fft=self.n_fft, 49 | hop_length=self.hop_length, 50 | window=stft_window, 51 | center=True, 52 | return_complex=False, 53 | ).to(source_device) 54 | 55 | # Rearrange the dimensions of the STFT output to bring the frequency dimension forward. 56 | permuted_stft_output = stft_output.permute([0, 3, 1, 2]) 57 | 58 | # Reshape the output to restore the original batch and channel dimensions, while keeping the newly formed frequency and time dimensions. 59 | final_output = permuted_stft_output.reshape( 60 | [*batch_dimensions, channel_dim, 2, -1, permuted_stft_output.shape[-1]] 61 | ).reshape( 62 | [*batch_dimensions, channel_dim * 2, -1, permuted_stft_output.shape[-1]] 63 | ) 64 | 65 | # If the original tensor was on a non-standard device, move the processed tensor back to that device. 66 | if is_non_standard_device: 67 | final_output = final_output.to(self.device) 68 | 69 | # Return the transformed tensor, sliced to retain only the required frequency dimension (`dim_f`). 70 | return final_output[..., : self.dim_f, :] 71 | 72 | def pad_frequency_dimension( 73 | self, 74 | input_tensor, 75 | batch_dimensions, 76 | channel_dim, 77 | freq_dim, 78 | time_dim, 79 | num_freq_bins, 80 | ): 81 | """ 82 | Adds zero padding to the frequency dimension of the input tensor. 83 | """ 84 | # Create a padding tensor for the frequency dimension 85 | freq_padding = torch.zeros( 86 | [*batch_dimensions, channel_dim, num_freq_bins - freq_dim, time_dim] 87 | ).to(input_tensor.device) 88 | 89 | # Concatenate the padding to the input tensor along the frequency dimension. 90 | padded_tensor = torch.cat([input_tensor, freq_padding], -2) 91 | 92 | return padded_tensor 93 | 94 | def calculate_inverse_dimensions(self, input_tensor): 95 | # Extract batch dimensions and frequency-time dimensions. 96 | batch_dimensions = input_tensor.shape[:-3] 97 | channel_dim, freq_dim, time_dim = input_tensor.shape[-3:] 98 | 99 | # Calculate the number of frequency bins for the inverse STFT. 100 | num_freq_bins = self.n_fft // 2 + 1 101 | 102 | return batch_dimensions, channel_dim, freq_dim, time_dim, num_freq_bins 103 | 104 | def prepare_for_istft( 105 | self, padded_tensor, batch_dimensions, channel_dim, num_freq_bins, time_dim 106 | ): 107 | """ 108 | Prepares the tensor for Inverse Short-Time Fourier Transform (ISTFT) by reshaping 109 | and creating a complex tensor from the real and imaginary parts. 110 | """ 111 | # Reshape the tensor to separate real and imaginary parts and prepare for ISTFT. 112 | reshaped_tensor = padded_tensor.reshape( 113 | [*batch_dimensions, channel_dim // 2, 2, num_freq_bins, time_dim] 114 | ) 115 | 116 | # Flatten batch dimensions and rearrange for ISTFT. 117 | flattened_tensor = reshaped_tensor.reshape([-1, 2, num_freq_bins, time_dim]) 118 | 119 | # Rearrange the dimensions of the tensor to bring the frequency dimension forward. 120 | permuted_tensor = flattened_tensor.permute([0, 2, 3, 1]) 121 | 122 | # Combine real and imaginary parts into a complex tensor. 123 | complex_tensor = permuted_tensor[..., 0] + permuted_tensor[..., 1] * 1.0j 124 | 125 | return complex_tensor 126 | 127 | def inverse(self, input_tensor): 128 | # Determine if the input tensor's device is not a standard computing device (i.e., not CPU or CUDA). 129 | is_non_standard_device = not input_tensor.device.type in ["cuda", "cpu"] 130 | 131 | # If on a non-standard device, temporarily move the tensor to CPU for processing. 132 | if is_non_standard_device: 133 | input_tensor = input_tensor.cpu() 134 | 135 | # Transfer the pre-defined Hann window tensor to the same device as the input tensor. 136 | stft_window = self.hann_window.to(input_tensor.device) 137 | 138 | batch_dimensions, channel_dim, freq_dim, time_dim, num_freq_bins = ( 139 | self.calculate_inverse_dimensions(input_tensor) 140 | ) 141 | 142 | padded_tensor = self.pad_frequency_dimension( 143 | input_tensor, 144 | batch_dimensions, 145 | channel_dim, 146 | freq_dim, 147 | time_dim, 148 | num_freq_bins, 149 | ) 150 | 151 | complex_tensor = self.prepare_for_istft( 152 | padded_tensor, batch_dimensions, channel_dim, num_freq_bins, time_dim 153 | ) 154 | 155 | # Perform the Inverse Short-Time Fourier Transform (ISTFT). 156 | source_device = complex_tensor.device 157 | if complex_tensor.device.type == "cuda" and torch.cuda.get_device_name(complex_tensor.device.index).endswith("[ZLUDA]"): 158 | complex_tensor = complex_tensor.device.to("cpu") 159 | stft_window = stft_window.to(cpu_device) 160 | 161 | istft_result = torch.istft( 162 | complex_tensor, 163 | n_fft=self.n_fft, 164 | hop_length=self.hop_length, 165 | window=stft_window, 166 | center=True, 167 | ).to(source_device) 168 | 169 | # Reshape ISTFT result to restore original batch and channel dimensions. 170 | final_output = istft_result.reshape([*batch_dimensions, 2, -1]) 171 | 172 | # If the original tensor was on a non-standard device, move the processed tensor back to that device. 173 | if is_non_standard_device: 174 | final_output = final_output.to(self.device) 175 | 176 | return final_output 177 | -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/tfc_tdf_v3.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from functools import partial 4 | 5 | 6 | class STFT: 7 | def __init__(self, n_fft, hop_length, dim_f, device): 8 | self.n_fft = n_fft 9 | self.hop_length = hop_length 10 | self.window = torch.hann_window(window_length=self.n_fft, periodic=True) 11 | self.dim_f = dim_f 12 | self.device = device 13 | 14 | def __call__(self, x): 15 | 16 | x_is_mps = not x.device.type in ["cuda", "cpu"] 17 | if x_is_mps: 18 | x = x.cpu() 19 | 20 | window = self.window.to(x.device) 21 | batch_dims = x.shape[:-2] 22 | c, t = x.shape[-2:] 23 | x = x.reshape([-1, t]) 24 | x = torch.stft( 25 | x, 26 | n_fft=self.n_fft, 27 | hop_length=self.hop_length, 28 | window=window, 29 | center=True, 30 | return_complex=False, 31 | ) 32 | x = x.permute([0, 3, 1, 2]) 33 | x = x.reshape([*batch_dims, c, 2, -1, x.shape[-1]]).reshape( 34 | [*batch_dims, c * 2, -1, x.shape[-1]] 35 | ) 36 | 37 | if x_is_mps: 38 | x = x.to(self.device) 39 | 40 | return x[..., : self.dim_f, :] 41 | 42 | def inverse(self, x): 43 | 44 | x_is_mps = not x.device.type in ["cuda", "cpu"] 45 | if x_is_mps: 46 | x = x.cpu() 47 | 48 | window = self.window.to(x.device) 49 | batch_dims = x.shape[:-3] 50 | c, f, t = x.shape[-3:] 51 | n = self.n_fft // 2 + 1 52 | f_pad = torch.zeros([*batch_dims, c, n - f, t]).to(x.device) 53 | x = torch.cat([x, f_pad], -2) 54 | x = x.reshape([*batch_dims, c // 2, 2, n, t]).reshape([-1, 2, n, t]) 55 | x = x.permute([0, 2, 3, 1]) 56 | x = x[..., 0] + x[..., 1] * 1.0j 57 | x = torch.istft( 58 | x, n_fft=self.n_fft, hop_length=self.hop_length, window=window, center=True 59 | ) 60 | x = x.reshape([*batch_dims, 2, -1]) 61 | 62 | if x_is_mps: 63 | x = x.to(self.device) 64 | 65 | return x 66 | 67 | 68 | def get_norm(norm_type): 69 | def norm(c, norm_type): 70 | if norm_type == "BatchNorm": 71 | return nn.BatchNorm2d(c) 72 | elif norm_type == "InstanceNorm": 73 | return nn.InstanceNorm2d(c, affine=True) 74 | elif "GroupNorm" in norm_type: 75 | g = int(norm_type.replace("GroupNorm", "")) 76 | return nn.GroupNorm(num_groups=g, num_channels=c) 77 | else: 78 | return nn.Identity() 79 | 80 | return partial(norm, norm_type=norm_type) 81 | 82 | 83 | def get_act(act_type): 84 | if act_type == "gelu": 85 | return nn.GELU() 86 | elif act_type == "relu": 87 | return nn.ReLU() 88 | elif act_type[:3] == "elu": 89 | alpha = float(act_type.replace("elu", "")) 90 | return nn.ELU(alpha) 91 | else: 92 | raise Exception 93 | 94 | 95 | class Upscale(nn.Module): 96 | def __init__(self, in_c, out_c, scale, norm, act): 97 | super().__init__() 98 | self.conv = nn.Sequential( 99 | norm(in_c), 100 | act, 101 | nn.ConvTranspose2d( 102 | in_channels=in_c, 103 | out_channels=out_c, 104 | kernel_size=scale, 105 | stride=scale, 106 | bias=False, 107 | ), 108 | ) 109 | 110 | def forward(self, x): 111 | return self.conv(x) 112 | 113 | 114 | class Downscale(nn.Module): 115 | def __init__(self, in_c, out_c, scale, norm, act): 116 | super().__init__() 117 | self.conv = nn.Sequential( 118 | norm(in_c), 119 | act, 120 | nn.Conv2d( 121 | in_channels=in_c, 122 | out_channels=out_c, 123 | kernel_size=scale, 124 | stride=scale, 125 | bias=False, 126 | ), 127 | ) 128 | 129 | def forward(self, x): 130 | return self.conv(x) 131 | 132 | 133 | class TFC_TDF(nn.Module): 134 | def __init__(self, in_c, c, l, f, bn, norm, act): 135 | super().__init__() 136 | 137 | self.blocks = nn.ModuleList() 138 | for i in range(l): 139 | block = nn.Module() 140 | 141 | block.tfc1 = nn.Sequential( 142 | norm(in_c), 143 | act, 144 | nn.Conv2d(in_c, c, 3, 1, 1, bias=False), 145 | ) 146 | block.tdf = nn.Sequential( 147 | norm(c), 148 | act, 149 | nn.Linear(f, f // bn, bias=False), 150 | norm(c), 151 | act, 152 | nn.Linear(f // bn, f, bias=False), 153 | ) 154 | block.tfc2 = nn.Sequential( 155 | norm(c), 156 | act, 157 | nn.Conv2d(c, c, 3, 1, 1, bias=False), 158 | ) 159 | block.shortcut = nn.Conv2d(in_c, c, 1, 1, 0, bias=False) 160 | 161 | self.blocks.append(block) 162 | in_c = c 163 | 164 | def forward(self, x): 165 | for block in self.blocks: 166 | s = block.shortcut(x) 167 | x = block.tfc1(x) 168 | x = x + block.tdf(x) 169 | x = block.tfc2(x) 170 | x = x + s 171 | return x 172 | 173 | 174 | class TFC_TDF_net(nn.Module): 175 | def __init__(self, config, device): 176 | super().__init__() 177 | self.config = config 178 | self.device = device 179 | 180 | norm = get_norm(norm_type=config.model.norm) 181 | act = get_act(act_type=config.model.act) 182 | 183 | self.num_target_instruments = ( 184 | 1 if config.training.target_instrument else len(config.training.instruments) 185 | ) 186 | self.num_subbands = config.model.num_subbands 187 | 188 | dim_c = self.num_subbands * config.audio.num_channels * 2 189 | n = config.model.num_scales 190 | scale = config.model.scale 191 | l = config.model.num_blocks_per_scale 192 | c = config.model.num_channels 193 | g = config.model.growth 194 | bn = config.model.bottleneck_factor 195 | f = config.audio.dim_f // self.num_subbands 196 | 197 | self.first_conv = nn.Conv2d(dim_c, c, 1, 1, 0, bias=False) 198 | 199 | self.encoder_blocks = nn.ModuleList() 200 | for i in range(n): 201 | block = nn.Module() 202 | block.tfc_tdf = TFC_TDF(c, c, l, f, bn, norm, act) 203 | block.downscale = Downscale(c, c + g, scale, norm, act) 204 | f = f // scale[1] 205 | c += g 206 | self.encoder_blocks.append(block) 207 | 208 | self.bottleneck_block = TFC_TDF(c, c, l, f, bn, norm, act) 209 | 210 | self.decoder_blocks = nn.ModuleList() 211 | for i in range(n): 212 | block = nn.Module() 213 | block.upscale = Upscale(c, c - g, scale, norm, act) 214 | f = f * scale[1] 215 | c -= g 216 | block.tfc_tdf = TFC_TDF(2 * c, c, l, f, bn, norm, act) 217 | self.decoder_blocks.append(block) 218 | 219 | self.final_conv = nn.Sequential( 220 | nn.Conv2d(c + dim_c, c, 1, 1, 0, bias=False), 221 | act, 222 | nn.Conv2d(c, self.num_target_instruments * dim_c, 1, 1, 0, bias=False), 223 | ) 224 | 225 | self.stft = STFT( 226 | config.audio.n_fft, config.audio.hop_length, config.audio.dim_f, self.device 227 | ) 228 | 229 | def cac2cws(self, x): 230 | k = self.num_subbands 231 | b, c, f, t = x.shape 232 | x = x.reshape(b, c, k, f // k, t) 233 | x = x.reshape(b, c * k, f // k, t) 234 | return x 235 | 236 | def cws2cac(self, x): 237 | k = self.num_subbands 238 | b, c, f, t = x.shape 239 | x = x.reshape(b, c // k, k, f, t) 240 | x = x.reshape(b, c // k, f * k, t) 241 | return x 242 | 243 | def forward(self, x): 244 | 245 | x = self.stft(x) 246 | 247 | mix = x = self.cac2cws(x) 248 | 249 | first_conv_out = x = self.first_conv(x) 250 | 251 | x = x.transpose(-1, -2) 252 | 253 | encoder_outputs = [] 254 | for block in self.encoder_blocks: 255 | x = block.tfc_tdf(x) 256 | encoder_outputs.append(x) 257 | x = block.downscale(x) 258 | 259 | x = self.bottleneck_block(x) 260 | 261 | for block in self.decoder_blocks: 262 | x = block.upscale(x) 263 | x = torch.cat([x, encoder_outputs.pop()], 1) 264 | x = block.tfc_tdf(x) 265 | 266 | x = x.transpose(-1, -2) 267 | 268 | x = x * first_conv_out # reduce artifacts 269 | 270 | x = self.final_conv(torch.cat([mix, x], 1)) 271 | 272 | x = self.cws2cac(x) 273 | 274 | if self.num_target_instruments > 1: 275 | b, c, f, t = x.shape 276 | x = x.reshape(b, self.num_target_instruments, -1, f, t) 277 | 278 | x = self.stft.inverse(x) 279 | 280 | return x 281 | -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/vr_network/__init__.py: -------------------------------------------------------------------------------- 1 | # VR init. 2 | -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/vr_network/layers_new.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | 5 | from tabs.plugins.installed.UVR.uvr.uvr_lib_v5 import spec_utils 6 | 7 | 8 | class Conv2DBNActiv(nn.Module): 9 | """ 10 | Conv2DBNActiv Class: 11 | This class implements a convolutional layer followed by batch normalization and an activation function. 12 | It is a fundamental building block for constructing neural networks, especially useful in image and audio processing tasks. 13 | The class encapsulates the pattern of applying a convolution, normalizing the output, and then applying a non-linear activation. 14 | """ 15 | 16 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 17 | super(Conv2DBNActiv, self).__init__() 18 | 19 | # Sequential model combining Conv2D, BatchNorm, and activation function into a single module 20 | self.conv = nn.Sequential( 21 | nn.Conv2d( 22 | nin, 23 | nout, 24 | kernel_size=ksize, 25 | stride=stride, 26 | padding=pad, 27 | dilation=dilation, 28 | bias=False, 29 | ), 30 | nn.BatchNorm2d(nout), 31 | activ(), 32 | ) 33 | 34 | def __call__(self, input_tensor): 35 | # Forward pass through the sequential model 36 | return self.conv(input_tensor) 37 | 38 | 39 | class Encoder(nn.Module): 40 | """ 41 | Encoder Class: 42 | This class defines an encoder module typically used in autoencoder architectures. 43 | It consists of two convolutional layers, each followed by batch normalization and an activation function. 44 | """ 45 | 46 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): 47 | super(Encoder, self).__init__() 48 | 49 | # First convolutional layer of the encoder 50 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ) 51 | # Second convolutional layer of the encoder 52 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ) 53 | 54 | def __call__(self, input_tensor): 55 | # Applying the first and then the second convolutional layers 56 | hidden = self.conv1(input_tensor) 57 | hidden = self.conv2(hidden) 58 | 59 | return hidden 60 | 61 | 62 | class Decoder(nn.Module): 63 | """ 64 | Decoder Class: 65 | This class defines a decoder module, which is the counterpart of the Encoder class in autoencoder architectures. 66 | It applies a convolutional layer followed by batch normalization and an activation function, with an optional dropout layer for regularization. 67 | """ 68 | 69 | def __init__( 70 | self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False 71 | ): 72 | super(Decoder, self).__init__() 73 | # Convolutional layer with optional dropout for regularization 74 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 75 | # self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ) 76 | self.dropout = nn.Dropout2d(0.1) if dropout else None 77 | 78 | def __call__(self, input_tensor, skip=None): 79 | # Forward pass through the convolutional layer and optional dropout 80 | input_tensor = F.interpolate( 81 | input_tensor, scale_factor=2, mode="bilinear", align_corners=True 82 | ) 83 | 84 | if skip is not None: 85 | skip = spec_utils.crop_center(skip, input_tensor) 86 | input_tensor = torch.cat([input_tensor, skip], dim=1) 87 | 88 | hidden = self.conv1(input_tensor) 89 | # hidden = self.conv2(hidden) 90 | 91 | if self.dropout is not None: 92 | hidden = self.dropout(hidden) 93 | 94 | return hidden 95 | 96 | 97 | class ASPPModule(nn.Module): 98 | """ 99 | ASPPModule Class: 100 | This class implements the Atrous Spatial Pyramid Pooling (ASPP) module, which is useful for semantic image segmentation tasks. 101 | It captures multi-scale contextual information by applying convolutions at multiple dilation rates. 102 | """ 103 | 104 | def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False): 105 | super(ASPPModule, self).__init__() 106 | 107 | # Global context convolution captures the overall context 108 | self.conv1 = nn.Sequential( 109 | nn.AdaptiveAvgPool2d((1, None)), 110 | Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ), 111 | ) 112 | self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ) 113 | self.conv3 = Conv2DBNActiv( 114 | nin, nout, 3, 1, dilations[0], dilations[0], activ=activ 115 | ) 116 | self.conv4 = Conv2DBNActiv( 117 | nin, nout, 3, 1, dilations[1], dilations[1], activ=activ 118 | ) 119 | self.conv5 = Conv2DBNActiv( 120 | nin, nout, 3, 1, dilations[2], dilations[2], activ=activ 121 | ) 122 | self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ) 123 | self.dropout = nn.Dropout2d(0.1) if dropout else None 124 | 125 | def forward(self, input_tensor): 126 | _, _, h, w = input_tensor.size() 127 | 128 | # Upsample global context to match input size and combine with local and multi-scale features 129 | feat1 = F.interpolate( 130 | self.conv1(input_tensor), size=(h, w), mode="bilinear", align_corners=True 131 | ) 132 | feat2 = self.conv2(input_tensor) 133 | feat3 = self.conv3(input_tensor) 134 | feat4 = self.conv4(input_tensor) 135 | feat5 = self.conv5(input_tensor) 136 | out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) 137 | out = self.bottleneck(out) 138 | 139 | if self.dropout is not None: 140 | out = self.dropout(out) 141 | 142 | return out 143 | 144 | 145 | class LSTMModule(nn.Module): 146 | """ 147 | LSTMModule Class: 148 | This class defines a module that combines convolutional feature extraction with a bidirectional LSTM for sequence modeling. 149 | It is useful for tasks that require understanding temporal dynamics in data, such as speech and audio processing. 150 | """ 151 | 152 | def __init__(self, nin_conv, nin_lstm, nout_lstm): 153 | super(LSTMModule, self).__init__() 154 | # Convolutional layer for initial feature extraction 155 | self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0) 156 | 157 | # Bidirectional LSTM for capturing temporal dynamics 158 | self.lstm = nn.LSTM( 159 | input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True 160 | ) 161 | 162 | # Dense layer for output dimensionality matching 163 | self.dense = nn.Sequential( 164 | nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU() 165 | ) 166 | 167 | def forward(self, input_tensor): 168 | N, _, nbins, nframes = input_tensor.size() 169 | 170 | # Extract features and prepare for LSTM 171 | hidden = self.conv(input_tensor)[:, 0] # N, nbins, nframes 172 | hidden = hidden.permute(2, 0, 1) # nframes, N, nbins 173 | hidden, _ = self.lstm(hidden) 174 | 175 | # Apply dense layer and reshape to match expected output format 176 | hidden = self.dense(hidden.reshape(-1, hidden.size()[-1])) # nframes * N, nbins 177 | hidden = hidden.reshape(nframes, N, 1, nbins) 178 | hidden = hidden.permute(1, 2, 3, 0) 179 | 180 | return hidden 181 | -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/vr_network/model_param_init.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | default_param = {} 4 | default_param["bins"] = -1 5 | default_param["unstable_bins"] = -1 # training only 6 | default_param["stable_bins"] = -1 # training only 7 | default_param["sr"] = 44100 8 | default_param["pre_filter_start"] = -1 9 | default_param["pre_filter_stop"] = -1 10 | default_param["band"] = {} 11 | 12 | N_BINS = "n_bins" 13 | 14 | 15 | def int_keys(d): 16 | """ 17 | Converts string keys that represent integers into actual integer keys in a list. 18 | 19 | This function is particularly useful when dealing with JSON data that may represent 20 | integer keys as strings due to the nature of JSON encoding. By converting these keys 21 | back to integers, it ensures that the data can be used in a manner consistent with 22 | its original representation, especially in contexts where the distinction between 23 | string and integer keys is important. 24 | 25 | Args: 26 | input_list (list of tuples): A list of (key, value) pairs where keys are strings 27 | that may represent integers. 28 | 29 | Returns: 30 | dict: A dictionary with keys converted to integers where applicable. 31 | """ 32 | # Initialize an empty dictionary to hold the converted key-value pairs. 33 | result_dict = {} 34 | # Iterate through each key-value pair in the input list. 35 | for key, value in d: 36 | # Check if the key is a digit (i.e., represents an integer). 37 | if key.isdigit(): 38 | # Convert the key from a string to an integer. 39 | key = int(key) 40 | result_dict[key] = value 41 | return result_dict 42 | 43 | 44 | class ModelParameters(object): 45 | """ 46 | A class to manage model parameters, including loading from a configuration file. 47 | 48 | Attributes: 49 | param (dict): Dictionary holding all parameters for the model. 50 | """ 51 | 52 | def __init__(self, config_path=""): 53 | """ 54 | Initializes the ModelParameters object by loading parameters from a JSON configuration file. 55 | 56 | Args: 57 | config_path (str): Path to the JSON configuration file. 58 | """ 59 | 60 | # Load parameters from the given configuration file path. 61 | with open(config_path, "r") as f: 62 | self.param = json.loads(f.read(), object_pairs_hook=int_keys) 63 | 64 | # Ensure certain parameters are set to False if not specified in the configuration. 65 | for k in [ 66 | "mid_side", 67 | "mid_side_b", 68 | "mid_side_b2", 69 | "stereo_w", 70 | "stereo_n", 71 | "reverse", 72 | ]: 73 | if not k in self.param: 74 | self.param[k] = False 75 | 76 | # If 'n_bins' is specified in the parameters, it's used as the value for 'bins'. 77 | if N_BINS in self.param: 78 | self.param["bins"] = self.param[N_BINS] 79 | -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/vr_network/modelparams/1band_sr16000_hl512.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 16000, 8 | "hl": 512, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 16000, 17 | "pre_filter_start": 1023, 18 | "pre_filter_stop": 1024 19 | } -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/vr_network/modelparams/1band_sr32000_hl512.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 32000, 8 | "hl": 512, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "kaiser_fast" 14 | } 15 | }, 16 | "sr": 32000, 17 | "pre_filter_start": 1000, 18 | "pre_filter_stop": 1021 19 | } -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/vr_network/modelparams/1band_sr33075_hl384.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 33075, 8 | "hl": 384, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 33075, 17 | "pre_filter_start": 1000, 18 | "pre_filter_stop": 1021 19 | } -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/vr_network/modelparams/1band_sr44100_hl1024.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 44100, 8 | "hl": 1024, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 44100, 17 | "pre_filter_start": 1023, 18 | "pre_filter_stop": 1024 19 | } -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/vr_network/modelparams/1band_sr44100_hl256.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 256, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 44100, 8 | "hl": 256, 9 | "n_fft": 512, 10 | "crop_start": 0, 11 | "crop_stop": 256, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 44100, 17 | "pre_filter_start": 256, 18 | "pre_filter_stop": 256 19 | } -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/vr_network/modelparams/1band_sr44100_hl512.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 44100, 8 | "hl": 512, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 44100, 17 | "pre_filter_start": 1023, 18 | "pre_filter_stop": 1024 19 | } -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/vr_network/modelparams/1band_sr44100_hl512_cut.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 44100, 8 | "hl": 512, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 700, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 44100, 17 | "pre_filter_start": 1023, 18 | "pre_filter_stop": 700 19 | } -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/vr_network/modelparams/1band_sr44100_hl512_nf1024.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 512, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 44100, 8 | "hl": 512, 9 | "n_fft": 1024, 10 | "crop_start": 0, 11 | "crop_stop": 512, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 44100, 17 | "pre_filter_start": 511, 18 | "pre_filter_stop": 512 19 | } -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/vr_network/modelparams/2band_32000.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 7, 4 | "reduction_bins": 705, 5 | "band": { 6 | "1": { 7 | "sr": 6000, 8 | "hl": 66, 9 | "n_fft": 512, 10 | "crop_start": 0, 11 | "crop_stop": 240, 12 | "lpf_start": 60, 13 | "lpf_stop": 118, 14 | "res_type": "sinc_fastest" 15 | }, 16 | "2": { 17 | "sr": 32000, 18 | "hl": 352, 19 | "n_fft": 1024, 20 | "crop_start": 22, 21 | "crop_stop": 505, 22 | "hpf_start": 44, 23 | "hpf_stop": 23, 24 | "res_type": "sinc_medium" 25 | } 26 | }, 27 | "sr": 32000, 28 | "pre_filter_start": 710, 29 | "pre_filter_stop": 731 30 | } 31 | -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/vr_network/modelparams/2band_44100_lofi.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 512, 3 | "unstable_bins": 7, 4 | "reduction_bins": 510, 5 | "band": { 6 | "1": { 7 | "sr": 11025, 8 | "hl": 160, 9 | "n_fft": 768, 10 | "crop_start": 0, 11 | "crop_stop": 192, 12 | "lpf_start": 41, 13 | "lpf_stop": 139, 14 | "res_type": "sinc_fastest" 15 | }, 16 | "2": { 17 | "sr": 44100, 18 | "hl": 640, 19 | "n_fft": 1024, 20 | "crop_start": 10, 21 | "crop_stop": 320, 22 | "hpf_start": 47, 23 | "hpf_stop": 15, 24 | "res_type": "sinc_medium" 25 | } 26 | }, 27 | "sr": 44100, 28 | "pre_filter_start": 510, 29 | "pre_filter_stop": 512 30 | } 31 | -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/vr_network/modelparams/2band_48000.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 7, 4 | "reduction_bins": 705, 5 | "band": { 6 | "1": { 7 | "sr": 6000, 8 | "hl": 66, 9 | "n_fft": 512, 10 | "crop_start": 0, 11 | "crop_stop": 240, 12 | "lpf_start": 60, 13 | "lpf_stop": 240, 14 | "res_type": "sinc_fastest" 15 | }, 16 | "2": { 17 | "sr": 48000, 18 | "hl": 528, 19 | "n_fft": 1536, 20 | "crop_start": 22, 21 | "crop_stop": 505, 22 | "hpf_start": 82, 23 | "hpf_stop": 22, 24 | "res_type": "sinc_medium" 25 | } 26 | }, 27 | "sr": 48000, 28 | "pre_filter_start": 710, 29 | "pre_filter_stop": 731 30 | } -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/vr_network/modelparams/3band_44100.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 5, 4 | "reduction_bins": 733, 5 | "band": { 6 | "1": { 7 | "sr": 11025, 8 | "hl": 128, 9 | "n_fft": 768, 10 | "crop_start": 0, 11 | "crop_stop": 278, 12 | "lpf_start": 28, 13 | "lpf_stop": 140, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 22050, 18 | "hl": 256, 19 | "n_fft": 768, 20 | "crop_start": 14, 21 | "crop_stop": 322, 22 | "hpf_start": 70, 23 | "hpf_stop": 14, 24 | "lpf_start": 283, 25 | "lpf_stop": 314, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 44100, 30 | "hl": 512, 31 | "n_fft": 768, 32 | "crop_start": 131, 33 | "crop_stop": 313, 34 | "hpf_start": 154, 35 | "hpf_stop": 141, 36 | "res_type": "sinc_medium" 37 | } 38 | }, 39 | "sr": 44100, 40 | "pre_filter_start": 757, 41 | "pre_filter_stop": 768 42 | } 43 | -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/vr_network/modelparams/3band_44100_mid.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side": true, 3 | "bins": 768, 4 | "unstable_bins": 5, 5 | "reduction_bins": 733, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 768, 11 | "crop_start": 0, 12 | "crop_stop": 278, 13 | "lpf_start": 28, 14 | "lpf_stop": 140, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 22050, 19 | "hl": 256, 20 | "n_fft": 768, 21 | "crop_start": 14, 22 | "crop_stop": 322, 23 | "hpf_start": 70, 24 | "hpf_stop": 14, 25 | "lpf_start": 283, 26 | "lpf_stop": 314, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 44100, 31 | "hl": 512, 32 | "n_fft": 768, 33 | "crop_start": 131, 34 | "crop_stop": 313, 35 | "hpf_start": 154, 36 | "hpf_stop": 141, 37 | "res_type": "sinc_medium" 38 | } 39 | }, 40 | "sr": 44100, 41 | "pre_filter_start": 757, 42 | "pre_filter_stop": 768 43 | } 44 | -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/vr_network/modelparams/3band_44100_msb2.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side_b2": true, 3 | "bins": 640, 4 | "unstable_bins": 7, 5 | "reduction_bins": 565, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 108, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 187, 13 | "lpf_start": 92, 14 | "lpf_stop": 186, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 22050, 19 | "hl": 216, 20 | "n_fft": 768, 21 | "crop_start": 0, 22 | "crop_stop": 212, 23 | "hpf_start": 68, 24 | "hpf_stop": 34, 25 | "lpf_start": 174, 26 | "lpf_stop": 209, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 44100, 31 | "hl": 432, 32 | "n_fft": 640, 33 | "crop_start": 66, 34 | "crop_stop": 307, 35 | "hpf_start": 86, 36 | "hpf_stop": 72, 37 | "res_type": "kaiser_fast" 38 | } 39 | }, 40 | "sr": 44100, 41 | "pre_filter_start": 639, 42 | "pre_filter_stop": 640 43 | } 44 | -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/vr_network/modelparams/4band_44100.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 7, 4 | "reduction_bins": 668, 5 | "band": { 6 | "1": { 7 | "sr": 11025, 8 | "hl": 128, 9 | "n_fft": 1024, 10 | "crop_start": 0, 11 | "crop_stop": 186, 12 | "lpf_start": 37, 13 | "lpf_stop": 73, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 11025, 18 | "hl": 128, 19 | "n_fft": 512, 20 | "crop_start": 4, 21 | "crop_stop": 185, 22 | "hpf_start": 36, 23 | "hpf_stop": 18, 24 | "lpf_start": 93, 25 | "lpf_stop": 185, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 22050, 30 | "hl": 256, 31 | "n_fft": 512, 32 | "crop_start": 46, 33 | "crop_stop": 186, 34 | "hpf_start": 93, 35 | "hpf_stop": 46, 36 | "lpf_start": 164, 37 | "lpf_stop": 186, 38 | "res_type": "polyphase" 39 | }, 40 | "4": { 41 | "sr": 44100, 42 | "hl": 512, 43 | "n_fft": 768, 44 | "crop_start": 121, 45 | "crop_stop": 382, 46 | "hpf_start": 138, 47 | "hpf_stop": 123, 48 | "res_type": "sinc_medium" 49 | } 50 | }, 51 | "sr": 44100, 52 | "pre_filter_start": 740, 53 | "pre_filter_stop": 768 54 | } 55 | -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/vr_network/modelparams/4band_44100_mid.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 7, 4 | "mid_side": true, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } 56 | -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/vr_network/modelparams/4band_44100_msb.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side_b": true, 3 | "bins": 768, 4 | "unstable_bins": 7, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/vr_network/modelparams/4band_44100_msb2.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side_b": true, 3 | "bins": 768, 4 | "unstable_bins": 7, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/vr_network/modelparams/4band_44100_reverse.json: -------------------------------------------------------------------------------- 1 | { 2 | "reverse": true, 3 | "bins": 768, 4 | "unstable_bins": 7, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/vr_network/modelparams/4band_44100_sw.json: -------------------------------------------------------------------------------- 1 | { 2 | "stereo_w": true, 3 | "bins": 768, 4 | "unstable_bins": 7, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/vr_network/modelparams/4band_v2.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 672, 3 | "unstable_bins": 8, 4 | "reduction_bins": 637, 5 | "band": { 6 | "1": { 7 | "sr": 7350, 8 | "hl": 80, 9 | "n_fft": 640, 10 | "crop_start": 0, 11 | "crop_stop": 85, 12 | "lpf_start": 25, 13 | "lpf_stop": 53, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 7350, 18 | "hl": 80, 19 | "n_fft": 320, 20 | "crop_start": 4, 21 | "crop_stop": 87, 22 | "hpf_start": 25, 23 | "hpf_stop": 12, 24 | "lpf_start": 31, 25 | "lpf_stop": 62, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 14700, 30 | "hl": 160, 31 | "n_fft": 512, 32 | "crop_start": 17, 33 | "crop_stop": 216, 34 | "hpf_start": 48, 35 | "hpf_stop": 24, 36 | "lpf_start": 139, 37 | "lpf_stop": 210, 38 | "res_type": "polyphase" 39 | }, 40 | "4": { 41 | "sr": 44100, 42 | "hl": 480, 43 | "n_fft": 960, 44 | "crop_start": 78, 45 | "crop_stop": 383, 46 | "hpf_start": 130, 47 | "hpf_stop": 86, 48 | "res_type": "kaiser_fast" 49 | } 50 | }, 51 | "sr": 44100, 52 | "pre_filter_start": 668, 53 | "pre_filter_stop": 672 54 | } -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/vr_network/modelparams/4band_v2_sn.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 672, 3 | "unstable_bins": 8, 4 | "reduction_bins": 637, 5 | "band": { 6 | "1": { 7 | "sr": 7350, 8 | "hl": 80, 9 | "n_fft": 640, 10 | "crop_start": 0, 11 | "crop_stop": 85, 12 | "lpf_start": 25, 13 | "lpf_stop": 53, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 7350, 18 | "hl": 80, 19 | "n_fft": 320, 20 | "crop_start": 4, 21 | "crop_stop": 87, 22 | "hpf_start": 25, 23 | "hpf_stop": 12, 24 | "lpf_start": 31, 25 | "lpf_stop": 62, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 14700, 30 | "hl": 160, 31 | "n_fft": 512, 32 | "crop_start": 17, 33 | "crop_stop": 216, 34 | "hpf_start": 48, 35 | "hpf_stop": 24, 36 | "lpf_start": 139, 37 | "lpf_stop": 210, 38 | "res_type": "polyphase" 39 | }, 40 | "4": { 41 | "sr": 44100, 42 | "hl": 480, 43 | "n_fft": 960, 44 | "crop_start": 78, 45 | "crop_stop": 383, 46 | "hpf_start": 130, 47 | "hpf_stop": 86, 48 | "convert_channels": "stereo_n", 49 | "res_type": "kaiser_fast" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 668, 54 | "pre_filter_stop": 672 55 | } -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/vr_network/modelparams/4band_v3.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 672, 3 | "unstable_bins": 8, 4 | "reduction_bins": 530, 5 | "band": { 6 | "1": { 7 | "sr": 7350, 8 | "hl": 80, 9 | "n_fft": 640, 10 | "crop_start": 0, 11 | "crop_stop": 85, 12 | "lpf_start": 25, 13 | "lpf_stop": 53, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 7350, 18 | "hl": 80, 19 | "n_fft": 320, 20 | "crop_start": 4, 21 | "crop_stop": 87, 22 | "hpf_start": 25, 23 | "hpf_stop": 12, 24 | "lpf_start": 31, 25 | "lpf_stop": 62, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 14700, 30 | "hl": 160, 31 | "n_fft": 512, 32 | "crop_start": 17, 33 | "crop_stop": 216, 34 | "hpf_start": 48, 35 | "hpf_stop": 24, 36 | "lpf_start": 139, 37 | "lpf_stop": 210, 38 | "res_type": "polyphase" 39 | }, 40 | "4": { 41 | "sr": 44100, 42 | "hl": 480, 43 | "n_fft": 960, 44 | "crop_start": 78, 45 | "crop_stop": 383, 46 | "hpf_start": 130, 47 | "hpf_stop": 86, 48 | "res_type": "kaiser_fast" 49 | } 50 | }, 51 | "sr": 44100, 52 | "pre_filter_start": 668, 53 | "pre_filter_stop": 672 54 | } -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/vr_network/modelparams/4band_v3_sn.json: -------------------------------------------------------------------------------- 1 | { 2 | "n_bins": 672, 3 | "unstable_bins": 8, 4 | "stable_bins": 530, 5 | "band": { 6 | "1": { 7 | "sr": 7350, 8 | "hl": 80, 9 | "n_fft": 640, 10 | "crop_start": 0, 11 | "crop_stop": 85, 12 | "lpf_start": 25, 13 | "lpf_stop": 53, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 7350, 18 | "hl": 80, 19 | "n_fft": 320, 20 | "crop_start": 4, 21 | "crop_stop": 87, 22 | "hpf_start": 25, 23 | "hpf_stop": 12, 24 | "lpf_start": 31, 25 | "lpf_stop": 62, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 14700, 30 | "hl": 160, 31 | "n_fft": 512, 32 | "crop_start": 17, 33 | "crop_stop": 216, 34 | "hpf_start": 48, 35 | "hpf_stop": 24, 36 | "lpf_start": 139, 37 | "lpf_stop": 210, 38 | "res_type": "polyphase" 39 | }, 40 | "4": { 41 | "sr": 44100, 42 | "hl": 480, 43 | "n_fft": 960, 44 | "crop_start": 78, 45 | "crop_stop": 383, 46 | "hpf_start": 130, 47 | "hpf_stop": 86, 48 | "convert_channels": "stereo_n", 49 | "res_type": "kaiser_fast" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 668, 54 | "pre_filter_stop": 672 55 | } -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/vr_network/modelparams/ensemble.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side_b2": true, 3 | "bins": 1280, 4 | "unstable_bins": 7, 5 | "reduction_bins": 565, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 108, 10 | "n_fft": 2048, 11 | "crop_start": 0, 12 | "crop_stop": 374, 13 | "lpf_start": 92, 14 | "lpf_stop": 186, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 22050, 19 | "hl": 216, 20 | "n_fft": 1536, 21 | "crop_start": 0, 22 | "crop_stop": 424, 23 | "hpf_start": 68, 24 | "hpf_stop": 34, 25 | "lpf_start": 348, 26 | "lpf_stop": 418, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 44100, 31 | "hl": 432, 32 | "n_fft": 1280, 33 | "crop_start": 132, 34 | "crop_stop": 614, 35 | "hpf_start": 172, 36 | "hpf_stop": 144, 37 | "res_type": "polyphase" 38 | } 39 | }, 40 | "sr": 44100, 41 | "pre_filter_start": 1280, 42 | "pre_filter_stop": 1280 43 | } -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/vr_network/nets.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | 5 | from . import layers 6 | 7 | 8 | class BaseASPPNet(nn.Module): 9 | """ 10 | BaseASPPNet Class: 11 | This class defines the base architecture for an Atrous Spatial Pyramid Pooling (ASPP) network. 12 | It is designed to extract features from input data at multiple scales by using dilated convolutions. 13 | This is particularly useful for tasks that benefit from understanding context at different resolutions, 14 | such as semantic segmentation. The network consists of a series of encoder layers for downsampling and feature extraction, 15 | followed by an ASPP module for multi-scale feature extraction, and finally a series of decoder layers for upsampling. 16 | """ 17 | 18 | def __init__(self, nn_architecture, nin, ch, dilations=(4, 8, 16)): 19 | super(BaseASPPNet, self).__init__() 20 | self.nn_architecture = nn_architecture 21 | 22 | # Encoder layers progressively increase the number of channels while reducing spatial dimensions. 23 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) 24 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) 25 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) 26 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) 27 | 28 | # Depending on the network architecture, an additional encoder layer and a specific ASPP module are initialized. 29 | if self.nn_architecture == 129605: 30 | self.enc5 = layers.Encoder(ch * 8, ch * 16, 3, 2, 1) 31 | self.aspp = layers.ASPPModule(nn_architecture, ch * 16, ch * 32, dilations) 32 | self.dec5 = layers.Decoder(ch * (16 + 32), ch * 16, 3, 1, 1) 33 | else: 34 | self.aspp = layers.ASPPModule(nn_architecture, ch * 8, ch * 16, dilations) 35 | 36 | # Decoder layers progressively decrease the number of channels while increasing spatial dimensions. 37 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) 38 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) 39 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) 40 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) 41 | 42 | def __call__(self, input_tensor): 43 | # The input tensor is passed through a series of encoder layers. 44 | hidden_state, encoder_output1 = self.enc1(input_tensor) 45 | hidden_state, encoder_output2 = self.enc2(hidden_state) 46 | hidden_state, encoder_output3 = self.enc3(hidden_state) 47 | hidden_state, encoder_output4 = self.enc4(hidden_state) 48 | 49 | # Depending on the network architecture, the hidden state is processed by an additional encoder layer and the ASPP module. 50 | if self.nn_architecture == 129605: 51 | hidden_state, encoder_output5 = self.enc5(hidden_state) 52 | hidden_state = self.aspp(hidden_state) 53 | # The decoder layers use skip connections from the encoder layers for better feature integration. 54 | hidden_state = self.dec5(hidden_state, encoder_output5) 55 | else: 56 | hidden_state = self.aspp(hidden_state) 57 | 58 | # The hidden state is further processed by the decoder layers, using skip connections for feature integration. 59 | hidden_state = self.dec4(hidden_state, encoder_output4) 60 | hidden_state = self.dec3(hidden_state, encoder_output3) 61 | hidden_state = self.dec2(hidden_state, encoder_output2) 62 | hidden_state = self.dec1(hidden_state, encoder_output1) 63 | 64 | return hidden_state 65 | 66 | 67 | def determine_model_capacity(n_fft_bins, nn_architecture): 68 | """ 69 | The determine_model_capacity function is designed to select the appropriate model configuration 70 | based on the frequency bins and network architecture. It maps specific architectures to predefined 71 | model capacities, which dictate the structure and parameters of the CascadedASPPNet model. 72 | """ 73 | 74 | # Predefined model architectures categorized by their precision level. 75 | sp_model_arch = [31191, 33966, 129605] 76 | hp_model_arch = [123821, 123812] 77 | hp2_model_arch = [537238, 537227] 78 | 79 | # Mapping network architectures to their corresponding model capacity data. 80 | if nn_architecture in sp_model_arch: 81 | model_capacity_data = [ 82 | (2, 16), 83 | (2, 16), 84 | (18, 8, 1, 1, 0), 85 | (8, 16), 86 | (34, 16, 1, 1, 0), 87 | (16, 32), 88 | (32, 2, 1), 89 | (16, 2, 1), 90 | (16, 2, 1), 91 | ] 92 | 93 | if nn_architecture in hp_model_arch: 94 | model_capacity_data = [ 95 | (2, 32), 96 | (2, 32), 97 | (34, 16, 1, 1, 0), 98 | (16, 32), 99 | (66, 32, 1, 1, 0), 100 | (32, 64), 101 | (64, 2, 1), 102 | (32, 2, 1), 103 | (32, 2, 1), 104 | ] 105 | 106 | if nn_architecture in hp2_model_arch: 107 | model_capacity_data = [ 108 | (2, 64), 109 | (2, 64), 110 | (66, 32, 1, 1, 0), 111 | (32, 64), 112 | (130, 64, 1, 1, 0), 113 | (64, 128), 114 | (128, 2, 1), 115 | (64, 2, 1), 116 | (64, 2, 1), 117 | ] 118 | 119 | # Initializing the CascadedASPPNet model with the selected model capacity data. 120 | cascaded = CascadedASPPNet 121 | model = cascaded(n_fft_bins, model_capacity_data, nn_architecture) 122 | 123 | return model 124 | 125 | 126 | class CascadedASPPNet(nn.Module): 127 | """ 128 | CascadedASPPNet Class: 129 | This class implements a cascaded version of the ASPP network, designed for processing audio signals 130 | for tasks such as vocal removal. It consists of multiple stages, each with its own ASPP network, 131 | to process different frequency bands of the input signal. This allows the model to effectively 132 | handle the full spectrum of audio frequencies by focusing on different frequency bands separately. 133 | """ 134 | 135 | def __init__(self, n_fft, model_capacity_data, nn_architecture): 136 | super(CascadedASPPNet, self).__init__() 137 | # The first stage processes the low and high frequency bands separately. 138 | self.stg1_low_band_net = BaseASPPNet(nn_architecture, *model_capacity_data[0]) 139 | self.stg1_high_band_net = BaseASPPNet(nn_architecture, *model_capacity_data[1]) 140 | 141 | # Bridge layers connect different stages of the network. 142 | self.stg2_bridge = layers.Conv2DBNActiv(*model_capacity_data[2]) 143 | self.stg2_full_band_net = BaseASPPNet(nn_architecture, *model_capacity_data[3]) 144 | 145 | self.stg3_bridge = layers.Conv2DBNActiv(*model_capacity_data[4]) 146 | self.stg3_full_band_net = BaseASPPNet(nn_architecture, *model_capacity_data[5]) 147 | 148 | # Output layers for the final mask prediction and auxiliary outputs. 149 | self.out = nn.Conv2d(*model_capacity_data[6], bias=False) 150 | self.aux1_out = nn.Conv2d(*model_capacity_data[7], bias=False) 151 | self.aux2_out = nn.Conv2d(*model_capacity_data[8], bias=False) 152 | 153 | # Parameters for handling the frequency bins of the input signal. 154 | self.max_bin = n_fft // 2 155 | self.output_bin = n_fft // 2 + 1 156 | 157 | self.offset = 128 158 | 159 | def forward(self, input_tensor): 160 | # The forward pass processes the input tensor through each stage of the network, 161 | # combining the outputs of different frequency bands and stages to produce the final mask. 162 | mix = input_tensor.detach() 163 | input_tensor = input_tensor.clone() 164 | 165 | # Preparing the input tensor by selecting the mainput_tensorimum frequency bin. 166 | input_tensor = input_tensor[:, :, : self.max_bin] 167 | 168 | # Processing the low and high frequency bands separately in the first stage. 169 | bandwidth = input_tensor.size()[2] // 2 170 | aux1 = torch.cat( 171 | [ 172 | self.stg1_low_band_net(input_tensor[:, :, :bandwidth]), 173 | self.stg1_high_band_net(input_tensor[:, :, bandwidth:]), 174 | ], 175 | dim=2, 176 | ) 177 | 178 | # Combining the outputs of the first stage and passing through the second stage. 179 | hidden_state = torch.cat([input_tensor, aux1], dim=1) 180 | aux2 = self.stg2_full_band_net(self.stg2_bridge(hidden_state)) 181 | 182 | # Further processing the combined outputs through the third stage. 183 | hidden_state = torch.cat([input_tensor, aux1, aux2], dim=1) 184 | hidden_state = self.stg3_full_band_net(self.stg3_bridge(hidden_state)) 185 | 186 | # Applying the final output layer to produce the mask. 187 | mask = torch.sigmoid(self.out(hidden_state)) 188 | 189 | # Padding the mask to match the output frequency bin size. 190 | mask = F.pad( 191 | input=mask, 192 | pad=(0, 0, 0, self.output_bin - mask.size()[2]), 193 | mode="replicate", 194 | ) 195 | 196 | # During training, auxiliary outputs are also produced and padded accordingly. 197 | if self.training: 198 | aux1 = torch.sigmoid(self.aux1_out(aux1)) 199 | aux1 = F.pad( 200 | input=aux1, 201 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]), 202 | mode="replicate", 203 | ) 204 | aux2 = torch.sigmoid(self.aux2_out(aux2)) 205 | aux2 = F.pad( 206 | input=aux2, 207 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]), 208 | mode="replicate", 209 | ) 210 | return mask * mix, aux1 * mix, aux2 * mix 211 | else: 212 | return mask # * mix 213 | 214 | def predict_mask(self, input_tensor): 215 | # This method predicts the mask for the input tensor by calling the forward method 216 | # and applying any necessary padding adjustments. 217 | mask = self.forward(input_tensor) 218 | 219 | # Adjusting the mask by removing padding offsets if present. 220 | if self.offset > 0: 221 | mask = mask[:, :, :, self.offset : -self.offset] 222 | 223 | return mask 224 | -------------------------------------------------------------------------------- /UVR/uvr/uvr_lib_v5/vr_network/nets_new.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | from . import layers_new as layers 5 | 6 | 7 | class BaseNet(nn.Module): 8 | """ 9 | BaseNet Class: 10 | This class defines the base network architecture for vocal removal. It includes a series of encoders for feature extraction, 11 | an ASPP module for capturing multi-scale context, and a series of decoders for reconstructing the output. Additionally, 12 | it incorporates an LSTM module for capturing temporal dependencies. 13 | """ 14 | 15 | def __init__( 16 | self, nin, nout, nin_lstm, nout_lstm, dilations=((4, 2), (8, 4), (12, 6)) 17 | ): 18 | super(BaseNet, self).__init__() 19 | # Initialize the encoder layers with increasing output channels for hierarchical feature extraction. 20 | self.enc1 = layers.Conv2DBNActiv(nin, nout, 3, 1, 1) 21 | self.enc2 = layers.Encoder(nout, nout * 2, 3, 2, 1) 22 | self.enc3 = layers.Encoder(nout * 2, nout * 4, 3, 2, 1) 23 | self.enc4 = layers.Encoder(nout * 4, nout * 6, 3, 2, 1) 24 | self.enc5 = layers.Encoder(nout * 6, nout * 8, 3, 2, 1) 25 | 26 | # ASPP module for capturing multi-scale features with different dilation rates. 27 | self.aspp = layers.ASPPModule(nout * 8, nout * 8, dilations, dropout=True) 28 | 29 | # Decoder layers for upscaling and merging features from different levels of the encoder and ASPP module. 30 | self.dec4 = layers.Decoder(nout * (6 + 8), nout * 6, 3, 1, 1) 31 | self.dec3 = layers.Decoder(nout * (4 + 6), nout * 4, 3, 1, 1) 32 | self.dec2 = layers.Decoder(nout * (2 + 4), nout * 2, 3, 1, 1) 33 | 34 | # LSTM module for capturing temporal dependencies in the sequence of features. 35 | self.lstm_dec2 = layers.LSTMModule(nout * 2, nin_lstm, nout_lstm) 36 | self.dec1 = layers.Decoder(nout * (1 + 2) + 1, nout * 1, 3, 1, 1) 37 | 38 | def __call__(self, input_tensor): 39 | # Sequentially pass the input through the encoder layers. 40 | encoded1 = self.enc1(input_tensor) 41 | encoded2 = self.enc2(encoded1) 42 | encoded3 = self.enc3(encoded2) 43 | encoded4 = self.enc4(encoded3) 44 | encoded5 = self.enc5(encoded4) 45 | 46 | # Pass the deepest encoder output through the ASPP module. 47 | bottleneck = self.aspp(encoded5) 48 | 49 | # Sequentially upscale and merge the features using the decoder layers. 50 | bottleneck = self.dec4(bottleneck, encoded4) 51 | bottleneck = self.dec3(bottleneck, encoded3) 52 | bottleneck = self.dec2(bottleneck, encoded2) 53 | # Concatenate the LSTM module output for temporal feature enhancement. 54 | bottleneck = torch.cat([bottleneck, self.lstm_dec2(bottleneck)], dim=1) 55 | bottleneck = self.dec1(bottleneck, encoded1) 56 | 57 | return bottleneck 58 | 59 | 60 | class CascadedNet(nn.Module): 61 | """ 62 | CascadedNet Class: 63 | This class defines a cascaded network architecture that processes input in multiple stages, each stage focusing on different frequency bands. 64 | It utilizes the BaseNet for processing, and combines outputs from different stages to produce the final mask for vocal removal. 65 | """ 66 | 67 | def __init__(self, n_fft, nn_arch_size=51000, nout=32, nout_lstm=128): 68 | super(CascadedNet, self).__init__() 69 | # Calculate frequency bins based on FFT size. 70 | self.max_bin = n_fft // 2 71 | self.output_bin = n_fft // 2 + 1 72 | self.nin_lstm = self.max_bin // 2 73 | self.offset = 64 74 | # Adjust output channels based on the architecture size. 75 | nout = 64 if nn_arch_size == 218409 else nout 76 | 77 | # print(nout, nout_lstm, n_fft) 78 | 79 | # Initialize the network stages, each focusing on different frequency bands and progressively refining the output. 80 | self.stg1_low_band_net = nn.Sequential( 81 | BaseNet(2, nout // 2, self.nin_lstm // 2, nout_lstm), 82 | layers.Conv2DBNActiv(nout // 2, nout // 4, 1, 1, 0), 83 | ) 84 | self.stg1_high_band_net = BaseNet( 85 | 2, nout // 4, self.nin_lstm // 2, nout_lstm // 2 86 | ) 87 | 88 | self.stg2_low_band_net = nn.Sequential( 89 | BaseNet(nout // 4 + 2, nout, self.nin_lstm // 2, nout_lstm), 90 | layers.Conv2DBNActiv(nout, nout // 2, 1, 1, 0), 91 | ) 92 | self.stg2_high_band_net = BaseNet( 93 | nout // 4 + 2, nout // 2, self.nin_lstm // 2, nout_lstm // 2 94 | ) 95 | 96 | self.stg3_full_band_net = BaseNet( 97 | 3 * nout // 4 + 2, nout, self.nin_lstm, nout_lstm 98 | ) 99 | 100 | # Output layer for generating the final mask. 101 | self.out = nn.Conv2d(nout, 2, 1, bias=False) 102 | # Auxiliary output layer for intermediate supervision during training. 103 | self.aux_out = nn.Conv2d(3 * nout // 4, 2, 1, bias=False) 104 | 105 | def forward(self, input_tensor): 106 | # Preprocess input tensor to match the maximum frequency bin. 107 | input_tensor = input_tensor[:, :, : self.max_bin] 108 | 109 | # Split the input into low and high frequency bands. 110 | bandw = input_tensor.size()[2] // 2 111 | l1_in = input_tensor[:, :, :bandw] 112 | h1_in = input_tensor[:, :, bandw:] 113 | 114 | # Process each band through the first stage networks. 115 | l1 = self.stg1_low_band_net(l1_in) 116 | h1 = self.stg1_high_band_net(h1_in) 117 | 118 | # Combine the outputs for auxiliary supervision. 119 | aux1 = torch.cat([l1, h1], dim=2) 120 | 121 | # Prepare inputs for the second stage by concatenating the original and processed bands. 122 | l2_in = torch.cat([l1_in, l1], dim=1) 123 | h2_in = torch.cat([h1_in, h1], dim=1) 124 | 125 | # Process through the second stage networks. 126 | l2 = self.stg2_low_band_net(l2_in) 127 | h2 = self.stg2_high_band_net(h2_in) 128 | 129 | # Combine the outputs for auxiliary supervision. 130 | aux2 = torch.cat([l2, h2], dim=2) 131 | 132 | # Prepare input for the third stage by concatenating all previous outputs with the original input. 133 | f3_in = torch.cat([input_tensor, aux1, aux2], dim=1) 134 | 135 | # Process through the third stage network. 136 | f3 = self.stg3_full_band_net(f3_in) 137 | 138 | # Apply the output layer to generate the final mask and apply sigmoid for normalization. 139 | mask = torch.sigmoid(self.out(f3)) 140 | 141 | # Pad the mask to match the output frequency bin size. 142 | mask = F.pad( 143 | input=mask, 144 | pad=(0, 0, 0, self.output_bin - mask.size()[2]), 145 | mode="replicate", 146 | ) 147 | 148 | # During training, generate and pad the auxiliary output for additional supervision. 149 | if self.training: 150 | aux = torch.cat([aux1, aux2], dim=1) 151 | aux = torch.sigmoid(self.aux_out(aux)) 152 | aux = F.pad( 153 | input=aux, 154 | pad=(0, 0, 0, self.output_bin - aux.size()[2]), 155 | mode="replicate", 156 | ) 157 | return mask, aux 158 | else: 159 | return mask 160 | 161 | # Method for predicting the mask given an input tensor. 162 | def predict_mask(self, input_tensor): 163 | mask = self.forward(input_tensor) 164 | 165 | # If an offset is specified, crop the mask to remove edge artifacts. 166 | if self.offset > 0: 167 | mask = mask[:, :, :, self.offset : -self.offset] 168 | assert mask.size()[3] > 0 169 | 170 | return mask 171 | 172 | # Method for applying the predicted mask to the input tensor to obtain the predicted magnitude. 173 | def predict(self, input_tensor): 174 | mask = self.forward(input_tensor) 175 | pred_mag = input_tensor * mask 176 | 177 | # If an offset is specified, crop the predicted magnitude to remove edge artifacts. 178 | if self.offset > 0: 179 | pred_mag = pred_mag[:, :, :, self.offset : -self.offset] 180 | assert pred_mag.size()[3] > 0 181 | 182 | return pred_mag 183 | --------------------------------------------------------------------------------