├── Elevenlabs-TTS
    ├── Elevenlabs-TTS.zip
    ├── plugin.py
    └── requirements.txt
├── Example-Plugin
    ├── plugin.py
    └── requirements.txt
├── README.md
└── UVR
    ├── UVR.zip
    ├── models
        └── download_checks.json
    ├── plugin.py
    ├── requirements.txt
    ├── uvr.py
    └── uvr
        ├── __init__.py
        ├── architectures
            ├── __init__.py
            ├── demucs_separator.py
            ├── mdx_separator.py
            ├── mdxc_separator.py
            └── vr_separator.py
        ├── common_separator.py
        ├── separator.py
        └── uvr_lib_v5
            ├── __init__.py
            ├── attend.py
            ├── bs_roformer.py
            ├── demucs
                ├── __init__.py
                ├── __main__.py
                ├── apply.py
                ├── demucs.py
                ├── filtering.py
                ├── hdemucs.py
                ├── htdemucs.py
                ├── model.py
                ├── model_v2.py
                ├── pretrained.py
                ├── repo.py
                ├── spec.py
                ├── states.py
                ├── tasnet.py
                ├── tasnet_v2.py
                ├── transformer.py
                └── utils.py
            ├── mdxnet.py
            ├── mel_band_roformer.py
            ├── mixer.ckpt
            ├── modules.py
            ├── playsound.py
            ├── pyrb.py
            ├── results.py
            ├── spec_utils.py
            ├── stft.py
            ├── tfc_tdf_v3.py
            └── vr_network
                ├── __init__.py
                ├── layers.py
                ├── layers_new.py
                ├── model_param_init.py
                ├── modelparams
                    ├── 1band_sr16000_hl512.json
                    ├── 1band_sr32000_hl512.json
                    ├── 1band_sr33075_hl384.json
                    ├── 1band_sr44100_hl1024.json
                    ├── 1band_sr44100_hl256.json
                    ├── 1band_sr44100_hl512.json
                    ├── 1band_sr44100_hl512_cut.json
                    ├── 1band_sr44100_hl512_nf1024.json
                    ├── 2band_32000.json
                    ├── 2band_44100_lofi.json
                    ├── 2band_48000.json
                    ├── 3band_44100.json
                    ├── 3band_44100_mid.json
                    ├── 3band_44100_msb2.json
                    ├── 4band_44100.json
                    ├── 4band_44100_mid.json
                    ├── 4band_44100_msb.json
                    ├── 4band_44100_msb2.json
                    ├── 4band_44100_reverse.json
                    ├── 4band_44100_sw.json
                    ├── 4band_v2.json
                    ├── 4band_v2_sn.json
                    ├── 4band_v3.json
                    ├── 4band_v3_sn.json
                    └── ensemble.json
                ├── nets.py
                └── nets_new.py


/Elevenlabs-TTS/Elevenlabs-TTS.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IAHispano/Applio-Plugins/b80054bb20ade068aa69fed31bfe48f7dcbc4cad/Elevenlabs-TTS/Elevenlabs-TTS.zip


--------------------------------------------------------------------------------
/Elevenlabs-TTS/requirements.txt:
--------------------------------------------------------------------------------
1 | elevenlabs


--------------------------------------------------------------------------------
/Example-Plugin/plugin.py:
--------------------------------------------------------------------------------
1 | import gradio as gr
2 | 
3 | def applio_plugin():
4 |     gr.Markdown(
5 |         value=
6 |             "This code snippet introduces an Applio plugin. The heart of the plugin lies in the `def applio_plugin()` function, acting as the interface for the Gradio tab. This function will be brought into the plugins tab later on. It's crucial to maintain the original names of both the function and the `plugin.py` file, as they are integral to the import process. Additionally, there's a requirements file that cannot be relocated or renamed but can be removed if not needed."
7 |     )


--------------------------------------------------------------------------------
/Example-Plugin/requirements.txt:
--------------------------------------------------------------------------------
1 | # Example requirements.txt file, this file cannot be relocated or renamed but can be removed if not needed.
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Applio Plugins
 2 | 
 3 | > [!IMPORTANT]
 4 | > We've just launched our new plugin system and we're looking for your assistance in creating amazing plugins! Below, you'll find all the information you need to get started.
 5 | 
 6 | Welcome to **Applio Plugins**, a repository specifically designed for Applio plugins.
 7 | 
 8 | If you're not familiar with Applio, check it out on our incredible webpage, [applio.org](https://applio.org), or visit our [GitHub repository](https://github.com/IAHispano/Applio).
 9 | 
10 | ## Instructions
11 | 
12 | - The core of the plugin lies in the `def applio_plugin()` function, acting as the interface for the Gradio tab. *This function will be brought into the plugins tab later on.*
13 | - It's crucial to maintain the original names of both the function and the `plugin.py` file, as they are integral to the import process. 
14 | 
15 | *Additionally, there's a requirements file that cannot be relocated or renamed but can be removed if not needed.*
16 | 


--------------------------------------------------------------------------------
/UVR/UVR.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IAHispano/Applio-Plugins/b80054bb20ade068aa69fed31bfe48f7dcbc4cad/UVR/UVR.zip


--------------------------------------------------------------------------------
/UVR/models/download_checks.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "current_version": "UVR_Patch_10_6_23_4_27",
  3 |     "current_version_ocl": "UVR_Patch_10_6_23_4_27",
  4 |     "current_version_mac": "UVR_Patch_10_6_23_4_27",
  5 |     "current_version_linux": "UVR_Patch_10_6_23_4_27",
  6 |     "vr_download_list": {
  7 |                         "VR Arch Single Model v5: 1_HP-UVR": "1_HP-UVR.pth",
  8 |                         "VR Arch Single Model v5: 2_HP-UVR": "2_HP-UVR.pth",
  9 |                         "VR Arch Single Model v5: 3_HP-Vocal-UVR": "3_HP-Vocal-UVR.pth",
 10 |                         "VR Arch Single Model v5: 4_HP-Vocal-UVR": "4_HP-Vocal-UVR.pth",
 11 |                         "VR Arch Single Model v5: 5_HP-Karaoke-UVR": "5_HP-Karaoke-UVR.pth",
 12 |                         "VR Arch Single Model v5: 6_HP-Karaoke-UVR": "6_HP-Karaoke-UVR.pth",
 13 |                         "VR Arch Single Model v5: 7_HP2-UVR": "7_HP2-UVR.pth",
 14 |                         "VR Arch Single Model v5: 8_HP2-UVR": "8_HP2-UVR.pth",
 15 |                         "VR Arch Single Model v5: 9_HP2-UVR": "9_HP2-UVR.pth",
 16 |                         "VR Arch Single Model v5: 10_SP-UVR-2B-32000-1": "10_SP-UVR-2B-32000-1.pth",
 17 |                         "VR Arch Single Model v5: 11_SP-UVR-2B-32000-2": "11_SP-UVR-2B-32000-2.pth",
 18 |                         "VR Arch Single Model v5: 12_SP-UVR-3B-44100": "12_SP-UVR-3B-44100.pth",
 19 |                         "VR Arch Single Model v5: 13_SP-UVR-4B-44100-1": "13_SP-UVR-4B-44100-1.pth",
 20 |                         "VR Arch Single Model v5: 14_SP-UVR-4B-44100-2": "14_SP-UVR-4B-44100-2.pth",
 21 |                         "VR Arch Single Model v5: 15_SP-UVR-MID-44100-1": "15_SP-UVR-MID-44100-1.pth",
 22 |                         "VR Arch Single Model v5: 16_SP-UVR-MID-44100-2": "16_SP-UVR-MID-44100-2.pth",
 23 |                         "VR Arch Single Model v5: 17_HP-Wind_Inst-UVR": "17_HP-Wind_Inst-UVR.pth",
 24 |                         "VR Arch Single Model v5: UVR-De-Echo-Aggressive by FoxJoy": "UVR-De-Echo-Aggressive.pth",
 25 |                         "VR Arch Single Model v5: UVR-De-Echo-Normal by FoxJoy": "UVR-De-Echo-Normal.pth",
 26 |                         "VR Arch Single Model v5: UVR-DeEcho-DeReverb by FoxJoy": "UVR-DeEcho-DeReverb.pth",
 27 |                         "VR Arch Single Model v5: UVR-DeNoise-Lite by FoxJoy": "UVR-DeNoise-Lite.pth",
 28 |                         "VR Arch Single Model v5: UVR-DeNoise by FoxJoy": "UVR-DeNoise.pth",
 29 |                         "VR Arch Single Model v5: UVR-BVE-4B_SN-44100-1": "UVR-BVE-4B_SN-44100-1.pth",
 30 |                         "VR Arch Single Model v4: MGM_HIGHEND_v4": "MGM_HIGHEND_v4.pth",
 31 |                         "VR Arch Single Model v4: MGM_LOWEND_A_v4": "MGM_LOWEND_A_v4.pth",
 32 |                         "VR Arch Single Model v4: MGM_LOWEND_B_v4": "MGM_LOWEND_B_v4.pth",
 33 |                         "VR Arch Single Model v4: MGM_MAIN_v4": "MGM_MAIN_v4.pth"
 34 |                 },
 35 |     
 36 |     "mdx_download_list": {      
 37 |                         "MDX-Net Model: UVR-MDX-NET Inst HQ 1": "UVR-MDX-NET-Inst_HQ_1.onnx",
 38 |                         "MDX-Net Model: UVR-MDX-NET Inst HQ 2": "UVR-MDX-NET-Inst_HQ_2.onnx",
 39 |                         "MDX-Net Model: UVR-MDX-NET Inst HQ 3": "UVR-MDX-NET-Inst_HQ_3.onnx",
 40 |                         "MDX-Net Model: UVR-MDX-NET Inst HQ 4": "UVR-MDX-NET-Inst_HQ_4.onnx",
 41 |                         "MDX-Net Model: UVR-MDX-NET Main": "UVR_MDXNET_Main.onnx",
 42 |                         "MDX-Net Model: UVR-MDX-NET Inst Main": "UVR-MDX-NET-Inst_Main.onnx",
 43 |                         "MDX-Net Model: UVR-MDX-NET 1": "UVR_MDXNET_1_9703.onnx",
 44 |                         "MDX-Net Model: UVR-MDX-NET 2": "UVR_MDXNET_2_9682.onnx",
 45 |                         "MDX-Net Model: UVR-MDX-NET 3": "UVR_MDXNET_3_9662.onnx",
 46 |                         "MDX-Net Model: UVR-MDX-NET Inst 1": "UVR-MDX-NET-Inst_1.onnx",
 47 |                         "MDX-Net Model: UVR-MDX-NET Inst 2": "UVR-MDX-NET-Inst_2.onnx",
 48 |                         "MDX-Net Model: UVR-MDX-NET Inst 3": "UVR-MDX-NET-Inst_3.onnx",
 49 |                         "MDX-Net Model: UVR-MDX-NET Karaoke": "UVR_MDXNET_KARA.onnx",
 50 |                         "MDX-Net Model: UVR-MDX-NET Karaoke 2": "UVR_MDXNET_KARA_2.onnx",
 51 |                         "MDX-Net Model: UVR_MDXNET_9482": "UVR_MDXNET_9482.onnx",
 52 |                         "MDX-Net Model: UVR-MDX-NET Voc FT": "UVR-MDX-NET-Voc_FT.onnx",
 53 |                         "MDX-Net Model: Kim Vocal 1": "Kim_Vocal_1.onnx",
 54 |                         "MDX-Net Model: Kim Vocal 2": "Kim_Vocal_2.onnx",
 55 |                         "MDX-Net Model: Kim Inst": "Kim_Inst.onnx",
 56 |                         "MDX-Net Model: Reverb HQ By FoxJoy": "Reverb_HQ_By_FoxJoy.onnx",
 57 |                         "MDX-Net Model: UVR-MDX-NET Crowd HQ 1 By Aufr33": "UVR-MDX-NET_Crowd_HQ_1.onnx",
 58 |                         "MDX-Net Model: kuielab_a_vocals": "kuielab_a_vocals.onnx",
 59 |                         "MDX-Net Model: kuielab_a_other": "kuielab_a_other.onnx",
 60 |                         "MDX-Net Model: kuielab_a_bass": "kuielab_a_bass.onnx",
 61 |                         "MDX-Net Model: kuielab_a_drums": "kuielab_a_drums.onnx",
 62 |                         "MDX-Net Model: kuielab_b_vocals": "kuielab_b_vocals.onnx",
 63 |                         "MDX-Net Model: kuielab_b_other": "kuielab_b_other.onnx",
 64 |                         "MDX-Net Model: kuielab_b_bass": "kuielab_b_bass.onnx",
 65 |                         "MDX-Net Model: kuielab_b_drums": "kuielab_b_drums.onnx"
 66 |                         },
 67 | 
 68 |     "demucs_download_list":{
 69 | 
 70 |                 "Demucs v4: htdemucs_ft":{
 71 |                                 "f7e0c4bc-ba3fe64a.th":"https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/f7e0c4bc-ba3fe64a.th",
 72 |                                 "d12395a8-e57c48e6.th":"https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/d12395a8-e57c48e6.th",
 73 |                                 "92cfc3b6-ef3bcb9c.th":"https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/92cfc3b6-ef3bcb9c.th",
 74 |                                 "04573f0d-f3cf25b2.th":"https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/04573f0d-f3cf25b2.th",
 75 |                                 "htdemucs_ft.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/htdemucs_ft.yaml"
 76 |                                 },
 77 | 
 78 |                 "Demucs v4: htdemucs":{
 79 |                                 "955717e8-8726e21a.th": "https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/955717e8-8726e21a.th",
 80 |                                 "htdemucs.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/htdemucs.yaml"
 81 |                                 },
 82 | 
 83 |                 "Demucs v4: hdemucs_mmi":{
 84 |                                 "75fc33f5-1941ce65.th": "https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/75fc33f5-1941ce65.th",
 85 |                                 "hdemucs_mmi.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/hdemucs_mmi.yaml"
 86 |                                 },
 87 |                 "Demucs v4: htdemucs_6s":{
 88 |                                 "5c90dfd2-34c22ccb.th": "https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/5c90dfd2-34c22ccb.th",
 89 |                                 "htdemucs_6s.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/htdemucs_6s.yaml"
 90 |                                 },
 91 |                 "Demucs v3: mdx":{
 92 |                                 "0d19c1c6-0f06f20e.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/0d19c1c6-0f06f20e.th", 
 93 |                                 "7ecf8ec1-70f50cc9.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/7ecf8ec1-70f50cc9.th",
 94 |                                 "c511e2ab-fe698775.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/c511e2ab-fe698775.th",
 95 |                                 "7d865c68-3d5dd56b.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/7d865c68-3d5dd56b.th",
 96 |                                 "mdx.yaml": "https://raw.githubusercontent.com/facebookresearch/demucs/main/demucs/remote/mdx.yaml"
 97 |                                 },
 98 |                 
 99 |                 "Demucs v3: mdx_q":{
100 |                                 "6b9c2ca1-3fd82607.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/6b9c2ca1-3fd82607.th",
101 |                                 "b72baf4e-8778635e.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/b72baf4e-8778635e.th",
102 |                                 "42e558d4-196e0e1b.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/42e558d4-196e0e1b.th",
103 |                                 "305bc58f-18378783.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/305bc58f-18378783.th",
104 |                                 "mdx_q.yaml": "https://raw.githubusercontent.com/facebookresearch/demucs/main/demucs/remote/mdx_q.yaml"
105 |                                 },
106 |                 
107 |                 "Demucs v3: mdx_extra":{
108 |                                 "e51eebcc-c1b80bdd.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/e51eebcc-c1b80bdd.th",
109 |                                 "a1d90b5c-ae9d2452.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/a1d90b5c-ae9d2452.th",
110 |                                 "5d2d6c55-db83574e.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/5d2d6c55-db83574e.th",
111 |                                 "cfa93e08-61801ae1.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/cfa93e08-61801ae1.th",
112 |                                 "mdx_extra.yaml": "https://raw.githubusercontent.com/facebookresearch/demucs/main/demucs/remote/mdx_extra.yaml"
113 |                                 },
114 |                 
115 |                 "Demucs v3: mdx_extra_q": {
116 |                                 "83fc094f-4a16d450.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/83fc094f-4a16d450.th",
117 |                                 "464b36d7-e5a9386e.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/464b36d7-e5a9386e.th",
118 |                                 "14fc6a69-a89dd0ee.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/14fc6a69-a89dd0ee.th",
119 |                                 "7fd6ef75-a905dd85.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/7fd6ef75-a905dd85.th",
120 |                                 "mdx_extra_q.yaml": "https://raw.githubusercontent.com/facebookresearch/demucs/main/demucs/remote/mdx_extra_q.yaml"
121 |                                 },
122 |                 
123 |                 "Demucs v3: UVR Model":{
124 |                                 "ebf34a2db.th": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/ebf34a2db.th",
125 |                                 "UVR_Demucs_Model_1.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/UVR_Demucs_Model_1.yaml"
126 |                                 },
127 | 
128 |                 "Demucs v3: repro_mdx_a":{
129 |                                 "9a6b4851-03af0aa6.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/9a6b4851-03af0aa6.th", 
130 |                                 "1ef250f1-592467ce.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/1ef250f1-592467ce.th",
131 |                                 "fa0cb7f9-100d8bf4.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/fa0cb7f9-100d8bf4.th",
132 |                                 "902315c2-b39ce9c9.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/902315c2-b39ce9c9.th",
133 |                                 "repro_mdx_a.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/repro_mdx_a.yaml"
134 |                                 },
135 | 
136 |                 "Demucs v3: repro_mdx_a_time_only":{
137 |                                 "9a6b4851-03af0aa6.th":"https://dl.fbaipublicfiles.com/demucs/mdx_final/9a6b4851-03af0aa6.th",
138 |                                 "1ef250f1-592467ce.th":"https://dl.fbaipublicfiles.com/demucs/mdx_final/1ef250f1-592467ce.th",
139 |                                 "repro_mdx_a_time_only.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/repro_mdx_a_time_only.yaml"
140 |                                 },
141 | 
142 |                 "Demucs v3: repro_mdx_a_hybrid_only":{
143 |                                 "fa0cb7f9-100d8bf4.th":"https://dl.fbaipublicfiles.com/demucs/mdx_final/fa0cb7f9-100d8bf4.th",
144 |                                 "902315c2-b39ce9c9.th":"https://dl.fbaipublicfiles.com/demucs/mdx_final/902315c2-b39ce9c9.th",
145 |                                 "repro_mdx_a_hybrid_only.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/repro_mdx_a_hybrid_only.yaml"
146 |                                 },
147 | 
148 |                 "Demucs v2: demucs": {
149 |                                 "demucs-e07c671f.th": "https://dl.fbaipublicfiles.com/demucs/v3.0/demucs-e07c671f.th"
150 |                                 },
151 |         
152 |                 "Demucs v2: demucs_extra": {
153 |                                 "demucs_extra-3646af93.th":"https://dl.fbaipublicfiles.com/demucs/v3.0/demucs_extra-3646af93.th"
154 |                                 },
155 | 
156 |                 "Demucs v2: demucs48_hq": {
157 |                                 "demucs48_hq-28a1282c.th":"https://dl.fbaipublicfiles.com/demucs/v3.0/demucs48_hq-28a1282c.th"
158 |                                 },
159 | 
160 |                 "Demucs v2: tasnet": {
161 |                                 "tasnet-beb46fac.th":"https://dl.fbaipublicfiles.com/demucs/v3.0/tasnet-beb46fac.th"
162 |                                 },
163 | 
164 |                 "Demucs v2: tasnet_extra": {
165 |                                 "tasnet_extra-df3777b2.th":"https://dl.fbaipublicfiles.com/demucs/v3.0/tasnet_extra-df3777b2.th"
166 |                                 },
167 |                                 
168 |                 "Demucs v2: demucs_unittest": {
169 |                                 "demucs_unittest-09ebc15f.th":"https://dl.fbaipublicfiles.com/demucs/v3.0/demucs_unittest-09ebc15f.th"
170 |                                 },
171 | 
172 |                 "Demucs v1: demucs": {
173 |                                 "demucs.th":"https://dl.fbaipublicfiles.com/demucs/v2.0/demucs.th"
174 |                                 },
175 | 
176 |                 "Demucs v1: demucs_extra": {
177 |                                 "demucs_extra.th":"https://dl.fbaipublicfiles.com/demucs/v2.0/demucs_extra.th"
178 |                                 },
179 | 
180 |                 "Demucs v1: light": {
181 |                                 "light.th":"https://dl.fbaipublicfiles.com/demucs/v2.0/light.th"
182 |                                 },
183 | 
184 |                 "Demucs v1: light_extra": {
185 |                                 "light_extra.th":"https://dl.fbaipublicfiles.com/demucs/v2.0/light_extra.th"
186 |                                 },
187 | 
188 |                 "Demucs v1: tasnet": {
189 |                                 "tasnet.th":"https://dl.fbaipublicfiles.com/demucs/v2.0/tasnet.th"
190 |                                 },
191 |                                 
192 |                 "Demucs v1: tasnet_extra": {
193 |                                 "tasnet_extra.th":"https://dl.fbaipublicfiles.com/demucs/v2.0/tasnet_extra.th"
194 |                                 }
195 |                 },
196 | 
197 |     "mdx_download_vip_list": {
198 |                 "MDX-Net Model VIP: UVR-MDX-NET_Main_340": "UVR-MDX-NET_Main_340.onnx",
199 |                 "MDX-Net Model VIP: UVR-MDX-NET_Main_390": "UVR-MDX-NET_Main_390.onnx",
200 |                 "MDX-Net Model VIP: UVR-MDX-NET_Main_406": "UVR-MDX-NET_Main_406.onnx",
201 |                 "MDX-Net Model VIP: UVR-MDX-NET_Main_427": "UVR-MDX-NET_Main_427.onnx",
202 |                 "MDX-Net Model VIP: UVR-MDX-NET_Main_438": "UVR-MDX-NET_Main_438.onnx",
203 |                 "MDX-Net Model VIP: UVR-MDX-NET_Inst_82_beta": "UVR-MDX-NET_Inst_82_beta.onnx",
204 |                 "MDX-Net Model VIP: UVR-MDX-NET_Inst_90_beta": "UVR-MDX-NET_Inst_90_beta.onnx",
205 |                 "MDX-Net Model VIP: UVR-MDX-NET_Inst_187_beta": "UVR-MDX-NET_Inst_187_beta.onnx",
206 |                 "MDX-Net Model VIP: UVR-MDX-NET-Inst_full_292": "UVR-MDX-NET-Inst_full_292.onnx"
207 |                 },
208 | 
209 |     "mdx23_download_list": {
210 |                 "MDX23C Model: MDX23C_D1581": {"MDX23C_D1581.ckpt":"model_2_stem_061321.yaml"}
211 |                 },
212 | 
213 |     "mdx23c_download_list": {
214 |                 "MDX23C Model: MDX23C-InstVoc HQ": {"MDX23C-8KFFT-InstVoc_HQ.ckpt":"model_2_stem_full_band_8k.yaml"}
215 |                 },
216 |     
217 |     "roformer_download_list": {
218 |                 "Roformer Model: BS-Roformer-Viperx-1297": {"model_bs_roformer_ep_317_sdr_12.9755.ckpt":"model_bs_roformer_ep_317_sdr_12.9755.yaml"},
219 |                 "Roformer Model: BS-Roformer-Viperx-1296": {"model_bs_roformer_ep_368_sdr_12.9628.ckpt":"model_bs_roformer_ep_368_sdr_12.9628.yaml"},
220 |                 "Roformer Model: BS-Roformer-Viperx-1053": {"model_bs_roformer_ep_937_sdr_10.5309.ckpt":"model_bs_roformer_ep_937_sdr_10.5309.yaml"},
221 |                 "Roformer Model: Mel-Roformer-Viperx-1143": {"model_mel_band_roformer_ep_3005_sdr_11.4360.ckpt":"model_mel_band_roformer_ep_3005_sdr_11.4360.yaml"}
222 |                 },
223 |     
224 |     "mdx23c_download_vip_list": {
225 |             "MDX23C Model VIP: MDX23C_D1581": {"MDX23C_D1581.ckpt":"model_2_stem_061321.yaml"},
226 |             "MDX23C Model VIP: MDX23C-InstVoc HQ 2": {"MDX23C-8KFFT-InstVoc_HQ_2.ckpt":"model_2_stem_full_band_8k.yaml"}
227 |             },
228 |     
229 |     "vr_download_vip_list": [],
230 |     "demucs_download_vip_list": []
231 | }
232 | 


--------------------------------------------------------------------------------
/UVR/plugin.py:
--------------------------------------------------------------------------------
  1 | import os, sys
  2 | import json
  3 | import gradio as gr
  4 | 
  5 | now_dir = os.getcwd()
  6 | sys.path.append(now_dir)
  7 | 
  8 | from tabs.plugins.installed.UVR.uvr import Separator
  9 | 
 10 | plugin_folder = os.path.relpath(
 11 |     os.path.join(now_dir, "tabs", "plugins", "installed", "UVR")
 12 | )
 13 | 
 14 | 
 15 | def get_models_by_type(type):
 16 |     download_checks_path = os.path.join(plugin_folder, "models", "download_checks.json")
 17 | 
 18 |     model_downloads_list = json.load(open(download_checks_path, encoding="utf-8"))
 19 | 
 20 |     filtered_demucs_v4 = {
 21 |         key: value
 22 |         for key, value in model_downloads_list["demucs_download_list"].items()
 23 |         if key.startswith("Demucs v4")
 24 |     }
 25 | 
 26 |     model_files_grouped_by_type = {
 27 |         "VR": model_downloads_list["vr_download_list"],
 28 |         "MDX": {
 29 |             **model_downloads_list["mdx_download_list"],
 30 |             **model_downloads_list["mdx_download_vip_list"],
 31 |         },
 32 |         "Demucs": filtered_demucs_v4,
 33 |         "MDXC": {
 34 |             **model_downloads_list["mdx23c_download_list"],
 35 |             **model_downloads_list["mdx23c_download_vip_list"],
 36 |             **model_downloads_list["roformer_download_list"],
 37 |         },
 38 |     }
 39 | 
 40 |     results = []
 41 |     for model_name, model_info in model_files_grouped_by_type[type].items():
 42 |         results.append(model_info)
 43 | 
 44 |     return results
 45 | 
 46 | 
 47 | def run_uvr(
 48 |     audio,
 49 |     output_format,
 50 |     output_dir,
 51 |     invert_spect,
 52 |     normalization,
 53 |     single_stem,
 54 |     sample_rate,
 55 |     vr_model,
 56 |     vr_batch_size,
 57 |     vr_window_size,
 58 |     vr_aggression,
 59 |     vr_enable_tta,
 60 |     vr_high_end_process,
 61 |     vr_enable_post_process,
 62 |     vr_post_process_threshold,
 63 |     mdx_model,
 64 |     mdx_segment_size,
 65 |     mdx_overlap,
 66 |     mdx_batch_size,
 67 |     mdx_hop_length,
 68 |     mdx_enable_denoise,
 69 |     mdxc_model,
 70 |     mdxc_segment_size,
 71 |     mdxc_override_model_segment_size,
 72 |     mdxc_overlap,
 73 |     mdxc_batch_size,
 74 |     mdxc_pitch_shift,
 75 |     # demucs_model,
 76 |     # demucs_segment_size,
 77 |     # demucs_shifts,
 78 |     # demucs_overlap,
 79 |     # demucs_segments_enabled,
 80 |     tab_selected,
 81 | ):
 82 |     if tab_selected == "VR":
 83 |         model = vr_model
 84 |     elif tab_selected == "MDX":
 85 |         model = mdx_model
 86 |     elif tab_selected == "MDXC":
 87 |         model = mdxc_model
 88 |     # elif tab_selected == "Demucs":
 89 |     #     model = demucs_model
 90 | 
 91 |     if single_stem == "None":
 92 |         single_stem = None
 93 | 
 94 |     separator = Separator(
 95 |         model_file_dir=os.path.join(plugin_folder, "models"),
 96 |         output_dir=output_dir,
 97 |         output_format=output_format,
 98 |         normalization_threshold=float(normalization),
 99 |         output_single_stem=single_stem,
100 |         invert_using_spec=invert_spect,
101 |         sample_rate=int(sample_rate),
102 |         mdx_params={
103 |             "hop_length": int(mdx_hop_length),
104 |             "segment_size": int(mdx_segment_size),
105 |             "overlap": float(mdx_overlap),
106 |             "batch_size": int(mdx_batch_size),
107 |             "enable_denoise": mdx_enable_denoise,
108 |         },
109 |         vr_params={
110 |             "batch_size": int(vr_batch_size),
111 |             "window_size": int(vr_window_size),
112 |             "aggression": int(vr_aggression),
113 |             "enable_tta": vr_enable_tta,
114 |             "enable_post_process": vr_enable_post_process,
115 |             "post_process_threshold": float(vr_post_process_threshold),
116 |             "high_end_process": vr_high_end_process,
117 |         },
118 |         mdxc_params={
119 |             "segment_size": int(mdxc_segment_size),
120 |             "batch_size": int(mdxc_batch_size),
121 |             "overlap": int(mdxc_overlap),
122 |             "override_model_segment_size": mdxc_override_model_segment_size,
123 |             "pitch_shift": int(mdxc_pitch_shift),
124 |         },
125 |     )
126 |     """
127 |     demucs_params={
128 |         "segment_size": demucs_segment_size,
129 |         "shifts": demucs_shifts,
130 |         "overlap": demucs_overlap,
131 |         "segments_enabled": demucs_segments_enabled,
132 |     },
133 |     """
134 |     separator.load_model(model_filename=model)
135 | 
136 |     results = []
137 |     files = separator.separate(audio)
138 |     try:
139 |         for file in files:
140 |             file_path = os.path.join(output_dir, file)
141 |             results.append(file_path)
142 |         return results
143 |     except AttributeError:
144 |         return os.path.join(output_dir, files)
145 | 
146 | 
147 | def applio_plugin():
148 |     audio = gr.Audio(
149 |         label="Input audio",
150 |         sources=["upload", "microphone"],
151 |         type="filepath",
152 |         interactive=True,
153 |     )
154 | 
155 |     single_stem = gr.Radio(
156 |         label="Single stem",
157 |         choices=[
158 |             "None",
159 |             "Instrumental",
160 |             "Vocals",
161 |             "Drums",
162 |             "Bass",
163 |             "Guitar",
164 |             "Piano",
165 |             "Other",
166 |         ],
167 |         value="None",
168 |         interactive=True,
169 |     )
170 | 
171 |     with gr.Accordion("Advanced Settings", open=False):
172 |         invert_spect = gr.Checkbox(
173 |             label="Invert spectrogram",
174 |             value=False,
175 |             interactive=True,
176 |         )
177 | 
178 |         output_format = gr.Radio(
179 |             label="Output format",
180 |             choices=["wav", "mp3"],
181 |             value="wav",
182 |             interactive=True,
183 |         )
184 | 
185 |         output_dir = gr.Textbox(
186 |             label="Output directory",
187 |             value=os.path.join(plugin_folder, "output"),
188 |             interactive=True,
189 |         )
190 | 
191 |         with gr.Row():
192 |             sample_rate = gr.Textbox(
193 |                 label="Sample rate",
194 |                 value=44100,
195 |                 interactive=True,
196 |             )
197 | 
198 |             normalization = gr.Textbox(
199 |                 label="Normalization",
200 |                 value=0.9,
201 |                 interactive=True,
202 |             )
203 | 
204 |     with gr.Tab("VR") as vr_tab:
205 |         vr_model = gr.Dropdown(
206 |             label="Model",
207 |             choices=get_models_by_type("VR"),
208 |             interactive=True,
209 |         )
210 |         with gr.Accordion("Settings", open=False):
211 |             vr_enable_tta = gr.Checkbox(
212 |                 label="Enable TTA",
213 |                 value=False,
214 |                 interactive=True,
215 |             )
216 |             vr_high_end_process = gr.Checkbox(
217 |                 label="High-end process",
218 |                 value=False,
219 |                 interactive=True,
220 |             )
221 |             vr_enable_post_process = gr.Checkbox(
222 |                 label="Enable post-process",
223 |                 value=False,
224 |                 interactive=True,
225 |             )
226 |             with gr.Row():
227 |                 vr_aggression = gr.Slider(
228 |                     label="Aggression",
229 |                     minimum=-100,
230 |                     maximum=100,
231 |                     value=5,
232 |                     interactive=True,
233 |                 )
234 |                 vr_post_process_threshold = gr.Slider(
235 |                     label="Post-process threshold",
236 |                     minimum=0.1,
237 |                     maximum=0.3,
238 |                     step=0.01,
239 |                     value=0.2,
240 |                     interactive=True,
241 |                 )
242 |             with gr.Row():
243 |                 vr_batch_size = gr.Textbox(
244 |                     label="Batch size",
245 |                     value=4,
246 |                     interactive=True,
247 |                 )
248 |                 vr_window_size = gr.Dropdown(
249 |                     label="Window size",
250 |                     choices=[1024, 512, 320],
251 |                     value=512,
252 |                     interactive=True,
253 |                     allow_custom_value=True,
254 |                 )
255 | 
256 |     with gr.Tab("MDX") as mdx_tab:
257 |         mdx_model = gr.Dropdown(
258 |             label="Model",
259 |             choices=get_models_by_type("MDX"),
260 |             interactive=True,
261 |         )
262 |         with gr.Accordion("Settings", open=False):
263 |             mdx_enable_denoise = gr.Checkbox(
264 |                 label="Enable denoise",
265 |                 value=False,
266 |                 interactive=True,
267 |             )
268 |             with gr.Row():
269 |                 mdx_overlap = gr.Slider(
270 |                     label="Overlap",
271 |                     minimum=0.001,
272 |                     maximum=0.999,
273 |                     value=0.25,
274 |                     interactive=True,
275 |                 )
276 |             with gr.Row():
277 |                 mdx_batch_size = gr.Textbox(
278 |                     label="Batch size",
279 |                     value=1,
280 |                     interactive=True,
281 |                 )
282 |                 mdx_segment_size = gr.Textbox(
283 |                     label="Segment size",
284 |                     value=256,
285 |                     interactive=True,
286 |                 )
287 |                 mdx_hop_length = gr.Textbox(
288 |                     label="Hop length",
289 |                     value=1024,
290 |                     interactive=True,
291 |                 )
292 | 
293 |     with gr.Tab("MDXC") as mdxc_tab:
294 |         mdxc_model = gr.Dropdown(
295 |             label="Model",
296 |             choices=get_models_by_type("MDXC"),
297 |             interactive=True,
298 |         )
299 |         with gr.Accordion("Settings", open=False):
300 |             mdxc_override_model_segment_size = gr.Checkbox(
301 |                 label="Override model segment size",
302 |                 value=False,
303 |             )
304 |             with gr.Row():
305 |                 mdxc_overlap = gr.Slider(
306 |                     label="Overlap",
307 |                     minimum=0.001,
308 |                     maximum=0.999,
309 |                     value=0.25,
310 |                     interactive=True,
311 |                 )
312 |             with gr.Row():
313 |                 mdxc_batch_size = gr.Textbox(
314 |                     label="Batch size",
315 |                     value=1,
316 |                     interactive=True,
317 |                 )
318 |                 mdxc_segment_size = gr.Textbox(
319 |                     label="Segment size",
320 |                     value=256,
321 |                     interactive=True,
322 |                 )
323 |                 mdxc_pitch_shift = gr.Textbox(
324 |                     label="Hop length",
325 |                     value=0,
326 |                     interactive=True,
327 |                 )
328 | 
329 |     with gr.Tab("Demucs") as demucs_tab:
330 |         gr.Markdown("Demucs is not available in this version of the plugin.")
331 |         """
332 |         demucs_model = gr.Dropdown(
333 |             label="Model",
334 |             choices=get_models_by_type("Demucs"),
335 |             interactive=True,
336 |         )
337 |         with gr.Accordion("Settings", open=False):
338 |             demucs_segments_enabled = gr.Checkbox(
339 |                 label="Segments enabled",
340 |                 value=True,
341 |                 interactive=True,
342 |             )    
343 |             demucs_overlap = gr.Slider(
344 |                 label="Overlap",
345 |                 minimum=0.001,
346 |                 maximum=0.999,
347 |                 value=0.25,
348 |                 interactive=True,
349 |             )
350 |             with gr.Row():
351 |                 demucs_segment_size = gr.Textbox(
352 |                     label="Segment size",
353 |                     value="Default",
354 |                     interactive=True,
355 |                 )
356 |                 demucs_shifts = gr.Textbox(
357 |                     label="Shifts",
358 |                     value=2,
359 |                     interactive=True,
360 |                 )
361 |         """
362 | 
363 |     tab_selected = gr.Textbox(
364 |         label="Tab selected",
365 |         value="VR",
366 |         interactive=False,
367 |         visible=False,
368 |     )
369 | 
370 |     run_uvr_button = gr.Button("Run")
371 |     output_files = gr.File(
372 |         label="Output files", file_count="multiple", type="filepath", interactive=False
373 |     )
374 | 
375 |     run_uvr_button.click(
376 |         fn=run_uvr,
377 |         inputs=[
378 |             audio,
379 |             output_format,
380 |             output_dir,
381 |             invert_spect,
382 |             normalization,
383 |             single_stem,
384 |             sample_rate,
385 |             vr_model,
386 |             vr_batch_size,
387 |             vr_window_size,
388 |             vr_aggression,
389 |             vr_enable_tta,
390 |             vr_high_end_process,
391 |             vr_enable_post_process,
392 |             vr_post_process_threshold,
393 |             mdx_model,
394 |             mdx_segment_size,
395 |             mdx_overlap,
396 |             mdx_batch_size,
397 |             mdx_hop_length,
398 |             mdx_enable_denoise,
399 |             mdxc_model,
400 |             mdxc_segment_size,
401 |             mdxc_override_model_segment_size,
402 |             mdxc_overlap,
403 |             mdxc_batch_size,
404 |             mdxc_pitch_shift,
405 |             # demucs_model,
406 |             # demucs_segment_size,
407 |             # demucs_shifts,
408 |             # demucs_overlap,
409 |             # demucs_segments_enabled,
410 |             tab_selected,
411 |         ],
412 |         outputs=output_files,
413 |     )
414 | 
415 |     vr_tab.select(lambda: "VR", None, tab_selected)
416 |     mdx_tab.select(lambda: "MDX", None, tab_selected)
417 |     mdxc_tab.select(lambda: "MDXC", None, tab_selected)
418 |     demucs_tab.select(lambda: "Demucs", None, tab_selected)
419 | 


--------------------------------------------------------------------------------
/UVR/requirements.txt:
--------------------------------------------------------------------------------
 1 | six>=1.16
 2 | samplerate==0.1.0
 3 | pyyaml
 4 | ml_collections
 5 | 
 6 | onnx2torch>=1.5
 7 | onnx>=1.14
 8 | onnxruntime
 9 | onnxruntime_gpu==1.15.1
10 | 
11 | julius>=0.2
12 | diffq>=0.2
13 | 
14 | beartype==0.18.5
15 | rotary-embedding-torch==0.6.1


--------------------------------------------------------------------------------
/UVR/uvr.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | import json
  4 | import sys
  5 | 
  6 | 
  7 | def main():
  8 |     """Main entry point for the CLI."""
  9 | 
 10 |     logger = logging.getLogger(__name__)
 11 | 
 12 |     parser = argparse.ArgumentParser(
 13 |         description="Separate audio file into different stems.",
 14 |         formatter_class=lambda prog: argparse.RawTextHelpFormatter(
 15 |             prog, max_help_position=60
 16 |         ),
 17 |     )
 18 | 
 19 |     parser.add_argument(
 20 |         "--audio_file",
 21 |         nargs="?",
 22 |         help="The audio file path to separate, in any common format.",
 23 |         default=argparse.SUPPRESS,
 24 |     )
 25 | 
 26 |     debug_help = "Enable debug logging, equivalent to --log_level=debug."
 27 |     env_info_help = "Print environment information and exit."
 28 |     list_models_help = "List all supported models and exit."
 29 |     log_level_help = "Log level, e.g. info, debug, warning (default: %(default)s)."
 30 | 
 31 |     info_params = parser.add_argument_group("Info and Debugging")
 32 |     info_params.add_argument("-d", "--debug", action="store_true", help=debug_help)
 33 |     info_params.add_argument(
 34 |         "-e", "--env_info", action="store_true", help=env_info_help
 35 |     )
 36 |     info_params.add_argument(
 37 |         "-l", "--list_models", action="store_true", help=list_models_help
 38 |     )
 39 |     info_params.add_argument("--log_level", default="info", help=log_level_help)
 40 | 
 41 |     model_filename_help = (
 42 |         "model to use for separation (default: %(default)s). Example: -m 2_HP-UVR.pth"
 43 |     )
 44 |     output_format_help = "output format for separated files, any common format (default: %(default)s). Example: --output_format=MP3"
 45 |     output_dir_help = "directory to write output files (default: <current dir>). Example: --output_dir=/app/separated"
 46 |     model_file_dir_help = "model files directory (default: %(default)s). Example: --model_file_dir=/app/models"
 47 | 
 48 |     io_params = parser.add_argument_group("Separation I/O Params")
 49 |     io_params.add_argument(
 50 |         "-m",
 51 |         "--model_filename",
 52 |         default="model_mel_band_roformer_ep_3005_sdr_11.4360.ckpt",
 53 |         help=model_filename_help,
 54 |     )
 55 |     io_params.add_argument("--output_format", default="WAV", help=output_format_help)
 56 |     io_params.add_argument("--output_dir", default=None, help=output_dir_help)
 57 |     io_params.add_argument(
 58 |         "--model_file_dir",
 59 |         default="uvr/tmp/audio-separator-models/",
 60 |         help=model_file_dir_help,
 61 |     )
 62 | 
 63 |     invert_spect_help = "invert secondary stem using spectogram (default: %(default)s). Example: --invert_spect"
 64 |     normalization_help = "max peak amplitude to normalize input and output audio to (default: %(default)s). Example: --normalization=0.7"
 65 |     single_stem_help = "output only single stem, e.g. Instrumental, Vocals, Drums, Bass, Guitar, Piano, Other. Example: --single_stem=Instrumental"
 66 |     sample_rate_help = "modify the sample rate of the output audio (default: %(default)s). Example: --sample_rate=44100"
 67 | 
 68 |     common_params = parser.add_argument_group("Common Separation Parameters")
 69 |     common_params.add_argument(
 70 |         "--invert_spect", action="store_true", help=invert_spect_help
 71 |     )
 72 |     common_params.add_argument(
 73 |         "--normalization", type=float, default=0.9, help=normalization_help
 74 |     )
 75 |     common_params.add_argument("--single_stem", default=None, help=single_stem_help)
 76 |     common_params.add_argument(
 77 |         "--sample_rate", type=int, default=44100, help=sample_rate_help
 78 |     )
 79 | 
 80 |     mdx_segment_size_help = "larger consumes more resources, but may give better results (default: %(default)s). Example: --mdx_segment_size=256"
 81 |     mdx_overlap_help = "amount of overlap between prediction windows, 0.001-0.999. higher is better but slower (default: %(default)s). Example: --mdx_overlap=0.25"
 82 |     mdx_batch_size_help = "larger consumes more RAM but may process slightly faster (default: %(default)s). Example: --mdx_batch_size=4"
 83 |     mdx_hop_length_help = "usually called stride in neural networks, only change if you know what you're doing (default: %(default)s). Example: --mdx_hop_length=1024"
 84 |     mdx_enable_denoise_help = "enable denoising during separation (default: %(default)s). Example: --mdx_enable_denoise"
 85 | 
 86 |     mdx_params = parser.add_argument_group("MDX Architecture Parameters")
 87 |     mdx_params.add_argument(
 88 |         "--mdx_segment_size", type=int, default=256, help=mdx_segment_size_help
 89 |     )
 90 |     mdx_params.add_argument(
 91 |         "--mdx_overlap", type=float, default=0.25, help=mdx_overlap_help
 92 |     )
 93 |     mdx_params.add_argument(
 94 |         "--mdx_batch_size", type=int, default=1, help=mdx_batch_size_help
 95 |     )
 96 |     mdx_params.add_argument(
 97 |         "--mdx_hop_length", type=int, default=1024, help=mdx_hop_length_help
 98 |     )
 99 |     mdx_params.add_argument(
100 |         "--mdx_enable_denoise", action="store_true", help=mdx_enable_denoise_help
101 |     )
102 | 
103 |     vr_batch_size_help = "number of batches to process at a time. higher = more RAM, slightly faster processing (default: %(default)s). Example: --vr_batch_size=16"
104 |     vr_window_size_help = "balance quality and speed. 1024 = fast but lower, 320 = slower but better quality. (default: %(default)s). Example: --vr_window_size=320"
105 |     vr_aggression_help = "intensity of primary stem extraction, -100 - 100. typically 5 for vocals & instrumentals (default: %(default)s). Example: --vr_aggression=2"
106 |     vr_enable_tta_help = "enable Test-Time-Augmentation; slow but improves quality (default: %(default)s). Example: --vr_enable_tta"
107 |     vr_high_end_process_help = "mirror the missing frequency range of the output (default: %(default)s). Example: --vr_high_end_process"
108 |     vr_enable_post_process_help = "identify leftover artifacts within vocal output; may improve separation for some songs (default: %(default)s). Example: --vr_enable_post_process"
109 |     vr_post_process_threshold_help = "threshold for post_process feature: 0.1-0.3 (default: %(default)s). Example: --vr_post_process_threshold=0.1"
110 | 
111 |     vr_params = parser.add_argument_group("VR Architecture Parameters")
112 |     vr_params.add_argument(
113 |         "--vr_batch_size", type=int, default=4, help=vr_batch_size_help
114 |     )
115 |     vr_params.add_argument(
116 |         "--vr_window_size", type=int, default=512, help=vr_window_size_help
117 |     )
118 |     vr_params.add_argument(
119 |         "--vr_aggression", type=int, default=5, help=vr_aggression_help
120 |     )
121 |     vr_params.add_argument(
122 |         "--vr_enable_tta", action="store_true", help=vr_enable_tta_help
123 |     )
124 |     vr_params.add_argument(
125 |         "--vr_high_end_process", action="store_true", help=vr_high_end_process_help
126 |     )
127 |     vr_params.add_argument(
128 |         "--vr_enable_post_process",
129 |         action="store_true",
130 |         help=vr_enable_post_process_help,
131 |     )
132 |     vr_params.add_argument(
133 |         "--vr_post_process_threshold",
134 |         type=float,
135 |         default=0.2,
136 |         help=vr_post_process_threshold_help,
137 |     )
138 | 
139 |     demucs_segment_size_help = "size of segments into which the audio is split, 1-100. higher = slower but better quality (default: %(default)s). Example: --demucs_segment_size=256"
140 |     demucs_shifts_help = "number of predictions with random shifts, higher = slower but better quality (default: %(default)s). Example: --demucs_shifts=4"
141 |     demucs_overlap_help = "overlap between prediction windows, 0.001-0.999. higher = slower but better quality (default: %(default)s). Example: --demucs_overlap=0.25"
142 |     demucs_segments_enabled_help = "enable segment-wise processing (default: %(default)s). Example: --demucs_segments_enabled=False"
143 | 
144 |     demucs_params = parser.add_argument_group("Demucs Architecture Parameters")
145 |     demucs_params.add_argument(
146 |         "--demucs_segment_size",
147 |         type=str,
148 |         default="Default",
149 |         help=demucs_segment_size_help,
150 |     )
151 |     demucs_params.add_argument(
152 |         "--demucs_shifts", type=int, default=2, help=demucs_shifts_help
153 |     )
154 |     demucs_params.add_argument(
155 |         "--demucs_overlap", type=float, default=0.25, help=demucs_overlap_help
156 |     )
157 |     demucs_params.add_argument(
158 |         "--demucs_segments_enabled",
159 |         type=bool,
160 |         default=True,
161 |         help=demucs_segments_enabled_help,
162 |     )
163 | 
164 |     mdxc_segment_size_help = "larger consumes more resources, but may give better results (default: %(default)s). Example: --mdxc_segment_size=256"
165 |     mdxc_override_model_segment_size_help = "override model default segment size instead of using the model default value. Example: --mdxc_override_model_segment_size"
166 |     mdxc_overlap_help = "amount of overlap between prediction windows, 2-50. higher is better but slower (default: %(default)s). Example: --mdxc_overlap=8"
167 |     mdxc_batch_size_help = "larger consumes more RAM but may process slightly faster (default: %(default)s). Example: --mdxc_batch_size=4"
168 |     mdxc_pitch_shift_help = "shift audio pitch by a number of semitones while processing. may improve output for deep/high vocals. (default: %(default)s). Example: --mdxc_pitch_shift=2"
169 | 
170 |     mdxc_params = parser.add_argument_group("MDXC Architecture Parameters")
171 |     mdxc_params.add_argument(
172 |         "--mdxc_segment_size", type=int, default=256, help=mdxc_segment_size_help
173 |     )
174 |     mdxc_params.add_argument(
175 |         "--mdxc_override_model_segment_size",
176 |         action="store_true",
177 |         help=mdxc_override_model_segment_size_help,
178 |     )
179 |     mdxc_params.add_argument(
180 |         "--mdxc_overlap", type=int, default=8, help=mdxc_overlap_help
181 |     )
182 |     mdxc_params.add_argument(
183 |         "--mdxc_batch_size", type=int, default=1, help=mdxc_batch_size_help
184 |     )
185 |     mdxc_params.add_argument(
186 |         "--mdxc_pitch_shift", type=int, default=0, help=mdxc_pitch_shift_help
187 |     )
188 | 
189 |     args = parser.parse_args()
190 | 
191 |     if args.debug:
192 |         log_level = logging.DEBUG
193 |     else:
194 |         log_level = getattr(logging, args.log_level.upper())
195 | 
196 |     logger.setLevel(log_level)
197 | 
198 |     from tabs.plugins.installed.UVR.uvr.separator import Separator
199 | 
200 |     if args.env_info:
201 |         separator = Separator()
202 |         sys.exit(0)
203 | 
204 |     if args.list_models:
205 |         separator = Separator()
206 |         print(
207 |             json.dumps(separator.list_supported_model_files(), indent=4, sort_keys=True)
208 |         )
209 |         sys.exit(0)
210 | 
211 |     if not hasattr(args, "audio_file"):
212 |         parser.print_help()
213 |         sys.exit(1)
214 | 
215 |     separator = Separator(
216 |         log_level=log_level,
217 |         model_file_dir=args.model_file_dir,
218 |         output_dir=args.output_dir,
219 |         output_format=args.output_format,
220 |         normalization_threshold=args.normalization,
221 |         output_single_stem=args.single_stem,
222 |         invert_using_spec=args.invert_spect,
223 |         sample_rate=args.sample_rate,
224 |         mdx_params={
225 |             "hop_length": args.mdx_hop_length,
226 |             "segment_size": args.mdx_segment_size,
227 |             "overlap": args.mdx_overlap,
228 |             "batch_size": args.mdx_batch_size,
229 |             "enable_denoise": args.mdx_enable_denoise,
230 |         },
231 |         vr_params={
232 |             "batch_size": args.vr_batch_size,
233 |             "window_size": args.vr_window_size,
234 |             "aggression": args.vr_aggression,
235 |             "enable_tta": args.vr_enable_tta,
236 |             "enable_post_process": args.vr_enable_post_process,
237 |             "post_process_threshold": args.vr_post_process_threshold,
238 |             "high_end_process": args.vr_high_end_process,
239 |         },
240 |         demucs_params={
241 |             "segment_size": args.demucs_segment_size,
242 |             "shifts": args.demucs_shifts,
243 |             "overlap": args.demucs_overlap,
244 |             "segments_enabled": args.demucs_segments_enabled,
245 |         },
246 |         mdxc_params={
247 |             "segment_size": args.mdxc_segment_size,
248 |             "batch_size": args.mdxc_batch_size,
249 |             "overlap": args.mdxc_overlap,
250 |             "override_model_segment_size": args.mdxc_override_model_segment_size,
251 |             "pitch_shift": args.mdxc_pitch_shift,
252 |         },
253 |     )
254 | 
255 |     separator.load_model(model_filename=args.model_filename)
256 | 
257 |     output_files = separator.separate(args.audio_file)
258 | 
259 |     logger.info(f"Separation complete! Output file(s): {' '.join(output_files)}")
260 | 
261 | 
262 | if __name__ == "__main__":
263 |     main()
264 | 


--------------------------------------------------------------------------------
/UVR/uvr/__init__.py:
--------------------------------------------------------------------------------
1 | from .separator import Separator
2 | 


--------------------------------------------------------------------------------
/UVR/uvr/architectures/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IAHispano/Applio-Plugins/b80054bb20ade068aa69fed31bfe48f7dcbc4cad/UVR/uvr/architectures/__init__.py


--------------------------------------------------------------------------------
/UVR/uvr/architectures/demucs_separator.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | from pathlib import Path
  4 | import torch
  5 | import numpy as np
  6 | from tabs.plugins.installed.UVR.uvr.common_separator import CommonSeparator
  7 | from tabs.plugins.installed.UVR.uvr.uvr_lib_v5.demucs.apply import apply_model, demucs_segments
  8 | from tabs.plugins.installed.UVR.uvr.uvr_lib_v5.demucs.hdemucs import HDemucs
  9 | from tabs.plugins.installed.UVR.uvr.uvr_lib_v5.demucs.pretrained import get_model as get_demucs_model
 10 | from tabs.plugins.installed.UVR.uvr.uvr_lib_v5 import spec_utils
 11 | 
 12 | DEMUCS_4_SOURCE = ["drums", "bass", "other", "vocals"]
 13 | 
 14 | DEMUCS_2_SOURCE_MAPPER = {CommonSeparator.INST_STEM: 0, CommonSeparator.VOCAL_STEM: 1}
 15 | DEMUCS_4_SOURCE_MAPPER = {
 16 |     CommonSeparator.BASS_STEM: 0,
 17 |     CommonSeparator.DRUM_STEM: 1,
 18 |     CommonSeparator.OTHER_STEM: 2,
 19 |     CommonSeparator.VOCAL_STEM: 3,
 20 | }
 21 | DEMUCS_6_SOURCE_MAPPER = {
 22 |     CommonSeparator.BASS_STEM: 0,
 23 |     CommonSeparator.DRUM_STEM: 1,
 24 |     CommonSeparator.OTHER_STEM: 2,
 25 |     CommonSeparator.VOCAL_STEM: 3,
 26 |     CommonSeparator.GUITAR_STEM: 4,
 27 |     CommonSeparator.PIANO_STEM: 5,
 28 | }
 29 | 
 30 | 
 31 | class DemucsSeparator(CommonSeparator):
 32 |     """
 33 |     DemucsSeparator is responsible for separating audio sources using Demucs models.
 34 |     It initializes with configuration parameters and prepares the model for separation tasks.
 35 |     """
 36 | 
 37 |     def __init__(self, common_config, arch_config):
 38 |         # Any configuration values which can be shared between architectures should be set already in CommonSeparator,
 39 |         # e.g. user-specified functionality choices (self.output_single_stem) or common model parameters (self.primary_stem_name)
 40 |         super().__init__(config=common_config)
 41 | 
 42 |         # Initializing user-configurable parameters, passed through with an mdx_from the CLI or Separator instance
 43 | 
 44 |         # Adjust segments to manage RAM or V-RAM usage:
 45 |         # - Smaller sizes consume less resources.
 46 |         # - Bigger sizes consume more resources, but may provide better results.
 47 |         # - "Default" picks the optimal size.
 48 |         # DEMUCS_SEGMENTS = (DEF_OPT, '1', '5', '10', '15', '20',
 49 |         #           '25', '30', '35', '40', '45', '50',
 50 |         #           '55', '60', '65', '70', '75', '80',
 51 |         #           '85', '90', '95', '100')
 52 |         self.segment_size = arch_config.get("segment_size", "Default")
 53 | 
 54 |         # Performs multiple predictions with random shifts of the input and averages them.
 55 |         # The higher number of shifts, the longer the prediction will take.
 56 |         # Not recommended unless you have a GPU.
 57 |         # DEMUCS_SHIFTS = (0, 1, 2, 3, 4, 5,
 58 |         #                 6, 7, 8, 9, 10, 11,
 59 |         #                 12, 13, 14, 15, 16, 17,
 60 |         #                 18, 19, 20)
 61 |         self.shifts = arch_config.get("shifts", 2)
 62 | 
 63 |         # This option controls the amount of overlap between prediction windows.
 64 |         #  - Higher values can provide better results, but will lead to longer processing times.
 65 |         #  - You can choose between 0.001-0.999
 66 |         # DEMUCS_OVERLAP = (0.25, 0.50, 0.75, 0.99)
 67 |         self.overlap = arch_config.get("overlap", 0.25)
 68 | 
 69 |         # Enables "Segments". Deselecting this option is only recommended for those with powerful PCs.
 70 |         self.segments_enabled = arch_config.get("segments_enabled", True)
 71 | 
 72 |         self.logger.debug(
 73 |             f"Demucs arch params: segment_size={self.segment_size}, segments_enabled={self.segments_enabled}"
 74 |         )
 75 |         self.logger.debug(
 76 |             f"Demucs arch params: shifts={self.shifts}, overlap={self.overlap}"
 77 |         )
 78 | 
 79 |         self.demucs_source_map = DEMUCS_4_SOURCE_MAPPER
 80 | 
 81 |         self.audio_file_path = None
 82 |         self.audio_file_base = None
 83 |         self.demucs_model_instance = None
 84 | 
 85 |         # Add uvr_lib_v5 folder to system path so pytorch serialization can find the demucs module
 86 |         current_dir = os.path.dirname(__file__)
 87 |         uvr_lib_v5_path = os.path.join(current_dir, "..", "uvr_lib_v5")
 88 |         sys.path.insert(0, uvr_lib_v5_path)
 89 | 
 90 |         self.logger.info("Demucs Separator initialisation complete")
 91 | 
 92 |     def separate(self, audio_file_path):
 93 |         """
 94 |         Separates the audio file into its component stems using the Demucs model.
 95 |         """
 96 |         self.logger.debug("Starting separation process...")
 97 |         source = None
 98 |         stem_source = None
 99 |         inst_source = {}
100 | 
101 |         self.audio_file_path = audio_file_path
102 |         self.audio_file_base = os.path.splitext(os.path.basename(audio_file_path))[0]
103 | 
104 |         # Prepare the mix for processing
105 |         self.logger.debug("Preparing mix...")
106 |         mix = self.prepare_mix(self.audio_file_path)
107 | 
108 |         self.logger.debug(f"Mix prepared for demixing. Shape: {mix.shape}")
109 | 
110 |         self.logger.debug("Loading model for demixing...")
111 | 
112 |         self.demucs_model_instance = HDemucs(sources=DEMUCS_4_SOURCE)
113 |         self.demucs_model_instance = get_demucs_model(
114 |             name=os.path.splitext(os.path.basename(self.model_path))[0],
115 |             repo=Path(os.path.dirname(self.model_path)),
116 |         )
117 |         self.demucs_model_instance = demucs_segments(
118 |             self.segment_size, self.demucs_model_instance
119 |         )
120 |         self.demucs_model_instance.to(self.torch_device)
121 |         self.demucs_model_instance.eval()
122 | 
123 |         self.logger.debug("Model loaded and set to evaluation mode.")
124 | 
125 |         source = self.demix_demucs(mix)
126 | 
127 |         del self.demucs_model_instance
128 |         self.clear_gpu_cache()
129 |         self.logger.debug("Model and GPU cache cleared after demixing.")
130 | 
131 |         output_files = []
132 |         self.logger.debug("Processing output files...")
133 | 
134 |         if isinstance(inst_source, np.ndarray):
135 |             self.logger.debug("Processing instance source...")
136 |             source_reshape = spec_utils.reshape_sources(
137 |                 inst_source[self.demucs_source_map[CommonSeparator.VOCAL_STEM]],
138 |                 source[self.demucs_source_map[CommonSeparator.VOCAL_STEM]],
139 |             )
140 |             inst_source[self.demucs_source_map[CommonSeparator.VOCAL_STEM]] = (
141 |                 source_reshape
142 |             )
143 |             source = inst_source
144 | 
145 |         if isinstance(source, np.ndarray):
146 |             source_length = len(source)
147 |             self.logger.debug(
148 |                 f"Processing source array, source length is {source_length}"
149 |             )
150 |             if source_length == 2:
151 |                 self.logger.debug("Setting source map to 2-stem...")
152 |                 self.demucs_source_map = DEMUCS_2_SOURCE_MAPPER
153 |             elif source_length == 6:
154 |                 self.logger.debug("Setting source map to 6-stem...")
155 |                 self.demucs_source_map = DEMUCS_6_SOURCE_MAPPER
156 |             else:
157 |                 self.logger.debug("Setting source map to 4-stem...")
158 |                 self.demucs_source_map = DEMUCS_4_SOURCE_MAPPER
159 | 
160 |         self.logger.debug("Processing for all stems...")
161 |         for stem_name, stem_value in self.demucs_source_map.items():
162 |             if self.output_single_stem is not None:
163 |                 if stem_name.lower() != self.output_single_stem.lower():
164 |                     self.logger.debug(
165 |                         f"Skipping writing stem {stem_name} as output_single_stem is set to {self.output_single_stem}..."
166 |                     )
167 |                     continue
168 | 
169 |             stem_path = os.path.join(
170 |                 f"{self.audio_file_base}_({stem_name})_{self.model_name}.{self.output_format.lower()}"
171 |             )
172 |             stem_source = source[stem_value].T
173 | 
174 |             self.final_process(stem_path, stem_source, stem_name)
175 |             output_files.append(stem_path)
176 | 
177 |         return output_files
178 | 
179 |     def demix_demucs(self, mix):
180 |         """
181 |         Demixes the input mix using the demucs model.
182 |         """
183 |         self.logger.debug("Starting demixing process in demix_demucs...")
184 | 
185 |         processed = {}
186 |         mix = torch.tensor(mix, dtype=torch.float32)
187 |         ref = mix.mean(0)
188 |         mix = (mix - ref.mean()) / ref.std()
189 |         mix_infer = mix
190 | 
191 |         with torch.no_grad():
192 |             self.logger.debug("Running model inference...")
193 |             sources = apply_model(
194 |                 model=self.demucs_model_instance,
195 |                 mix=mix_infer[None],
196 |                 shifts=self.shifts,
197 |                 split=self.segments_enabled,
198 |                 overlap=self.overlap,
199 |                 static_shifts=1 if self.shifts == 0 else self.shifts,
200 |                 set_progress_bar=None,
201 |                 device=self.torch_device,
202 |                 progress=True,
203 |             )[0]
204 | 
205 |         sources = (sources * ref.std() + ref.mean()).cpu().numpy()
206 |         sources[[0, 1]] = sources[[1, 0]]
207 |         processed[mix] = sources[:, :, 0:None].copy()
208 |         sources = list(processed.values())
209 |         sources = [s[:, :, 0:None] for s in sources]
210 |         sources = np.concatenate(sources, axis=-1)
211 | 
212 |         return sources
213 | 


--------------------------------------------------------------------------------
/UVR/uvr/common_separator.py:
--------------------------------------------------------------------------------
  1 | """ This file contains the CommonSeparator class, common to all architecture-specific Separator classes. """
  2 | 
  3 | from logging import Logger
  4 | import os
  5 | import gc
  6 | import numpy as np
  7 | import librosa
  8 | import torch
  9 | from pydub import AudioSegment
 10 | from tabs.plugins.installed.UVR.uvr.uvr_lib_v5 import spec_utils
 11 | 
 12 | 
 13 | class CommonSeparator:
 14 |     """
 15 |     This class contains the common methods and attributes common to all architecture-specific Separator classes.
 16 |     """
 17 | 
 18 |     ALL_STEMS = "All Stems"
 19 |     VOCAL_STEM = "Vocals"
 20 |     INST_STEM = "Instrumental"
 21 |     OTHER_STEM = "Other"
 22 |     BASS_STEM = "Bass"
 23 |     DRUM_STEM = "Drums"
 24 |     GUITAR_STEM = "Guitar"
 25 |     PIANO_STEM = "Piano"
 26 |     SYNTH_STEM = "Synthesizer"
 27 |     STRINGS_STEM = "Strings"
 28 |     WOODWINDS_STEM = "Woodwinds"
 29 |     BRASS_STEM = "Brass"
 30 |     WIND_INST_STEM = "Wind Inst"
 31 |     NO_OTHER_STEM = "No Other"
 32 |     NO_BASS_STEM = "No Bass"
 33 |     NO_DRUM_STEM = "No Drums"
 34 |     NO_GUITAR_STEM = "No Guitar"
 35 |     NO_PIANO_STEM = "No Piano"
 36 |     NO_SYNTH_STEM = "No Synthesizer"
 37 |     NO_STRINGS_STEM = "No Strings"
 38 |     NO_WOODWINDS_STEM = "No Woodwinds"
 39 |     NO_WIND_INST_STEM = "No Wind Inst"
 40 |     NO_BRASS_STEM = "No Brass"
 41 |     PRIMARY_STEM = "Primary Stem"
 42 |     SECONDARY_STEM = "Secondary Stem"
 43 |     LEAD_VOCAL_STEM = "lead_only"
 44 |     BV_VOCAL_STEM = "backing_only"
 45 |     LEAD_VOCAL_STEM_I = "with_lead_vocals"
 46 |     BV_VOCAL_STEM_I = "with_backing_vocals"
 47 |     LEAD_VOCAL_STEM_LABEL = "Lead Vocals"
 48 |     BV_VOCAL_STEM_LABEL = "Backing Vocals"
 49 | 
 50 |     NON_ACCOM_STEMS = (
 51 |         VOCAL_STEM,
 52 |         OTHER_STEM,
 53 |         BASS_STEM,
 54 |         DRUM_STEM,
 55 |         GUITAR_STEM,
 56 |         PIANO_STEM,
 57 |         SYNTH_STEM,
 58 |         STRINGS_STEM,
 59 |         WOODWINDS_STEM,
 60 |         BRASS_STEM,
 61 |         WIND_INST_STEM,
 62 |     )
 63 | 
 64 |     def __init__(self, config):
 65 | 
 66 |         self.logger: Logger = config.get("logger")
 67 |         self.log_level: int = config.get("log_level")
 68 | 
 69 |         # Inferencing device / acceleration config
 70 |         self.torch_device = config.get("torch_device")
 71 |         self.torch_device_cpu = config.get("torch_device_cpu")
 72 |         self.torch_device_mps = config.get("torch_device_mps")
 73 |         self.onnx_execution_provider = config.get("onnx_execution_provider")
 74 | 
 75 |         # Model data
 76 |         self.model_name = config.get("model_name")
 77 |         self.model_path = config.get("model_path")
 78 |         self.model_data = config.get("model_data")
 79 | 
 80 |         # Output directory and format
 81 |         self.output_dir = config.get("output_dir")
 82 |         self.output_format = config.get("output_format")
 83 | 
 84 |         # Functional options which are applicable to all architectures and the user may tweak to affect the output
 85 |         self.normalization_threshold = config.get("normalization_threshold")
 86 |         self.enable_denoise = config.get("enable_denoise")
 87 |         self.output_single_stem = config.get("output_single_stem")
 88 |         self.invert_using_spec = config.get("invert_using_spec")
 89 |         self.sample_rate = config.get("sample_rate")
 90 | 
 91 |         # Model specific properties
 92 |         self.primary_stem_name = self.model_data.get("primary_stem", "Vocals")
 93 |         self.secondary_stem_name = (
 94 |             "Vocals" if self.primary_stem_name == "Instrumental" else "Instrumental"
 95 |         )
 96 |         self.is_karaoke = self.model_data.get("is_karaoke", False)
 97 |         self.is_bv_model = self.model_data.get("is_bv_model", False)
 98 |         self.bv_model_rebalance = self.model_data.get("is_bv_model_rebalanced", 0)
 99 | 
100 |         self.logger.debug(
101 |             f"Common params: model_name={self.model_name}, model_path={self.model_path}"
102 |         )
103 |         self.logger.debug(
104 |             f"Common params: output_dir={self.output_dir}, output_format={self.output_format}"
105 |         )
106 |         self.logger.debug(
107 |             f"Common params: normalization_threshold={self.normalization_threshold}"
108 |         )
109 |         self.logger.debug(
110 |             f"Common params: enable_denoise={self.enable_denoise}, output_single_stem={self.output_single_stem}"
111 |         )
112 |         self.logger.debug(
113 |             f"Common params: invert_using_spec={self.invert_using_spec}, sample_rate={self.sample_rate}"
114 |         )
115 | 
116 |         self.logger.debug(
117 |             f"Common params: primary_stem_name={self.primary_stem_name}, secondary_stem_name={self.secondary_stem_name}"
118 |         )
119 |         self.logger.debug(
120 |             f"Common params: is_karaoke={self.is_karaoke}, is_bv_model={self.is_bv_model}, bv_model_rebalance={self.bv_model_rebalance}"
121 |         )
122 | 
123 |         # File-specific variables which need to be cleared between processing different audio inputs
124 |         self.audio_file_path = None
125 |         self.audio_file_base = None
126 | 
127 |         self.primary_source = None
128 |         self.secondary_source = None
129 | 
130 |         self.primary_stem_output_path = None
131 |         self.secondary_stem_output_path = None
132 | 
133 |         self.cached_sources_map = {}
134 | 
135 |     def separate(self, audio_file_path):
136 |         """
137 |         Placeholder method for separating audio sources. Should be overridden by subclasses.
138 |         """
139 |         raise NotImplementedError("This method should be overridden by subclasses.")
140 | 
141 |     def final_process(self, stem_path, source, stem_name):
142 |         """
143 |         Finalizes the processing of a stem by writing the audio to a file and returning the processed source.
144 |         """
145 |         self.logger.debug(
146 |             f"Finalizing {stem_name} stem processing and writing audio..."
147 |         )
148 |         self.write_audio(stem_path, source)
149 | 
150 |         return {stem_name: source}
151 | 
152 |     def cached_sources_clear(self):
153 |         """
154 |         Clears the cache dictionaries for VR, MDX, and Demucs models.
155 | 
156 |         This function is essential for ensuring that the cache does not hold outdated or irrelevant data
157 |         between different processing sessions or when a new batch of audio files is processed.
158 |         It helps in managing memory efficiently and prevents potential errors due to stale data.
159 |         """
160 |         self.cached_sources_map = {}
161 | 
162 |     def cached_source_callback(self, model_architecture, model_name=None):
163 |         """
164 |         Retrieves the model and sources from the cache based on the processing method and model name.
165 | 
166 |         Args:
167 |             model_architecture: The architecture type (VR, MDX, or Demucs) being used for processing.
168 |             model_name: The specific model name within the architecture type, if applicable.
169 | 
170 |         Returns:
171 |             A tuple containing the model and its sources if found in the cache; otherwise, None.
172 | 
173 |         This function is crucial for optimizing performance by avoiding redundant processing.
174 |         If the requested model and its sources are already in the cache, they can be reused directly,
175 |         saving time and computational resources.
176 |         """
177 |         model, sources = None, None
178 | 
179 |         mapper = self.cached_sources_map[model_architecture]
180 | 
181 |         for key, value in mapper.items():
182 |             if model_name in key:
183 |                 model = key
184 |                 sources = value
185 | 
186 |         return model, sources
187 | 
188 |     def cached_model_source_holder(self, model_architecture, sources, model_name=None):
189 |         """
190 |         Update the dictionary for the given model_architecture with the new model name and its sources.
191 |         Use the model_architecture as a key to access the corresponding cache source mapper dictionary.
192 |         """
193 |         self.cached_sources_map[model_architecture] = {
194 |             **self.cached_sources_map.get(model_architecture, {}),
195 |             **{model_name: sources},
196 |         }
197 | 
198 |     def prepare_mix(self, mix):
199 |         """
200 |         Prepares the mix for processing. This includes loading the audio from a file if necessary,
201 |         ensuring the mix is in the correct format, and converting mono to stereo if needed.
202 |         """
203 |         # Store the original path or the mix itself for later checks
204 |         audio_path = mix
205 | 
206 |         # Check if the input is a file path (string) and needs to be loaded
207 |         if not isinstance(mix, np.ndarray):
208 |             self.logger.debug(f"Loading audio from file: {mix}")
209 |             mix, sr = librosa.load(mix, mono=False, sr=self.sample_rate)
210 |             self.logger.debug(
211 |                 f"Audio loaded. Sample rate: {sr}, Audio shape: {mix.shape}"
212 |             )
213 |         else:
214 |             # Transpose the mix if it's already an ndarray (expected shape: [channels, samples])
215 |             self.logger.debug("Transposing the provided mix array.")
216 |             mix = mix.T
217 |             self.logger.debug(f"Transposed mix shape: {mix.shape}")
218 | 
219 |         # If the original input was a filepath, check if the loaded mix is empty
220 |         if isinstance(audio_path, str):
221 |             if not np.any(mix):
222 |                 error_msg = f"Audio file {audio_path} is empty or not valid"
223 |                 self.logger.error(error_msg)
224 |                 raise ValueError(error_msg)
225 |             else:
226 |                 self.logger.debug("Audio file is valid and contains data.")
227 | 
228 |         # Ensure the mix is in stereo format
229 |         if mix.ndim == 1:
230 |             self.logger.debug("Mix is mono. Converting to stereo.")
231 |             mix = np.asfortranarray([mix, mix])
232 |             self.logger.debug("Converted to stereo mix.")
233 | 
234 |         # Final log indicating successful preparation of the mix
235 |         self.logger.debug("Mix preparation completed.")
236 |         return mix
237 | 
238 |     def write_audio(self, stem_path: str, stem_source):
239 |         """
240 |         Writes the separated audio source to a file.
241 |         """
242 |         self.logger.debug(f"Entering write_audio with stem_path: {stem_path}")
243 | 
244 |         stem_source = spec_utils.normalize(
245 |             wave=stem_source, max_peak=self.normalization_threshold
246 |         )
247 | 
248 |         # Check if the numpy array is empty or contains very low values
249 |         if np.max(np.abs(stem_source)) < 1e-6:
250 |             self.logger.warning("Warning: stem_source array is near-silent or empty.")
251 |             return
252 | 
253 |         # If output_dir is specified, create it and join it with stem_path
254 |         if self.output_dir:
255 |             os.makedirs(self.output_dir, exist_ok=True)
256 |             stem_path = os.path.join(self.output_dir, stem_path)
257 | 
258 |         self.logger.debug(f"Audio data shape before processing: {stem_source.shape}")
259 |         self.logger.debug(f"Data type before conversion: {stem_source.dtype}")
260 | 
261 |         # Ensure the audio data is in the correct format (e.g., int16)
262 |         if stem_source.dtype != np.int16:
263 |             stem_source = (stem_source * 32767).astype(np.int16)
264 |             self.logger.debug("Converted stem_source to int16.")
265 | 
266 |         # Correctly interleave stereo channels
267 |         stem_source_interleaved = np.empty((2 * stem_source.shape[0],), dtype=np.int16)
268 |         stem_source_interleaved[0::2] = stem_source[:, 0]  # Left channel
269 |         stem_source_interleaved[1::2] = stem_source[:, 1]  # Right channel
270 | 
271 |         self.logger.debug(
272 |             f"Interleaved audio data shape: {stem_source_interleaved.shape}"
273 |         )
274 | 
275 |         # Create a pydub AudioSegment
276 |         try:
277 |             audio_segment = AudioSegment(
278 |                 stem_source_interleaved.tobytes(),
279 |                 frame_rate=self.sample_rate,
280 |                 sample_width=stem_source.dtype.itemsize,
281 |                 channels=2,
282 |             )
283 |             self.logger.debug("Created AudioSegment successfully.")
284 |         except (IOError, ValueError) as e:
285 |             self.logger.error(f"Specific error creating AudioSegment: {e}")
286 |             return
287 | 
288 |         # Determine file format based on the file extension
289 |         file_format = stem_path.lower().split(".")[-1]
290 | 
291 |         # For m4a files, specify mp4 as the container format as the extension doesn't match the format name
292 |         if file_format == "m4a":
293 |             file_format = "mp4"
294 |         elif file_format == "mka":
295 |             file_format = "matroska"
296 | 
297 |         # Export using the determined format
298 |         try:
299 |             audio_segment.export(stem_path, format=file_format)
300 |             self.logger.debug(f"Exported audio file successfully to {stem_path}")
301 |         except (IOError, ValueError) as e:
302 |             self.logger.error(f"Error exporting audio file: {e}")
303 | 
304 |     def clear_gpu_cache(self):
305 |         """
306 |         This method clears the GPU cache to free up memory.
307 |         """
308 |         self.logger.debug("Running garbage collection...")
309 |         gc.collect()
310 |         if self.torch_device == torch.device("mps"):
311 |             self.logger.debug("Clearing MPS cache...")
312 |             torch.mps.empty_cache()
313 |         if self.torch_device == torch.device("cuda"):
314 |             self.logger.debug("Clearing CUDA cache...")
315 |             torch.cuda.empty_cache()
316 | 
317 |     def clear_file_specific_paths(self):
318 |         """
319 |         Clears the file-specific variables which need to be cleared between processing different audio inputs.
320 |         """
321 |         self.logger.info("Clearing input audio file paths, sources and stems...")
322 | 
323 |         self.audio_file_path = None
324 |         self.audio_file_base = None
325 | 
326 |         self.primary_source = None
327 |         self.secondary_source = None
328 | 
329 |         self.primary_stem_output_path = None
330 |         self.secondary_stem_output_path = None
331 | 


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IAHispano/Applio-Plugins/b80054bb20ade068aa69fed31bfe48f7dcbc4cad/UVR/uvr/uvr_lib_v5/__init__.py


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/attend.py:
--------------------------------------------------------------------------------
  1 | from functools import wraps
  2 | from packaging import version
  3 | from collections import namedtuple
  4 | 
  5 | import torch
  6 | from torch import nn, einsum
  7 | import torch.nn.functional as F
  8 | 
  9 | from einops import rearrange, reduce
 10 | 
 11 | # constants
 12 | 
 13 | FlashAttentionConfig = namedtuple(
 14 |     "FlashAttentionConfig", ["enable_flash", "enable_math", "enable_mem_efficient"]
 15 | )
 16 | 
 17 | # helpers
 18 | 
 19 | 
 20 | def exists(val):
 21 |     return val is not None
 22 | 
 23 | 
 24 | def once(fn):
 25 |     called = False
 26 | 
 27 |     @wraps(fn)
 28 |     def inner(x):
 29 |         nonlocal called
 30 |         if called:
 31 |             return
 32 |         called = True
 33 |         return fn(x)
 34 | 
 35 |     return inner
 36 | 
 37 | 
 38 | print_once = once(print)
 39 | 
 40 | # main class
 41 | 
 42 | 
 43 | class Attend(nn.Module):
 44 |     def __init__(self, dropout=0.0, flash=False):
 45 |         super().__init__()
 46 |         self.dropout = dropout
 47 |         self.attn_dropout = nn.Dropout(dropout)
 48 | 
 49 |         self.flash = flash
 50 |         assert not (
 51 |             flash and version.parse(torch.__version__) < version.parse("2.0.0")
 52 |         ), "in order to use flash attention, you must be using pytorch 2.0 or above"
 53 | 
 54 |         # determine efficient attention configs for cuda and cpu
 55 | 
 56 |         self.cpu_config = FlashAttentionConfig(True, True, True)
 57 |         self.cuda_config = None
 58 | 
 59 |         if not torch.cuda.is_available() or not flash:
 60 |             return
 61 | 
 62 |         device_properties = torch.cuda.get_device_properties(torch.device("cuda"))
 63 | 
 64 |         if device_properties.major == 8 and device_properties.minor == 0:
 65 |             print_once(
 66 |                 "A100 GPU detected, using flash attention if input tensor is on cuda"
 67 |             )
 68 |             self.cuda_config = FlashAttentionConfig(True, False, False)
 69 |         else:
 70 |             self.cuda_config = FlashAttentionConfig(False, True, True)
 71 | 
 72 |     def flash_attn(self, q, k, v):
 73 |         _, heads, q_len, _, k_len, is_cuda, device = (
 74 |             *q.shape,
 75 |             k.shape[-2],
 76 |             q.is_cuda,
 77 |             q.device,
 78 |         )
 79 | 
 80 |         # Check if there is a compatible device for flash attention
 81 | 
 82 |         config = self.cuda_config if is_cuda else self.cpu_config
 83 | 
 84 |         # pytorch 2.0 flash attn: q, k, v, mask, dropout, softmax_scale
 85 | 
 86 |         with torch.backends.cuda.sdp_kernel(**config._asdict()):
 87 |             out = F.scaled_dot_product_attention(
 88 |                 q, k, v, dropout_p=self.dropout if self.training else 0.0
 89 |             )
 90 | 
 91 |         return out
 92 | 
 93 |     def forward(self, q, k, v):
 94 |         """
 95 |         einstein notation
 96 |         b - batch
 97 |         h - heads
 98 |         n, i, j - sequence length (base sequence length, source, target)
 99 |         d - feature dimension
100 |         """
101 | 
102 |         q_len, k_len, device = q.shape[-2], k.shape[-2], q.device
103 | 
104 |         scale = q.shape[-1] ** -0.5
105 | 
106 |         if self.flash:
107 |             return self.flash_attn(q, k, v)
108 | 
109 |         # similarity
110 | 
111 |         sim = einsum(f"b h i d, b h j d -> b h i j", q, k) * scale
112 | 
113 |         # attention
114 | 
115 |         attn = sim.softmax(dim=-1)
116 |         attn = self.attn_dropout(attn)
117 | 
118 |         # aggregate values
119 | 
120 |         out = einsum(f"b h i j, b h j d -> b h i d", attn, v)
121 | 
122 |         return out
123 | 


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/demucs/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/demucs/__main__.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import json
  8 | import os
  9 | import sys
 10 | import time
 11 | from dataclasses import dataclass, field
 12 | from fractions import Fraction
 13 | 
 14 | import torch as th
 15 | from torch import distributed, nn
 16 | from torch.nn.parallel.distributed import DistributedDataParallel
 17 | 
 18 | from .augment import FlipChannels, FlipSign, Remix, Shift
 19 | from .compressed import StemsSet, build_musdb_metadata, get_musdb_tracks
 20 | from .model import Demucs
 21 | from .parser import get_name, get_parser
 22 | from .raw import Rawset
 23 | from .tasnet import ConvTasNet
 24 | from .test import evaluate
 25 | from .train import train_model, validate_model
 26 | from .utils import human_seconds, load_model, save_model, sizeof_fmt
 27 | 
 28 | 
 29 | @dataclass
 30 | class SavedState:
 31 |     metrics: list = field(default_factory=list)
 32 |     last_state: dict = None
 33 |     best_state: dict = None
 34 |     optimizer: dict = None
 35 | 
 36 | 
 37 | def main():
 38 |     parser = get_parser()
 39 |     args = parser.parse_args()
 40 |     name = get_name(parser, args)
 41 |     print(f"Experiment {name}")
 42 | 
 43 |     if args.musdb is None and args.rank == 0:
 44 |         print(
 45 |             "You must provide the path to the MusDB dataset with the --musdb flag. "
 46 |             "To download the MusDB dataset, see https://sigsep.github.io/datasets/musdb.html.",
 47 |             file=sys.stderr,
 48 |         )
 49 |         sys.exit(1)
 50 | 
 51 |     eval_folder = args.evals / name
 52 |     eval_folder.mkdir(exist_ok=True, parents=True)
 53 |     args.logs.mkdir(exist_ok=True)
 54 |     metrics_path = args.logs / f"{name}.json"
 55 |     eval_folder.mkdir(exist_ok=True, parents=True)
 56 |     args.checkpoints.mkdir(exist_ok=True, parents=True)
 57 |     args.models.mkdir(exist_ok=True, parents=True)
 58 | 
 59 |     if args.device is None:
 60 |         device = "cpu"
 61 |         if th.cuda.is_available():
 62 |             device = "cuda"
 63 |     else:
 64 |         device = args.device
 65 | 
 66 |     th.manual_seed(args.seed)
 67 |     # Prevents too many threads to be started when running `museval` as it can be quite
 68 |     # inefficient on NUMA architectures.
 69 |     os.environ["OMP_NUM_THREADS"] = "1"
 70 | 
 71 |     if args.world_size > 1:
 72 |         if device != "cuda" and args.rank == 0:
 73 |             print(
 74 |                 "Error: distributed training is only available with cuda device",
 75 |                 file=sys.stderr,
 76 |             )
 77 |             sys.exit(1)
 78 |         th.cuda.set_device(args.rank % th.cuda.device_count())
 79 |         distributed.init_process_group(
 80 |             backend="nccl",
 81 |             init_method="tcp://" + args.master,
 82 |             rank=args.rank,
 83 |             world_size=args.world_size,
 84 |         )
 85 | 
 86 |     checkpoint = args.checkpoints / f"{name}.th"
 87 |     checkpoint_tmp = args.checkpoints / f"{name}.th.tmp"
 88 |     if args.restart and checkpoint.exists():
 89 |         checkpoint.unlink()
 90 | 
 91 |     if args.test:
 92 |         args.epochs = 1
 93 |         args.repeat = 0
 94 |         model = load_model(args.models / args.test)
 95 |     elif args.tasnet:
 96 |         model = ConvTasNet(
 97 |             audio_channels=args.audio_channels, samplerate=args.samplerate, X=args.X
 98 |         )
 99 |     else:
100 |         model = Demucs(
101 |             audio_channels=args.audio_channels,
102 |             channels=args.channels,
103 |             context=args.context,
104 |             depth=args.depth,
105 |             glu=args.glu,
106 |             growth=args.growth,
107 |             kernel_size=args.kernel_size,
108 |             lstm_layers=args.lstm_layers,
109 |             rescale=args.rescale,
110 |             rewrite=args.rewrite,
111 |             sources=4,
112 |             stride=args.conv_stride,
113 |             upsample=args.upsample,
114 |             samplerate=args.samplerate,
115 |         )
116 |     model.to(device)
117 |     if args.show:
118 |         print(model)
119 |         size = sizeof_fmt(4 * sum(p.numel() for p in model.parameters()))
120 |         print(f"Model size {size}")
121 |         return
122 | 
123 |     optimizer = th.optim.Adam(model.parameters(), lr=args.lr)
124 | 
125 |     try:
126 |         saved = th.load(checkpoint, map_location="cpu")
127 |     except IOError:
128 |         saved = SavedState()
129 |     else:
130 |         model.load_state_dict(saved.last_state)
131 |         optimizer.load_state_dict(saved.optimizer)
132 | 
133 |     if args.save_model:
134 |         if args.rank == 0:
135 |             model.to("cpu")
136 |             model.load_state_dict(saved.best_state)
137 |             save_model(model, args.models / f"{name}.th")
138 |         return
139 | 
140 |     if args.rank == 0:
141 |         done = args.logs / f"{name}.done"
142 |         if done.exists():
143 |             done.unlink()
144 | 
145 |     if args.augment:
146 |         augment = nn.Sequential(
147 |             FlipSign(),
148 |             FlipChannels(),
149 |             Shift(args.data_stride),
150 |             Remix(group_size=args.remix_group_size),
151 |         ).to(device)
152 |     else:
153 |         augment = Shift(args.data_stride)
154 | 
155 |     if args.mse:
156 |         criterion = nn.MSELoss()
157 |     else:
158 |         criterion = nn.L1Loss()
159 | 
160 |     # Setting number of samples so that all convolution windows are full.
161 |     # Prevents hard to debug mistake with the prediction being shifted compared
162 |     # to the input mixture.
163 |     samples = model.valid_length(args.samples)
164 |     print(f"Number of training samples adjusted to {samples}")
165 | 
166 |     if args.raw:
167 |         train_set = Rawset(
168 |             args.raw / "train",
169 |             samples=samples + args.data_stride,
170 |             channels=args.audio_channels,
171 |             streams=[0, 1, 2, 3, 4],
172 |             stride=args.data_stride,
173 |         )
174 | 
175 |         valid_set = Rawset(args.raw / "valid", channels=args.audio_channels)
176 |     else:
177 |         if not args.metadata.is_file() and args.rank == 0:
178 |             build_musdb_metadata(args.metadata, args.musdb, args.workers)
179 |         if args.world_size > 1:
180 |             distributed.barrier()
181 |         metadata = json.load(open(args.metadata))
182 |         duration = Fraction(samples + args.data_stride, args.samplerate)
183 |         stride = Fraction(args.data_stride, args.samplerate)
184 |         train_set = StemsSet(
185 |             get_musdb_tracks(args.musdb, subsets=["train"], split="train"),
186 |             metadata,
187 |             duration=duration,
188 |             stride=stride,
189 |             samplerate=args.samplerate,
190 |             channels=args.audio_channels,
191 |         )
192 |         valid_set = StemsSet(
193 |             get_musdb_tracks(args.musdb, subsets=["train"], split="valid"),
194 |             metadata,
195 |             samplerate=args.samplerate,
196 |             channels=args.audio_channels,
197 |         )
198 | 
199 |     best_loss = float("inf")
200 |     for epoch, metrics in enumerate(saved.metrics):
201 |         print(
202 |             f"Epoch {epoch:03d}: "
203 |             f"train={metrics['train']:.8f} "
204 |             f"valid={metrics['valid']:.8f} "
205 |             f"best={metrics['best']:.4f} "
206 |             f"duration={human_seconds(metrics['duration'])}"
207 |         )
208 |         best_loss = metrics["best"]
209 | 
210 |     if args.world_size > 1:
211 |         dmodel = DistributedDataParallel(
212 |             model,
213 |             device_ids=[th.cuda.current_device()],
214 |             output_device=th.cuda.current_device(),
215 |         )
216 |     else:
217 |         dmodel = model
218 | 
219 |     for epoch in range(len(saved.metrics), args.epochs):
220 |         begin = time.time()
221 |         model.train()
222 |         train_loss = train_model(
223 |             epoch,
224 |             train_set,
225 |             dmodel,
226 |             criterion,
227 |             optimizer,
228 |             augment,
229 |             batch_size=args.batch_size,
230 |             device=device,
231 |             repeat=args.repeat,
232 |             seed=args.seed,
233 |             workers=args.workers,
234 |             world_size=args.world_size,
235 |         )
236 |         model.eval()
237 |         valid_loss = validate_model(
238 |             epoch,
239 |             valid_set,
240 |             model,
241 |             criterion,
242 |             device=device,
243 |             rank=args.rank,
244 |             split=args.split_valid,
245 |             world_size=args.world_size,
246 |         )
247 | 
248 |         duration = time.time() - begin
249 |         if valid_loss < best_loss:
250 |             best_loss = valid_loss
251 |             saved.best_state = {
252 |                 key: value.to("cpu").clone()
253 |                 for key, value in model.state_dict().items()
254 |             }
255 |         saved.metrics.append(
256 |             {
257 |                 "train": train_loss,
258 |                 "valid": valid_loss,
259 |                 "best": best_loss,
260 |                 "duration": duration,
261 |             }
262 |         )
263 |         if args.rank == 0:
264 |             json.dump(saved.metrics, open(metrics_path, "w"))
265 | 
266 |         saved.last_state = model.state_dict()
267 |         saved.optimizer = optimizer.state_dict()
268 |         if args.rank == 0 and not args.test:
269 |             th.save(saved, checkpoint_tmp)
270 |             checkpoint_tmp.rename(checkpoint)
271 | 
272 |         print(
273 |             f"Epoch {epoch:03d}: "
274 |             f"train={train_loss:.8f} valid={valid_loss:.8f} best={best_loss:.4f} "
275 |             f"duration={human_seconds(duration)}"
276 |         )
277 | 
278 |     del dmodel
279 |     model.load_state_dict(saved.best_state)
280 |     if args.eval_cpu:
281 |         device = "cpu"
282 |         model.to(device)
283 |     model.eval()
284 |     evaluate(
285 |         model,
286 |         args.musdb,
287 |         eval_folder,
288 |         rank=args.rank,
289 |         world_size=args.world_size,
290 |         device=device,
291 |         save=args.save,
292 |         split=args.split_valid,
293 |         shifts=args.shifts,
294 |         workers=args.eval_workers,
295 |     )
296 |     model.to("cpu")
297 |     save_model(model, args.models / f"{name}.th")
298 |     if args.rank == 0:
299 |         print("done")
300 |         done.write_text("done")
301 | 
302 | 
303 | if __name__ == "__main__":
304 |     main()
305 | 


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/demucs/apply.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | """
  7 | Code to apply a model to a mix. It will handle chunking with overlaps and
  8 | inteprolation between chunks, as well as the "shift trick".
  9 | """
 10 | from concurrent.futures import ThreadPoolExecutor
 11 | import random
 12 | import typing as tp
 13 | from multiprocessing import Process, Queue, Pipe
 14 | 
 15 | import torch as th
 16 | from torch import nn
 17 | from torch.nn import functional as F
 18 | import tqdm
 19 | import tkinter as tk
 20 | 
 21 | from .demucs import Demucs
 22 | from .hdemucs import HDemucs
 23 | from .utils import center_trim, DummyPoolExecutor
 24 | 
 25 | Model = tp.Union[Demucs, HDemucs]
 26 | 
 27 | progress_bar_num = 0
 28 | 
 29 | 
 30 | class BagOfModels(nn.Module):
 31 |     def __init__(
 32 |         self,
 33 |         models: tp.List[Model],
 34 |         weights: tp.Optional[tp.List[tp.List[float]]] = None,
 35 |         segment: tp.Optional[float] = None,
 36 |     ):
 37 |         """
 38 |         Represents a bag of models with specific weights.
 39 |         You should call `apply_model` rather than calling directly the forward here for
 40 |         optimal performance.
 41 | 
 42 |         Args:
 43 |             models (list[nn.Module]): list of Demucs/HDemucs models.
 44 |             weights (list[list[float]]): list of weights. If None, assumed to
 45 |                 be all ones, otherwise it should be a list of N list (N number of models),
 46 |                 each containing S floats (S number of sources).
 47 |             segment (None or float): overrides the `segment` attribute of each model
 48 |                 (this is performed inplace, be careful if you reuse the models passed).
 49 |         """
 50 | 
 51 |         super().__init__()
 52 |         assert len(models) > 0
 53 |         first = models[0]
 54 |         for other in models:
 55 |             assert other.sources == first.sources
 56 |             assert other.samplerate == first.samplerate
 57 |             assert other.audio_channels == first.audio_channels
 58 |             if segment is not None:
 59 |                 other.segment = segment
 60 | 
 61 |         self.audio_channels = first.audio_channels
 62 |         self.samplerate = first.samplerate
 63 |         self.sources = first.sources
 64 |         self.models = nn.ModuleList(models)
 65 | 
 66 |         if weights is None:
 67 |             weights = [[1.0 for _ in first.sources] for _ in models]
 68 |         else:
 69 |             assert len(weights) == len(models)
 70 |             for weight in weights:
 71 |                 assert len(weight) == len(first.sources)
 72 |         self.weights = weights
 73 | 
 74 |     def forward(self, x):
 75 |         raise NotImplementedError("Call `apply_model` on this.")
 76 | 
 77 | 
 78 | class TensorChunk:
 79 |     def __init__(self, tensor, offset=0, length=None):
 80 |         total_length = tensor.shape[-1]
 81 |         assert offset >= 0
 82 |         assert offset < total_length
 83 | 
 84 |         if length is None:
 85 |             length = total_length - offset
 86 |         else:
 87 |             length = min(total_length - offset, length)
 88 | 
 89 |         if isinstance(tensor, TensorChunk):
 90 |             self.tensor = tensor.tensor
 91 |             self.offset = offset + tensor.offset
 92 |         else:
 93 |             self.tensor = tensor
 94 |             self.offset = offset
 95 |         self.length = length
 96 |         self.device = tensor.device
 97 | 
 98 |     @property
 99 |     def shape(self):
100 |         shape = list(self.tensor.shape)
101 |         shape[-1] = self.length
102 |         return shape
103 | 
104 |     def padded(self, target_length):
105 |         delta = target_length - self.length
106 |         total_length = self.tensor.shape[-1]
107 |         assert delta >= 0
108 | 
109 |         start = self.offset - delta // 2
110 |         end = start + target_length
111 | 
112 |         correct_start = max(0, start)
113 |         correct_end = min(total_length, end)
114 | 
115 |         pad_left = correct_start - start
116 |         pad_right = end - correct_end
117 | 
118 |         out = F.pad(self.tensor[..., correct_start:correct_end], (pad_left, pad_right))
119 |         assert out.shape[-1] == target_length
120 |         return out
121 | 
122 | 
123 | def tensor_chunk(tensor_or_chunk):
124 |     if isinstance(tensor_or_chunk, TensorChunk):
125 |         return tensor_or_chunk
126 |     else:
127 |         assert isinstance(tensor_or_chunk, th.Tensor)
128 |         return TensorChunk(tensor_or_chunk)
129 | 
130 | 
131 | def apply_model(
132 |     model,
133 |     mix,
134 |     shifts=1,
135 |     split=True,
136 |     overlap=0.25,
137 |     transition_power=1.0,
138 |     static_shifts=1,
139 |     set_progress_bar=None,
140 |     device=None,
141 |     progress=False,
142 |     num_workers=0,
143 |     pool=None,
144 | ):
145 |     """
146 |     Apply model to a given mixture.
147 | 
148 |     Args:
149 |         shifts (int): if > 0, will shift in time `mix` by a random amount between 0 and 0.5 sec
150 |             and apply the oppositve shift to the output. This is repeated `shifts` time and
151 |             all predictions are averaged. This effectively makes the model time equivariant
152 |             and improves SDR by up to 0.2 points.
153 |         split (bool): if True, the input will be broken down in 8 seconds extracts
154 |             and predictions will be performed individually on each and concatenated.
155 |             Useful for model with large memory footprint like Tasnet.
156 |         progress (bool): if True, show a progress bar (requires split=True)
157 |         device (torch.device, str, or None): if provided, device on which to
158 |             execute the computation, otherwise `mix.device` is assumed.
159 |             When `device` is different from `mix.device`, only local computations will
160 |             be on `device`, while the entire tracks will be stored on `mix.device`.
161 |     """
162 | 
163 |     global fut_length
164 |     global bag_num
165 |     global prog_bar
166 | 
167 |     if device is None:
168 |         device = mix.device
169 |     else:
170 |         device = th.device(device)
171 |     if pool is None:
172 |         if num_workers > 0 and device.type == "cpu":
173 |             pool = ThreadPoolExecutor(num_workers)
174 |         else:
175 |             pool = DummyPoolExecutor()
176 | 
177 |     kwargs = {
178 |         "shifts": shifts,
179 |         "split": split,
180 |         "overlap": overlap,
181 |         "transition_power": transition_power,
182 |         "progress": progress,
183 |         "device": device,
184 |         "pool": pool,
185 |         "set_progress_bar": set_progress_bar,
186 |         "static_shifts": static_shifts,
187 |     }
188 | 
189 |     if isinstance(model, BagOfModels):
190 |         # Special treatment for bag of model.
191 |         # We explicitely apply multiple times `apply_model` so that the random shifts
192 |         # are different for each model.
193 | 
194 |         estimates = 0
195 |         totals = [0] * len(model.sources)
196 |         bag_num = len(model.models)
197 |         fut_length = 0
198 |         prog_bar = 0
199 |         current_model = 0  # (bag_num + 1)
200 |         for sub_model, weight in zip(model.models, model.weights):
201 |             original_model_device = next(iter(sub_model.parameters())).device
202 |             sub_model.to(device)
203 |             fut_length += fut_length
204 |             current_model += 1
205 |             out = apply_model(sub_model, mix, **kwargs)
206 |             sub_model.to(original_model_device)
207 |             for k, inst_weight in enumerate(weight):
208 |                 out[:, k, :, :] *= inst_weight
209 |                 totals[k] += inst_weight
210 |             estimates += out
211 |             del out
212 | 
213 |         for k in range(estimates.shape[1]):
214 |             estimates[:, k, :, :] /= totals[k]
215 |         return estimates
216 | 
217 |     model.to(device)
218 |     model.eval()
219 |     assert transition_power >= 1, "transition_power < 1 leads to weird behavior."
220 |     batch, channels, length = mix.shape
221 | 
222 |     if shifts:
223 |         kwargs["shifts"] = 0
224 |         max_shift = int(0.5 * model.samplerate)
225 |         mix = tensor_chunk(mix)
226 |         padded_mix = mix.padded(length + 2 * max_shift)
227 |         out = 0
228 |         for _ in range(shifts):
229 |             offset = random.randint(0, max_shift)
230 |             shifted = TensorChunk(padded_mix, offset, length + max_shift - offset)
231 |             shifted_out = apply_model(model, shifted, **kwargs)
232 |             out += shifted_out[..., max_shift - offset :]
233 |         out /= shifts
234 |         return out
235 |     elif split:
236 |         kwargs["split"] = False
237 |         out = th.zeros(batch, len(model.sources), channels, length, device=mix.device)
238 |         sum_weight = th.zeros(length, device=mix.device)
239 |         segment = int(model.samplerate * model.segment)
240 |         stride = int((1 - overlap) * segment)
241 |         offsets = range(0, length, stride)
242 |         scale = float(format(stride / model.samplerate, ".2f"))
243 |         # We start from a triangle shaped weight, with maximal weight in the middle
244 |         # of the segment. Then we normalize and take to the power `transition_power`.
245 |         # Large values of transition power will lead to sharper transitions.
246 |         weight = th.cat(
247 |             [
248 |                 th.arange(1, segment // 2 + 1, device=device),
249 |                 th.arange(segment - segment // 2, 0, -1, device=device),
250 |             ]
251 |         )
252 |         assert len(weight) == segment
253 |         # If the overlap < 50%, this will translate to linear transition when
254 |         # transition_power is 1.
255 |         weight = (weight / weight.max()) ** transition_power
256 |         futures = []
257 |         for offset in offsets:
258 |             chunk = TensorChunk(mix, offset, segment)
259 |             future = pool.submit(apply_model, model, chunk, **kwargs)
260 |             futures.append((future, offset))
261 |             offset += segment
262 |         if progress:
263 |             futures = tqdm.tqdm(futures)
264 |         for future, offset in futures:
265 |             if set_progress_bar:
266 |                 fut_length = len(futures) * bag_num * static_shifts
267 |                 prog_bar += 1
268 |                 set_progress_bar(0.1, (0.8 / fut_length * prog_bar))
269 |             chunk_out = future.result()
270 |             chunk_length = chunk_out.shape[-1]
271 |             out[..., offset : offset + segment] += (
272 |                 weight[:chunk_length] * chunk_out
273 |             ).to(mix.device)
274 |             sum_weight[offset : offset + segment] += weight[:chunk_length].to(
275 |                 mix.device
276 |             )
277 |         assert sum_weight.min() > 0
278 |         out /= sum_weight
279 |         return out
280 |     else:
281 |         if hasattr(model, "valid_length"):
282 |             valid_length = model.valid_length(length)
283 |         else:
284 |             valid_length = length
285 |         mix = tensor_chunk(mix)
286 |         padded_mix = mix.padded(valid_length).to(device)
287 |         with th.no_grad():
288 |             out = model(padded_mix)
289 |         return center_trim(out, length)
290 | 
291 | 
292 | def demucs_segments(demucs_segment, demucs_model):
293 | 
294 |     if demucs_segment == "Default":
295 |         segment = None
296 |         if isinstance(demucs_model, BagOfModels):
297 |             if segment is not None:
298 |                 for sub in demucs_model.models:
299 |                     sub.segment = segment
300 |         else:
301 |             if segment is not None:
302 |                 sub.segment = segment
303 |     else:
304 |         try:
305 |             segment = int(demucs_segment)
306 |             if isinstance(demucs_model, BagOfModels):
307 |                 if segment is not None:
308 |                     for sub in demucs_model.models:
309 |                         sub.segment = segment
310 |             else:
311 |                 if segment is not None:
312 |                     sub.segment = segment
313 |         except:
314 |             segment = None
315 |             if isinstance(demucs_model, BagOfModels):
316 |                 if segment is not None:
317 |                     for sub in demucs_model.models:
318 |                         sub.segment = segment
319 |             else:
320 |                 if segment is not None:
321 |                     sub.segment = segment
322 | 
323 |     return demucs_model
324 | 


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/demucs/model.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import math
  8 | 
  9 | import torch as th
 10 | from torch import nn
 11 | 
 12 | from .utils import capture_init, center_trim
 13 | 
 14 | 
 15 | class BLSTM(nn.Module):
 16 |     def __init__(self, dim, layers=1):
 17 |         super().__init__()
 18 |         self.lstm = nn.LSTM(
 19 |             bidirectional=True, num_layers=layers, hidden_size=dim, input_size=dim
 20 |         )
 21 |         self.linear = nn.Linear(2 * dim, dim)
 22 | 
 23 |     def forward(self, x):
 24 |         x = x.permute(2, 0, 1)
 25 |         x = self.lstm(x)[0]
 26 |         x = self.linear(x)
 27 |         x = x.permute(1, 2, 0)
 28 |         return x
 29 | 
 30 | 
 31 | def rescale_conv(conv, reference):
 32 |     std = conv.weight.std().detach()
 33 |     scale = (std / reference) ** 0.5
 34 |     conv.weight.data /= scale
 35 |     if conv.bias is not None:
 36 |         conv.bias.data /= scale
 37 | 
 38 | 
 39 | def rescale_module(module, reference):
 40 |     for sub in module.modules():
 41 |         if isinstance(sub, (nn.Conv1d, nn.ConvTranspose1d)):
 42 |             rescale_conv(sub, reference)
 43 | 
 44 | 
 45 | def upsample(x, stride):
 46 |     """
 47 |     Linear upsampling, the output will be `stride` times longer.
 48 |     """
 49 |     batch, channels, time = x.size()
 50 |     weight = th.arange(stride, device=x.device, dtype=th.float) / stride
 51 |     x = x.view(batch, channels, time, 1)
 52 |     out = x[..., :-1, :] * (1 - weight) + x[..., 1:, :] * weight
 53 |     return out.reshape(batch, channels, -1)
 54 | 
 55 | 
 56 | def downsample(x, stride):
 57 |     """
 58 |     Downsample x by decimation.
 59 |     """
 60 |     return x[:, :, ::stride]
 61 | 
 62 | 
 63 | class Demucs(nn.Module):
 64 |     @capture_init
 65 |     def __init__(
 66 |         self,
 67 |         sources=4,
 68 |         audio_channels=2,
 69 |         channels=64,
 70 |         depth=6,
 71 |         rewrite=True,
 72 |         glu=True,
 73 |         upsample=False,
 74 |         rescale=0.1,
 75 |         kernel_size=8,
 76 |         stride=4,
 77 |         growth=2.0,
 78 |         lstm_layers=2,
 79 |         context=3,
 80 |         samplerate=44100,
 81 |     ):
 82 |         """
 83 |         Args:
 84 |             sources (int): number of sources to separate
 85 |             audio_channels (int): stereo or mono
 86 |             channels (int): first convolution channels
 87 |             depth (int): number of encoder/decoder layers
 88 |             rewrite (bool): add 1x1 convolution to each encoder layer
 89 |                 and a convolution to each decoder layer.
 90 |                 For the decoder layer, `context` gives the kernel size.
 91 |             glu (bool): use glu instead of ReLU
 92 |             upsample (bool): use linear upsampling with convolutions
 93 |                 Wave-U-Net style, instead of transposed convolutions
 94 |             rescale (int): rescale initial weights of convolutions
 95 |                 to get their standard deviation closer to `rescale`
 96 |             kernel_size (int): kernel size for convolutions
 97 |             stride (int): stride for convolutions
 98 |             growth (float): multiply (resp divide) number of channels by that
 99 |                 for each layer of the encoder (resp decoder)
100 |             lstm_layers (int): number of lstm layers, 0 = no lstm
101 |             context (int): kernel size of the convolution in the
102 |                 decoder before the transposed convolution. If > 1,
103 |                 will provide some context from neighboring time
104 |                 steps.
105 |         """
106 | 
107 |         super().__init__()
108 |         self.audio_channels = audio_channels
109 |         self.sources = sources
110 |         self.kernel_size = kernel_size
111 |         self.context = context
112 |         self.stride = stride
113 |         self.depth = depth
114 |         self.upsample = upsample
115 |         self.channels = channels
116 |         self.samplerate = samplerate
117 | 
118 |         self.encoder = nn.ModuleList()
119 |         self.decoder = nn.ModuleList()
120 | 
121 |         self.final = None
122 |         if upsample:
123 |             self.final = nn.Conv1d(
124 |                 channels + audio_channels, sources * audio_channels, 1
125 |             )
126 |             stride = 1
127 | 
128 |         if glu:
129 |             activation = nn.GLU(dim=1)
130 |             ch_scale = 2
131 |         else:
132 |             activation = nn.ReLU()
133 |             ch_scale = 1
134 |         in_channels = audio_channels
135 |         for index in range(depth):
136 |             encode = []
137 |             encode += [nn.Conv1d(in_channels, channels, kernel_size, stride), nn.ReLU()]
138 |             if rewrite:
139 |                 encode += [nn.Conv1d(channels, ch_scale * channels, 1), activation]
140 |             self.encoder.append(nn.Sequential(*encode))
141 | 
142 |             decode = []
143 |             if index > 0:
144 |                 out_channels = in_channels
145 |             else:
146 |                 if upsample:
147 |                     out_channels = channels
148 |                 else:
149 |                     out_channels = sources * audio_channels
150 |             if rewrite:
151 |                 decode += [
152 |                     nn.Conv1d(channels, ch_scale * channels, context),
153 |                     activation,
154 |                 ]
155 |             if upsample:
156 |                 decode += [nn.Conv1d(channels, out_channels, kernel_size, stride=1)]
157 |             else:
158 |                 decode += [
159 |                     nn.ConvTranspose1d(channels, out_channels, kernel_size, stride)
160 |                 ]
161 |             if index > 0:
162 |                 decode.append(nn.ReLU())
163 |             self.decoder.insert(0, nn.Sequential(*decode))
164 |             in_channels = channels
165 |             channels = int(growth * channels)
166 | 
167 |         channels = in_channels
168 | 
169 |         if lstm_layers:
170 |             self.lstm = BLSTM(channels, lstm_layers)
171 |         else:
172 |             self.lstm = None
173 | 
174 |         if rescale:
175 |             rescale_module(self, reference=rescale)
176 | 
177 |     def valid_length(self, length):
178 |         """
179 |         Return the nearest valid length to use with the model so that
180 |         there is no time steps left over in a convolutions, e.g. for all
181 |         layers, size of the input - kernel_size % stride = 0.
182 | 
183 |         If the mixture has a valid length, the estimated sources
184 |         will have exactly the same length when context = 1. If context > 1,
185 |         the two signals can be center trimmed to match.
186 | 
187 |         For training, extracts should have a valid length.For evaluation
188 |         on full tracks we recommend passing `pad = True` to :method:`forward`.
189 |         """
190 |         for _ in range(self.depth):
191 |             if self.upsample:
192 |                 length = math.ceil(length / self.stride) + self.kernel_size - 1
193 |             else:
194 |                 length = math.ceil((length - self.kernel_size) / self.stride) + 1
195 |             length = max(1, length)
196 |             length += self.context - 1
197 |         for _ in range(self.depth):
198 |             if self.upsample:
199 |                 length = length * self.stride + self.kernel_size - 1
200 |             else:
201 |                 length = (length - 1) * self.stride + self.kernel_size
202 | 
203 |         return int(length)
204 | 
205 |     def forward(self, mix):
206 |         x = mix
207 |         saved = [x]
208 |         for encode in self.encoder:
209 |             x = encode(x)
210 |             saved.append(x)
211 |             if self.upsample:
212 |                 x = downsample(x, self.stride)
213 |         if self.lstm:
214 |             x = self.lstm(x)
215 |         for decode in self.decoder:
216 |             if self.upsample:
217 |                 x = upsample(x, stride=self.stride)
218 |             skip = center_trim(saved.pop(-1), x)
219 |             x = x + skip
220 |             x = decode(x)
221 |         if self.final:
222 |             skip = center_trim(saved.pop(-1), x)
223 |             x = th.cat([x, skip], dim=1)
224 |             x = self.final(x)
225 | 
226 |         x = x.view(x.size(0), self.sources, self.audio_channels, x.size(-1))
227 |         return x
228 | 


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/demucs/model_v2.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import math
  8 | 
  9 | import julius
 10 | from torch import nn
 11 | from .tasnet_v2 import ConvTasNet
 12 | 
 13 | from .utils import capture_init, center_trim
 14 | 
 15 | 
 16 | class BLSTM(nn.Module):
 17 |     def __init__(self, dim, layers=1):
 18 |         super().__init__()
 19 |         self.lstm = nn.LSTM(
 20 |             bidirectional=True, num_layers=layers, hidden_size=dim, input_size=dim
 21 |         )
 22 |         self.linear = nn.Linear(2 * dim, dim)
 23 | 
 24 |     def forward(self, x):
 25 |         x = x.permute(2, 0, 1)
 26 |         x = self.lstm(x)[0]
 27 |         x = self.linear(x)
 28 |         x = x.permute(1, 2, 0)
 29 |         return x
 30 | 
 31 | 
 32 | def rescale_conv(conv, reference):
 33 |     std = conv.weight.std().detach()
 34 |     scale = (std / reference) ** 0.5
 35 |     conv.weight.data /= scale
 36 |     if conv.bias is not None:
 37 |         conv.bias.data /= scale
 38 | 
 39 | 
 40 | def rescale_module(module, reference):
 41 |     for sub in module.modules():
 42 |         if isinstance(sub, (nn.Conv1d, nn.ConvTranspose1d)):
 43 |             rescale_conv(sub, reference)
 44 | 
 45 | 
 46 | def auto_load_demucs_model_v2(sources, demucs_model_name):
 47 | 
 48 |     if "48" in demucs_model_name:
 49 |         channels = 48
 50 |     elif "unittest" in demucs_model_name:
 51 |         channels = 4
 52 |     else:
 53 |         channels = 64
 54 | 
 55 |     if "tasnet" in demucs_model_name:
 56 |         init_demucs_model = ConvTasNet(sources, X=10)
 57 |     else:
 58 |         init_demucs_model = Demucs(sources, channels=channels)
 59 | 
 60 |     return init_demucs_model
 61 | 
 62 | 
 63 | class Demucs(nn.Module):
 64 |     @capture_init
 65 |     def __init__(
 66 |         self,
 67 |         sources,
 68 |         audio_channels=2,
 69 |         channels=64,
 70 |         depth=6,
 71 |         rewrite=True,
 72 |         glu=True,
 73 |         rescale=0.1,
 74 |         resample=True,
 75 |         kernel_size=8,
 76 |         stride=4,
 77 |         growth=2.0,
 78 |         lstm_layers=2,
 79 |         context=3,
 80 |         normalize=False,
 81 |         samplerate=44100,
 82 |         segment_length=4 * 10 * 44100,
 83 |     ):
 84 |         """
 85 |         Args:
 86 |             sources (list[str]): list of source names
 87 |             audio_channels (int): stereo or mono
 88 |             channels (int): first convolution channels
 89 |             depth (int): number of encoder/decoder layers
 90 |             rewrite (bool): add 1x1 convolution to each encoder layer
 91 |                 and a convolution to each decoder layer.
 92 |                 For the decoder layer, `context` gives the kernel size.
 93 |             glu (bool): use glu instead of ReLU
 94 |             resample_input (bool): upsample x2 the input and downsample /2 the output.
 95 |             rescale (int): rescale initial weights of convolutions
 96 |                 to get their standard deviation closer to `rescale`
 97 |             kernel_size (int): kernel size for convolutions
 98 |             stride (int): stride for convolutions
 99 |             growth (float): multiply (resp divide) number of channels by that
100 |                 for each layer of the encoder (resp decoder)
101 |             lstm_layers (int): number of lstm layers, 0 = no lstm
102 |             context (int): kernel size of the convolution in the
103 |                 decoder before the transposed convolution. If > 1,
104 |                 will provide some context from neighboring time
105 |                 steps.
106 |             samplerate (int): stored as meta information for easing
107 |                 future evaluations of the model.
108 |             segment_length (int): stored as meta information for easing
109 |                 future evaluations of the model. Length of the segments on which
110 |                 the model was trained.
111 |         """
112 | 
113 |         super().__init__()
114 |         self.audio_channels = audio_channels
115 |         self.sources = sources
116 |         self.kernel_size = kernel_size
117 |         self.context = context
118 |         self.stride = stride
119 |         self.depth = depth
120 |         self.resample = resample
121 |         self.channels = channels
122 |         self.normalize = normalize
123 |         self.samplerate = samplerate
124 |         self.segment_length = segment_length
125 | 
126 |         self.encoder = nn.ModuleList()
127 |         self.decoder = nn.ModuleList()
128 | 
129 |         if glu:
130 |             activation = nn.GLU(dim=1)
131 |             ch_scale = 2
132 |         else:
133 |             activation = nn.ReLU()
134 |             ch_scale = 1
135 |         in_channels = audio_channels
136 |         for index in range(depth):
137 |             encode = []
138 |             encode += [nn.Conv1d(in_channels, channels, kernel_size, stride), nn.ReLU()]
139 |             if rewrite:
140 |                 encode += [nn.Conv1d(channels, ch_scale * channels, 1), activation]
141 |             self.encoder.append(nn.Sequential(*encode))
142 | 
143 |             decode = []
144 |             if index > 0:
145 |                 out_channels = in_channels
146 |             else:
147 |                 out_channels = len(self.sources) * audio_channels
148 |             if rewrite:
149 |                 decode += [
150 |                     nn.Conv1d(channels, ch_scale * channels, context),
151 |                     activation,
152 |                 ]
153 |             decode += [nn.ConvTranspose1d(channels, out_channels, kernel_size, stride)]
154 |             if index > 0:
155 |                 decode.append(nn.ReLU())
156 |             self.decoder.insert(0, nn.Sequential(*decode))
157 |             in_channels = channels
158 |             channels = int(growth * channels)
159 | 
160 |         channels = in_channels
161 | 
162 |         if lstm_layers:
163 |             self.lstm = BLSTM(channels, lstm_layers)
164 |         else:
165 |             self.lstm = None
166 | 
167 |         if rescale:
168 |             rescale_module(self, reference=rescale)
169 | 
170 |     def valid_length(self, length):
171 |         """
172 |         Return the nearest valid length to use with the model so that
173 |         there is no time steps left over in a convolutions, e.g. for all
174 |         layers, size of the input - kernel_size % stride = 0.
175 | 
176 |         If the mixture has a valid length, the estimated sources
177 |         will have exactly the same length when context = 1. If context > 1,
178 |         the two signals can be center trimmed to match.
179 | 
180 |         For training, extracts should have a valid length.For evaluation
181 |         on full tracks we recommend passing `pad = True` to :method:`forward`.
182 |         """
183 |         if self.resample:
184 |             length *= 2
185 |         for _ in range(self.depth):
186 |             length = math.ceil((length - self.kernel_size) / self.stride) + 1
187 |             length = max(1, length)
188 |             length += self.context - 1
189 |         for _ in range(self.depth):
190 |             length = (length - 1) * self.stride + self.kernel_size
191 | 
192 |         if self.resample:
193 |             length = math.ceil(length / 2)
194 |         return int(length)
195 | 
196 |     def forward(self, mix):
197 |         x = mix
198 | 
199 |         if self.normalize:
200 |             mono = mix.mean(dim=1, keepdim=True)
201 |             mean = mono.mean(dim=-1, keepdim=True)
202 |             std = mono.std(dim=-1, keepdim=True)
203 |         else:
204 |             mean = 0
205 |             std = 1
206 | 
207 |         x = (x - mean) / (1e-5 + std)
208 | 
209 |         if self.resample:
210 |             x = julius.resample_frac(x, 1, 2)
211 | 
212 |         saved = []
213 |         for encode in self.encoder:
214 |             x = encode(x)
215 |             saved.append(x)
216 |         if self.lstm:
217 |             x = self.lstm(x)
218 |         for decode in self.decoder:
219 |             skip = center_trim(saved.pop(-1), x)
220 |             x = x + skip
221 |             x = decode(x)
222 | 
223 |         if self.resample:
224 |             x = julius.resample_frac(x, 2, 1)
225 |         x = x * std + mean
226 |         x = x.view(x.size(0), len(self.sources), self.audio_channels, x.size(-1))
227 |         return x
228 | 


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/demucs/pretrained.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | """Loading pretrained models.
  7 | """
  8 | 
  9 | import logging
 10 | from pathlib import Path
 11 | import typing as tp
 12 | 
 13 | # from dora.log import fatal
 14 | 
 15 | import logging
 16 | 
 17 | from diffq import DiffQuantizer
 18 | import torch.hub
 19 | 
 20 | from .model import Demucs
 21 | from .tasnet_v2 import ConvTasNet
 22 | from .utils import set_state
 23 | 
 24 | from .hdemucs import HDemucs
 25 | from .repo import (
 26 |     RemoteRepo,
 27 |     LocalRepo,
 28 |     ModelOnlyRepo,
 29 |     BagOnlyRepo,
 30 |     AnyModelRepo,
 31 |     ModelLoadingError,
 32 | )  # noqa
 33 | 
 34 | logger = logging.getLogger(__name__)
 35 | ROOT_URL = "https://dl.fbaipublicfiles.com/demucs/mdx_final/"
 36 | REMOTE_ROOT = Path(__file__).parent / "remote"
 37 | 
 38 | SOURCES = ["drums", "bass", "other", "vocals"]
 39 | 
 40 | 
 41 | def demucs_unittest():
 42 |     model = HDemucs(channels=4, sources=SOURCES)
 43 |     return model
 44 | 
 45 | 
 46 | def add_model_flags(parser):
 47 |     group = parser.add_mutually_exclusive_group(required=False)
 48 |     group.add_argument("-s", "--sig", help="Locally trained XP signature.")
 49 |     group.add_argument(
 50 |         "-n",
 51 |         "--name",
 52 |         default="mdx_extra_q",
 53 |         help="Pretrained model name or signature. Default is mdx_extra_q.",
 54 |     )
 55 |     parser.add_argument(
 56 |         "--repo",
 57 |         type=Path,
 58 |         help="Folder containing all pre-trained models for use with -n.",
 59 |     )
 60 | 
 61 | 
 62 | def _parse_remote_files(remote_file_list) -> tp.Dict[str, str]:
 63 |     root: str = ""
 64 |     models: tp.Dict[str, str] = {}
 65 |     for line in remote_file_list.read_text().split("\n"):
 66 |         line = line.strip()
 67 |         if line.startswith("#"):
 68 |             continue
 69 |         elif line.startswith("root:"):
 70 |             root = line.split(":", 1)[1].strip()
 71 |         else:
 72 |             sig = line.split("-", 1)[0]
 73 |             assert sig not in models
 74 |             models[sig] = ROOT_URL + root + line
 75 |     return models
 76 | 
 77 | 
 78 | def get_model(name: str, repo: tp.Optional[Path] = None):
 79 |     """`name` must be a bag of models name or a pretrained signature
 80 |     from the remote AWS model repo or the specified local repo if `repo` is not None.
 81 |     """
 82 |     if name == "demucs_unittest":
 83 |         return demucs_unittest()
 84 |     model_repo: ModelOnlyRepo
 85 |     if repo is None:
 86 |         models = _parse_remote_files(REMOTE_ROOT / "files.txt")
 87 |         model_repo = RemoteRepo(models)
 88 |         bag_repo = BagOnlyRepo(REMOTE_ROOT, model_repo)
 89 |     else:
 90 |         if not repo.is_dir():
 91 |             fatal(f"{repo} must exist and be a directory.")
 92 |         model_repo = LocalRepo(repo)
 93 |         bag_repo = BagOnlyRepo(repo, model_repo)
 94 |     any_repo = AnyModelRepo(model_repo, bag_repo)
 95 |     model = any_repo.get_model(name)
 96 |     model.eval()
 97 |     return model
 98 | 
 99 | 
100 | def get_model_from_args(args):
101 |     """
102 |     Load local model package or pre-trained model.
103 |     """
104 |     return get_model(name=args.name, repo=args.repo)
105 | 
106 | 
107 | logger = logging.getLogger(__name__)
108 | ROOT = "https://dl.fbaipublicfiles.com/demucs/v3.0/"
109 | 
110 | PRETRAINED_MODELS = {
111 |     "demucs": "e07c671f",
112 |     "demucs48_hq": "28a1282c",
113 |     "demucs_extra": "3646af93",
114 |     "demucs_quantized": "07afea75",
115 |     "tasnet": "beb46fac",
116 |     "tasnet_extra": "df3777b2",
117 |     "demucs_unittest": "09ebc15f",
118 | }
119 | 
120 | SOURCES = ["drums", "bass", "other", "vocals"]
121 | 
122 | 
123 | def get_url(name):
124 |     sig = PRETRAINED_MODELS[name]
125 |     return ROOT + name + "-" + sig[:8] + ".th"
126 | 
127 | 
128 | def is_pretrained(name):
129 |     return name in PRETRAINED_MODELS
130 | 
131 | 
132 | def load_pretrained(name):
133 |     if name == "demucs":
134 |         return demucs(pretrained=True)
135 |     elif name == "demucs48_hq":
136 |         return demucs(pretrained=True, hq=True, channels=48)
137 |     elif name == "demucs_extra":
138 |         return demucs(pretrained=True, extra=True)
139 |     elif name == "demucs_quantized":
140 |         return demucs(pretrained=True, quantized=True)
141 |     elif name == "demucs_unittest":
142 |         return demucs_unittest(pretrained=True)
143 |     elif name == "tasnet":
144 |         return tasnet(pretrained=True)
145 |     elif name == "tasnet_extra":
146 |         return tasnet(pretrained=True, extra=True)
147 |     else:
148 |         raise ValueError(f"Invalid pretrained name {name}")
149 | 
150 | 
151 | def _load_state(name, model, quantizer=None):
152 |     url = get_url(name)
153 |     state = torch.hub.load_state_dict_from_url(url, map_location="cpu", check_hash=True)
154 |     set_state(model, quantizer, state)
155 |     if quantizer:
156 |         quantizer.detach()
157 | 
158 | 
159 | def demucs_unittest(pretrained=True):
160 |     model = Demucs(channels=4, sources=SOURCES)
161 |     if pretrained:
162 |         _load_state("demucs_unittest", model)
163 |     return model
164 | 
165 | 
166 | def demucs(pretrained=True, extra=False, quantized=False, hq=False, channels=64):
167 |     if not pretrained and (extra or quantized or hq):
168 |         raise ValueError("if extra or quantized is True, pretrained must be True.")
169 |     model = Demucs(sources=SOURCES, channels=channels)
170 |     if pretrained:
171 |         name = "demucs"
172 |         if channels != 64:
173 |             name += str(channels)
174 |         quantizer = None
175 |         if sum([extra, quantized, hq]) > 1:
176 |             raise ValueError("Only one of extra, quantized, hq, can be True.")
177 |         if quantized:
178 |             quantizer = DiffQuantizer(model, group_size=8, min_size=1)
179 |             name += "_quantized"
180 |         if extra:
181 |             name += "_extra"
182 |         if hq:
183 |             name += "_hq"
184 |         _load_state(name, model, quantizer)
185 |     return model
186 | 
187 | 
188 | def tasnet(pretrained=True, extra=False):
189 |     if not pretrained and extra:
190 |         raise ValueError("if extra is True, pretrained must be True.")
191 |     model = ConvTasNet(X=10, sources=SOURCES)
192 |     if pretrained:
193 |         name = "tasnet"
194 |         if extra:
195 |             name = "tasnet_extra"
196 |         _load_state(name, model)
197 |     return model
198 | 


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/demucs/repo.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | """Represents a model repository, including pre-trained models and bags of models.
  7 | A repo can either be the main remote repository stored in AWS, or a local repository
  8 | with your own models.
  9 | """
 10 | 
 11 | from hashlib import sha256
 12 | from pathlib import Path
 13 | import typing as tp
 14 | 
 15 | import torch
 16 | import yaml
 17 | 
 18 | from .apply import BagOfModels, Model
 19 | from .states import load_model
 20 | 
 21 | 
 22 | AnyModel = tp.Union[Model, BagOfModels]
 23 | 
 24 | 
 25 | class ModelLoadingError(RuntimeError):
 26 |     pass
 27 | 
 28 | 
 29 | def check_checksum(path: Path, checksum: str):
 30 |     sha = sha256()
 31 |     with open(path, "rb") as file:
 32 |         while True:
 33 |             buf = file.read(2**20)
 34 |             if not buf:
 35 |                 break
 36 |             sha.update(buf)
 37 |     actual_checksum = sha.hexdigest()[: len(checksum)]
 38 |     if actual_checksum != checksum:
 39 |         raise ModelLoadingError(
 40 |             f"Invalid checksum for file {path}, "
 41 |             f"expected {checksum} but got {actual_checksum}"
 42 |         )
 43 | 
 44 | 
 45 | class ModelOnlyRepo:
 46 |     """Base class for all model only repos."""
 47 | 
 48 |     def has_model(self, sig: str) -> bool:
 49 |         raise NotImplementedError()
 50 | 
 51 |     def get_model(self, sig: str) -> Model:
 52 |         raise NotImplementedError()
 53 | 
 54 | 
 55 | class RemoteRepo(ModelOnlyRepo):
 56 |     def __init__(self, models: tp.Dict[str, str]):
 57 |         self._models = models
 58 | 
 59 |     def has_model(self, sig: str) -> bool:
 60 |         return sig in self._models
 61 | 
 62 |     def get_model(self, sig: str) -> Model:
 63 |         try:
 64 |             url = self._models[sig]
 65 |         except KeyError:
 66 |             raise ModelLoadingError(
 67 |                 f"Could not find a pre-trained model with signature {sig}."
 68 |             )
 69 |         pkg = torch.hub.load_state_dict_from_url(
 70 |             url, map_location="cpu", check_hash=True
 71 |         )
 72 |         return load_model(pkg)
 73 | 
 74 | 
 75 | class LocalRepo(ModelOnlyRepo):
 76 |     def __init__(self, root: Path):
 77 |         self.root = root
 78 |         self.scan()
 79 | 
 80 |     def scan(self):
 81 |         self._models = {}
 82 |         self._checksums = {}
 83 |         for file in self.root.iterdir():
 84 |             if file.suffix == ".th":
 85 |                 if "-" in file.stem:
 86 |                     xp_sig, checksum = file.stem.split("-")
 87 |                     self._checksums[xp_sig] = checksum
 88 |                 else:
 89 |                     xp_sig = file.stem
 90 |                 if xp_sig in self._models:
 91 |                     print("Whats xp? ", xp_sig)
 92 |                     raise ModelLoadingError(
 93 |                         f"Duplicate pre-trained model exist for signature {xp_sig}. "
 94 |                         "Please delete all but one."
 95 |                     )
 96 |                 self._models[xp_sig] = file
 97 | 
 98 |     def has_model(self, sig: str) -> bool:
 99 |         return sig in self._models
100 | 
101 |     def get_model(self, sig: str) -> Model:
102 |         try:
103 |             file = self._models[sig]
104 |         except KeyError:
105 |             raise ModelLoadingError(
106 |                 f"Could not find pre-trained model with signature {sig}."
107 |             )
108 |         if sig in self._checksums:
109 |             check_checksum(file, self._checksums[sig])
110 |         return load_model(file)
111 | 
112 | 
113 | class BagOnlyRepo:
114 |     """Handles only YAML files containing bag of models, leaving the actual
115 |     model loading to some Repo.
116 |     """
117 | 
118 |     def __init__(self, root: Path, model_repo: ModelOnlyRepo):
119 |         self.root = root
120 |         self.model_repo = model_repo
121 |         self.scan()
122 | 
123 |     def scan(self):
124 |         self._bags = {}
125 |         for file in self.root.iterdir():
126 |             if file.suffix == ".yaml":
127 |                 self._bags[file.stem] = file
128 | 
129 |     def has_model(self, name: str) -> bool:
130 |         return name in self._bags
131 | 
132 |     def get_model(self, name: str) -> BagOfModels:
133 |         try:
134 |             yaml_file = self._bags[name]
135 |         except KeyError:
136 |             raise ModelLoadingError(
137 |                 f"{name} is neither a single pre-trained model or " "a bag of models."
138 |             )
139 |         bag = yaml.safe_load(open(yaml_file))
140 |         signatures = bag["models"]
141 |         models = [self.model_repo.get_model(sig) for sig in signatures]
142 |         weights = bag.get("weights")
143 |         segment = bag.get("segment")
144 |         return BagOfModels(models, weights, segment)
145 | 
146 | 
147 | class AnyModelRepo:
148 |     def __init__(self, model_repo: ModelOnlyRepo, bag_repo: BagOnlyRepo):
149 |         self.model_repo = model_repo
150 |         self.bag_repo = bag_repo
151 | 
152 |     def has_model(self, name_or_sig: str) -> bool:
153 |         return self.model_repo.has_model(name_or_sig) or self.bag_repo.has_model(
154 |             name_or_sig
155 |         )
156 | 
157 |     def get_model(self, name_or_sig: str) -> AnyModel:
158 |         # print('name_or_sig: ', name_or_sig)
159 |         if self.model_repo.has_model(name_or_sig):
160 |             return self.model_repo.get_model(name_or_sig)
161 |         else:
162 |             return self.bag_repo.get_model(name_or_sig)
163 | 


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/demucs/spec.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | """Conveniance wrapper to perform STFT and iSTFT"""
 7 | 
 8 | import torch as th
 9 | 
10 | 
11 | def spectro(x, n_fft=512, hop_length=None, pad=0):
12 |     *other, length = x.shape
13 |     x = x.reshape(-1, length)
14 | 
15 |     device_type = x.device.type
16 |     is_other_gpu = not device_type in ["cuda", "cpu"]
17 | 
18 |     if is_other_gpu:
19 |         x = x.cpu()
20 |     z = th.stft(
21 |         x,
22 |         n_fft * (1 + pad),
23 |         hop_length or n_fft // 4,
24 |         window=th.hann_window(n_fft).to(x),
25 |         win_length=n_fft,
26 |         normalized=True,
27 |         center=True,
28 |         return_complex=True,
29 |         pad_mode="reflect",
30 |     )
31 |     _, freqs, frame = z.shape
32 |     return z.view(*other, freqs, frame)
33 | 
34 | 
35 | def ispectro(z, hop_length=None, length=None, pad=0):
36 |     *other, freqs, frames = z.shape
37 |     n_fft = 2 * freqs - 2
38 |     z = z.view(-1, freqs, frames)
39 |     win_length = n_fft // (1 + pad)
40 | 
41 |     device_type = z.device.type
42 |     is_other_gpu = not device_type in ["cuda", "cpu"]
43 | 
44 |     if is_other_gpu:
45 |         z = z.cpu()
46 |     x = th.istft(
47 |         z,
48 |         n_fft,
49 |         hop_length,
50 |         window=th.hann_window(win_length).to(z.real),
51 |         win_length=win_length,
52 |         normalized=True,
53 |         length=length,
54 |         center=True,
55 |     )
56 |     _, length = x.shape
57 |     return x.view(*other, length)
58 | 


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/demucs/states.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | """
  7 | Utilities to save and load models.
  8 | """
  9 | from contextlib import contextmanager
 10 | 
 11 | import functools
 12 | import hashlib
 13 | import inspect
 14 | import io
 15 | from pathlib import Path
 16 | import warnings
 17 | 
 18 | from diffq import DiffQuantizer, UniformQuantizer, restore_quantized_state
 19 | import torch
 20 | 
 21 | 
 22 | def get_quantizer(model, args, optimizer=None):
 23 |     """Return the quantizer given the XP quantization args."""
 24 |     quantizer = None
 25 |     if args.diffq:
 26 |         quantizer = DiffQuantizer(
 27 |             model, min_size=args.min_size, group_size=args.group_size
 28 |         )
 29 |         if optimizer is not None:
 30 |             quantizer.setup_optimizer(optimizer)
 31 |     elif args.qat:
 32 |         quantizer = UniformQuantizer(model, bits=args.qat, min_size=args.min_size)
 33 |     return quantizer
 34 | 
 35 | 
 36 | def load_model(path_or_package, strict=False):
 37 |     """Load a model from the given serialized model, either given as a dict (already loaded)
 38 |     or a path to a file on disk."""
 39 |     if isinstance(path_or_package, dict):
 40 |         package = path_or_package
 41 |     elif isinstance(path_or_package, (str, Path)):
 42 |         with warnings.catch_warnings():
 43 |             warnings.simplefilter("ignore")
 44 |             path = path_or_package
 45 |             package = torch.load(path, "cpu")
 46 |     else:
 47 |         raise ValueError(f"Invalid type for {path_or_package}.")
 48 | 
 49 |     klass = package["klass"]
 50 |     args = package["args"]
 51 |     kwargs = package["kwargs"]
 52 | 
 53 |     if strict:
 54 |         model = klass(*args, **kwargs)
 55 |     else:
 56 |         sig = inspect.signature(klass)
 57 |         for key in list(kwargs):
 58 |             if key not in sig.parameters:
 59 |                 warnings.warn("Dropping inexistant parameter " + key)
 60 |                 del kwargs[key]
 61 |         model = klass(*args, **kwargs)
 62 | 
 63 |     state = package["state"]
 64 | 
 65 |     set_state(model, state)
 66 |     return model
 67 | 
 68 | 
 69 | def get_state(model, quantizer, half=False):
 70 |     """Get the state from a model, potentially with quantization applied.
 71 |     If `half` is True, model are stored as half precision, which shouldn't impact performance
 72 |     but half the state size."""
 73 |     if quantizer is None:
 74 |         dtype = torch.half if half else None
 75 |         state = {
 76 |             k: p.data.to(device="cpu", dtype=dtype)
 77 |             for k, p in model.state_dict().items()
 78 |         }
 79 |     else:
 80 |         state = quantizer.get_quantized_state()
 81 |         state["__quantized"] = True
 82 |     return state
 83 | 
 84 | 
 85 | def set_state(model, state, quantizer=None):
 86 |     """Set the state on a given model."""
 87 |     if state.get("__quantized"):
 88 |         if quantizer is not None:
 89 |             quantizer.restore_quantized_state(model, state["quantized"])
 90 |         else:
 91 |             restore_quantized_state(model, state)
 92 |     else:
 93 |         model.load_state_dict(state)
 94 |     return state
 95 | 
 96 | 
 97 | def save_with_checksum(content, path):
 98 |     """Save the given value on disk, along with a sha256 hash.
 99 |     Should be used with the output of either `serialize_model` or `get_state`."""
100 |     buf = io.BytesIO()
101 |     torch.save(content, buf)
102 |     sig = hashlib.sha256(buf.getvalue()).hexdigest()[:8]
103 | 
104 |     path = path.parent / (path.stem + "-" + sig + path.suffix)
105 |     path.write_bytes(buf.getvalue())
106 | 
107 | 
108 | def copy_state(state):
109 |     return {k: v.cpu().clone() for k, v in state.items()}
110 | 
111 | 
112 | @contextmanager
113 | def swap_state(model, state):
114 |     """
115 |     Context manager that swaps the state of a model, e.g:
116 | 
117 |         # model is in old state
118 |         with swap_state(model, new_state):
119 |             # model in new state
120 |         # model back to old state
121 |     """
122 |     old_state = copy_state(model.state_dict())
123 |     model.load_state_dict(state, strict=False)
124 |     try:
125 |         yield
126 |     finally:
127 |         model.load_state_dict(old_state)
128 | 
129 | 
130 | def capture_init(init):
131 |     @functools.wraps(init)
132 |     def __init__(self, *args, **kwargs):
133 |         self._init_args_kwargs = (args, kwargs)
134 |         init(self, *args, **kwargs)
135 | 
136 |     return __init__
137 | 


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/mdxnet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from .modules import TFC_TDF
  4 | from pytorch_lightning import LightningModule
  5 | 
  6 | dim_s = 4
  7 | 
  8 | 
  9 | class AbstractMDXNet(LightningModule):
 10 |     def __init__(
 11 |         self,
 12 |         target_name,
 13 |         lr,
 14 |         optimizer,
 15 |         dim_c,
 16 |         dim_f,
 17 |         dim_t,
 18 |         n_fft,
 19 |         hop_length,
 20 |         overlap,
 21 |     ):
 22 |         super().__init__()
 23 |         self.target_name = target_name
 24 |         self.lr = lr
 25 |         self.optimizer = optimizer
 26 |         self.dim_c = dim_c
 27 |         self.dim_f = dim_f
 28 |         self.dim_t = dim_t
 29 |         self.n_fft = n_fft
 30 |         self.n_bins = n_fft // 2 + 1
 31 |         self.hop_length = hop_length
 32 |         self.window = nn.Parameter(
 33 |             torch.hann_window(window_length=self.n_fft, periodic=True),
 34 |             requires_grad=False,
 35 |         )
 36 |         self.freq_pad = nn.Parameter(
 37 |             torch.zeros([1, dim_c, self.n_bins - self.dim_f, self.dim_t]),
 38 |             requires_grad=False,
 39 |         )
 40 | 
 41 |     def get_optimizer(self):
 42 |         if self.optimizer == "rmsprop":
 43 |             return torch.optim.RMSprop(self.parameters(), self.lr)
 44 | 
 45 |         if self.optimizer == "adamw":
 46 |             return torch.optim.AdamW(self.parameters(), self.lr)
 47 | 
 48 | 
 49 | class ConvTDFNet(AbstractMDXNet):
 50 |     def __init__(
 51 |         self,
 52 |         target_name,
 53 |         lr,
 54 |         optimizer,
 55 |         dim_c,
 56 |         dim_f,
 57 |         dim_t,
 58 |         n_fft,
 59 |         hop_length,
 60 |         num_blocks,
 61 |         l,
 62 |         g,
 63 |         k,
 64 |         bn,
 65 |         bias,
 66 |         overlap,
 67 |     ):
 68 | 
 69 |         super(ConvTDFNet, self).__init__(
 70 |             target_name, lr, optimizer, dim_c, dim_f, dim_t, n_fft, hop_length, overlap
 71 |         )
 72 |         # self.save_hyperparameters()
 73 | 
 74 |         self.num_blocks = num_blocks
 75 |         self.l = l
 76 |         self.g = g
 77 |         self.k = k
 78 |         self.bn = bn
 79 |         self.bias = bias
 80 | 
 81 |         if optimizer == "rmsprop":
 82 |             norm = nn.BatchNorm2d
 83 | 
 84 |         if optimizer == "adamw":
 85 |             norm = lambda input: nn.GroupNorm(2, input)
 86 | 
 87 |         self.n = num_blocks // 2
 88 |         scale = (2, 2)
 89 | 
 90 |         self.first_conv = nn.Sequential(
 91 |             nn.Conv2d(in_channels=self.dim_c, out_channels=g, kernel_size=(1, 1)),
 92 |             norm(g),
 93 |             nn.ReLU(),
 94 |         )
 95 | 
 96 |         f = self.dim_f
 97 |         c = g
 98 |         self.encoding_blocks = nn.ModuleList()
 99 |         self.ds = nn.ModuleList()
100 |         for i in range(self.n):
101 |             self.encoding_blocks.append(TFC_TDF(c, l, f, k, bn, bias=bias, norm=norm))
102 |             self.ds.append(
103 |                 nn.Sequential(
104 |                     nn.Conv2d(
105 |                         in_channels=c,
106 |                         out_channels=c + g,
107 |                         kernel_size=scale,
108 |                         stride=scale,
109 |                     ),
110 |                     norm(c + g),
111 |                     nn.ReLU(),
112 |                 )
113 |             )
114 |             f = f // 2
115 |             c += g
116 | 
117 |         self.bottleneck_block = TFC_TDF(c, l, f, k, bn, bias=bias, norm=norm)
118 | 
119 |         self.decoding_blocks = nn.ModuleList()
120 |         self.us = nn.ModuleList()
121 |         for i in range(self.n):
122 |             self.us.append(
123 |                 nn.Sequential(
124 |                     nn.ConvTranspose2d(
125 |                         in_channels=c,
126 |                         out_channels=c - g,
127 |                         kernel_size=scale,
128 |                         stride=scale,
129 |                     ),
130 |                     norm(c - g),
131 |                     nn.ReLU(),
132 |                 )
133 |             )
134 |             f = f * 2
135 |             c -= g
136 | 
137 |             self.decoding_blocks.append(TFC_TDF(c, l, f, k, bn, bias=bias, norm=norm))
138 | 
139 |         self.final_conv = nn.Sequential(
140 |             nn.Conv2d(in_channels=c, out_channels=self.dim_c, kernel_size=(1, 1)),
141 |         )
142 | 
143 |     def forward(self, x):
144 | 
145 |         x = self.first_conv(x)
146 | 
147 |         x = x.transpose(-1, -2)
148 | 
149 |         ds_outputs = []
150 |         for i in range(self.n):
151 |             x = self.encoding_blocks[i](x)
152 |             ds_outputs.append(x)
153 |             x = self.ds[i](x)
154 | 
155 |         x = self.bottleneck_block(x)
156 | 
157 |         for i in range(self.n):
158 |             x = self.us[i](x)
159 |             x *= ds_outputs[-i - 1]
160 |             x = self.decoding_blocks[i](x)
161 | 
162 |         x = x.transpose(-1, -2)
163 | 
164 |         x = self.final_conv(x)
165 | 
166 |         return x
167 | 
168 | 
169 | class Mixer(nn.Module):
170 |     def __init__(self, device, mixer_path):
171 | 
172 |         super(Mixer, self).__init__()
173 | 
174 |         self.linear = nn.Linear((dim_s + 1) * 2, dim_s * 2, bias=False)
175 | 
176 |         self.load_state_dict(torch.load(mixer_path, map_location=device))
177 | 
178 |     def forward(self, x):
179 |         x = x.reshape(1, (dim_s + 1) * 2, -1).transpose(-1, -2)
180 |         x = self.linear(x)
181 |         return x.transpose(-1, -2).reshape(dim_s, 2, -1)
182 | 


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/mixer.ckpt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IAHispano/Applio-Plugins/b80054bb20ade068aa69fed31bfe48f7dcbc4cad/UVR/uvr/uvr_lib_v5/mixer.ckpt


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/modules.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class TFC(nn.Module):
 6 |     def __init__(self, c, l, k, norm):
 7 |         super(TFC, self).__init__()
 8 | 
 9 |         self.H = nn.ModuleList()
10 |         for i in range(l):
11 |             self.H.append(
12 |                 nn.Sequential(
13 |                     nn.Conv2d(
14 |                         in_channels=c,
15 |                         out_channels=c,
16 |                         kernel_size=k,
17 |                         stride=1,
18 |                         padding=k // 2,
19 |                     ),
20 |                     norm(c),
21 |                     nn.ReLU(),
22 |                 )
23 |             )
24 | 
25 |     def forward(self, x):
26 |         for h in self.H:
27 |             x = h(x)
28 |         return x
29 | 
30 | 
31 | class DenseTFC(nn.Module):
32 |     def __init__(self, c, l, k, norm):
33 |         super(DenseTFC, self).__init__()
34 | 
35 |         self.conv = nn.ModuleList()
36 |         for i in range(l):
37 |             self.conv.append(
38 |                 nn.Sequential(
39 |                     nn.Conv2d(
40 |                         in_channels=c,
41 |                         out_channels=c,
42 |                         kernel_size=k,
43 |                         stride=1,
44 |                         padding=k // 2,
45 |                     ),
46 |                     norm(c),
47 |                     nn.ReLU(),
48 |                 )
49 |             )
50 | 
51 |     def forward(self, x):
52 |         for layer in self.conv[:-1]:
53 |             x = torch.cat([layer(x), x], 1)
54 |         return self.conv[-1](x)
55 | 
56 | 
57 | class TFC_TDF(nn.Module):
58 |     def __init__(self, c, l, f, k, bn, dense=False, bias=True, norm=nn.BatchNorm2d):
59 | 
60 |         super(TFC_TDF, self).__init__()
61 | 
62 |         self.use_tdf = bn is not None
63 | 
64 |         self.tfc = DenseTFC(c, l, k, norm) if dense else TFC(c, l, k, norm)
65 | 
66 |         if self.use_tdf:
67 |             if bn == 0:
68 |                 self.tdf = nn.Sequential(nn.Linear(f, f, bias=bias), norm(c), nn.ReLU())
69 |             else:
70 |                 self.tdf = nn.Sequential(
71 |                     nn.Linear(f, f // bn, bias=bias),
72 |                     norm(c),
73 |                     nn.ReLU(),
74 |                     nn.Linear(f // bn, f, bias=bias),
75 |                     norm(c),
76 |                     nn.ReLU(),
77 |                 )
78 | 
79 |     def forward(self, x):
80 |         x = self.tfc(x)
81 |         return x + self.tdf(x) if self.use_tdf else x
82 | 


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/playsound.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | logger = logging.getLogger(__name__)
  4 | 
  5 | 
  6 | class PlaysoundException(Exception):
  7 |     pass
  8 | 
  9 | 
 10 | def _canonicalizePath(path):
 11 |     """
 12 |     Support passing in a pathlib.Path-like object by converting to str.
 13 |     """
 14 |     import sys
 15 | 
 16 |     if sys.version_info[0] >= 3:
 17 |         return str(path)
 18 |     else:
 19 |         # On earlier Python versions, str is a byte string, so attempting to
 20 |         # convert a unicode string to str will fail. Leave it alone in this case.
 21 |         return path
 22 | 
 23 | 
 24 | def _playsoundWin(sound, block=True):
 25 |     """
 26 |     Utilizes windll.winmm. Tested and known to work with MP3 and WAVE on
 27 |     Windows 7 with Python 2.7. Probably works with more file formats.
 28 |     Probably works on Windows XP thru Windows 10. Probably works with all
 29 |     versions of Python.
 30 | 
 31 |     Inspired by (but not copied from) Michael Gundlach <gundlach@gmail.com>'s mp3play:
 32 |     https://github.com/michaelgundlach/mp3play
 33 | 
 34 |     I never would have tried using windll.winmm without seeing his code.
 35 |     """
 36 |     sound = '"' + _canonicalizePath(sound) + '"'
 37 | 
 38 |     from ctypes import create_unicode_buffer, windll, wintypes
 39 |     from time import sleep
 40 | 
 41 |     windll.winmm.mciSendStringW.argtypes = [
 42 |         wintypes.LPCWSTR,
 43 |         wintypes.LPWSTR,
 44 |         wintypes.UINT,
 45 |         wintypes.HANDLE,
 46 |     ]
 47 |     windll.winmm.mciGetErrorStringW.argtypes = [
 48 |         wintypes.DWORD,
 49 |         wintypes.LPWSTR,
 50 |         wintypes.UINT,
 51 |     ]
 52 | 
 53 |     def winCommand(*command):
 54 |         bufLen = 600
 55 |         buf = create_unicode_buffer(bufLen)
 56 |         command = " ".join(command)
 57 |         errorCode = int(
 58 |             windll.winmm.mciSendStringW(command, buf, bufLen - 1, 0)
 59 |         )  # use widestring version of the function
 60 |         if errorCode:
 61 |             errorBuffer = create_unicode_buffer(bufLen)
 62 |             windll.winmm.mciGetErrorStringW(
 63 |                 errorCode, errorBuffer, bufLen - 1
 64 |             )  # use widestring version of the function
 65 |             exceptionMessage = (
 66 |                 "\n    Error " + str(errorCode) + " for command:"
 67 |                 "\n        " + command + "\n    " + errorBuffer.value
 68 |             )
 69 |             logger.error(exceptionMessage)
 70 |             raise PlaysoundException(exceptionMessage)
 71 |         return buf.value
 72 | 
 73 |     try:
 74 |         logger.debug("Starting")
 75 |         winCommand("open {}".format(sound))
 76 |         winCommand("play {}{}".format(sound, " wait" if block else ""))
 77 |         logger.debug("Returning")
 78 |     finally:
 79 |         try:
 80 |             winCommand("close {}".format(sound))
 81 |         except PlaysoundException:
 82 |             logger.warning("Failed to close the file: {}".format(sound))
 83 |             # If it fails, there's nothing more that can be done...
 84 |             pass
 85 | 
 86 | 
 87 | def _handlePathOSX(sound):
 88 |     sound = _canonicalizePath(sound)
 89 | 
 90 |     if "://" not in sound:
 91 |         if not sound.startswith("/"):
 92 |             from os import getcwd
 93 | 
 94 |             sound = getcwd() + "/" + sound
 95 |         sound = "file://" + sound
 96 | 
 97 |     try:
 98 |         # Don't double-encode it.
 99 |         sound.encode("ascii")
100 |         return sound.replace(" ", "%20")
101 |     except UnicodeEncodeError:
102 |         try:
103 |             from urllib.parse import quote  # Try the Python 3 import first...
104 |         except ImportError:
105 |             from urllib import (
106 |                 quote,
107 |             )  # Try using the Python 2 import before giving up entirely...
108 | 
109 |         parts = sound.split("://", 1)
110 |         return parts[0] + "://" + quote(parts[1].encode("utf-8")).replace(" ", "%20")
111 | 
112 | 
113 | def _playsoundOSX(sound, block=True):
114 |     """
115 |     Utilizes AppKit.NSSound. Tested and known to work with MP3 and WAVE on
116 |     OS X 10.11 with Python 2.7. Probably works with anything QuickTime supports.
117 |     Probably works on OS X 10.5 and newer. Probably works with all versions of
118 |     Python.
119 | 
120 |     Inspired by (but not copied from) Aaron's Stack Overflow answer here:
121 |     http://stackoverflow.com/a/34568298/901641
122 | 
123 |     I never would have tried using AppKit.NSSound without seeing his code.
124 |     """
125 |     try:
126 |         from AppKit import NSSound
127 |     except ImportError:
128 |         logger.warning(
129 |             "playsound could not find a copy of AppKit - falling back to using macOS's system copy."
130 |         )
131 |         sys.path.append(
132 |             "/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/PyObjC"
133 |         )
134 |         from AppKit import NSSound
135 | 
136 |     from Foundation import NSURL
137 |     from time import sleep
138 | 
139 |     sound = _handlePathOSX(sound)
140 |     url = NSURL.URLWithString_(sound)
141 |     if not url:
142 |         raise PlaysoundException("Cannot find a sound with filename: " + sound)
143 | 
144 |     for i in range(5):
145 |         nssound = NSSound.alloc().initWithContentsOfURL_byReference_(url, True)
146 |         if nssound:
147 |             break
148 |         else:
149 |             logger.debug("Failed to load sound, although url was good... " + sound)
150 |     else:
151 |         raise PlaysoundException(
152 |             "Could not load sound with filename, although URL was good... " + sound
153 |         )
154 |     nssound.play()
155 | 
156 |     if block:
157 |         sleep(nssound.duration())
158 | 
159 | 
160 | def _playsoundNix(sound, block=True):
161 |     """Play a sound using GStreamer.
162 | 
163 |     Inspired by this:
164 |     https://gstreamer.freedesktop.org/documentation/tutorials/playback/playbin-usage.html
165 |     """
166 |     sound = _canonicalizePath(sound)
167 | 
168 |     # pathname2url escapes non-URL-safe characters
169 |     from os.path import abspath, exists
170 | 
171 |     try:
172 |         from urllib.request import pathname2url
173 |     except ImportError:
174 |         # python 2
175 |         from urllib import pathname2url
176 | 
177 |     import gi
178 | 
179 |     gi.require_version("Gst", "1.0")
180 |     from gi.repository import Gst
181 | 
182 |     Gst.init(None)
183 | 
184 |     playbin = Gst.ElementFactory.make("playbin", "playbin")
185 |     if sound.startswith(("http://", "https://")):
186 |         playbin.props.uri = sound
187 |     else:
188 |         path = abspath(sound)
189 |         if not exists(path):
190 |             raise PlaysoundException("File not found: {}".format(path))
191 |         playbin.props.uri = "file://" + pathname2url(path)
192 | 
193 |     set_result = playbin.set_state(Gst.State.PLAYING)
194 |     if set_result != Gst.StateChangeReturn.ASYNC:
195 |         raise PlaysoundException("playbin.set_state returned " + repr(set_result))
196 | 
197 |     # FIXME: use some other bus method than poll() with block=False
198 |     # https://lazka.github.io/pgi-docs/#Gst-1.0/classes/Bus.html
199 |     logger.debug("Starting play")
200 |     if block:
201 |         bus = playbin.get_bus()
202 |         try:
203 |             bus.poll(Gst.MessageType.EOS, Gst.CLOCK_TIME_NONE)
204 |         finally:
205 |             playbin.set_state(Gst.State.NULL)
206 | 
207 |     logger.debug("Finishing play")
208 | 
209 | 
210 | def _playsoundAnotherPython(otherPython, sound, block=True, macOS=False):
211 |     """
212 |     Mostly written so that when this is run on python3 on macOS, it can invoke
213 |     python2 on macOS... but maybe this idea could be useful on linux, too.
214 |     """
215 |     from inspect import getsourcefile
216 |     from os.path import abspath, exists
217 |     from subprocess import check_call
218 |     from threading import Thread
219 | 
220 |     sound = _canonicalizePath(sound)
221 | 
222 |     class PropogatingThread(Thread):
223 |         def run(self):
224 |             self.exc = None
225 |             try:
226 |                 self.ret = self._target(*self._args, **self._kwargs)
227 |             except BaseException as e:
228 |                 self.exc = e
229 | 
230 |         def join(self, timeout=None):
231 |             super().join(timeout)
232 |             if self.exc:
233 |                 raise self.exc
234 |             return self.ret
235 | 
236 |     # Check if the file exists...
237 |     if not exists(abspath(sound)):
238 |         raise PlaysoundException("Cannot find a sound with filename: " + sound)
239 | 
240 |     playsoundPath = abspath(getsourcefile(lambda: 0))
241 |     t = PropogatingThread(
242 |         target=lambda: check_call(
243 |             [otherPython, playsoundPath, _handlePathOSX(sound) if macOS else sound]
244 |         )
245 |     )
246 |     t.start()
247 |     if block:
248 |         t.join()
249 | 
250 | 
251 | from platform import system
252 | 
253 | system = system()
254 | 
255 | if system == "Windows":
256 |     playsound_func = _playsoundWin
257 | elif system == "Darwin":
258 |     playsound_func = _playsoundOSX
259 |     import sys
260 | 
261 |     if sys.version_info[0] > 2:
262 |         try:
263 |             from AppKit import NSSound
264 |         except ImportError:
265 |             logger.warning(
266 |                 "playsound is relying on a python 2 subprocess. Please use `pip3 install PyObjC` if you want playsound to run more efficiently."
267 |             )
268 |             playsound_func = lambda sound, block=True: _playsoundAnotherPython(
269 |                 "/System/Library/Frameworks/Python.framework/Versions/2.7/bin/python",
270 |                 sound,
271 |                 block,
272 |                 macOS=True,
273 |             )
274 | else:
275 |     playsound_func = _playsoundNix
276 |     if (
277 |         __name__ != "__main__"
278 |     ):  # Ensure we don't infinitely recurse trying to get another python instance.
279 |         try:
280 |             import gi
281 | 
282 |             gi.require_version("Gst", "1.0")
283 |             from gi.repository import Gst
284 |         except:
285 |             logger.warning(
286 |                 "playsound is relying on another python subprocess. Please use `pip install pygobject` if you want playsound to run more efficiently."
287 |             )
288 |             playsound_func = lambda sound, block=True: _playsoundAnotherPython(
289 |                 "/usr/bin/python3", sound, block, macOS=False
290 |             )
291 | 
292 | del system
293 | 
294 | 
295 | def play(audio_filepath):
296 |     playsound_func(audio_filepath)
297 | 


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/pyrb.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import subprocess
  3 | import tempfile
  4 | import six
  5 | import numpy as np
  6 | import soundfile as sf
  7 | import sys
  8 | 
  9 | if getattr(sys, "frozen", False):
 10 |     BASE_PATH_RUB = sys._MEIPASS
 11 | else:
 12 |     BASE_PATH_RUB = os.path.dirname(os.path.abspath(__file__))
 13 | 
 14 | __all__ = ["time_stretch", "pitch_shift"]
 15 | 
 16 | __RUBBERBAND_UTIL = os.path.join(BASE_PATH_RUB, "rubberband")
 17 | 
 18 | if six.PY2:
 19 |     DEVNULL = open(os.devnull, "w")
 20 | else:
 21 |     DEVNULL = subprocess.DEVNULL
 22 | 
 23 | 
 24 | def __rubberband(y, sr, **kwargs):
 25 | 
 26 |     assert sr > 0
 27 | 
 28 |     # Get the input and output tempfile
 29 |     fd, infile = tempfile.mkstemp(suffix=".wav")
 30 |     os.close(fd)
 31 |     fd, outfile = tempfile.mkstemp(suffix=".wav")
 32 |     os.close(fd)
 33 | 
 34 |     # dump the audio
 35 |     sf.write(infile, y, sr)
 36 | 
 37 |     try:
 38 |         # Execute rubberband
 39 |         arguments = [__RUBBERBAND_UTIL, "-q"]
 40 | 
 41 |         for key, value in six.iteritems(kwargs):
 42 |             arguments.append(str(key))
 43 |             arguments.append(str(value))
 44 | 
 45 |         arguments.extend([infile, outfile])
 46 | 
 47 |         subprocess.check_call(arguments, stdout=DEVNULL, stderr=DEVNULL)
 48 | 
 49 |         # Load the processed audio.
 50 |         y_out, _ = sf.read(outfile, always_2d=True)
 51 | 
 52 |         # make sure that output dimensions matches input
 53 |         if y.ndim == 1:
 54 |             y_out = np.squeeze(y_out)
 55 | 
 56 |     except OSError as exc:
 57 |         six.raise_from(
 58 |             RuntimeError(
 59 |                 "Failed to execute rubberband. "
 60 |                 "Please verify that rubberband-cli "
 61 |                 "is installed."
 62 |             ),
 63 |             exc,
 64 |         )
 65 | 
 66 |     finally:
 67 |         # Remove temp files
 68 |         os.unlink(infile)
 69 |         os.unlink(outfile)
 70 | 
 71 |     return y_out
 72 | 
 73 | 
 74 | def time_stretch(y, sr, rate, rbargs=None):
 75 |     if rate <= 0:
 76 |         raise ValueError("rate must be strictly positive")
 77 | 
 78 |     if rate == 1.0:
 79 |         return y
 80 | 
 81 |     if rbargs is None:
 82 |         rbargs = dict()
 83 | 
 84 |     rbargs.setdefault("--tempo", rate)
 85 | 
 86 |     return __rubberband(y, sr, **rbargs)
 87 | 
 88 | 
 89 | def pitch_shift(y, sr, n_steps, rbargs=None):
 90 | 
 91 |     if n_steps == 0:
 92 |         return y
 93 | 
 94 |     if rbargs is None:
 95 |         rbargs = dict()
 96 | 
 97 |     rbargs.setdefault("--pitch", n_steps)
 98 | 
 99 |     return __rubberband(y, sr, **rbargs)
100 | 


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/results.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | Matchering - Audio Matching and Mastering Python Library
 5 | Copyright (C) 2016-2022 Sergree
 6 | 
 7 | This program is free software: you can redistribute it and/or modify
 8 | it under the terms of the GNU General Public License as published by
 9 | the Free Software Foundation, either version 3 of the License, or
10 | (at your option) any later version.
11 | 
12 | This program is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | GNU General Public License for more details.
16 | 
17 | You should have received a copy of the GNU General Public License
18 | along with this program.  If not, see <https://www.gnu.org/licenses/>.
19 | """
20 | 
21 | import os
22 | import soundfile as sf
23 | 
24 | 
25 | class Result:
26 |     def __init__(
27 |         self, file: str, subtype: str, use_limiter: bool = True, normalize: bool = True
28 |     ):
29 |         _, file_ext = os.path.splitext(file)
30 |         file_ext = file_ext[1:].upper()
31 |         if not sf.check_format(file_ext):
32 |             raise TypeError(f"{file_ext} format is not supported")
33 |         if not sf.check_format(file_ext, subtype):
34 |             raise TypeError(f"{file_ext} format does not have {subtype} subtype")
35 |         self.file = file
36 |         self.subtype = subtype
37 |         self.use_limiter = use_limiter
38 |         self.normalize = normalize
39 | 
40 | 
41 | def pcm16(file: str) -> Result:
42 |     return Result(file, "PCM_16")
43 | 
44 | 
45 | def pcm24(file: str) -> Result:
46 |     return Result(file, "FLOAT")
47 | 
48 | 
49 | def save_audiofile(file: str, wav_set="PCM_16") -> Result:
50 |     return Result(file, wav_set)
51 | 


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/stft.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | cpu_device=torch.device("cpu")
  3 | 
  4 | class STFT:
  5 |     """
  6 |     This class performs the Short-Time Fourier Transform (STFT) and its inverse (ISTFT).
  7 |     These functions are essential for converting the audio between the time domain and the frequency domain,
  8 |     which is a crucial aspect of audio processing in neural networks.
  9 |     """
 10 | 
 11 |     def __init__(self, logger, n_fft, hop_length, dim_f, device):
 12 |         self.logger = logger
 13 |         self.n_fft = n_fft
 14 |         self.hop_length = hop_length
 15 |         self.dim_f = dim_f
 16 |         self.device = device
 17 |         # Create a Hann window tensor for use in the STFT.
 18 |         self.hann_window = torch.hann_window(window_length=self.n_fft, periodic=True)
 19 | 
 20 |     def __call__(self, input_tensor):
 21 |         # Determine if the input tensor's device is not a standard computing device (i.e., not CPU or CUDA).
 22 |         is_non_standard_device = not input_tensor.device.type in ["cuda", "cpu"]
 23 | 
 24 |         # If on a non-standard device, temporarily move the tensor to CPU for processing.
 25 |         if is_non_standard_device:
 26 |             input_tensor = input_tensor.cpu()
 27 | 
 28 |         # Transfer the pre-defined window tensor to the same device as the input tensor.
 29 |         stft_window = self.hann_window.to(input_tensor.device)
 30 | 
 31 |         # Extract batch dimensions (all dimensions except the last two which are channel and time).
 32 |         batch_dimensions = input_tensor.shape[:-2]
 33 | 
 34 |         # Extract channel and time dimensions (last two dimensions of the tensor).
 35 |         channel_dim, time_dim = input_tensor.shape[-2:]
 36 | 
 37 |         # Reshape the tensor to merge batch and channel dimensions for STFT processing.
 38 |         reshaped_tensor = input_tensor.reshape([-1, time_dim])
 39 | 
 40 |         # Perform the Short-Time Fourier Transform (STFT) on the reshaped tensor.
 41 |         source_device = reshaped_tensor.device
 42 |         if reshaped_tensor.device.type == "cuda" and torch.cuda.get_device_name(reshaped_tensor.device.index).endswith("[ZLUDA]"):
 43 |             reshaped_tensor = reshaped_tensor.to("cpu")
 44 |             stft_window = stft_window.to("cpu")
 45 |         
 46 |         stft_output = torch.stft(
 47 |             reshaped_tensor,
 48 |             n_fft=self.n_fft,
 49 |             hop_length=self.hop_length,
 50 |             window=stft_window,
 51 |             center=True,
 52 |             return_complex=False,
 53 |         ).to(source_device)
 54 | 
 55 |         # Rearrange the dimensions of the STFT output to bring the frequency dimension forward.
 56 |         permuted_stft_output = stft_output.permute([0, 3, 1, 2])
 57 | 
 58 |         # Reshape the output to restore the original batch and channel dimensions, while keeping the newly formed frequency and time dimensions.
 59 |         final_output = permuted_stft_output.reshape(
 60 |             [*batch_dimensions, channel_dim, 2, -1, permuted_stft_output.shape[-1]]
 61 |         ).reshape(
 62 |             [*batch_dimensions, channel_dim * 2, -1, permuted_stft_output.shape[-1]]
 63 |         )
 64 | 
 65 |         # If the original tensor was on a non-standard device, move the processed tensor back to that device.
 66 |         if is_non_standard_device:
 67 |             final_output = final_output.to(self.device)
 68 | 
 69 |         # Return the transformed tensor, sliced to retain only the required frequency dimension (`dim_f`).
 70 |         return final_output[..., : self.dim_f, :]
 71 | 
 72 |     def pad_frequency_dimension(
 73 |         self,
 74 |         input_tensor,
 75 |         batch_dimensions,
 76 |         channel_dim,
 77 |         freq_dim,
 78 |         time_dim,
 79 |         num_freq_bins,
 80 |     ):
 81 |         """
 82 |         Adds zero padding to the frequency dimension of the input tensor.
 83 |         """
 84 |         # Create a padding tensor for the frequency dimension
 85 |         freq_padding = torch.zeros(
 86 |             [*batch_dimensions, channel_dim, num_freq_bins - freq_dim, time_dim]
 87 |         ).to(input_tensor.device)
 88 | 
 89 |         # Concatenate the padding to the input tensor along the frequency dimension.
 90 |         padded_tensor = torch.cat([input_tensor, freq_padding], -2)
 91 | 
 92 |         return padded_tensor
 93 | 
 94 |     def calculate_inverse_dimensions(self, input_tensor):
 95 |         # Extract batch dimensions and frequency-time dimensions.
 96 |         batch_dimensions = input_tensor.shape[:-3]
 97 |         channel_dim, freq_dim, time_dim = input_tensor.shape[-3:]
 98 | 
 99 |         # Calculate the number of frequency bins for the inverse STFT.
100 |         num_freq_bins = self.n_fft // 2 + 1
101 | 
102 |         return batch_dimensions, channel_dim, freq_dim, time_dim, num_freq_bins
103 | 
104 |     def prepare_for_istft(
105 |         self, padded_tensor, batch_dimensions, channel_dim, num_freq_bins, time_dim
106 |     ):
107 |         """
108 |         Prepares the tensor for Inverse Short-Time Fourier Transform (ISTFT) by reshaping
109 |         and creating a complex tensor from the real and imaginary parts.
110 |         """
111 |         # Reshape the tensor to separate real and imaginary parts and prepare for ISTFT.
112 |         reshaped_tensor = padded_tensor.reshape(
113 |             [*batch_dimensions, channel_dim // 2, 2, num_freq_bins, time_dim]
114 |         )
115 | 
116 |         # Flatten batch dimensions and rearrange for ISTFT.
117 |         flattened_tensor = reshaped_tensor.reshape([-1, 2, num_freq_bins, time_dim])
118 | 
119 |         # Rearrange the dimensions of the tensor to bring the frequency dimension forward.
120 |         permuted_tensor = flattened_tensor.permute([0, 2, 3, 1])
121 | 
122 |         # Combine real and imaginary parts into a complex tensor.
123 |         complex_tensor = permuted_tensor[..., 0] + permuted_tensor[..., 1] * 1.0j
124 | 
125 |         return complex_tensor
126 | 
127 |     def inverse(self, input_tensor):
128 |         # Determine if the input tensor's device is not a standard computing device (i.e., not CPU or CUDA).
129 |         is_non_standard_device = not input_tensor.device.type in ["cuda", "cpu"]
130 | 
131 |         # If on a non-standard device, temporarily move the tensor to CPU for processing.
132 |         if is_non_standard_device:
133 |             input_tensor = input_tensor.cpu()
134 | 
135 |         # Transfer the pre-defined Hann window tensor to the same device as the input tensor.
136 |         stft_window = self.hann_window.to(input_tensor.device)
137 | 
138 |         batch_dimensions, channel_dim, freq_dim, time_dim, num_freq_bins = (
139 |             self.calculate_inverse_dimensions(input_tensor)
140 |         )
141 | 
142 |         padded_tensor = self.pad_frequency_dimension(
143 |             input_tensor,
144 |             batch_dimensions,
145 |             channel_dim,
146 |             freq_dim,
147 |             time_dim,
148 |             num_freq_bins,
149 |         )
150 | 
151 |         complex_tensor = self.prepare_for_istft(
152 |             padded_tensor, batch_dimensions, channel_dim, num_freq_bins, time_dim
153 |         )
154 | 
155 |         # Perform the Inverse Short-Time Fourier Transform (ISTFT).
156 |         source_device = complex_tensor.device
157 |         if complex_tensor.device.type == "cuda" and torch.cuda.get_device_name(complex_tensor.device.index).endswith("[ZLUDA]"):
158 |             complex_tensor = complex_tensor.device.to("cpu")
159 |             stft_window = stft_window.to(cpu_device)
160 |         
161 |         istft_result = torch.istft(
162 |             complex_tensor,
163 |             n_fft=self.n_fft,
164 |             hop_length=self.hop_length,
165 |             window=stft_window,
166 |             center=True,
167 |         ).to(source_device)
168 | 
169 |         # Reshape ISTFT result to restore original batch and channel dimensions.
170 |         final_output = istft_result.reshape([*batch_dimensions, 2, -1])
171 | 
172 |         # If the original tensor was on a non-standard device, move the processed tensor back to that device.
173 |         if is_non_standard_device:
174 |             final_output = final_output.to(self.device)
175 | 
176 |         return final_output
177 | 


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/tfc_tdf_v3.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from functools import partial
  4 | 
  5 | 
  6 | class STFT:
  7 |     def __init__(self, n_fft, hop_length, dim_f, device):
  8 |         self.n_fft = n_fft
  9 |         self.hop_length = hop_length
 10 |         self.window = torch.hann_window(window_length=self.n_fft, periodic=True)
 11 |         self.dim_f = dim_f
 12 |         self.device = device
 13 | 
 14 |     def __call__(self, x):
 15 | 
 16 |         x_is_mps = not x.device.type in ["cuda", "cpu"]
 17 |         if x_is_mps:
 18 |             x = x.cpu()
 19 | 
 20 |         window = self.window.to(x.device)
 21 |         batch_dims = x.shape[:-2]
 22 |         c, t = x.shape[-2:]
 23 |         x = x.reshape([-1, t])
 24 |         x = torch.stft(
 25 |             x,
 26 |             n_fft=self.n_fft,
 27 |             hop_length=self.hop_length,
 28 |             window=window,
 29 |             center=True,
 30 |             return_complex=False,
 31 |         )
 32 |         x = x.permute([0, 3, 1, 2])
 33 |         x = x.reshape([*batch_dims, c, 2, -1, x.shape[-1]]).reshape(
 34 |             [*batch_dims, c * 2, -1, x.shape[-1]]
 35 |         )
 36 | 
 37 |         if x_is_mps:
 38 |             x = x.to(self.device)
 39 | 
 40 |         return x[..., : self.dim_f, :]
 41 | 
 42 |     def inverse(self, x):
 43 | 
 44 |         x_is_mps = not x.device.type in ["cuda", "cpu"]
 45 |         if x_is_mps:
 46 |             x = x.cpu()
 47 | 
 48 |         window = self.window.to(x.device)
 49 |         batch_dims = x.shape[:-3]
 50 |         c, f, t = x.shape[-3:]
 51 |         n = self.n_fft // 2 + 1
 52 |         f_pad = torch.zeros([*batch_dims, c, n - f, t]).to(x.device)
 53 |         x = torch.cat([x, f_pad], -2)
 54 |         x = x.reshape([*batch_dims, c // 2, 2, n, t]).reshape([-1, 2, n, t])
 55 |         x = x.permute([0, 2, 3, 1])
 56 |         x = x[..., 0] + x[..., 1] * 1.0j
 57 |         x = torch.istft(
 58 |             x, n_fft=self.n_fft, hop_length=self.hop_length, window=window, center=True
 59 |         )
 60 |         x = x.reshape([*batch_dims, 2, -1])
 61 | 
 62 |         if x_is_mps:
 63 |             x = x.to(self.device)
 64 | 
 65 |         return x
 66 | 
 67 | 
 68 | def get_norm(norm_type):
 69 |     def norm(c, norm_type):
 70 |         if norm_type == "BatchNorm":
 71 |             return nn.BatchNorm2d(c)
 72 |         elif norm_type == "InstanceNorm":
 73 |             return nn.InstanceNorm2d(c, affine=True)
 74 |         elif "GroupNorm" in norm_type:
 75 |             g = int(norm_type.replace("GroupNorm", ""))
 76 |             return nn.GroupNorm(num_groups=g, num_channels=c)
 77 |         else:
 78 |             return nn.Identity()
 79 | 
 80 |     return partial(norm, norm_type=norm_type)
 81 | 
 82 | 
 83 | def get_act(act_type):
 84 |     if act_type == "gelu":
 85 |         return nn.GELU()
 86 |     elif act_type == "relu":
 87 |         return nn.ReLU()
 88 |     elif act_type[:3] == "elu":
 89 |         alpha = float(act_type.replace("elu", ""))
 90 |         return nn.ELU(alpha)
 91 |     else:
 92 |         raise Exception
 93 | 
 94 | 
 95 | class Upscale(nn.Module):
 96 |     def __init__(self, in_c, out_c, scale, norm, act):
 97 |         super().__init__()
 98 |         self.conv = nn.Sequential(
 99 |             norm(in_c),
100 |             act,
101 |             nn.ConvTranspose2d(
102 |                 in_channels=in_c,
103 |                 out_channels=out_c,
104 |                 kernel_size=scale,
105 |                 stride=scale,
106 |                 bias=False,
107 |             ),
108 |         )
109 | 
110 |     def forward(self, x):
111 |         return self.conv(x)
112 | 
113 | 
114 | class Downscale(nn.Module):
115 |     def __init__(self, in_c, out_c, scale, norm, act):
116 |         super().__init__()
117 |         self.conv = nn.Sequential(
118 |             norm(in_c),
119 |             act,
120 |             nn.Conv2d(
121 |                 in_channels=in_c,
122 |                 out_channels=out_c,
123 |                 kernel_size=scale,
124 |                 stride=scale,
125 |                 bias=False,
126 |             ),
127 |         )
128 | 
129 |     def forward(self, x):
130 |         return self.conv(x)
131 | 
132 | 
133 | class TFC_TDF(nn.Module):
134 |     def __init__(self, in_c, c, l, f, bn, norm, act):
135 |         super().__init__()
136 | 
137 |         self.blocks = nn.ModuleList()
138 |         for i in range(l):
139 |             block = nn.Module()
140 | 
141 |             block.tfc1 = nn.Sequential(
142 |                 norm(in_c),
143 |                 act,
144 |                 nn.Conv2d(in_c, c, 3, 1, 1, bias=False),
145 |             )
146 |             block.tdf = nn.Sequential(
147 |                 norm(c),
148 |                 act,
149 |                 nn.Linear(f, f // bn, bias=False),
150 |                 norm(c),
151 |                 act,
152 |                 nn.Linear(f // bn, f, bias=False),
153 |             )
154 |             block.tfc2 = nn.Sequential(
155 |                 norm(c),
156 |                 act,
157 |                 nn.Conv2d(c, c, 3, 1, 1, bias=False),
158 |             )
159 |             block.shortcut = nn.Conv2d(in_c, c, 1, 1, 0, bias=False)
160 | 
161 |             self.blocks.append(block)
162 |             in_c = c
163 | 
164 |     def forward(self, x):
165 |         for block in self.blocks:
166 |             s = block.shortcut(x)
167 |             x = block.tfc1(x)
168 |             x = x + block.tdf(x)
169 |             x = block.tfc2(x)
170 |             x = x + s
171 |         return x
172 | 
173 | 
174 | class TFC_TDF_net(nn.Module):
175 |     def __init__(self, config, device):
176 |         super().__init__()
177 |         self.config = config
178 |         self.device = device
179 | 
180 |         norm = get_norm(norm_type=config.model.norm)
181 |         act = get_act(act_type=config.model.act)
182 | 
183 |         self.num_target_instruments = (
184 |             1 if config.training.target_instrument else len(config.training.instruments)
185 |         )
186 |         self.num_subbands = config.model.num_subbands
187 | 
188 |         dim_c = self.num_subbands * config.audio.num_channels * 2
189 |         n = config.model.num_scales
190 |         scale = config.model.scale
191 |         l = config.model.num_blocks_per_scale
192 |         c = config.model.num_channels
193 |         g = config.model.growth
194 |         bn = config.model.bottleneck_factor
195 |         f = config.audio.dim_f // self.num_subbands
196 | 
197 |         self.first_conv = nn.Conv2d(dim_c, c, 1, 1, 0, bias=False)
198 | 
199 |         self.encoder_blocks = nn.ModuleList()
200 |         for i in range(n):
201 |             block = nn.Module()
202 |             block.tfc_tdf = TFC_TDF(c, c, l, f, bn, norm, act)
203 |             block.downscale = Downscale(c, c + g, scale, norm, act)
204 |             f = f // scale[1]
205 |             c += g
206 |             self.encoder_blocks.append(block)
207 | 
208 |         self.bottleneck_block = TFC_TDF(c, c, l, f, bn, norm, act)
209 | 
210 |         self.decoder_blocks = nn.ModuleList()
211 |         for i in range(n):
212 |             block = nn.Module()
213 |             block.upscale = Upscale(c, c - g, scale, norm, act)
214 |             f = f * scale[1]
215 |             c -= g
216 |             block.tfc_tdf = TFC_TDF(2 * c, c, l, f, bn, norm, act)
217 |             self.decoder_blocks.append(block)
218 | 
219 |         self.final_conv = nn.Sequential(
220 |             nn.Conv2d(c + dim_c, c, 1, 1, 0, bias=False),
221 |             act,
222 |             nn.Conv2d(c, self.num_target_instruments * dim_c, 1, 1, 0, bias=False),
223 |         )
224 | 
225 |         self.stft = STFT(
226 |             config.audio.n_fft, config.audio.hop_length, config.audio.dim_f, self.device
227 |         )
228 | 
229 |     def cac2cws(self, x):
230 |         k = self.num_subbands
231 |         b, c, f, t = x.shape
232 |         x = x.reshape(b, c, k, f // k, t)
233 |         x = x.reshape(b, c * k, f // k, t)
234 |         return x
235 | 
236 |     def cws2cac(self, x):
237 |         k = self.num_subbands
238 |         b, c, f, t = x.shape
239 |         x = x.reshape(b, c // k, k, f, t)
240 |         x = x.reshape(b, c // k, f * k, t)
241 |         return x
242 | 
243 |     def forward(self, x):
244 | 
245 |         x = self.stft(x)
246 | 
247 |         mix = x = self.cac2cws(x)
248 | 
249 |         first_conv_out = x = self.first_conv(x)
250 | 
251 |         x = x.transpose(-1, -2)
252 | 
253 |         encoder_outputs = []
254 |         for block in self.encoder_blocks:
255 |             x = block.tfc_tdf(x)
256 |             encoder_outputs.append(x)
257 |             x = block.downscale(x)
258 | 
259 |         x = self.bottleneck_block(x)
260 | 
261 |         for block in self.decoder_blocks:
262 |             x = block.upscale(x)
263 |             x = torch.cat([x, encoder_outputs.pop()], 1)
264 |             x = block.tfc_tdf(x)
265 | 
266 |         x = x.transpose(-1, -2)
267 | 
268 |         x = x * first_conv_out  # reduce artifacts
269 | 
270 |         x = self.final_conv(torch.cat([mix, x], 1))
271 | 
272 |         x = self.cws2cac(x)
273 | 
274 |         if self.num_target_instruments > 1:
275 |             b, c, f, t = x.shape
276 |             x = x.reshape(b, self.num_target_instruments, -1, f, t)
277 | 
278 |         x = self.stft.inverse(x)
279 | 
280 |         return x
281 | 


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/vr_network/__init__.py:
--------------------------------------------------------------------------------
1 | # VR init.
2 | 


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/vr_network/layers_new.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | import torch.nn.functional as F
  4 | 
  5 | from tabs.plugins.installed.UVR.uvr.uvr_lib_v5 import spec_utils
  6 | 
  7 | 
  8 | class Conv2DBNActiv(nn.Module):
  9 |     """
 10 |     Conv2DBNActiv Class:
 11 |     This class implements a convolutional layer followed by batch normalization and an activation function.
 12 |     It is a fundamental building block for constructing neural networks, especially useful in image and audio processing tasks.
 13 |     The class encapsulates the pattern of applying a convolution, normalizing the output, and then applying a non-linear activation.
 14 |     """
 15 | 
 16 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 17 |         super(Conv2DBNActiv, self).__init__()
 18 | 
 19 |         # Sequential model combining Conv2D, BatchNorm, and activation function into a single module
 20 |         self.conv = nn.Sequential(
 21 |             nn.Conv2d(
 22 |                 nin,
 23 |                 nout,
 24 |                 kernel_size=ksize,
 25 |                 stride=stride,
 26 |                 padding=pad,
 27 |                 dilation=dilation,
 28 |                 bias=False,
 29 |             ),
 30 |             nn.BatchNorm2d(nout),
 31 |             activ(),
 32 |         )
 33 | 
 34 |     def __call__(self, input_tensor):
 35 |         # Forward pass through the sequential model
 36 |         return self.conv(input_tensor)
 37 | 
 38 | 
 39 | class Encoder(nn.Module):
 40 |     """
 41 |     Encoder Class:
 42 |     This class defines an encoder module typically used in autoencoder architectures.
 43 |     It consists of two convolutional layers, each followed by batch normalization and an activation function.
 44 |     """
 45 | 
 46 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
 47 |         super(Encoder, self).__init__()
 48 | 
 49 |         # First convolutional layer of the encoder
 50 |         self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ)
 51 |         # Second convolutional layer of the encoder
 52 |         self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
 53 | 
 54 |     def __call__(self, input_tensor):
 55 |         # Applying the first and then the second convolutional layers
 56 |         hidden = self.conv1(input_tensor)
 57 |         hidden = self.conv2(hidden)
 58 | 
 59 |         return hidden
 60 | 
 61 | 
 62 | class Decoder(nn.Module):
 63 |     """
 64 |     Decoder Class:
 65 |     This class defines a decoder module, which is the counterpart of the Encoder class in autoencoder architectures.
 66 |     It applies a convolutional layer followed by batch normalization and an activation function, with an optional dropout layer for regularization.
 67 |     """
 68 | 
 69 |     def __init__(
 70 |         self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
 71 |     ):
 72 |         super(Decoder, self).__init__()
 73 |         # Convolutional layer with optional dropout for regularization
 74 |         self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 75 |         # self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
 76 |         self.dropout = nn.Dropout2d(0.1) if dropout else None
 77 | 
 78 |     def __call__(self, input_tensor, skip=None):
 79 |         # Forward pass through the convolutional layer and optional dropout
 80 |         input_tensor = F.interpolate(
 81 |             input_tensor, scale_factor=2, mode="bilinear", align_corners=True
 82 |         )
 83 | 
 84 |         if skip is not None:
 85 |             skip = spec_utils.crop_center(skip, input_tensor)
 86 |             input_tensor = torch.cat([input_tensor, skip], dim=1)
 87 | 
 88 |         hidden = self.conv1(input_tensor)
 89 |         # hidden = self.conv2(hidden)
 90 | 
 91 |         if self.dropout is not None:
 92 |             hidden = self.dropout(hidden)
 93 | 
 94 |         return hidden
 95 | 
 96 | 
 97 | class ASPPModule(nn.Module):
 98 |     """
 99 |     ASPPModule Class:
100 |     This class implements the Atrous Spatial Pyramid Pooling (ASPP) module, which is useful for semantic image segmentation tasks.
101 |     It captures multi-scale contextual information by applying convolutions at multiple dilation rates.
102 |     """
103 | 
104 |     def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False):
105 |         super(ASPPModule, self).__init__()
106 | 
107 |         # Global context convolution captures the overall context
108 |         self.conv1 = nn.Sequential(
109 |             nn.AdaptiveAvgPool2d((1, None)),
110 |             Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ),
111 |         )
112 |         self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ)
113 |         self.conv3 = Conv2DBNActiv(
114 |             nin, nout, 3, 1, dilations[0], dilations[0], activ=activ
115 |         )
116 |         self.conv4 = Conv2DBNActiv(
117 |             nin, nout, 3, 1, dilations[1], dilations[1], activ=activ
118 |         )
119 |         self.conv5 = Conv2DBNActiv(
120 |             nin, nout, 3, 1, dilations[2], dilations[2], activ=activ
121 |         )
122 |         self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ)
123 |         self.dropout = nn.Dropout2d(0.1) if dropout else None
124 | 
125 |     def forward(self, input_tensor):
126 |         _, _, h, w = input_tensor.size()
127 | 
128 |         # Upsample global context to match input size and combine with local and multi-scale features
129 |         feat1 = F.interpolate(
130 |             self.conv1(input_tensor), size=(h, w), mode="bilinear", align_corners=True
131 |         )
132 |         feat2 = self.conv2(input_tensor)
133 |         feat3 = self.conv3(input_tensor)
134 |         feat4 = self.conv4(input_tensor)
135 |         feat5 = self.conv5(input_tensor)
136 |         out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
137 |         out = self.bottleneck(out)
138 | 
139 |         if self.dropout is not None:
140 |             out = self.dropout(out)
141 | 
142 |         return out
143 | 
144 | 
145 | class LSTMModule(nn.Module):
146 |     """
147 |     LSTMModule Class:
148 |     This class defines a module that combines convolutional feature extraction with a bidirectional LSTM for sequence modeling.
149 |     It is useful for tasks that require understanding temporal dynamics in data, such as speech and audio processing.
150 |     """
151 | 
152 |     def __init__(self, nin_conv, nin_lstm, nout_lstm):
153 |         super(LSTMModule, self).__init__()
154 |         # Convolutional layer for initial feature extraction
155 |         self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0)
156 | 
157 |         # Bidirectional LSTM for capturing temporal dynamics
158 |         self.lstm = nn.LSTM(
159 |             input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True
160 |         )
161 | 
162 |         # Dense layer for output dimensionality matching
163 |         self.dense = nn.Sequential(
164 |             nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU()
165 |         )
166 | 
167 |     def forward(self, input_tensor):
168 |         N, _, nbins, nframes = input_tensor.size()
169 | 
170 |         # Extract features and prepare for LSTM
171 |         hidden = self.conv(input_tensor)[:, 0]  # N, nbins, nframes
172 |         hidden = hidden.permute(2, 0, 1)  # nframes, N, nbins
173 |         hidden, _ = self.lstm(hidden)
174 | 
175 |         # Apply dense layer and reshape to match expected output format
176 |         hidden = self.dense(hidden.reshape(-1, hidden.size()[-1]))  # nframes * N, nbins
177 |         hidden = hidden.reshape(nframes, N, 1, nbins)
178 |         hidden = hidden.permute(1, 2, 3, 0)
179 | 
180 |         return hidden
181 | 


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/vr_network/model_param_init.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | default_param = {}
 4 | default_param["bins"] = -1
 5 | default_param["unstable_bins"] = -1  # training only
 6 | default_param["stable_bins"] = -1  # training only
 7 | default_param["sr"] = 44100
 8 | default_param["pre_filter_start"] = -1
 9 | default_param["pre_filter_stop"] = -1
10 | default_param["band"] = {}
11 | 
12 | N_BINS = "n_bins"
13 | 
14 | 
15 | def int_keys(d):
16 |     """
17 |     Converts string keys that represent integers into actual integer keys in a list.
18 | 
19 |     This function is particularly useful when dealing with JSON data that may represent
20 |     integer keys as strings due to the nature of JSON encoding. By converting these keys
21 |     back to integers, it ensures that the data can be used in a manner consistent with
22 |     its original representation, especially in contexts where the distinction between
23 |     string and integer keys is important.
24 | 
25 |     Args:
26 |         input_list (list of tuples): A list of (key, value) pairs where keys are strings
27 |                                      that may represent integers.
28 | 
29 |     Returns:
30 |         dict: A dictionary with keys converted to integers where applicable.
31 |     """
32 |     # Initialize an empty dictionary to hold the converted key-value pairs.
33 |     result_dict = {}
34 |     # Iterate through each key-value pair in the input list.
35 |     for key, value in d:
36 |         # Check if the key is a digit (i.e., represents an integer).
37 |         if key.isdigit():
38 |             # Convert the key from a string to an integer.
39 |             key = int(key)
40 |         result_dict[key] = value
41 |     return result_dict
42 | 
43 | 
44 | class ModelParameters(object):
45 |     """
46 |     A class to manage model parameters, including loading from a configuration file.
47 | 
48 |     Attributes:
49 |         param (dict): Dictionary holding all parameters for the model.
50 |     """
51 | 
52 |     def __init__(self, config_path=""):
53 |         """
54 |         Initializes the ModelParameters object by loading parameters from a JSON configuration file.
55 | 
56 |         Args:
57 |             config_path (str): Path to the JSON configuration file.
58 |         """
59 | 
60 |         # Load parameters from the given configuration file path.
61 |         with open(config_path, "r") as f:
62 |             self.param = json.loads(f.read(), object_pairs_hook=int_keys)
63 | 
64 |         # Ensure certain parameters are set to False if not specified in the configuration.
65 |         for k in [
66 |             "mid_side",
67 |             "mid_side_b",
68 |             "mid_side_b2",
69 |             "stereo_w",
70 |             "stereo_n",
71 |             "reverse",
72 |         ]:
73 |             if not k in self.param:
74 |                 self.param[k] = False
75 | 
76 |         # If 'n_bins' is specified in the parameters, it's used as the value for 'bins'.
77 |         if N_BINS in self.param:
78 |             self.param["bins"] = self.param[N_BINS]
79 | 


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/vr_network/modelparams/1band_sr16000_hl512.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 16000,
 8 | 			"hl": 512,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 1024,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 16000,
17 | 	"pre_filter_start": 1023,
18 | 	"pre_filter_stop": 1024
19 | }


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/vr_network/modelparams/1band_sr32000_hl512.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 32000,
 8 | 			"hl": 512,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 1024,
12 | 			"hpf_start": -1,
13 | 			"res_type": "kaiser_fast"
14 | 		}
15 | 	},
16 | 	"sr": 32000,
17 | 	"pre_filter_start": 1000,
18 | 	"pre_filter_stop": 1021
19 | }


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/vr_network/modelparams/1band_sr33075_hl384.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 33075,
 8 | 			"hl": 384,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 1024,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 33075,
17 | 	"pre_filter_start": 1000,
18 | 	"pre_filter_stop": 1021
19 | }


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/vr_network/modelparams/1band_sr44100_hl1024.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 44100,
 8 | 			"hl": 1024,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 1024,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 44100,
17 | 	"pre_filter_start": 1023,
18 | 	"pre_filter_stop": 1024
19 | }


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/vr_network/modelparams/1band_sr44100_hl256.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 256,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 44100,
 8 | 			"hl": 256,
 9 | 			"n_fft": 512,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 256,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 44100,
17 | 	"pre_filter_start": 256,
18 | 	"pre_filter_stop": 256
19 | }


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/vr_network/modelparams/1band_sr44100_hl512.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 44100,
 8 | 			"hl": 512,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 1024,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 44100,
17 | 	"pre_filter_start": 1023,
18 | 	"pre_filter_stop": 1024
19 | }


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/vr_network/modelparams/1band_sr44100_hl512_cut.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 44100,
 8 | 			"hl": 512,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 700,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 44100,
17 | 	"pre_filter_start": 1023,
18 | 	"pre_filter_stop": 700
19 | }


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/vr_network/modelparams/1band_sr44100_hl512_nf1024.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 512,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 44100,
 8 | 			"hl": 512,
 9 | 			"n_fft": 1024,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 512,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 44100,
17 | 	"pre_filter_start": 511,
18 | 	"pre_filter_stop": 512
19 | }


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/vr_network/modelparams/2band_32000.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 768,
 3 | 	"unstable_bins": 7,
 4 | 	"reduction_bins": 705,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 6000,
 8 | 			"hl": 66,
 9 | 			"n_fft": 512,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 240,
12 | 			"lpf_start": 60,
13 | 			"lpf_stop": 118,
14 | 			"res_type": "sinc_fastest"
15 | 		},
16 | 		"2": {
17 | 			"sr": 32000,
18 | 			"hl": 352,
19 | 			"n_fft": 1024,
20 | 			"crop_start": 22,
21 | 			"crop_stop": 505,
22 | 			"hpf_start": 44,
23 | 			"hpf_stop": 23,
24 | 			"res_type": "sinc_medium"
25 | 		}
26 | 	},
27 | 	"sr": 32000,
28 | 	"pre_filter_start": 710,
29 | 	"pre_filter_stop": 731
30 | }
31 | 


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/vr_network/modelparams/2band_44100_lofi.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 512,
 3 | 	"unstable_bins": 7,
 4 | 	"reduction_bins": 510,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 11025,
 8 | 			"hl": 160,
 9 | 			"n_fft": 768,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 192,
12 | 			"lpf_start": 41,
13 | 			"lpf_stop": 139,
14 | 			"res_type": "sinc_fastest"
15 | 		},
16 | 		"2": {
17 | 			"sr": 44100,
18 | 			"hl": 640,
19 | 			"n_fft": 1024,
20 | 			"crop_start": 10,
21 | 			"crop_stop": 320,
22 | 			"hpf_start": 47,
23 | 			"hpf_stop": 15,
24 | 			"res_type": "sinc_medium"
25 | 		}
26 | 	},
27 | 	"sr": 44100,
28 | 	"pre_filter_start": 510,
29 | 	"pre_filter_stop": 512
30 | }
31 | 


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/vr_network/modelparams/2band_48000.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 768,
 3 | 	"unstable_bins": 7,
 4 | 	"reduction_bins": 705,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 6000,
 8 | 			"hl": 66,
 9 | 			"n_fft": 512,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 240,
12 | 			"lpf_start": 60,
13 | 			"lpf_stop": 240,
14 | 			"res_type": "sinc_fastest"
15 | 		},
16 | 		"2": {
17 | 			"sr": 48000,
18 | 			"hl": 528,
19 | 			"n_fft": 1536,
20 | 			"crop_start": 22,
21 | 			"crop_stop": 505,
22 | 			"hpf_start": 82,
23 | 			"hpf_stop": 22,
24 | 			"res_type": "sinc_medium"
25 | 		}
26 | 	},
27 | 	"sr": 48000,
28 | 	"pre_filter_start": 710,
29 | 	"pre_filter_stop": 731
30 | }


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/vr_network/modelparams/3band_44100.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 768,
 3 | 	"unstable_bins": 5,
 4 | 	"reduction_bins": 733,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 11025,
 8 | 			"hl": 128,
 9 | 			"n_fft": 768,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 278,
12 | 			"lpf_start": 28,
13 | 			"lpf_stop": 140,
14 | 			"res_type": "polyphase"
15 | 		},
16 | 		"2": {
17 | 			"sr": 22050,
18 | 			"hl": 256,
19 | 			"n_fft": 768,
20 | 			"crop_start": 14,
21 | 			"crop_stop": 322,
22 | 			"hpf_start": 70,
23 | 			"hpf_stop": 14,
24 | 			"lpf_start": 283,
25 | 			"lpf_stop": 314,
26 | 			"res_type": "polyphase"
27 | 		},	
28 | 		"3": {
29 | 			"sr": 44100,
30 | 			"hl": 512,
31 | 			"n_fft": 768,
32 | 			"crop_start": 131,
33 | 			"crop_stop": 313,
34 | 			"hpf_start": 154,
35 | 			"hpf_stop": 141,
36 | 			"res_type": "sinc_medium"
37 | 		}
38 | 	},
39 | 	"sr": 44100,
40 | 	"pre_filter_start": 757,
41 | 	"pre_filter_stop": 768
42 | }
43 | 


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/vr_network/modelparams/3band_44100_mid.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"mid_side": true,
 3 | 	"bins": 768,
 4 | 	"unstable_bins": 5,
 5 | 	"reduction_bins": 733,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 768,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 278,
13 | 			"lpf_start": 28,
14 | 			"lpf_stop": 140,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 22050,
19 | 			"hl": 256,
20 | 			"n_fft": 768,
21 | 			"crop_start": 14,
22 | 			"crop_stop": 322,
23 | 			"hpf_start": 70,
24 | 			"hpf_stop": 14,
25 | 			"lpf_start": 283,
26 | 			"lpf_stop": 314,
27 | 			"res_type": "polyphase"
28 | 		},	
29 | 		"3": {
30 | 			"sr": 44100,
31 | 			"hl": 512,
32 | 			"n_fft": 768,
33 | 			"crop_start": 131,
34 | 			"crop_stop": 313,
35 | 			"hpf_start": 154,
36 | 			"hpf_stop": 141,
37 | 			"res_type": "sinc_medium"
38 | 		}
39 | 	},
40 | 	"sr": 44100,
41 | 	"pre_filter_start": 757,
42 | 	"pre_filter_stop": 768
43 | }
44 | 


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/vr_network/modelparams/3band_44100_msb2.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"mid_side_b2": true,
 3 | 	"bins": 640,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 565,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 108,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 187,
13 | 			"lpf_start": 92,
14 | 			"lpf_stop": 186,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 22050,
19 | 			"hl": 216,
20 | 			"n_fft": 768,
21 | 			"crop_start": 0,
22 | 			"crop_stop": 212,
23 | 			"hpf_start": 68,
24 | 			"hpf_stop": 34,
25 | 			"lpf_start": 174,
26 | 			"lpf_stop": 209,
27 | 			"res_type": "polyphase"
28 | 		},	
29 | 		"3": {
30 | 			"sr": 44100,
31 | 			"hl": 432,
32 | 			"n_fft": 640,
33 | 			"crop_start": 66,
34 | 			"crop_stop": 307,
35 | 			"hpf_start": 86,
36 | 			"hpf_stop": 72,
37 | 			"res_type": "kaiser_fast"
38 | 		}
39 | 	},
40 | 	"sr": 44100,
41 | 	"pre_filter_start": 639,
42 | 	"pre_filter_stop": 640
43 | }
44 | 


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/vr_network/modelparams/4band_44100.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 768,
 3 | 	"unstable_bins": 7,
 4 | 	"reduction_bins": 668,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 11025,
 8 | 			"hl": 128,
 9 | 			"n_fft": 1024,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 186,
12 | 			"lpf_start": 37,
13 | 			"lpf_stop": 73,
14 | 			"res_type": "polyphase"
15 | 		},
16 | 		"2": {
17 | 			"sr": 11025,
18 | 			"hl": 128,
19 | 			"n_fft": 512,
20 | 			"crop_start": 4,
21 | 			"crop_stop": 185,			
22 | 			"hpf_start": 36,
23 | 			"hpf_stop": 18,
24 | 			"lpf_start": 93,
25 | 			"lpf_stop": 185,
26 | 			"res_type": "polyphase"
27 | 		},
28 | 		"3": {
29 | 			"sr": 22050,
30 | 			"hl": 256,
31 | 			"n_fft": 512,
32 | 			"crop_start": 46,
33 | 			"crop_stop": 186,
34 | 			"hpf_start": 93,
35 | 			"hpf_stop": 46,
36 | 			"lpf_start": 164,
37 | 			"lpf_stop": 186,
38 | 			"res_type": "polyphase"
39 | 		},	
40 | 		"4": {
41 | 			"sr": 44100,
42 | 			"hl": 512,
43 | 			"n_fft": 768,
44 | 			"crop_start": 121,
45 | 			"crop_stop": 382,
46 | 			"hpf_start": 138,
47 | 			"hpf_stop": 123,
48 | 			"res_type": "sinc_medium"
49 | 		}
50 | 	},
51 | 	"sr": 44100,
52 | 	"pre_filter_start": 740,
53 | 	"pre_filter_stop": 768
54 | }
55 | 


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/vr_network/modelparams/4band_44100_mid.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 768,
 3 | 	"unstable_bins": 7,
 4 | 	"mid_side": true,
 5 | 	"reduction_bins": 668,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 186,
13 | 			"lpf_start": 37,
14 | 			"lpf_stop": 73,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 11025,
19 | 			"hl": 128,
20 | 			"n_fft": 512,
21 | 			"crop_start": 4,
22 | 			"crop_stop": 185,			
23 | 			"hpf_start": 36,
24 | 			"hpf_stop": 18,
25 | 			"lpf_start": 93,
26 | 			"lpf_stop": 185,
27 | 			"res_type": "polyphase"
28 | 		},
29 | 		"3": {
30 | 			"sr": 22050,
31 | 			"hl": 256,
32 | 			"n_fft": 512,
33 | 			"crop_start": 46,
34 | 			"crop_stop": 186,
35 | 			"hpf_start": 93,
36 | 			"hpf_stop": 46,
37 | 			"lpf_start": 164,
38 | 			"lpf_stop": 186,
39 | 			"res_type": "polyphase"
40 | 		},	
41 | 		"4": {
42 | 			"sr": 44100,
43 | 			"hl": 512,
44 | 			"n_fft": 768,
45 | 			"crop_start": 121,
46 | 			"crop_stop": 382,
47 | 			"hpf_start": 138,
48 | 			"hpf_stop": 123,
49 | 			"res_type": "sinc_medium"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 740,
54 | 	"pre_filter_stop": 768
55 | }
56 | 


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/vr_network/modelparams/4band_44100_msb.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"mid_side_b": true,
 3 | 	"bins": 768,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 668,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 186,
13 | 			"lpf_start": 37,
14 | 			"lpf_stop": 73,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 11025,
19 | 			"hl": 128,
20 | 			"n_fft": 512,
21 | 			"crop_start": 4,
22 | 			"crop_stop": 185,			
23 | 			"hpf_start": 36,
24 | 			"hpf_stop": 18,
25 | 			"lpf_start": 93,
26 | 			"lpf_stop": 185,
27 | 			"res_type": "polyphase"
28 | 		},
29 | 		"3": {
30 | 			"sr": 22050,
31 | 			"hl": 256,
32 | 			"n_fft": 512,
33 | 			"crop_start": 46,
34 | 			"crop_stop": 186,
35 | 			"hpf_start": 93,
36 | 			"hpf_stop": 46,
37 | 			"lpf_start": 164,
38 | 			"lpf_stop": 186,
39 | 			"res_type": "polyphase"
40 | 		},	
41 | 		"4": {
42 | 			"sr": 44100,
43 | 			"hl": 512,
44 | 			"n_fft": 768,
45 | 			"crop_start": 121,
46 | 			"crop_stop": 382,
47 | 			"hpf_start": 138,
48 | 			"hpf_stop": 123,
49 | 			"res_type": "sinc_medium"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 740,
54 | 	"pre_filter_stop": 768
55 | }


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/vr_network/modelparams/4band_44100_msb2.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"mid_side_b": true,
 3 | 	"bins": 768,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 668,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 186,
13 | 			"lpf_start": 37,
14 | 			"lpf_stop": 73,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 11025,
19 | 			"hl": 128,
20 | 			"n_fft": 512,
21 | 			"crop_start": 4,
22 | 			"crop_stop": 185,			
23 | 			"hpf_start": 36,
24 | 			"hpf_stop": 18,
25 | 			"lpf_start": 93,
26 | 			"lpf_stop": 185,
27 | 			"res_type": "polyphase"
28 | 		},
29 | 		"3": {
30 | 			"sr": 22050,
31 | 			"hl": 256,
32 | 			"n_fft": 512,
33 | 			"crop_start": 46,
34 | 			"crop_stop": 186,
35 | 			"hpf_start": 93,
36 | 			"hpf_stop": 46,
37 | 			"lpf_start": 164,
38 | 			"lpf_stop": 186,
39 | 			"res_type": "polyphase"
40 | 		},	
41 | 		"4": {
42 | 			"sr": 44100,
43 | 			"hl": 512,
44 | 			"n_fft": 768,
45 | 			"crop_start": 121,
46 | 			"crop_stop": 382,
47 | 			"hpf_start": 138,
48 | 			"hpf_stop": 123,
49 | 			"res_type": "sinc_medium"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 740,
54 | 	"pre_filter_stop": 768
55 | }


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/vr_network/modelparams/4band_44100_reverse.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"reverse": true,
 3 | 	"bins": 768,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 668,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 186,
13 | 			"lpf_start": 37,
14 | 			"lpf_stop": 73,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 11025,
19 | 			"hl": 128,
20 | 			"n_fft": 512,
21 | 			"crop_start": 4,
22 | 			"crop_stop": 185,			
23 | 			"hpf_start": 36,
24 | 			"hpf_stop": 18,
25 | 			"lpf_start": 93,
26 | 			"lpf_stop": 185,
27 | 			"res_type": "polyphase"
28 | 		},
29 | 		"3": {
30 | 			"sr": 22050,
31 | 			"hl": 256,
32 | 			"n_fft": 512,
33 | 			"crop_start": 46,
34 | 			"crop_stop": 186,
35 | 			"hpf_start": 93,
36 | 			"hpf_stop": 46,
37 | 			"lpf_start": 164,
38 | 			"lpf_stop": 186,
39 | 			"res_type": "polyphase"
40 | 		},	
41 | 		"4": {
42 | 			"sr": 44100,
43 | 			"hl": 512,
44 | 			"n_fft": 768,
45 | 			"crop_start": 121,
46 | 			"crop_stop": 382,
47 | 			"hpf_start": 138,
48 | 			"hpf_stop": 123,
49 | 			"res_type": "sinc_medium"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 740,
54 | 	"pre_filter_stop": 768
55 | }


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/vr_network/modelparams/4band_44100_sw.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"stereo_w": true,
 3 | 	"bins": 768,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 668,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 186,
13 | 			"lpf_start": 37,
14 | 			"lpf_stop": 73,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 11025,
19 | 			"hl": 128,
20 | 			"n_fft": 512,
21 | 			"crop_start": 4,
22 | 			"crop_stop": 185,			
23 | 			"hpf_start": 36,
24 | 			"hpf_stop": 18,
25 | 			"lpf_start": 93,
26 | 			"lpf_stop": 185,
27 | 			"res_type": "polyphase"
28 | 		},
29 | 		"3": {
30 | 			"sr": 22050,
31 | 			"hl": 256,
32 | 			"n_fft": 512,
33 | 			"crop_start": 46,
34 | 			"crop_stop": 186,
35 | 			"hpf_start": 93,
36 | 			"hpf_stop": 46,
37 | 			"lpf_start": 164,
38 | 			"lpf_stop": 186,
39 | 			"res_type": "polyphase"
40 | 		},	
41 | 		"4": {
42 | 			"sr": 44100,
43 | 			"hl": 512,
44 | 			"n_fft": 768,
45 | 			"crop_start": 121,
46 | 			"crop_stop": 382,
47 | 			"hpf_start": 138,
48 | 			"hpf_stop": 123,
49 | 			"res_type": "sinc_medium"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 740,
54 | 	"pre_filter_stop": 768
55 | }


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/vr_network/modelparams/4band_v2.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 672,
 3 | 	"unstable_bins": 8,
 4 | 	"reduction_bins": 637,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 7350,
 8 | 			"hl": 80,
 9 | 			"n_fft": 640,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 85,
12 | 			"lpf_start": 25,
13 | 			"lpf_stop": 53,
14 | 			"res_type": "polyphase"
15 | 		},
16 | 		"2": {
17 | 			"sr": 7350,
18 | 			"hl": 80,
19 | 			"n_fft": 320,
20 | 			"crop_start": 4,
21 | 			"crop_stop": 87,
22 | 			"hpf_start": 25,
23 | 			"hpf_stop": 12,
24 | 			"lpf_start": 31,
25 | 			"lpf_stop": 62,
26 | 			"res_type": "polyphase"
27 | 		},		
28 | 		"3": {
29 | 			"sr": 14700,
30 | 			"hl": 160,
31 | 			"n_fft": 512,
32 | 			"crop_start": 17,
33 | 			"crop_stop": 216,
34 | 			"hpf_start": 48,
35 | 			"hpf_stop": 24,
36 | 			"lpf_start": 139,
37 | 			"lpf_stop": 210,
38 | 			"res_type": "polyphase"
39 | 		},	
40 | 		"4": {
41 | 			"sr": 44100,
42 | 			"hl": 480,
43 | 			"n_fft": 960,
44 | 			"crop_start": 78,
45 | 			"crop_stop": 383,
46 | 			"hpf_start": 130,
47 | 			"hpf_stop": 86,
48 | 			"res_type": "kaiser_fast"
49 | 		}
50 | 	},
51 | 	"sr": 44100,
52 | 	"pre_filter_start": 668,
53 | 	"pre_filter_stop": 672
54 | }


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/vr_network/modelparams/4band_v2_sn.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 672,
 3 | 	"unstable_bins": 8,
 4 | 	"reduction_bins": 637,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 7350,
 8 | 			"hl": 80,
 9 | 			"n_fft": 640,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 85,
12 | 			"lpf_start": 25,
13 | 			"lpf_stop": 53,
14 | 			"res_type": "polyphase"
15 | 		},
16 | 		"2": {
17 | 			"sr": 7350,
18 | 			"hl": 80,
19 | 			"n_fft": 320,
20 | 			"crop_start": 4,
21 | 			"crop_stop": 87,
22 | 			"hpf_start": 25,
23 | 			"hpf_stop": 12,
24 | 			"lpf_start": 31,
25 | 			"lpf_stop": 62,
26 | 			"res_type": "polyphase"
27 | 		},		
28 | 		"3": {
29 | 			"sr": 14700,
30 | 			"hl": 160,
31 | 			"n_fft": 512,
32 | 			"crop_start": 17,
33 | 			"crop_stop": 216,
34 | 			"hpf_start": 48,
35 | 			"hpf_stop": 24,
36 | 			"lpf_start": 139,
37 | 			"lpf_stop": 210,
38 | 			"res_type": "polyphase"
39 | 		},	
40 | 		"4": {
41 | 			"sr": 44100,
42 | 			"hl": 480,
43 | 			"n_fft": 960,
44 | 			"crop_start": 78,
45 | 			"crop_stop": 383,
46 | 			"hpf_start": 130,
47 | 			"hpf_stop": 86,
48 | 			"convert_channels": "stereo_n",
49 | 			"res_type": "kaiser_fast"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 668,
54 | 	"pre_filter_stop": 672
55 | }


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/vr_network/modelparams/4band_v3.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 672,
 3 | 	"unstable_bins": 8,
 4 | 	"reduction_bins": 530,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 7350,
 8 | 			"hl": 80,
 9 | 			"n_fft": 640,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 85,
12 | 			"lpf_start": 25,
13 | 			"lpf_stop": 53,
14 | 			"res_type": "polyphase"
15 | 		},
16 | 		"2": {
17 | 			"sr": 7350,
18 | 			"hl": 80,
19 | 			"n_fft": 320,
20 | 			"crop_start": 4,
21 | 			"crop_stop": 87,
22 | 			"hpf_start": 25,
23 | 			"hpf_stop": 12,
24 | 			"lpf_start": 31,
25 | 			"lpf_stop": 62,
26 | 			"res_type": "polyphase"
27 | 		},
28 | 		"3": {
29 | 			"sr": 14700,
30 | 			"hl": 160,
31 | 			"n_fft": 512,
32 | 			"crop_start": 17,
33 | 			"crop_stop": 216,
34 | 			"hpf_start": 48,
35 | 			"hpf_stop": 24,
36 | 			"lpf_start": 139,
37 | 			"lpf_stop": 210,
38 | 			"res_type": "polyphase"
39 | 		},
40 | 		"4": {
41 | 			"sr": 44100,
42 | 			"hl": 480,
43 | 			"n_fft": 960,
44 | 			"crop_start": 78,
45 | 			"crop_stop": 383,
46 | 			"hpf_start": 130,
47 | 			"hpf_stop": 86,
48 | 			"res_type": "kaiser_fast"
49 | 		}
50 | 	},
51 | 	"sr": 44100,
52 | 	"pre_filter_start": 668,
53 | 	"pre_filter_stop": 672
54 | }


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/vr_network/modelparams/4band_v3_sn.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"n_bins": 672,
 3 | 	"unstable_bins": 8,
 4 | 	"stable_bins": 530,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 7350,
 8 | 			"hl": 80,
 9 | 			"n_fft": 640,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 85,
12 | 			"lpf_start": 25,
13 | 			"lpf_stop": 53,
14 | 			"res_type": "polyphase"
15 | 		},
16 | 		"2": {
17 | 			"sr": 7350,
18 | 			"hl": 80,
19 | 			"n_fft": 320,
20 | 			"crop_start": 4,
21 | 			"crop_stop": 87,
22 | 			"hpf_start": 25,
23 | 			"hpf_stop": 12,
24 | 			"lpf_start": 31,
25 | 			"lpf_stop": 62,
26 | 			"res_type": "polyphase"
27 | 		},
28 | 		"3": {
29 | 			"sr": 14700,
30 | 			"hl": 160,
31 | 			"n_fft": 512,
32 | 			"crop_start": 17,
33 | 			"crop_stop": 216,
34 | 			"hpf_start": 48,
35 | 			"hpf_stop": 24,
36 | 			"lpf_start": 139,
37 | 			"lpf_stop": 210,
38 | 			"res_type": "polyphase"
39 | 		},
40 | 		"4": {
41 | 			"sr": 44100,
42 | 			"hl": 480,
43 | 			"n_fft": 960,
44 | 			"crop_start": 78,
45 | 			"crop_stop": 383,
46 | 			"hpf_start": 130,
47 | 			"hpf_stop": 86,
48 | 			"convert_channels": "stereo_n",
49 | 			"res_type": "kaiser_fast"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 668,
54 | 	"pre_filter_stop": 672
55 | }


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/vr_network/modelparams/ensemble.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"mid_side_b2": true,
 3 | 	"bins": 1280,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 565,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 108,
10 | 			"n_fft": 2048,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 374,
13 | 			"lpf_start": 92,
14 | 			"lpf_stop": 186,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 22050,
19 | 			"hl": 216,
20 | 			"n_fft": 1536,
21 | 			"crop_start": 0,
22 | 			"crop_stop": 424,
23 | 			"hpf_start": 68,
24 | 			"hpf_stop": 34,
25 | 			"lpf_start": 348,
26 | 			"lpf_stop": 418,
27 | 			"res_type": "polyphase"
28 | 		},	
29 | 		"3": {
30 | 			"sr": 44100,
31 | 			"hl": 432,
32 | 			"n_fft": 1280,
33 | 			"crop_start": 132,
34 | 			"crop_stop": 614,
35 | 			"hpf_start": 172,
36 | 			"hpf_stop": 144,
37 | 			"res_type": "polyphase"
38 | 		}
39 | 	},
40 | 	"sr": 44100,
41 | 	"pre_filter_start": 1280,
42 | 	"pre_filter_stop": 1280
43 | }


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/vr_network/nets.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | import torch.nn.functional as F
  4 | 
  5 | from . import layers
  6 | 
  7 | 
  8 | class BaseASPPNet(nn.Module):
  9 |     """
 10 |     BaseASPPNet Class:
 11 |     This class defines the base architecture for an Atrous Spatial Pyramid Pooling (ASPP) network.
 12 |     It is designed to extract features from input data at multiple scales by using dilated convolutions.
 13 |     This is particularly useful for tasks that benefit from understanding context at different resolutions,
 14 |     such as semantic segmentation. The network consists of a series of encoder layers for downsampling and feature extraction,
 15 |     followed by an ASPP module for multi-scale feature extraction, and finally a series of decoder layers for upsampling.
 16 |     """
 17 | 
 18 |     def __init__(self, nn_architecture, nin, ch, dilations=(4, 8, 16)):
 19 |         super(BaseASPPNet, self).__init__()
 20 |         self.nn_architecture = nn_architecture
 21 | 
 22 |         # Encoder layers progressively increase the number of channels while reducing spatial dimensions.
 23 |         self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
 24 |         self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
 25 |         self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
 26 |         self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
 27 | 
 28 |         # Depending on the network architecture, an additional encoder layer and a specific ASPP module are initialized.
 29 |         if self.nn_architecture == 129605:
 30 |             self.enc5 = layers.Encoder(ch * 8, ch * 16, 3, 2, 1)
 31 |             self.aspp = layers.ASPPModule(nn_architecture, ch * 16, ch * 32, dilations)
 32 |             self.dec5 = layers.Decoder(ch * (16 + 32), ch * 16, 3, 1, 1)
 33 |         else:
 34 |             self.aspp = layers.ASPPModule(nn_architecture, ch * 8, ch * 16, dilations)
 35 | 
 36 |         # Decoder layers progressively decrease the number of channels while increasing spatial dimensions.
 37 |         self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
 38 |         self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
 39 |         self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
 40 |         self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
 41 | 
 42 |     def __call__(self, input_tensor):
 43 |         # The input tensor is passed through a series of encoder layers.
 44 |         hidden_state, encoder_output1 = self.enc1(input_tensor)
 45 |         hidden_state, encoder_output2 = self.enc2(hidden_state)
 46 |         hidden_state, encoder_output3 = self.enc3(hidden_state)
 47 |         hidden_state, encoder_output4 = self.enc4(hidden_state)
 48 | 
 49 |         # Depending on the network architecture, the hidden state is processed by an additional encoder layer and the ASPP module.
 50 |         if self.nn_architecture == 129605:
 51 |             hidden_state, encoder_output5 = self.enc5(hidden_state)
 52 |             hidden_state = self.aspp(hidden_state)
 53 |             # The decoder layers use skip connections from the encoder layers for better feature integration.
 54 |             hidden_state = self.dec5(hidden_state, encoder_output5)
 55 |         else:
 56 |             hidden_state = self.aspp(hidden_state)
 57 | 
 58 |         # The hidden state is further processed by the decoder layers, using skip connections for feature integration.
 59 |         hidden_state = self.dec4(hidden_state, encoder_output4)
 60 |         hidden_state = self.dec3(hidden_state, encoder_output3)
 61 |         hidden_state = self.dec2(hidden_state, encoder_output2)
 62 |         hidden_state = self.dec1(hidden_state, encoder_output1)
 63 | 
 64 |         return hidden_state
 65 | 
 66 | 
 67 | def determine_model_capacity(n_fft_bins, nn_architecture):
 68 |     """
 69 |     The determine_model_capacity function is designed to select the appropriate model configuration
 70 |     based on the frequency bins and network architecture. It maps specific architectures to predefined
 71 |     model capacities, which dictate the structure and parameters of the CascadedASPPNet model.
 72 |     """
 73 | 
 74 |     # Predefined model architectures categorized by their precision level.
 75 |     sp_model_arch = [31191, 33966, 129605]
 76 |     hp_model_arch = [123821, 123812]
 77 |     hp2_model_arch = [537238, 537227]
 78 | 
 79 |     # Mapping network architectures to their corresponding model capacity data.
 80 |     if nn_architecture in sp_model_arch:
 81 |         model_capacity_data = [
 82 |             (2, 16),
 83 |             (2, 16),
 84 |             (18, 8, 1, 1, 0),
 85 |             (8, 16),
 86 |             (34, 16, 1, 1, 0),
 87 |             (16, 32),
 88 |             (32, 2, 1),
 89 |             (16, 2, 1),
 90 |             (16, 2, 1),
 91 |         ]
 92 | 
 93 |     if nn_architecture in hp_model_arch:
 94 |         model_capacity_data = [
 95 |             (2, 32),
 96 |             (2, 32),
 97 |             (34, 16, 1, 1, 0),
 98 |             (16, 32),
 99 |             (66, 32, 1, 1, 0),
100 |             (32, 64),
101 |             (64, 2, 1),
102 |             (32, 2, 1),
103 |             (32, 2, 1),
104 |         ]
105 | 
106 |     if nn_architecture in hp2_model_arch:
107 |         model_capacity_data = [
108 |             (2, 64),
109 |             (2, 64),
110 |             (66, 32, 1, 1, 0),
111 |             (32, 64),
112 |             (130, 64, 1, 1, 0),
113 |             (64, 128),
114 |             (128, 2, 1),
115 |             (64, 2, 1),
116 |             (64, 2, 1),
117 |         ]
118 | 
119 |     # Initializing the CascadedASPPNet model with the selected model capacity data.
120 |     cascaded = CascadedASPPNet
121 |     model = cascaded(n_fft_bins, model_capacity_data, nn_architecture)
122 | 
123 |     return model
124 | 
125 | 
126 | class CascadedASPPNet(nn.Module):
127 |     """
128 |     CascadedASPPNet Class:
129 |     This class implements a cascaded version of the ASPP network, designed for processing audio signals
130 |     for tasks such as vocal removal. It consists of multiple stages, each with its own ASPP network,
131 |     to process different frequency bands of the input signal. This allows the model to effectively
132 |     handle the full spectrum of audio frequencies by focusing on different frequency bands separately.
133 |     """
134 | 
135 |     def __init__(self, n_fft, model_capacity_data, nn_architecture):
136 |         super(CascadedASPPNet, self).__init__()
137 |         # The first stage processes the low and high frequency bands separately.
138 |         self.stg1_low_band_net = BaseASPPNet(nn_architecture, *model_capacity_data[0])
139 |         self.stg1_high_band_net = BaseASPPNet(nn_architecture, *model_capacity_data[1])
140 | 
141 |         # Bridge layers connect different stages of the network.
142 |         self.stg2_bridge = layers.Conv2DBNActiv(*model_capacity_data[2])
143 |         self.stg2_full_band_net = BaseASPPNet(nn_architecture, *model_capacity_data[3])
144 | 
145 |         self.stg3_bridge = layers.Conv2DBNActiv(*model_capacity_data[4])
146 |         self.stg3_full_band_net = BaseASPPNet(nn_architecture, *model_capacity_data[5])
147 | 
148 |         # Output layers for the final mask prediction and auxiliary outputs.
149 |         self.out = nn.Conv2d(*model_capacity_data[6], bias=False)
150 |         self.aux1_out = nn.Conv2d(*model_capacity_data[7], bias=False)
151 |         self.aux2_out = nn.Conv2d(*model_capacity_data[8], bias=False)
152 | 
153 |         # Parameters for handling the frequency bins of the input signal.
154 |         self.max_bin = n_fft // 2
155 |         self.output_bin = n_fft // 2 + 1
156 | 
157 |         self.offset = 128
158 | 
159 |     def forward(self, input_tensor):
160 |         # The forward pass processes the input tensor through each stage of the network,
161 |         # combining the outputs of different frequency bands and stages to produce the final mask.
162 |         mix = input_tensor.detach()
163 |         input_tensor = input_tensor.clone()
164 | 
165 |         # Preparing the input tensor by selecting the mainput_tensorimum frequency bin.
166 |         input_tensor = input_tensor[:, :, : self.max_bin]
167 | 
168 |         # Processing the low and high frequency bands separately in the first stage.
169 |         bandwidth = input_tensor.size()[2] // 2
170 |         aux1 = torch.cat(
171 |             [
172 |                 self.stg1_low_band_net(input_tensor[:, :, :bandwidth]),
173 |                 self.stg1_high_band_net(input_tensor[:, :, bandwidth:]),
174 |             ],
175 |             dim=2,
176 |         )
177 | 
178 |         # Combining the outputs of the first stage and passing through the second stage.
179 |         hidden_state = torch.cat([input_tensor, aux1], dim=1)
180 |         aux2 = self.stg2_full_band_net(self.stg2_bridge(hidden_state))
181 | 
182 |         # Further processing the combined outputs through the third stage.
183 |         hidden_state = torch.cat([input_tensor, aux1, aux2], dim=1)
184 |         hidden_state = self.stg3_full_band_net(self.stg3_bridge(hidden_state))
185 | 
186 |         # Applying the final output layer to produce the mask.
187 |         mask = torch.sigmoid(self.out(hidden_state))
188 | 
189 |         # Padding the mask to match the output frequency bin size.
190 |         mask = F.pad(
191 |             input=mask,
192 |             pad=(0, 0, 0, self.output_bin - mask.size()[2]),
193 |             mode="replicate",
194 |         )
195 | 
196 |         # During training, auxiliary outputs are also produced and padded accordingly.
197 |         if self.training:
198 |             aux1 = torch.sigmoid(self.aux1_out(aux1))
199 |             aux1 = F.pad(
200 |                 input=aux1,
201 |                 pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
202 |                 mode="replicate",
203 |             )
204 |             aux2 = torch.sigmoid(self.aux2_out(aux2))
205 |             aux2 = F.pad(
206 |                 input=aux2,
207 |                 pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
208 |                 mode="replicate",
209 |             )
210 |             return mask * mix, aux1 * mix, aux2 * mix
211 |         else:
212 |             return mask  # * mix
213 | 
214 |     def predict_mask(self, input_tensor):
215 |         # This method predicts the mask for the input tensor by calling the forward method
216 |         # and applying any necessary padding adjustments.
217 |         mask = self.forward(input_tensor)
218 | 
219 |         # Adjusting the mask by removing padding offsets if present.
220 |         if self.offset > 0:
221 |             mask = mask[:, :, :, self.offset : -self.offset]
222 | 
223 |         return mask
224 | 


--------------------------------------------------------------------------------
/UVR/uvr/uvr_lib_v5/vr_network/nets_new.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | import torch.nn.functional as F
  4 | from . import layers_new as layers
  5 | 
  6 | 
  7 | class BaseNet(nn.Module):
  8 |     """
  9 |     BaseNet Class:
 10 |     This class defines the base network architecture for vocal removal. It includes a series of encoders for feature extraction,
 11 |     an ASPP module for capturing multi-scale context, and a series of decoders for reconstructing the output. Additionally,
 12 |     it incorporates an LSTM module for capturing temporal dependencies.
 13 |     """
 14 | 
 15 |     def __init__(
 16 |         self, nin, nout, nin_lstm, nout_lstm, dilations=((4, 2), (8, 4), (12, 6))
 17 |     ):
 18 |         super(BaseNet, self).__init__()
 19 |         # Initialize the encoder layers with increasing output channels for hierarchical feature extraction.
 20 |         self.enc1 = layers.Conv2DBNActiv(nin, nout, 3, 1, 1)
 21 |         self.enc2 = layers.Encoder(nout, nout * 2, 3, 2, 1)
 22 |         self.enc3 = layers.Encoder(nout * 2, nout * 4, 3, 2, 1)
 23 |         self.enc4 = layers.Encoder(nout * 4, nout * 6, 3, 2, 1)
 24 |         self.enc5 = layers.Encoder(nout * 6, nout * 8, 3, 2, 1)
 25 | 
 26 |         # ASPP module for capturing multi-scale features with different dilation rates.
 27 |         self.aspp = layers.ASPPModule(nout * 8, nout * 8, dilations, dropout=True)
 28 | 
 29 |         # Decoder layers for upscaling and merging features from different levels of the encoder and ASPP module.
 30 |         self.dec4 = layers.Decoder(nout * (6 + 8), nout * 6, 3, 1, 1)
 31 |         self.dec3 = layers.Decoder(nout * (4 + 6), nout * 4, 3, 1, 1)
 32 |         self.dec2 = layers.Decoder(nout * (2 + 4), nout * 2, 3, 1, 1)
 33 | 
 34 |         # LSTM module for capturing temporal dependencies in the sequence of features.
 35 |         self.lstm_dec2 = layers.LSTMModule(nout * 2, nin_lstm, nout_lstm)
 36 |         self.dec1 = layers.Decoder(nout * (1 + 2) + 1, nout * 1, 3, 1, 1)
 37 | 
 38 |     def __call__(self, input_tensor):
 39 |         # Sequentially pass the input through the encoder layers.
 40 |         encoded1 = self.enc1(input_tensor)
 41 |         encoded2 = self.enc2(encoded1)
 42 |         encoded3 = self.enc3(encoded2)
 43 |         encoded4 = self.enc4(encoded3)
 44 |         encoded5 = self.enc5(encoded4)
 45 | 
 46 |         # Pass the deepest encoder output through the ASPP module.
 47 |         bottleneck = self.aspp(encoded5)
 48 | 
 49 |         # Sequentially upscale and merge the features using the decoder layers.
 50 |         bottleneck = self.dec4(bottleneck, encoded4)
 51 |         bottleneck = self.dec3(bottleneck, encoded3)
 52 |         bottleneck = self.dec2(bottleneck, encoded2)
 53 |         # Concatenate the LSTM module output for temporal feature enhancement.
 54 |         bottleneck = torch.cat([bottleneck, self.lstm_dec2(bottleneck)], dim=1)
 55 |         bottleneck = self.dec1(bottleneck, encoded1)
 56 | 
 57 |         return bottleneck
 58 | 
 59 | 
 60 | class CascadedNet(nn.Module):
 61 |     """
 62 |     CascadedNet Class:
 63 |     This class defines a cascaded network architecture that processes input in multiple stages, each stage focusing on different frequency bands.
 64 |     It utilizes the BaseNet for processing, and combines outputs from different stages to produce the final mask for vocal removal.
 65 |     """
 66 | 
 67 |     def __init__(self, n_fft, nn_arch_size=51000, nout=32, nout_lstm=128):
 68 |         super(CascadedNet, self).__init__()
 69 |         # Calculate frequency bins based on FFT size.
 70 |         self.max_bin = n_fft // 2
 71 |         self.output_bin = n_fft // 2 + 1
 72 |         self.nin_lstm = self.max_bin // 2
 73 |         self.offset = 64
 74 |         # Adjust output channels based on the architecture size.
 75 |         nout = 64 if nn_arch_size == 218409 else nout
 76 | 
 77 |         # print(nout, nout_lstm, n_fft)
 78 | 
 79 |         # Initialize the network stages, each focusing on different frequency bands and progressively refining the output.
 80 |         self.stg1_low_band_net = nn.Sequential(
 81 |             BaseNet(2, nout // 2, self.nin_lstm // 2, nout_lstm),
 82 |             layers.Conv2DBNActiv(nout // 2, nout // 4, 1, 1, 0),
 83 |         )
 84 |         self.stg1_high_band_net = BaseNet(
 85 |             2, nout // 4, self.nin_lstm // 2, nout_lstm // 2
 86 |         )
 87 | 
 88 |         self.stg2_low_band_net = nn.Sequential(
 89 |             BaseNet(nout // 4 + 2, nout, self.nin_lstm // 2, nout_lstm),
 90 |             layers.Conv2DBNActiv(nout, nout // 2, 1, 1, 0),
 91 |         )
 92 |         self.stg2_high_band_net = BaseNet(
 93 |             nout // 4 + 2, nout // 2, self.nin_lstm // 2, nout_lstm // 2
 94 |         )
 95 | 
 96 |         self.stg3_full_band_net = BaseNet(
 97 |             3 * nout // 4 + 2, nout, self.nin_lstm, nout_lstm
 98 |         )
 99 | 
100 |         # Output layer for generating the final mask.
101 |         self.out = nn.Conv2d(nout, 2, 1, bias=False)
102 |         # Auxiliary output layer for intermediate supervision during training.
103 |         self.aux_out = nn.Conv2d(3 * nout // 4, 2, 1, bias=False)
104 | 
105 |     def forward(self, input_tensor):
106 |         # Preprocess input tensor to match the maximum frequency bin.
107 |         input_tensor = input_tensor[:, :, : self.max_bin]
108 | 
109 |         # Split the input into low and high frequency bands.
110 |         bandw = input_tensor.size()[2] // 2
111 |         l1_in = input_tensor[:, :, :bandw]
112 |         h1_in = input_tensor[:, :, bandw:]
113 | 
114 |         # Process each band through the first stage networks.
115 |         l1 = self.stg1_low_band_net(l1_in)
116 |         h1 = self.stg1_high_band_net(h1_in)
117 | 
118 |         # Combine the outputs for auxiliary supervision.
119 |         aux1 = torch.cat([l1, h1], dim=2)
120 | 
121 |         # Prepare inputs for the second stage by concatenating the original and processed bands.
122 |         l2_in = torch.cat([l1_in, l1], dim=1)
123 |         h2_in = torch.cat([h1_in, h1], dim=1)
124 | 
125 |         # Process through the second stage networks.
126 |         l2 = self.stg2_low_band_net(l2_in)
127 |         h2 = self.stg2_high_band_net(h2_in)
128 | 
129 |         # Combine the outputs for auxiliary supervision.
130 |         aux2 = torch.cat([l2, h2], dim=2)
131 | 
132 |         # Prepare input for the third stage by concatenating all previous outputs with the original input.
133 |         f3_in = torch.cat([input_tensor, aux1, aux2], dim=1)
134 | 
135 |         # Process through the third stage network.
136 |         f3 = self.stg3_full_band_net(f3_in)
137 | 
138 |         # Apply the output layer to generate the final mask and apply sigmoid for normalization.
139 |         mask = torch.sigmoid(self.out(f3))
140 | 
141 |         # Pad the mask to match the output frequency bin size.
142 |         mask = F.pad(
143 |             input=mask,
144 |             pad=(0, 0, 0, self.output_bin - mask.size()[2]),
145 |             mode="replicate",
146 |         )
147 | 
148 |         # During training, generate and pad the auxiliary output for additional supervision.
149 |         if self.training:
150 |             aux = torch.cat([aux1, aux2], dim=1)
151 |             aux = torch.sigmoid(self.aux_out(aux))
152 |             aux = F.pad(
153 |                 input=aux,
154 |                 pad=(0, 0, 0, self.output_bin - aux.size()[2]),
155 |                 mode="replicate",
156 |             )
157 |             return mask, aux
158 |         else:
159 |             return mask
160 | 
161 |     # Method for predicting the mask given an input tensor.
162 |     def predict_mask(self, input_tensor):
163 |         mask = self.forward(input_tensor)
164 | 
165 |         # If an offset is specified, crop the mask to remove edge artifacts.
166 |         if self.offset > 0:
167 |             mask = mask[:, :, :, self.offset : -self.offset]
168 |             assert mask.size()[3] > 0
169 | 
170 |         return mask
171 | 
172 |     # Method for applying the predicted mask to the input tensor to obtain the predicted magnitude.
173 |     def predict(self, input_tensor):
174 |         mask = self.forward(input_tensor)
175 |         pred_mag = input_tensor * mask
176 | 
177 |         # If an offset is specified, crop the predicted magnitude to remove edge artifacts.
178 |         if self.offset > 0:
179 |             pred_mag = pred_mag[:, :, :, self.offset : -self.offset]
180 |             assert pred_mag.size()[3] > 0
181 | 
182 |         return pred_mag
183 | 


--------------------------------------------------------------------------------