├── .gitignore ├── README.md ├── audio.aac ├── download.sh ├── requirements.txt ├── separate.py ├── uvr5_pack ├── lib_v5 │ ├── dataset.py │ ├── layers.py │ ├── layers_123812KB .py │ ├── layers_123821KB.py │ ├── layers_33966KB.py │ ├── layers_537227KB.py │ ├── layers_537238KB.py │ ├── model_param_init.py │ ├── modelparams │ │ ├── 1band_sr16000_hl512.json │ │ ├── 1band_sr32000_hl512.json │ │ ├── 1band_sr33075_hl384.json │ │ ├── 1band_sr44100_hl1024.json │ │ ├── 1band_sr44100_hl256.json │ │ ├── 1band_sr44100_hl512.json │ │ ├── 1band_sr44100_hl512_cut.json │ │ ├── 2band_32000.json │ │ ├── 2band_44100_lofi.json │ │ ├── 2band_48000.json │ │ ├── 3band_44100.json │ │ ├── 3band_44100_mid.json │ │ ├── 3band_44100_msb2.json │ │ ├── 4band_44100.json │ │ ├── 4band_44100_mid.json │ │ ├── 4band_44100_msb.json │ │ ├── 4band_44100_msb2.json │ │ ├── 4band_44100_reverse.json │ │ ├── 4band_44100_sw.json │ │ ├── 4band_v2.json │ │ ├── 4band_v2_sn.json │ │ └── ensemble.json │ ├── nets.py │ ├── nets_123812KB.py │ ├── nets_123821KB.py │ ├── nets_33966KB.py │ ├── nets_537227KB.py │ ├── nets_537238KB.py │ ├── nets_61968KB.py │ └── spec_utils.py └── utils.py └── uvr5_weights └── .gitkeep /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | *.pth 163 | # General 164 | .DS_Store 165 | .AppleDouble 166 | .LSOverride 167 | 168 | # Icon must end with two \r 169 | Icon 170 | 171 | 172 | # Thumbnails 173 | ._* 174 | 175 | # Files that might appear in the root of a volume 176 | .DocumentRevisions-V100 177 | .fseventsd 178 | .Spotlight-V100 179 | .TemporaryItems 180 | .Trashes 181 | .VolumeIcon.icns 182 | .com.apple.timemachine.donotpresent 183 | 184 | # Directories potentially created on remote AFP share 185 | .AppleDB 186 | .AppleDesktop 187 | Network Trash Folder 188 | Temporary Items 189 | .apdisk 190 | 191 | opt/ 192 | models/ 193 | models.zip -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 📦 Check out my new library called [Vocal](https://github.com/seanghay/vocal) for **Vocal/Speech Separation** built with simplicity in mind. 2 | 3 | --- 4 | 5 | # Ultimate Vocal Remover CLI 6 | 7 | [[Colab]](https://colab.research.google.com/drive/1VDncdndceKanFrs2LU-LM4Odv8tnPkzD?usp=sharing) 8 | 9 | ⚠️ Before running this project, make sure you have installed `torch`, `torchaudio`. Please check out the PyTorch documentation. 10 | 11 | ⚠️ Also make sure you have `libsndfile` and `ffmpeg` installed. 12 | 13 | ⚠️ This project currently works on CUDA. 14 | 15 | ## Install dependencies 16 | 17 | ```shell 18 | pip install -r requirements.txt 19 | ``` 20 | 21 | ## Download Model Weights 22 | 23 | ```shell 24 | ./download.sh 25 | ``` 26 | 27 | ## Separation 28 | 29 | ```shell 30 | python separate.py 31 | ``` 32 | 33 | [Modify these lines to fit your needs](https://github.com/seanghay/uvr/blob/fa19a9821d42586883202623936a0c8b895ae047/separate.py#L101-L108) 34 | 35 | ```python 36 | if __name__ == '__main__': 37 | device = 'cuda' 38 | is_half=True 39 | model_path='uvr5_weights/2_HP-UVR.pth' 40 | pre_fun = _audio_pre_(model_path=model_path,device=device,is_half=True) 41 | audio_path = 'audio.aac' 42 | save_path = 'opt' 43 | pre_fun._path_audio_(audio_path , save_path,save_path) 44 | ``` 45 | -------------------------------------------------------------------------------- /audio.aac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/seanghay/uvr/ae9ade573f5d4d0928a7cd7a67011339b6c7d143/audio.aac -------------------------------------------------------------------------------- /download.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if [ ! -f uvr5_weights/2_HP-UVR.pth ]; then 4 | echo "Download the model weights" 5 | wget -q -O uvr5_weights/2_HP-UVR.pth 2_HP-UVR.pth https://huggingface.co/fastrolling/uvr/resolve/main/Main_Models/2_HP-UVR.pth 6 | fi 7 | 8 | echo "The model weights have been downloaded" 9 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | joblib>=1.1.0 2 | numba==0.56.4 3 | numpy==1.23.5 4 | scipy==1.9.3 5 | librosa==0.9.1 6 | llvmlite==0.39.0 7 | pydub>=0.25.1 8 | soundfile>=0.12.1 9 | ffmpeg-python>=0.2.0 10 | tensorboardX 11 | praat-parselmouth>=0.4.2 12 | Pillow>=9.1.1 13 | resampy>=0.4.2 14 | scikit-learn 15 | starlette>=0.25.0 16 | tensorboard 17 | tqdm>=4.63.1 18 | audioread 19 | torchcrepe==0.0.20 -------------------------------------------------------------------------------- /separate.py: -------------------------------------------------------------------------------- 1 | import os,sys,torch,warnings,pdb 2 | warnings.filterwarnings("ignore") 3 | import librosa 4 | import importlib 5 | import numpy as np 6 | import hashlib , math 7 | from tqdm import tqdm 8 | from uvr5_pack.lib_v5 import spec_utils 9 | from uvr5_pack.utils import _get_name_params,inference 10 | from uvr5_pack.lib_v5.model_param_init import ModelParameters 11 | from scipy.io import wavfile 12 | 13 | class _audio_pre_(): 14 | def __init__(self, model_path,device,is_half): 15 | self.model_path = model_path 16 | self.device = device 17 | self.data = { 18 | # Processing Options 19 | 'postprocess': False, 20 | 'tta': False, 21 | # Constants 22 | 'window_size': 512, 23 | 'agg': 10, 24 | 'high_end_process': 'mirroring', 25 | } 26 | nn_arch_sizes = [ 27 | 31191, # default 28 | 33966,61968, 123821, 123812, 537238 # custom 29 | ] 30 | self.nn_architecture = list('{}KB'.format(s) for s in nn_arch_sizes) 31 | model_size = math.ceil(os.stat(model_path ).st_size / 1024) 32 | nn_architecture = '{}KB'.format(min(nn_arch_sizes, key=lambda x:abs(x-model_size))) 33 | nets = importlib.import_module('uvr5_pack.lib_v5.nets' + f'_{nn_architecture}'.replace('_{}KB'.format(nn_arch_sizes[0]), ''), package=None) 34 | model_hash = hashlib.md5(open(model_path,'rb').read()).hexdigest() 35 | param_name ,model_params_d = _get_name_params(model_path , model_hash) 36 | 37 | mp = ModelParameters(model_params_d) 38 | model = nets.CascadedASPPNet(mp.param['bins'] * 2) 39 | cpk = torch.load( model_path , map_location='cpu') 40 | model.load_state_dict(cpk) 41 | model.eval() 42 | if(is_half==True):model = model.half().to(device) 43 | else:model = model.to(device) 44 | 45 | self.mp = mp 46 | self.model = model 47 | 48 | def _path_audio_(self, music_file ,ins_root=None,vocal_root=None): 49 | if(ins_root is None and vocal_root is None):return "No save root." 50 | name=os.path.basename(music_file) 51 | if(ins_root is not None):os.makedirs(ins_root, exist_ok=True) 52 | if(vocal_root is not None):os.makedirs(vocal_root , exist_ok=True) 53 | X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {} 54 | bands_n = len(self.mp.param['band']) 55 | # print(bands_n) 56 | for d in range(bands_n, 0, -1): 57 | bp = self.mp.param['band'][d] 58 | if d == bands_n: # high-end band 59 | X_wave[d], _ = librosa.core.load( 60 | music_file, bp['sr'], False, dtype=np.float32, res_type=bp['res_type']) 61 | if X_wave[d].ndim == 1: 62 | X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]]) 63 | else: # lower bands 64 | X_wave[d] = librosa.core.resample(X_wave[d+1], self.mp.param['band'][d+1]['sr'], bp['sr'], res_type=bp['res_type']) 65 | # Stft of wave source 66 | X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(X_wave[d], bp['hl'], bp['n_fft'], self.mp.param['mid_side'], self.mp.param['mid_side_b2'], self.mp.param['reverse']) 67 | # pdb.set_trace() 68 | if d == bands_n and self.data['high_end_process'] != 'none': 69 | input_high_end_h = (bp['n_fft']//2 - bp['crop_stop']) + ( self.mp.param['pre_filter_stop'] - self.mp.param['pre_filter_start']) 70 | input_high_end = X_spec_s[d][:, bp['n_fft']//2-input_high_end_h:bp['n_fft']//2, :] 71 | 72 | X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp) 73 | aggresive_set = float(self.data['agg']/100) 74 | aggressiveness = {'value': aggresive_set, 'split_bin': self.mp.param['band'][1]['crop_stop']} 75 | with torch.no_grad(): 76 | pred, X_mag, X_phase = inference(X_spec_m,self.device,self.model, aggressiveness,self.data) 77 | # Postprocess 78 | if self.data['postprocess']: 79 | pred_inv = np.clip(X_mag - pred, 0, np.inf) 80 | pred = spec_utils.mask_silence(pred, pred_inv) 81 | y_spec_m = pred * X_phase 82 | v_spec_m = X_spec_m - y_spec_m 83 | 84 | if (ins_root is not None): 85 | if self.data['high_end_process'].startswith('mirroring'): 86 | input_high_end_ = spec_utils.mirroring(self.data['high_end_process'], y_spec_m, input_high_end, self.mp) 87 | wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp,input_high_end_h, input_high_end_) 88 | else: 89 | wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp) 90 | print ('%s instruments done'%name) 91 | wavfile.write(os.path.join(ins_root, 'instrument_{}.wav'.format(name) ), self.mp.param['sr'], (np.array(wav_instrument)*32768).astype("int16")) # 92 | if (vocal_root is not None): 93 | if self.data['high_end_process'].startswith('mirroring'): 94 | input_high_end_ = spec_utils.mirroring(self.data['high_end_process'], v_spec_m, input_high_end, self.mp) 95 | wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp, input_high_end_h, input_high_end_) 96 | else: 97 | wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp) 98 | print ('%s vocals done'%name) 99 | wavfile.write(os.path.join(vocal_root , 'vocal_{}.wav'.format(name) ), self.mp.param['sr'], (np.array(wav_vocals)*32768).astype("int16")) 100 | 101 | if __name__ == '__main__': 102 | device = 'cuda' 103 | is_half=True 104 | model_path='uvr5_weights/2_HP-UVR.pth' 105 | pre_fun = _audio_pre_(model_path=model_path,device=device,is_half=True) 106 | audio_path = 'audio.aac' 107 | save_path = 'opt' 108 | pre_fun._path_audio_(audio_path , save_path,save_path) 109 | -------------------------------------------------------------------------------- /uvr5_pack/lib_v5/dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | 4 | import numpy as np 5 | import torch 6 | import torch.utils.data 7 | from tqdm import tqdm 8 | 9 | from uvr5_pack.lib_v5 import spec_utils 10 | 11 | 12 | class VocalRemoverValidationSet(torch.utils.data.Dataset): 13 | 14 | def __init__(self, patch_list): 15 | self.patch_list = patch_list 16 | 17 | def __len__(self): 18 | return len(self.patch_list) 19 | 20 | def __getitem__(self, idx): 21 | path = self.patch_list[idx] 22 | data = np.load(path) 23 | 24 | X, y = data['X'], data['y'] 25 | 26 | X_mag = np.abs(X) 27 | y_mag = np.abs(y) 28 | 29 | return X_mag, y_mag 30 | 31 | 32 | def make_pair(mix_dir, inst_dir): 33 | input_exts = ['.wav', '.m4a', '.mp3', '.mp4', '.flac'] 34 | 35 | X_list = sorted([ 36 | os.path.join(mix_dir, fname) 37 | for fname in os.listdir(mix_dir) 38 | if os.path.splitext(fname)[1] in input_exts]) 39 | y_list = sorted([ 40 | os.path.join(inst_dir, fname) 41 | for fname in os.listdir(inst_dir) 42 | if os.path.splitext(fname)[1] in input_exts]) 43 | 44 | filelist = list(zip(X_list, y_list)) 45 | 46 | return filelist 47 | 48 | 49 | def train_val_split(dataset_dir, split_mode, val_rate, val_filelist): 50 | if split_mode == 'random': 51 | filelist = make_pair( 52 | os.path.join(dataset_dir, 'mixtures'), 53 | os.path.join(dataset_dir, 'instruments')) 54 | 55 | random.shuffle(filelist) 56 | 57 | if len(val_filelist) == 0: 58 | val_size = int(len(filelist) * val_rate) 59 | train_filelist = filelist[:-val_size] 60 | val_filelist = filelist[-val_size:] 61 | else: 62 | train_filelist = [ 63 | pair for pair in filelist 64 | if list(pair) not in val_filelist] 65 | elif split_mode == 'subdirs': 66 | if len(val_filelist) != 0: 67 | raise ValueError('The `val_filelist` option is not available in `subdirs` mode') 68 | 69 | train_filelist = make_pair( 70 | os.path.join(dataset_dir, 'training/mixtures'), 71 | os.path.join(dataset_dir, 'training/instruments')) 72 | 73 | val_filelist = make_pair( 74 | os.path.join(dataset_dir, 'validation/mixtures'), 75 | os.path.join(dataset_dir, 'validation/instruments')) 76 | 77 | return train_filelist, val_filelist 78 | 79 | 80 | def augment(X, y, reduction_rate, reduction_mask, mixup_rate, mixup_alpha): 81 | perm = np.random.permutation(len(X)) 82 | for i, idx in enumerate(tqdm(perm)): 83 | if np.random.uniform() < reduction_rate: 84 | y[idx] = spec_utils.reduce_vocal_aggressively(X[idx], y[idx], reduction_mask) 85 | 86 | if np.random.uniform() < 0.5: 87 | # swap channel 88 | X[idx] = X[idx, ::-1] 89 | y[idx] = y[idx, ::-1] 90 | if np.random.uniform() < 0.02: 91 | # mono 92 | X[idx] = X[idx].mean(axis=0, keepdims=True) 93 | y[idx] = y[idx].mean(axis=0, keepdims=True) 94 | if np.random.uniform() < 0.02: 95 | # inst 96 | X[idx] = y[idx] 97 | 98 | if np.random.uniform() < mixup_rate and i < len(perm) - 1: 99 | lam = np.random.beta(mixup_alpha, mixup_alpha) 100 | X[idx] = lam * X[idx] + (1 - lam) * X[perm[i + 1]] 101 | y[idx] = lam * y[idx] + (1 - lam) * y[perm[i + 1]] 102 | 103 | return X, y 104 | 105 | 106 | def make_padding(width, cropsize, offset): 107 | left = offset 108 | roi_size = cropsize - left * 2 109 | if roi_size == 0: 110 | roi_size = cropsize 111 | right = roi_size - (width % roi_size) + left 112 | 113 | return left, right, roi_size 114 | 115 | 116 | def make_training_set(filelist, cropsize, patches, sr, hop_length, n_fft, offset): 117 | len_dataset = patches * len(filelist) 118 | 119 | X_dataset = np.zeros( 120 | (len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64) 121 | y_dataset = np.zeros( 122 | (len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64) 123 | 124 | for i, (X_path, y_path) in enumerate(tqdm(filelist)): 125 | X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft) 126 | coef = np.max([np.abs(X).max(), np.abs(y).max()]) 127 | X, y = X / coef, y / coef 128 | 129 | l, r, roi_size = make_padding(X.shape[2], cropsize, offset) 130 | X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode='constant') 131 | y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode='constant') 132 | 133 | starts = np.random.randint(0, X_pad.shape[2] - cropsize, patches) 134 | ends = starts + cropsize 135 | for j in range(patches): 136 | idx = i * patches + j 137 | X_dataset[idx] = X_pad[:, :, starts[j]:ends[j]] 138 | y_dataset[idx] = y_pad[:, :, starts[j]:ends[j]] 139 | 140 | return X_dataset, y_dataset 141 | 142 | 143 | def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset): 144 | patch_list = [] 145 | patch_dir = 'cs{}_sr{}_hl{}_nf{}_of{}'.format(cropsize, sr, hop_length, n_fft, offset) 146 | os.makedirs(patch_dir, exist_ok=True) 147 | 148 | for i, (X_path, y_path) in enumerate(tqdm(filelist)): 149 | basename = os.path.splitext(os.path.basename(X_path))[0] 150 | 151 | X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft) 152 | coef = np.max([np.abs(X).max(), np.abs(y).max()]) 153 | X, y = X / coef, y / coef 154 | 155 | l, r, roi_size = make_padding(X.shape[2], cropsize, offset) 156 | X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode='constant') 157 | y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode='constant') 158 | 159 | len_dataset = int(np.ceil(X.shape[2] / roi_size)) 160 | for j in range(len_dataset): 161 | outpath = os.path.join(patch_dir, '{}_p{}.npz'.format(basename, j)) 162 | start = j * roi_size 163 | if not os.path.exists(outpath): 164 | np.savez( 165 | outpath, 166 | X=X_pad[:, :, start:start + cropsize], 167 | y=y_pad[:, :, start:start + cropsize]) 168 | patch_list.append(outpath) 169 | 170 | return VocalRemoverValidationSet(patch_list) 171 | -------------------------------------------------------------------------------- /uvr5_pack/lib_v5/layers.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | 5 | from uvr5_pack.lib_v5 import spec_utils 6 | 7 | 8 | class Conv2DBNActiv(nn.Module): 9 | 10 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 11 | super(Conv2DBNActiv, self).__init__() 12 | self.conv = nn.Sequential( 13 | nn.Conv2d( 14 | nin, nout, 15 | kernel_size=ksize, 16 | stride=stride, 17 | padding=pad, 18 | dilation=dilation, 19 | bias=False), 20 | nn.BatchNorm2d(nout), 21 | activ() 22 | ) 23 | 24 | def __call__(self, x): 25 | return self.conv(x) 26 | 27 | 28 | class SeperableConv2DBNActiv(nn.Module): 29 | 30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 31 | super(SeperableConv2DBNActiv, self).__init__() 32 | self.conv = nn.Sequential( 33 | nn.Conv2d( 34 | nin, nin, 35 | kernel_size=ksize, 36 | stride=stride, 37 | padding=pad, 38 | dilation=dilation, 39 | groups=nin, 40 | bias=False), 41 | nn.Conv2d( 42 | nin, nout, 43 | kernel_size=1, 44 | bias=False), 45 | nn.BatchNorm2d(nout), 46 | activ() 47 | ) 48 | 49 | def __call__(self, x): 50 | return self.conv(x) 51 | 52 | 53 | class Encoder(nn.Module): 54 | 55 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): 56 | super(Encoder, self).__init__() 57 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 58 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) 59 | 60 | def __call__(self, x): 61 | skip = self.conv1(x) 62 | h = self.conv2(skip) 63 | 64 | return h, skip 65 | 66 | 67 | class Decoder(nn.Module): 68 | 69 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): 70 | super(Decoder, self).__init__() 71 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 72 | self.dropout = nn.Dropout2d(0.1) if dropout else None 73 | 74 | def __call__(self, x, skip=None): 75 | x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True) 76 | if skip is not None: 77 | skip = spec_utils.crop_center(skip, x) 78 | x = torch.cat([x, skip], dim=1) 79 | h = self.conv(x) 80 | 81 | if self.dropout is not None: 82 | h = self.dropout(h) 83 | 84 | return h 85 | 86 | 87 | class ASPPModule(nn.Module): 88 | 89 | def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): 90 | super(ASPPModule, self).__init__() 91 | self.conv1 = nn.Sequential( 92 | nn.AdaptiveAvgPool2d((1, None)), 93 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) 94 | ) 95 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) 96 | self.conv3 = SeperableConv2DBNActiv( 97 | nin, nin, 3, 1, dilations[0], dilations[0], activ=activ) 98 | self.conv4 = SeperableConv2DBNActiv( 99 | nin, nin, 3, 1, dilations[1], dilations[1], activ=activ) 100 | self.conv5 = SeperableConv2DBNActiv( 101 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) 102 | self.bottleneck = nn.Sequential( 103 | Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), 104 | nn.Dropout2d(0.1) 105 | ) 106 | 107 | def forward(self, x): 108 | _, _, h, w = x.size() 109 | feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True) 110 | feat2 = self.conv2(x) 111 | feat3 = self.conv3(x) 112 | feat4 = self.conv4(x) 113 | feat5 = self.conv5(x) 114 | out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) 115 | bottle = self.bottleneck(out) 116 | return bottle 117 | -------------------------------------------------------------------------------- /uvr5_pack/lib_v5/layers_123812KB .py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | 5 | from uvr5_pack.lib_v5 import spec_utils 6 | 7 | 8 | class Conv2DBNActiv(nn.Module): 9 | 10 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 11 | super(Conv2DBNActiv, self).__init__() 12 | self.conv = nn.Sequential( 13 | nn.Conv2d( 14 | nin, nout, 15 | kernel_size=ksize, 16 | stride=stride, 17 | padding=pad, 18 | dilation=dilation, 19 | bias=False), 20 | nn.BatchNorm2d(nout), 21 | activ() 22 | ) 23 | 24 | def __call__(self, x): 25 | return self.conv(x) 26 | 27 | 28 | class SeperableConv2DBNActiv(nn.Module): 29 | 30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 31 | super(SeperableConv2DBNActiv, self).__init__() 32 | self.conv = nn.Sequential( 33 | nn.Conv2d( 34 | nin, nin, 35 | kernel_size=ksize, 36 | stride=stride, 37 | padding=pad, 38 | dilation=dilation, 39 | groups=nin, 40 | bias=False), 41 | nn.Conv2d( 42 | nin, nout, 43 | kernel_size=1, 44 | bias=False), 45 | nn.BatchNorm2d(nout), 46 | activ() 47 | ) 48 | 49 | def __call__(self, x): 50 | return self.conv(x) 51 | 52 | 53 | class Encoder(nn.Module): 54 | 55 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): 56 | super(Encoder, self).__init__() 57 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 58 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) 59 | 60 | def __call__(self, x): 61 | skip = self.conv1(x) 62 | h = self.conv2(skip) 63 | 64 | return h, skip 65 | 66 | 67 | class Decoder(nn.Module): 68 | 69 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): 70 | super(Decoder, self).__init__() 71 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 72 | self.dropout = nn.Dropout2d(0.1) if dropout else None 73 | 74 | def __call__(self, x, skip=None): 75 | x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True) 76 | if skip is not None: 77 | skip = spec_utils.crop_center(skip, x) 78 | x = torch.cat([x, skip], dim=1) 79 | h = self.conv(x) 80 | 81 | if self.dropout is not None: 82 | h = self.dropout(h) 83 | 84 | return h 85 | 86 | 87 | class ASPPModule(nn.Module): 88 | 89 | def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): 90 | super(ASPPModule, self).__init__() 91 | self.conv1 = nn.Sequential( 92 | nn.AdaptiveAvgPool2d((1, None)), 93 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) 94 | ) 95 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) 96 | self.conv3 = SeperableConv2DBNActiv( 97 | nin, nin, 3, 1, dilations[0], dilations[0], activ=activ) 98 | self.conv4 = SeperableConv2DBNActiv( 99 | nin, nin, 3, 1, dilations[1], dilations[1], activ=activ) 100 | self.conv5 = SeperableConv2DBNActiv( 101 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) 102 | self.bottleneck = nn.Sequential( 103 | Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), 104 | nn.Dropout2d(0.1) 105 | ) 106 | 107 | def forward(self, x): 108 | _, _, h, w = x.size() 109 | feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True) 110 | feat2 = self.conv2(x) 111 | feat3 = self.conv3(x) 112 | feat4 = self.conv4(x) 113 | feat5 = self.conv5(x) 114 | out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) 115 | bottle = self.bottleneck(out) 116 | return bottle 117 | -------------------------------------------------------------------------------- /uvr5_pack/lib_v5/layers_123821KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | 5 | from uvr5_pack.lib_v5 import spec_utils 6 | 7 | 8 | class Conv2DBNActiv(nn.Module): 9 | 10 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 11 | super(Conv2DBNActiv, self).__init__() 12 | self.conv = nn.Sequential( 13 | nn.Conv2d( 14 | nin, nout, 15 | kernel_size=ksize, 16 | stride=stride, 17 | padding=pad, 18 | dilation=dilation, 19 | bias=False), 20 | nn.BatchNorm2d(nout), 21 | activ() 22 | ) 23 | 24 | def __call__(self, x): 25 | return self.conv(x) 26 | 27 | 28 | class SeperableConv2DBNActiv(nn.Module): 29 | 30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 31 | super(SeperableConv2DBNActiv, self).__init__() 32 | self.conv = nn.Sequential( 33 | nn.Conv2d( 34 | nin, nin, 35 | kernel_size=ksize, 36 | stride=stride, 37 | padding=pad, 38 | dilation=dilation, 39 | groups=nin, 40 | bias=False), 41 | nn.Conv2d( 42 | nin, nout, 43 | kernel_size=1, 44 | bias=False), 45 | nn.BatchNorm2d(nout), 46 | activ() 47 | ) 48 | 49 | def __call__(self, x): 50 | return self.conv(x) 51 | 52 | 53 | class Encoder(nn.Module): 54 | 55 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): 56 | super(Encoder, self).__init__() 57 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 58 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) 59 | 60 | def __call__(self, x): 61 | skip = self.conv1(x) 62 | h = self.conv2(skip) 63 | 64 | return h, skip 65 | 66 | 67 | class Decoder(nn.Module): 68 | 69 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): 70 | super(Decoder, self).__init__() 71 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 72 | self.dropout = nn.Dropout2d(0.1) if dropout else None 73 | 74 | def __call__(self, x, skip=None): 75 | x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True) 76 | if skip is not None: 77 | skip = spec_utils.crop_center(skip, x) 78 | x = torch.cat([x, skip], dim=1) 79 | h = self.conv(x) 80 | 81 | if self.dropout is not None: 82 | h = self.dropout(h) 83 | 84 | return h 85 | 86 | 87 | class ASPPModule(nn.Module): 88 | 89 | def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): 90 | super(ASPPModule, self).__init__() 91 | self.conv1 = nn.Sequential( 92 | nn.AdaptiveAvgPool2d((1, None)), 93 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) 94 | ) 95 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) 96 | self.conv3 = SeperableConv2DBNActiv( 97 | nin, nin, 3, 1, dilations[0], dilations[0], activ=activ) 98 | self.conv4 = SeperableConv2DBNActiv( 99 | nin, nin, 3, 1, dilations[1], dilations[1], activ=activ) 100 | self.conv5 = SeperableConv2DBNActiv( 101 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) 102 | self.bottleneck = nn.Sequential( 103 | Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), 104 | nn.Dropout2d(0.1) 105 | ) 106 | 107 | def forward(self, x): 108 | _, _, h, w = x.size() 109 | feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True) 110 | feat2 = self.conv2(x) 111 | feat3 = self.conv3(x) 112 | feat4 = self.conv4(x) 113 | feat5 = self.conv5(x) 114 | out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) 115 | bottle = self.bottleneck(out) 116 | return bottle 117 | -------------------------------------------------------------------------------- /uvr5_pack/lib_v5/layers_33966KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | 5 | from uvr5_pack.lib_v5 import spec_utils 6 | 7 | 8 | class Conv2DBNActiv(nn.Module): 9 | 10 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 11 | super(Conv2DBNActiv, self).__init__() 12 | self.conv = nn.Sequential( 13 | nn.Conv2d( 14 | nin, nout, 15 | kernel_size=ksize, 16 | stride=stride, 17 | padding=pad, 18 | dilation=dilation, 19 | bias=False), 20 | nn.BatchNorm2d(nout), 21 | activ() 22 | ) 23 | 24 | def __call__(self, x): 25 | return self.conv(x) 26 | 27 | 28 | class SeperableConv2DBNActiv(nn.Module): 29 | 30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 31 | super(SeperableConv2DBNActiv, self).__init__() 32 | self.conv = nn.Sequential( 33 | nn.Conv2d( 34 | nin, nin, 35 | kernel_size=ksize, 36 | stride=stride, 37 | padding=pad, 38 | dilation=dilation, 39 | groups=nin, 40 | bias=False), 41 | nn.Conv2d( 42 | nin, nout, 43 | kernel_size=1, 44 | bias=False), 45 | nn.BatchNorm2d(nout), 46 | activ() 47 | ) 48 | 49 | def __call__(self, x): 50 | return self.conv(x) 51 | 52 | 53 | class Encoder(nn.Module): 54 | 55 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): 56 | super(Encoder, self).__init__() 57 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 58 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) 59 | 60 | def __call__(self, x): 61 | skip = self.conv1(x) 62 | h = self.conv2(skip) 63 | 64 | return h, skip 65 | 66 | 67 | class Decoder(nn.Module): 68 | 69 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): 70 | super(Decoder, self).__init__() 71 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 72 | self.dropout = nn.Dropout2d(0.1) if dropout else None 73 | 74 | def __call__(self, x, skip=None): 75 | x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True) 76 | if skip is not None: 77 | skip = spec_utils.crop_center(skip, x) 78 | x = torch.cat([x, skip], dim=1) 79 | h = self.conv(x) 80 | 81 | if self.dropout is not None: 82 | h = self.dropout(h) 83 | 84 | return h 85 | 86 | 87 | class ASPPModule(nn.Module): 88 | 89 | def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU): 90 | super(ASPPModule, self).__init__() 91 | self.conv1 = nn.Sequential( 92 | nn.AdaptiveAvgPool2d((1, None)), 93 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) 94 | ) 95 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) 96 | self.conv3 = SeperableConv2DBNActiv( 97 | nin, nin, 3, 1, dilations[0], dilations[0], activ=activ) 98 | self.conv4 = SeperableConv2DBNActiv( 99 | nin, nin, 3, 1, dilations[1], dilations[1], activ=activ) 100 | self.conv5 = SeperableConv2DBNActiv( 101 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) 102 | self.conv6 = SeperableConv2DBNActiv( 103 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) 104 | self.conv7 = SeperableConv2DBNActiv( 105 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) 106 | self.bottleneck = nn.Sequential( 107 | Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), 108 | nn.Dropout2d(0.1) 109 | ) 110 | 111 | def forward(self, x): 112 | _, _, h, w = x.size() 113 | feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True) 114 | feat2 = self.conv2(x) 115 | feat3 = self.conv3(x) 116 | feat4 = self.conv4(x) 117 | feat5 = self.conv5(x) 118 | feat6 = self.conv6(x) 119 | feat7 = self.conv7(x) 120 | out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1) 121 | bottle = self.bottleneck(out) 122 | return bottle 123 | -------------------------------------------------------------------------------- /uvr5_pack/lib_v5/layers_537227KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | 5 | from uvr5_pack.lib_v5 import spec_utils 6 | 7 | 8 | class Conv2DBNActiv(nn.Module): 9 | 10 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 11 | super(Conv2DBNActiv, self).__init__() 12 | self.conv = nn.Sequential( 13 | nn.Conv2d( 14 | nin, nout, 15 | kernel_size=ksize, 16 | stride=stride, 17 | padding=pad, 18 | dilation=dilation, 19 | bias=False), 20 | nn.BatchNorm2d(nout), 21 | activ() 22 | ) 23 | 24 | def __call__(self, x): 25 | return self.conv(x) 26 | 27 | 28 | class SeperableConv2DBNActiv(nn.Module): 29 | 30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 31 | super(SeperableConv2DBNActiv, self).__init__() 32 | self.conv = nn.Sequential( 33 | nn.Conv2d( 34 | nin, nin, 35 | kernel_size=ksize, 36 | stride=stride, 37 | padding=pad, 38 | dilation=dilation, 39 | groups=nin, 40 | bias=False), 41 | nn.Conv2d( 42 | nin, nout, 43 | kernel_size=1, 44 | bias=False), 45 | nn.BatchNorm2d(nout), 46 | activ() 47 | ) 48 | 49 | def __call__(self, x): 50 | return self.conv(x) 51 | 52 | 53 | class Encoder(nn.Module): 54 | 55 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): 56 | super(Encoder, self).__init__() 57 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 58 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) 59 | 60 | def __call__(self, x): 61 | skip = self.conv1(x) 62 | h = self.conv2(skip) 63 | 64 | return h, skip 65 | 66 | 67 | class Decoder(nn.Module): 68 | 69 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): 70 | super(Decoder, self).__init__() 71 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 72 | self.dropout = nn.Dropout2d(0.1) if dropout else None 73 | 74 | def __call__(self, x, skip=None): 75 | x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True) 76 | if skip is not None: 77 | skip = spec_utils.crop_center(skip, x) 78 | x = torch.cat([x, skip], dim=1) 79 | h = self.conv(x) 80 | 81 | if self.dropout is not None: 82 | h = self.dropout(h) 83 | 84 | return h 85 | 86 | 87 | class ASPPModule(nn.Module): 88 | 89 | def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU): 90 | super(ASPPModule, self).__init__() 91 | self.conv1 = nn.Sequential( 92 | nn.AdaptiveAvgPool2d((1, None)), 93 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) 94 | ) 95 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) 96 | self.conv3 = SeperableConv2DBNActiv( 97 | nin, nin, 3, 1, dilations[0], dilations[0], activ=activ) 98 | self.conv4 = SeperableConv2DBNActiv( 99 | nin, nin, 3, 1, dilations[1], dilations[1], activ=activ) 100 | self.conv5 = SeperableConv2DBNActiv( 101 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) 102 | self.conv6 = SeperableConv2DBNActiv( 103 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) 104 | self.conv7 = SeperableConv2DBNActiv( 105 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) 106 | self.bottleneck = nn.Sequential( 107 | Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), 108 | nn.Dropout2d(0.1) 109 | ) 110 | 111 | def forward(self, x): 112 | _, _, h, w = x.size() 113 | feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True) 114 | feat2 = self.conv2(x) 115 | feat3 = self.conv3(x) 116 | feat4 = self.conv4(x) 117 | feat5 = self.conv5(x) 118 | feat6 = self.conv6(x) 119 | feat7 = self.conv7(x) 120 | out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1) 121 | bottle = self.bottleneck(out) 122 | return bottle 123 | -------------------------------------------------------------------------------- /uvr5_pack/lib_v5/layers_537238KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | 5 | from uvr5_pack.lib_v5 import spec_utils 6 | 7 | 8 | class Conv2DBNActiv(nn.Module): 9 | 10 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 11 | super(Conv2DBNActiv, self).__init__() 12 | self.conv = nn.Sequential( 13 | nn.Conv2d( 14 | nin, nout, 15 | kernel_size=ksize, 16 | stride=stride, 17 | padding=pad, 18 | dilation=dilation, 19 | bias=False), 20 | nn.BatchNorm2d(nout), 21 | activ() 22 | ) 23 | 24 | def __call__(self, x): 25 | return self.conv(x) 26 | 27 | 28 | class SeperableConv2DBNActiv(nn.Module): 29 | 30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 31 | super(SeperableConv2DBNActiv, self).__init__() 32 | self.conv = nn.Sequential( 33 | nn.Conv2d( 34 | nin, nin, 35 | kernel_size=ksize, 36 | stride=stride, 37 | padding=pad, 38 | dilation=dilation, 39 | groups=nin, 40 | bias=False), 41 | nn.Conv2d( 42 | nin, nout, 43 | kernel_size=1, 44 | bias=False), 45 | nn.BatchNorm2d(nout), 46 | activ() 47 | ) 48 | 49 | def __call__(self, x): 50 | return self.conv(x) 51 | 52 | 53 | class Encoder(nn.Module): 54 | 55 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): 56 | super(Encoder, self).__init__() 57 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 58 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) 59 | 60 | def __call__(self, x): 61 | skip = self.conv1(x) 62 | h = self.conv2(skip) 63 | 64 | return h, skip 65 | 66 | 67 | class Decoder(nn.Module): 68 | 69 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): 70 | super(Decoder, self).__init__() 71 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 72 | self.dropout = nn.Dropout2d(0.1) if dropout else None 73 | 74 | def __call__(self, x, skip=None): 75 | x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True) 76 | if skip is not None: 77 | skip = spec_utils.crop_center(skip, x) 78 | x = torch.cat([x, skip], dim=1) 79 | h = self.conv(x) 80 | 81 | if self.dropout is not None: 82 | h = self.dropout(h) 83 | 84 | return h 85 | 86 | 87 | class ASPPModule(nn.Module): 88 | 89 | def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU): 90 | super(ASPPModule, self).__init__() 91 | self.conv1 = nn.Sequential( 92 | nn.AdaptiveAvgPool2d((1, None)), 93 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) 94 | ) 95 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) 96 | self.conv3 = SeperableConv2DBNActiv( 97 | nin, nin, 3, 1, dilations[0], dilations[0], activ=activ) 98 | self.conv4 = SeperableConv2DBNActiv( 99 | nin, nin, 3, 1, dilations[1], dilations[1], activ=activ) 100 | self.conv5 = SeperableConv2DBNActiv( 101 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) 102 | self.conv6 = SeperableConv2DBNActiv( 103 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) 104 | self.conv7 = SeperableConv2DBNActiv( 105 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) 106 | self.bottleneck = nn.Sequential( 107 | Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), 108 | nn.Dropout2d(0.1) 109 | ) 110 | 111 | def forward(self, x): 112 | _, _, h, w = x.size() 113 | feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True) 114 | feat2 = self.conv2(x) 115 | feat3 = self.conv3(x) 116 | feat4 = self.conv4(x) 117 | feat5 = self.conv5(x) 118 | feat6 = self.conv6(x) 119 | feat7 = self.conv7(x) 120 | out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1) 121 | bottle = self.bottleneck(out) 122 | return bottle 123 | -------------------------------------------------------------------------------- /uvr5_pack/lib_v5/model_param_init.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import pathlib 4 | 5 | default_param = {} 6 | default_param['bins'] = 768 7 | default_param['unstable_bins'] = 9 # training only 8 | default_param['reduction_bins'] = 762 # training only 9 | default_param['sr'] = 44100 10 | default_param['pre_filter_start'] = 757 11 | default_param['pre_filter_stop'] = 768 12 | default_param['band'] = {} 13 | 14 | 15 | default_param['band'][1] = { 16 | 'sr': 11025, 17 | 'hl': 128, 18 | 'n_fft': 960, 19 | 'crop_start': 0, 20 | 'crop_stop': 245, 21 | 'lpf_start': 61, # inference only 22 | 'res_type': 'polyphase' 23 | } 24 | 25 | default_param['band'][2] = { 26 | 'sr': 44100, 27 | 'hl': 512, 28 | 'n_fft': 1536, 29 | 'crop_start': 24, 30 | 'crop_stop': 547, 31 | 'hpf_start': 81, # inference only 32 | 'res_type': 'sinc_best' 33 | } 34 | 35 | 36 | def int_keys(d): 37 | r = {} 38 | for k, v in d: 39 | if k.isdigit(): 40 | k = int(k) 41 | r[k] = v 42 | return r 43 | 44 | 45 | class ModelParameters(object): 46 | def __init__(self, config_path=''): 47 | if '.pth' == pathlib.Path(config_path).suffix: 48 | import zipfile 49 | 50 | with zipfile.ZipFile(config_path, 'r') as zip: 51 | self.param = json.loads(zip.read('param.json'), object_pairs_hook=int_keys) 52 | elif '.json' == pathlib.Path(config_path).suffix: 53 | with open(config_path, 'r') as f: 54 | self.param = json.loads(f.read(), object_pairs_hook=int_keys) 55 | else: 56 | self.param = default_param 57 | 58 | for k in ['mid_side', 'mid_side_b', 'mid_side_b2', 'stereo_w', 'stereo_n', 'reverse']: 59 | if not k in self.param: 60 | self.param[k] = False -------------------------------------------------------------------------------- /uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 16000, 8 | "hl": 512, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 16000, 17 | "pre_filter_start": 1023, 18 | "pre_filter_stop": 1024 19 | } -------------------------------------------------------------------------------- /uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 32000, 8 | "hl": 512, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "kaiser_fast" 14 | } 15 | }, 16 | "sr": 32000, 17 | "pre_filter_start": 1000, 18 | "pre_filter_stop": 1021 19 | } -------------------------------------------------------------------------------- /uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 33075, 8 | "hl": 384, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 33075, 17 | "pre_filter_start": 1000, 18 | "pre_filter_stop": 1021 19 | } -------------------------------------------------------------------------------- /uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 44100, 8 | "hl": 1024, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 44100, 17 | "pre_filter_start": 1023, 18 | "pre_filter_stop": 1024 19 | } -------------------------------------------------------------------------------- /uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 256, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 44100, 8 | "hl": 256, 9 | "n_fft": 512, 10 | "crop_start": 0, 11 | "crop_stop": 256, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 44100, 17 | "pre_filter_start": 256, 18 | "pre_filter_stop": 256 19 | } -------------------------------------------------------------------------------- /uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 44100, 8 | "hl": 512, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 44100, 17 | "pre_filter_start": 1023, 18 | "pre_filter_stop": 1024 19 | } -------------------------------------------------------------------------------- /uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 44100, 8 | "hl": 512, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 700, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 44100, 17 | "pre_filter_start": 1023, 18 | "pre_filter_stop": 700 19 | } -------------------------------------------------------------------------------- /uvr5_pack/lib_v5/modelparams/2band_32000.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 7, 4 | "reduction_bins": 705, 5 | "band": { 6 | "1": { 7 | "sr": 6000, 8 | "hl": 66, 9 | "n_fft": 512, 10 | "crop_start": 0, 11 | "crop_stop": 240, 12 | "lpf_start": 60, 13 | "lpf_stop": 118, 14 | "res_type": "sinc_fastest" 15 | }, 16 | "2": { 17 | "sr": 32000, 18 | "hl": 352, 19 | "n_fft": 1024, 20 | "crop_start": 22, 21 | "crop_stop": 505, 22 | "hpf_start": 44, 23 | "hpf_stop": 23, 24 | "res_type": "sinc_medium" 25 | } 26 | }, 27 | "sr": 32000, 28 | "pre_filter_start": 710, 29 | "pre_filter_stop": 731 30 | } 31 | -------------------------------------------------------------------------------- /uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 512, 3 | "unstable_bins": 7, 4 | "reduction_bins": 510, 5 | "band": { 6 | "1": { 7 | "sr": 11025, 8 | "hl": 160, 9 | "n_fft": 768, 10 | "crop_start": 0, 11 | "crop_stop": 192, 12 | "lpf_start": 41, 13 | "lpf_stop": 139, 14 | "res_type": "sinc_fastest" 15 | }, 16 | "2": { 17 | "sr": 44100, 18 | "hl": 640, 19 | "n_fft": 1024, 20 | "crop_start": 10, 21 | "crop_stop": 320, 22 | "hpf_start": 47, 23 | "hpf_stop": 15, 24 | "res_type": "sinc_medium" 25 | } 26 | }, 27 | "sr": 44100, 28 | "pre_filter_start": 510, 29 | "pre_filter_stop": 512 30 | } 31 | -------------------------------------------------------------------------------- /uvr5_pack/lib_v5/modelparams/2band_48000.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 7, 4 | "reduction_bins": 705, 5 | "band": { 6 | "1": { 7 | "sr": 6000, 8 | "hl": 66, 9 | "n_fft": 512, 10 | "crop_start": 0, 11 | "crop_stop": 240, 12 | "lpf_start": 60, 13 | "lpf_stop": 240, 14 | "res_type": "sinc_fastest" 15 | }, 16 | "2": { 17 | "sr": 48000, 18 | "hl": 528, 19 | "n_fft": 1536, 20 | "crop_start": 22, 21 | "crop_stop": 505, 22 | "hpf_start": 82, 23 | "hpf_stop": 22, 24 | "res_type": "sinc_medium" 25 | } 26 | }, 27 | "sr": 48000, 28 | "pre_filter_start": 710, 29 | "pre_filter_stop": 731 30 | } -------------------------------------------------------------------------------- /uvr5_pack/lib_v5/modelparams/3band_44100.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 5, 4 | "reduction_bins": 733, 5 | "band": { 6 | "1": { 7 | "sr": 11025, 8 | "hl": 128, 9 | "n_fft": 768, 10 | "crop_start": 0, 11 | "crop_stop": 278, 12 | "lpf_start": 28, 13 | "lpf_stop": 140, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 22050, 18 | "hl": 256, 19 | "n_fft": 768, 20 | "crop_start": 14, 21 | "crop_stop": 322, 22 | "hpf_start": 70, 23 | "hpf_stop": 14, 24 | "lpf_start": 283, 25 | "lpf_stop": 314, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 44100, 30 | "hl": 512, 31 | "n_fft": 768, 32 | "crop_start": 131, 33 | "crop_stop": 313, 34 | "hpf_start": 154, 35 | "hpf_stop": 141, 36 | "res_type": "sinc_medium" 37 | } 38 | }, 39 | "sr": 44100, 40 | "pre_filter_start": 757, 41 | "pre_filter_stop": 768 42 | } 43 | -------------------------------------------------------------------------------- /uvr5_pack/lib_v5/modelparams/3band_44100_mid.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side": true, 3 | "bins": 768, 4 | "unstable_bins": 5, 5 | "reduction_bins": 733, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 768, 11 | "crop_start": 0, 12 | "crop_stop": 278, 13 | "lpf_start": 28, 14 | "lpf_stop": 140, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 22050, 19 | "hl": 256, 20 | "n_fft": 768, 21 | "crop_start": 14, 22 | "crop_stop": 322, 23 | "hpf_start": 70, 24 | "hpf_stop": 14, 25 | "lpf_start": 283, 26 | "lpf_stop": 314, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 44100, 31 | "hl": 512, 32 | "n_fft": 768, 33 | "crop_start": 131, 34 | "crop_stop": 313, 35 | "hpf_start": 154, 36 | "hpf_stop": 141, 37 | "res_type": "sinc_medium" 38 | } 39 | }, 40 | "sr": 44100, 41 | "pre_filter_start": 757, 42 | "pre_filter_stop": 768 43 | } 44 | -------------------------------------------------------------------------------- /uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side_b2": true, 3 | "bins": 640, 4 | "unstable_bins": 7, 5 | "reduction_bins": 565, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 108, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 187, 13 | "lpf_start": 92, 14 | "lpf_stop": 186, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 22050, 19 | "hl": 216, 20 | "n_fft": 768, 21 | "crop_start": 0, 22 | "crop_stop": 212, 23 | "hpf_start": 68, 24 | "hpf_stop": 34, 25 | "lpf_start": 174, 26 | "lpf_stop": 209, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 44100, 31 | "hl": 432, 32 | "n_fft": 640, 33 | "crop_start": 66, 34 | "crop_stop": 307, 35 | "hpf_start": 86, 36 | "hpf_stop": 72, 37 | "res_type": "kaiser_fast" 38 | } 39 | }, 40 | "sr": 44100, 41 | "pre_filter_start": 639, 42 | "pre_filter_stop": 640 43 | } 44 | -------------------------------------------------------------------------------- /uvr5_pack/lib_v5/modelparams/4band_44100.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 7, 4 | "reduction_bins": 668, 5 | "band": { 6 | "1": { 7 | "sr": 11025, 8 | "hl": 128, 9 | "n_fft": 1024, 10 | "crop_start": 0, 11 | "crop_stop": 186, 12 | "lpf_start": 37, 13 | "lpf_stop": 73, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 11025, 18 | "hl": 128, 19 | "n_fft": 512, 20 | "crop_start": 4, 21 | "crop_stop": 185, 22 | "hpf_start": 36, 23 | "hpf_stop": 18, 24 | "lpf_start": 93, 25 | "lpf_stop": 185, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 22050, 30 | "hl": 256, 31 | "n_fft": 512, 32 | "crop_start": 46, 33 | "crop_stop": 186, 34 | "hpf_start": 93, 35 | "hpf_stop": 46, 36 | "lpf_start": 164, 37 | "lpf_stop": 186, 38 | "res_type": "polyphase" 39 | }, 40 | "4": { 41 | "sr": 44100, 42 | "hl": 512, 43 | "n_fft": 768, 44 | "crop_start": 121, 45 | "crop_stop": 382, 46 | "hpf_start": 138, 47 | "hpf_stop": 123, 48 | "res_type": "sinc_medium" 49 | } 50 | }, 51 | "sr": 44100, 52 | "pre_filter_start": 740, 53 | "pre_filter_stop": 768 54 | } 55 | -------------------------------------------------------------------------------- /uvr5_pack/lib_v5/modelparams/4band_44100_mid.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 7, 4 | "mid_side": true, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } 56 | -------------------------------------------------------------------------------- /uvr5_pack/lib_v5/modelparams/4band_44100_msb.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side_b": true, 3 | "bins": 768, 4 | "unstable_bins": 7, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } -------------------------------------------------------------------------------- /uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side_b": true, 3 | "bins": 768, 4 | "unstable_bins": 7, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } -------------------------------------------------------------------------------- /uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json: -------------------------------------------------------------------------------- 1 | { 2 | "reverse": true, 3 | "bins": 768, 4 | "unstable_bins": 7, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } -------------------------------------------------------------------------------- /uvr5_pack/lib_v5/modelparams/4band_44100_sw.json: -------------------------------------------------------------------------------- 1 | { 2 | "stereo_w": true, 3 | "bins": 768, 4 | "unstable_bins": 7, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } -------------------------------------------------------------------------------- /uvr5_pack/lib_v5/modelparams/4band_v2.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 672, 3 | "unstable_bins": 8, 4 | "reduction_bins": 637, 5 | "band": { 6 | "1": { 7 | "sr": 7350, 8 | "hl": 80, 9 | "n_fft": 640, 10 | "crop_start": 0, 11 | "crop_stop": 85, 12 | "lpf_start": 25, 13 | "lpf_stop": 53, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 7350, 18 | "hl": 80, 19 | "n_fft": 320, 20 | "crop_start": 4, 21 | "crop_stop": 87, 22 | "hpf_start": 25, 23 | "hpf_stop": 12, 24 | "lpf_start": 31, 25 | "lpf_stop": 62, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 14700, 30 | "hl": 160, 31 | "n_fft": 512, 32 | "crop_start": 17, 33 | "crop_stop": 216, 34 | "hpf_start": 48, 35 | "hpf_stop": 24, 36 | "lpf_start": 139, 37 | "lpf_stop": 210, 38 | "res_type": "polyphase" 39 | }, 40 | "4": { 41 | "sr": 44100, 42 | "hl": 480, 43 | "n_fft": 960, 44 | "crop_start": 78, 45 | "crop_stop": 383, 46 | "hpf_start": 130, 47 | "hpf_stop": 86, 48 | "res_type": "kaiser_fast" 49 | } 50 | }, 51 | "sr": 44100, 52 | "pre_filter_start": 668, 53 | "pre_filter_stop": 672 54 | } -------------------------------------------------------------------------------- /uvr5_pack/lib_v5/modelparams/4band_v2_sn.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 672, 3 | "unstable_bins": 8, 4 | "reduction_bins": 637, 5 | "band": { 6 | "1": { 7 | "sr": 7350, 8 | "hl": 80, 9 | "n_fft": 640, 10 | "crop_start": 0, 11 | "crop_stop": 85, 12 | "lpf_start": 25, 13 | "lpf_stop": 53, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 7350, 18 | "hl": 80, 19 | "n_fft": 320, 20 | "crop_start": 4, 21 | "crop_stop": 87, 22 | "hpf_start": 25, 23 | "hpf_stop": 12, 24 | "lpf_start": 31, 25 | "lpf_stop": 62, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 14700, 30 | "hl": 160, 31 | "n_fft": 512, 32 | "crop_start": 17, 33 | "crop_stop": 216, 34 | "hpf_start": 48, 35 | "hpf_stop": 24, 36 | "lpf_start": 139, 37 | "lpf_stop": 210, 38 | "res_type": "polyphase" 39 | }, 40 | "4": { 41 | "sr": 44100, 42 | "hl": 480, 43 | "n_fft": 960, 44 | "crop_start": 78, 45 | "crop_stop": 383, 46 | "hpf_start": 130, 47 | "hpf_stop": 86, 48 | "convert_channels": "stereo_n", 49 | "res_type": "kaiser_fast" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 668, 54 | "pre_filter_stop": 672 55 | } -------------------------------------------------------------------------------- /uvr5_pack/lib_v5/modelparams/ensemble.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side_b2": true, 3 | "bins": 1280, 4 | "unstable_bins": 7, 5 | "reduction_bins": 565, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 108, 10 | "n_fft": 2048, 11 | "crop_start": 0, 12 | "crop_stop": 374, 13 | "lpf_start": 92, 14 | "lpf_stop": 186, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 22050, 19 | "hl": 216, 20 | "n_fft": 1536, 21 | "crop_start": 0, 22 | "crop_stop": 424, 23 | "hpf_start": 68, 24 | "hpf_stop": 34, 25 | "lpf_start": 348, 26 | "lpf_stop": 418, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 44100, 31 | "hl": 432, 32 | "n_fft": 1280, 33 | "crop_start": 132, 34 | "crop_stop": 614, 35 | "hpf_start": 172, 36 | "hpf_stop": 144, 37 | "res_type": "polyphase" 38 | } 39 | }, 40 | "sr": 44100, 41 | "pre_filter_start": 1280, 42 | "pre_filter_stop": 1280 43 | } -------------------------------------------------------------------------------- /uvr5_pack/lib_v5/nets.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | 5 | from uvr5_pack.lib_v5 import layers 6 | from uvr5_pack.lib_v5 import spec_utils 7 | 8 | 9 | class BaseASPPNet(nn.Module): 10 | 11 | def __init__(self, nin, ch, dilations=(4, 8, 16)): 12 | super(BaseASPPNet, self).__init__() 13 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) 14 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) 15 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) 16 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) 17 | 18 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) 19 | 20 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) 21 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) 22 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) 23 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) 24 | 25 | def __call__(self, x): 26 | h, e1 = self.enc1(x) 27 | h, e2 = self.enc2(h) 28 | h, e3 = self.enc3(h) 29 | h, e4 = self.enc4(h) 30 | 31 | h = self.aspp(h) 32 | 33 | h = self.dec4(h, e4) 34 | h = self.dec3(h, e3) 35 | h = self.dec2(h, e2) 36 | h = self.dec1(h, e1) 37 | 38 | return h 39 | 40 | 41 | class CascadedASPPNet(nn.Module): 42 | 43 | def __init__(self, n_fft): 44 | super(CascadedASPPNet, self).__init__() 45 | self.stg1_low_band_net = BaseASPPNet(2, 16) 46 | self.stg1_high_band_net = BaseASPPNet(2, 16) 47 | 48 | self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0) 49 | self.stg2_full_band_net = BaseASPPNet(8, 16) 50 | 51 | self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) 52 | self.stg3_full_band_net = BaseASPPNet(16, 32) 53 | 54 | self.out = nn.Conv2d(32, 2, 1, bias=False) 55 | self.aux1_out = nn.Conv2d(16, 2, 1, bias=False) 56 | self.aux2_out = nn.Conv2d(16, 2, 1, bias=False) 57 | 58 | self.max_bin = n_fft // 2 59 | self.output_bin = n_fft // 2 + 1 60 | 61 | self.offset = 128 62 | 63 | def forward(self, x, aggressiveness=None): 64 | mix = x.detach() 65 | x = x.clone() 66 | 67 | x = x[:, :, :self.max_bin] 68 | 69 | bandw = x.size()[2] // 2 70 | aux1 = torch.cat([ 71 | self.stg1_low_band_net(x[:, :, :bandw]), 72 | self.stg1_high_band_net(x[:, :, bandw:]) 73 | ], dim=2) 74 | 75 | h = torch.cat([x, aux1], dim=1) 76 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) 77 | 78 | h = torch.cat([x, aux1, aux2], dim=1) 79 | h = self.stg3_full_band_net(self.stg3_bridge(h)) 80 | 81 | mask = torch.sigmoid(self.out(h)) 82 | mask = F.pad( 83 | input=mask, 84 | pad=(0, 0, 0, self.output_bin - mask.size()[2]), 85 | mode='replicate') 86 | 87 | if self.training: 88 | aux1 = torch.sigmoid(self.aux1_out(aux1)) 89 | aux1 = F.pad( 90 | input=aux1, 91 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]), 92 | mode='replicate') 93 | aux2 = torch.sigmoid(self.aux2_out(aux2)) 94 | aux2 = F.pad( 95 | input=aux2, 96 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]), 97 | mode='replicate') 98 | return mask * mix, aux1 * mix, aux2 * mix 99 | else: 100 | if aggressiveness: 101 | mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3) 102 | mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value']) 103 | 104 | return mask * mix 105 | 106 | def predict(self, x_mag, aggressiveness=None): 107 | h = self.forward(x_mag, aggressiveness) 108 | 109 | if self.offset > 0: 110 | h = h[:, :, :, self.offset:-self.offset] 111 | assert h.size()[3] > 0 112 | 113 | return h 114 | -------------------------------------------------------------------------------- /uvr5_pack/lib_v5/nets_123812KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | 5 | from uvr5_pack.lib_v5 import layers_123821KB as layers 6 | 7 | 8 | class BaseASPPNet(nn.Module): 9 | 10 | def __init__(self, nin, ch, dilations=(4, 8, 16)): 11 | super(BaseASPPNet, self).__init__() 12 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) 13 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) 14 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) 15 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) 16 | 17 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) 18 | 19 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) 20 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) 21 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) 22 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) 23 | 24 | def __call__(self, x): 25 | h, e1 = self.enc1(x) 26 | h, e2 = self.enc2(h) 27 | h, e3 = self.enc3(h) 28 | h, e4 = self.enc4(h) 29 | 30 | h = self.aspp(h) 31 | 32 | h = self.dec4(h, e4) 33 | h = self.dec3(h, e3) 34 | h = self.dec2(h, e2) 35 | h = self.dec1(h, e1) 36 | 37 | return h 38 | 39 | 40 | class CascadedASPPNet(nn.Module): 41 | 42 | def __init__(self, n_fft): 43 | super(CascadedASPPNet, self).__init__() 44 | self.stg1_low_band_net = BaseASPPNet(2, 32) 45 | self.stg1_high_band_net = BaseASPPNet(2, 32) 46 | 47 | self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) 48 | self.stg2_full_band_net = BaseASPPNet(16, 32) 49 | 50 | self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) 51 | self.stg3_full_band_net = BaseASPPNet(32, 64) 52 | 53 | self.out = nn.Conv2d(64, 2, 1, bias=False) 54 | self.aux1_out = nn.Conv2d(32, 2, 1, bias=False) 55 | self.aux2_out = nn.Conv2d(32, 2, 1, bias=False) 56 | 57 | self.max_bin = n_fft // 2 58 | self.output_bin = n_fft // 2 + 1 59 | 60 | self.offset = 128 61 | 62 | def forward(self, x, aggressiveness=None): 63 | mix = x.detach() 64 | x = x.clone() 65 | 66 | x = x[:, :, :self.max_bin] 67 | 68 | bandw = x.size()[2] // 2 69 | aux1 = torch.cat([ 70 | self.stg1_low_band_net(x[:, :, :bandw]), 71 | self.stg1_high_band_net(x[:, :, bandw:]) 72 | ], dim=2) 73 | 74 | h = torch.cat([x, aux1], dim=1) 75 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) 76 | 77 | h = torch.cat([x, aux1, aux2], dim=1) 78 | h = self.stg3_full_band_net(self.stg3_bridge(h)) 79 | 80 | mask = torch.sigmoid(self.out(h)) 81 | mask = F.pad( 82 | input=mask, 83 | pad=(0, 0, 0, self.output_bin - mask.size()[2]), 84 | mode='replicate') 85 | 86 | if self.training: 87 | aux1 = torch.sigmoid(self.aux1_out(aux1)) 88 | aux1 = F.pad( 89 | input=aux1, 90 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]), 91 | mode='replicate') 92 | aux2 = torch.sigmoid(self.aux2_out(aux2)) 93 | aux2 = F.pad( 94 | input=aux2, 95 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]), 96 | mode='replicate') 97 | return mask * mix, aux1 * mix, aux2 * mix 98 | else: 99 | if aggressiveness: 100 | mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3) 101 | mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value']) 102 | 103 | return mask * mix 104 | 105 | def predict(self, x_mag, aggressiveness=None): 106 | h = self.forward(x_mag, aggressiveness) 107 | 108 | if self.offset > 0: 109 | h = h[:, :, :, self.offset:-self.offset] 110 | assert h.size()[3] > 0 111 | 112 | return h 113 | -------------------------------------------------------------------------------- /uvr5_pack/lib_v5/nets_123821KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | 5 | from uvr5_pack.lib_v5 import layers_123821KB as layers 6 | 7 | 8 | class BaseASPPNet(nn.Module): 9 | 10 | def __init__(self, nin, ch, dilations=(4, 8, 16)): 11 | super(BaseASPPNet, self).__init__() 12 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) 13 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) 14 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) 15 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) 16 | 17 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) 18 | 19 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) 20 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) 21 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) 22 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) 23 | 24 | def __call__(self, x): 25 | h, e1 = self.enc1(x) 26 | h, e2 = self.enc2(h) 27 | h, e3 = self.enc3(h) 28 | h, e4 = self.enc4(h) 29 | 30 | h = self.aspp(h) 31 | 32 | h = self.dec4(h, e4) 33 | h = self.dec3(h, e3) 34 | h = self.dec2(h, e2) 35 | h = self.dec1(h, e1) 36 | 37 | return h 38 | 39 | 40 | class CascadedASPPNet(nn.Module): 41 | 42 | def __init__(self, n_fft): 43 | super(CascadedASPPNet, self).__init__() 44 | self.stg1_low_band_net = BaseASPPNet(2, 32) 45 | self.stg1_high_band_net = BaseASPPNet(2, 32) 46 | 47 | self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) 48 | self.stg2_full_band_net = BaseASPPNet(16, 32) 49 | 50 | self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) 51 | self.stg3_full_band_net = BaseASPPNet(32, 64) 52 | 53 | self.out = nn.Conv2d(64, 2, 1, bias=False) 54 | self.aux1_out = nn.Conv2d(32, 2, 1, bias=False) 55 | self.aux2_out = nn.Conv2d(32, 2, 1, bias=False) 56 | 57 | self.max_bin = n_fft // 2 58 | self.output_bin = n_fft // 2 + 1 59 | 60 | self.offset = 128 61 | 62 | def forward(self, x, aggressiveness=None): 63 | mix = x.detach() 64 | x = x.clone() 65 | 66 | x = x[:, :, :self.max_bin] 67 | 68 | bandw = x.size()[2] // 2 69 | aux1 = torch.cat([ 70 | self.stg1_low_band_net(x[:, :, :bandw]), 71 | self.stg1_high_band_net(x[:, :, bandw:]) 72 | ], dim=2) 73 | 74 | h = torch.cat([x, aux1], dim=1) 75 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) 76 | 77 | h = torch.cat([x, aux1, aux2], dim=1) 78 | h = self.stg3_full_band_net(self.stg3_bridge(h)) 79 | 80 | mask = torch.sigmoid(self.out(h)) 81 | mask = F.pad( 82 | input=mask, 83 | pad=(0, 0, 0, self.output_bin - mask.size()[2]), 84 | mode='replicate') 85 | 86 | if self.training: 87 | aux1 = torch.sigmoid(self.aux1_out(aux1)) 88 | aux1 = F.pad( 89 | input=aux1, 90 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]), 91 | mode='replicate') 92 | aux2 = torch.sigmoid(self.aux2_out(aux2)) 93 | aux2 = F.pad( 94 | input=aux2, 95 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]), 96 | mode='replicate') 97 | return mask * mix, aux1 * mix, aux2 * mix 98 | else: 99 | if aggressiveness: 100 | mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3) 101 | mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value']) 102 | 103 | return mask * mix 104 | 105 | def predict(self, x_mag, aggressiveness=None): 106 | h = self.forward(x_mag, aggressiveness) 107 | 108 | if self.offset > 0: 109 | h = h[:, :, :, self.offset:-self.offset] 110 | assert h.size()[3] > 0 111 | 112 | return h 113 | -------------------------------------------------------------------------------- /uvr5_pack/lib_v5/nets_33966KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | 5 | from uvr5_pack.lib_v5 import layers_33966KB as layers 6 | 7 | 8 | class BaseASPPNet(nn.Module): 9 | 10 | def __init__(self, nin, ch, dilations=(4, 8, 16, 32)): 11 | super(BaseASPPNet, self).__init__() 12 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) 13 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) 14 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) 15 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) 16 | 17 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) 18 | 19 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) 20 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) 21 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) 22 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) 23 | 24 | def __call__(self, x): 25 | h, e1 = self.enc1(x) 26 | h, e2 = self.enc2(h) 27 | h, e3 = self.enc3(h) 28 | h, e4 = self.enc4(h) 29 | 30 | h = self.aspp(h) 31 | 32 | h = self.dec4(h, e4) 33 | h = self.dec3(h, e3) 34 | h = self.dec2(h, e2) 35 | h = self.dec1(h, e1) 36 | 37 | return h 38 | 39 | 40 | class CascadedASPPNet(nn.Module): 41 | 42 | def __init__(self, n_fft): 43 | super(CascadedASPPNet, self).__init__() 44 | self.stg1_low_band_net = BaseASPPNet(2, 16) 45 | self.stg1_high_band_net = BaseASPPNet(2, 16) 46 | 47 | self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0) 48 | self.stg2_full_band_net = BaseASPPNet(8, 16) 49 | 50 | self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) 51 | self.stg3_full_band_net = BaseASPPNet(16, 32) 52 | 53 | self.out = nn.Conv2d(32, 2, 1, bias=False) 54 | self.aux1_out = nn.Conv2d(16, 2, 1, bias=False) 55 | self.aux2_out = nn.Conv2d(16, 2, 1, bias=False) 56 | 57 | self.max_bin = n_fft // 2 58 | self.output_bin = n_fft // 2 + 1 59 | 60 | self.offset = 128 61 | 62 | def forward(self, x, aggressiveness=None): 63 | mix = x.detach() 64 | x = x.clone() 65 | 66 | x = x[:, :, :self.max_bin] 67 | 68 | bandw = x.size()[2] // 2 69 | aux1 = torch.cat([ 70 | self.stg1_low_band_net(x[:, :, :bandw]), 71 | self.stg1_high_band_net(x[:, :, bandw:]) 72 | ], dim=2) 73 | 74 | h = torch.cat([x, aux1], dim=1) 75 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) 76 | 77 | h = torch.cat([x, aux1, aux2], dim=1) 78 | h = self.stg3_full_band_net(self.stg3_bridge(h)) 79 | 80 | mask = torch.sigmoid(self.out(h)) 81 | mask = F.pad( 82 | input=mask, 83 | pad=(0, 0, 0, self.output_bin - mask.size()[2]), 84 | mode='replicate') 85 | 86 | if self.training: 87 | aux1 = torch.sigmoid(self.aux1_out(aux1)) 88 | aux1 = F.pad( 89 | input=aux1, 90 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]), 91 | mode='replicate') 92 | aux2 = torch.sigmoid(self.aux2_out(aux2)) 93 | aux2 = F.pad( 94 | input=aux2, 95 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]), 96 | mode='replicate') 97 | return mask * mix, aux1 * mix, aux2 * mix 98 | else: 99 | if aggressiveness: 100 | mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3) 101 | mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value']) 102 | 103 | return mask * mix 104 | 105 | def predict(self, x_mag, aggressiveness=None): 106 | h = self.forward(x_mag, aggressiveness) 107 | 108 | if self.offset > 0: 109 | h = h[:, :, :, self.offset:-self.offset] 110 | assert h.size()[3] > 0 111 | 112 | return h 113 | -------------------------------------------------------------------------------- /uvr5_pack/lib_v5/nets_537227KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from torch import nn 4 | import torch.nn.functional as F 5 | 6 | from uvr5_pack.lib_v5 import layers_537238KB as layers 7 | 8 | 9 | class BaseASPPNet(nn.Module): 10 | 11 | def __init__(self, nin, ch, dilations=(4, 8, 16)): 12 | super(BaseASPPNet, self).__init__() 13 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) 14 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) 15 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) 16 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) 17 | 18 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) 19 | 20 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) 21 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) 22 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) 23 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) 24 | 25 | def __call__(self, x): 26 | h, e1 = self.enc1(x) 27 | h, e2 = self.enc2(h) 28 | h, e3 = self.enc3(h) 29 | h, e4 = self.enc4(h) 30 | 31 | h = self.aspp(h) 32 | 33 | h = self.dec4(h, e4) 34 | h = self.dec3(h, e3) 35 | h = self.dec2(h, e2) 36 | h = self.dec1(h, e1) 37 | 38 | return h 39 | 40 | 41 | class CascadedASPPNet(nn.Module): 42 | 43 | def __init__(self, n_fft): 44 | super(CascadedASPPNet, self).__init__() 45 | self.stg1_low_band_net = BaseASPPNet(2, 64) 46 | self.stg1_high_band_net = BaseASPPNet(2, 64) 47 | 48 | self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) 49 | self.stg2_full_band_net = BaseASPPNet(32, 64) 50 | 51 | self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0) 52 | self.stg3_full_band_net = BaseASPPNet(64, 128) 53 | 54 | self.out = nn.Conv2d(128, 2, 1, bias=False) 55 | self.aux1_out = nn.Conv2d(64, 2, 1, bias=False) 56 | self.aux2_out = nn.Conv2d(64, 2, 1, bias=False) 57 | 58 | self.max_bin = n_fft // 2 59 | self.output_bin = n_fft // 2 + 1 60 | 61 | self.offset = 128 62 | 63 | def forward(self, x, aggressiveness=None): 64 | mix = x.detach() 65 | x = x.clone() 66 | 67 | x = x[:, :, :self.max_bin] 68 | 69 | bandw = x.size()[2] // 2 70 | aux1 = torch.cat([ 71 | self.stg1_low_band_net(x[:, :, :bandw]), 72 | self.stg1_high_band_net(x[:, :, bandw:]) 73 | ], dim=2) 74 | 75 | h = torch.cat([x, aux1], dim=1) 76 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) 77 | 78 | h = torch.cat([x, aux1, aux2], dim=1) 79 | h = self.stg3_full_band_net(self.stg3_bridge(h)) 80 | 81 | mask = torch.sigmoid(self.out(h)) 82 | mask = F.pad( 83 | input=mask, 84 | pad=(0, 0, 0, self.output_bin - mask.size()[2]), 85 | mode='replicate') 86 | 87 | if self.training: 88 | aux1 = torch.sigmoid(self.aux1_out(aux1)) 89 | aux1 = F.pad( 90 | input=aux1, 91 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]), 92 | mode='replicate') 93 | aux2 = torch.sigmoid(self.aux2_out(aux2)) 94 | aux2 = F.pad( 95 | input=aux2, 96 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]), 97 | mode='replicate') 98 | return mask * mix, aux1 * mix, aux2 * mix 99 | else: 100 | if aggressiveness: 101 | mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3) 102 | mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value']) 103 | 104 | return mask * mix 105 | 106 | def predict(self, x_mag, aggressiveness=None): 107 | h = self.forward(x_mag, aggressiveness) 108 | 109 | if self.offset > 0: 110 | h = h[:, :, :, self.offset:-self.offset] 111 | assert h.size()[3] > 0 112 | 113 | return h 114 | -------------------------------------------------------------------------------- /uvr5_pack/lib_v5/nets_537238KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from torch import nn 4 | import torch.nn.functional as F 5 | 6 | from uvr5_pack.lib_v5 import layers_537238KB as layers 7 | 8 | 9 | class BaseASPPNet(nn.Module): 10 | 11 | def __init__(self, nin, ch, dilations=(4, 8, 16)): 12 | super(BaseASPPNet, self).__init__() 13 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) 14 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) 15 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) 16 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) 17 | 18 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) 19 | 20 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) 21 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) 22 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) 23 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) 24 | 25 | def __call__(self, x): 26 | h, e1 = self.enc1(x) 27 | h, e2 = self.enc2(h) 28 | h, e3 = self.enc3(h) 29 | h, e4 = self.enc4(h) 30 | 31 | h = self.aspp(h) 32 | 33 | h = self.dec4(h, e4) 34 | h = self.dec3(h, e3) 35 | h = self.dec2(h, e2) 36 | h = self.dec1(h, e1) 37 | 38 | return h 39 | 40 | 41 | class CascadedASPPNet(nn.Module): 42 | 43 | def __init__(self, n_fft): 44 | super(CascadedASPPNet, self).__init__() 45 | self.stg1_low_band_net = BaseASPPNet(2, 64) 46 | self.stg1_high_band_net = BaseASPPNet(2, 64) 47 | 48 | self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) 49 | self.stg2_full_band_net = BaseASPPNet(32, 64) 50 | 51 | self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0) 52 | self.stg3_full_band_net = BaseASPPNet(64, 128) 53 | 54 | self.out = nn.Conv2d(128, 2, 1, bias=False) 55 | self.aux1_out = nn.Conv2d(64, 2, 1, bias=False) 56 | self.aux2_out = nn.Conv2d(64, 2, 1, bias=False) 57 | 58 | self.max_bin = n_fft // 2 59 | self.output_bin = n_fft // 2 + 1 60 | 61 | self.offset = 128 62 | 63 | def forward(self, x, aggressiveness=None): 64 | mix = x.detach() 65 | x = x.clone() 66 | 67 | x = x[:, :, :self.max_bin] 68 | 69 | bandw = x.size()[2] // 2 70 | aux1 = torch.cat([ 71 | self.stg1_low_band_net(x[:, :, :bandw]), 72 | self.stg1_high_band_net(x[:, :, bandw:]) 73 | ], dim=2) 74 | 75 | h = torch.cat([x, aux1], dim=1) 76 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) 77 | 78 | h = torch.cat([x, aux1, aux2], dim=1) 79 | h = self.stg3_full_band_net(self.stg3_bridge(h)) 80 | 81 | mask = torch.sigmoid(self.out(h)) 82 | mask = F.pad( 83 | input=mask, 84 | pad=(0, 0, 0, self.output_bin - mask.size()[2]), 85 | mode='replicate') 86 | 87 | if self.training: 88 | aux1 = torch.sigmoid(self.aux1_out(aux1)) 89 | aux1 = F.pad( 90 | input=aux1, 91 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]), 92 | mode='replicate') 93 | aux2 = torch.sigmoid(self.aux2_out(aux2)) 94 | aux2 = F.pad( 95 | input=aux2, 96 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]), 97 | mode='replicate') 98 | return mask * mix, aux1 * mix, aux2 * mix 99 | else: 100 | if aggressiveness: 101 | mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3) 102 | mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value']) 103 | 104 | return mask * mix 105 | 106 | def predict(self, x_mag, aggressiveness=None): 107 | h = self.forward(x_mag, aggressiveness) 108 | 109 | if self.offset > 0: 110 | h = h[:, :, :, self.offset:-self.offset] 111 | assert h.size()[3] > 0 112 | 113 | return h 114 | -------------------------------------------------------------------------------- /uvr5_pack/lib_v5/nets_61968KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | 5 | from uvr5_pack.lib_v5 import layers_123821KB as layers 6 | 7 | 8 | class BaseASPPNet(nn.Module): 9 | 10 | def __init__(self, nin, ch, dilations=(4, 8, 16)): 11 | super(BaseASPPNet, self).__init__() 12 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) 13 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) 14 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) 15 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) 16 | 17 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) 18 | 19 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) 20 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) 21 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) 22 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) 23 | 24 | def __call__(self, x): 25 | h, e1 = self.enc1(x) 26 | h, e2 = self.enc2(h) 27 | h, e3 = self.enc3(h) 28 | h, e4 = self.enc4(h) 29 | 30 | h = self.aspp(h) 31 | 32 | h = self.dec4(h, e4) 33 | h = self.dec3(h, e3) 34 | h = self.dec2(h, e2) 35 | h = self.dec1(h, e1) 36 | 37 | return h 38 | 39 | 40 | class CascadedASPPNet(nn.Module): 41 | 42 | def __init__(self, n_fft): 43 | super(CascadedASPPNet, self).__init__() 44 | self.stg1_low_band_net = BaseASPPNet(2, 32) 45 | self.stg1_high_band_net = BaseASPPNet(2, 32) 46 | 47 | self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) 48 | self.stg2_full_band_net = BaseASPPNet(16, 32) 49 | 50 | self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) 51 | self.stg3_full_band_net = BaseASPPNet(32, 64) 52 | 53 | self.out = nn.Conv2d(64, 2, 1, bias=False) 54 | self.aux1_out = nn.Conv2d(32, 2, 1, bias=False) 55 | self.aux2_out = nn.Conv2d(32, 2, 1, bias=False) 56 | 57 | self.max_bin = n_fft // 2 58 | self.output_bin = n_fft // 2 + 1 59 | 60 | self.offset = 128 61 | 62 | def forward(self, x, aggressiveness=None): 63 | mix = x.detach() 64 | x = x.clone() 65 | 66 | x = x[:, :, :self.max_bin] 67 | 68 | bandw = x.size()[2] // 2 69 | aux1 = torch.cat([ 70 | self.stg1_low_band_net(x[:, :, :bandw]), 71 | self.stg1_high_band_net(x[:, :, bandw:]) 72 | ], dim=2) 73 | 74 | h = torch.cat([x, aux1], dim=1) 75 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) 76 | 77 | h = torch.cat([x, aux1, aux2], dim=1) 78 | h = self.stg3_full_band_net(self.stg3_bridge(h)) 79 | 80 | mask = torch.sigmoid(self.out(h)) 81 | mask = F.pad( 82 | input=mask, 83 | pad=(0, 0, 0, self.output_bin - mask.size()[2]), 84 | mode='replicate') 85 | 86 | if self.training: 87 | aux1 = torch.sigmoid(self.aux1_out(aux1)) 88 | aux1 = F.pad( 89 | input=aux1, 90 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]), 91 | mode='replicate') 92 | aux2 = torch.sigmoid(self.aux2_out(aux2)) 93 | aux2 = F.pad( 94 | input=aux2, 95 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]), 96 | mode='replicate') 97 | return mask * mix, aux1 * mix, aux2 * mix 98 | else: 99 | if aggressiveness: 100 | mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3) 101 | mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value']) 102 | 103 | return mask * mix 104 | 105 | def predict(self, x_mag, aggressiveness=None): 106 | h = self.forward(x_mag, aggressiveness) 107 | 108 | if self.offset > 0: 109 | h = h[:, :, :, self.offset:-self.offset] 110 | assert h.size()[3] > 0 111 | 112 | return h 113 | -------------------------------------------------------------------------------- /uvr5_pack/lib_v5/spec_utils.py: -------------------------------------------------------------------------------- 1 | import os,librosa 2 | import numpy as np 3 | import soundfile as sf 4 | from tqdm import tqdm 5 | import json,math ,hashlib 6 | 7 | def crop_center(h1, h2): 8 | h1_shape = h1.size() 9 | h2_shape = h2.size() 10 | 11 | if h1_shape[3] == h2_shape[3]: 12 | return h1 13 | elif h1_shape[3] < h2_shape[3]: 14 | raise ValueError('h1_shape[3] must be greater than h2_shape[3]') 15 | 16 | # s_freq = (h2_shape[2] - h1_shape[2]) // 2 17 | # e_freq = s_freq + h1_shape[2] 18 | s_time = (h1_shape[3] - h2_shape[3]) // 2 19 | e_time = s_time + h2_shape[3] 20 | h1 = h1[:, :, :, s_time:e_time] 21 | 22 | return h1 23 | 24 | 25 | def wave_to_spectrogram(wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False): 26 | if reverse: 27 | wave_left = np.flip(np.asfortranarray(wave[0])) 28 | wave_right = np.flip(np.asfortranarray(wave[1])) 29 | elif mid_side: 30 | wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2) 31 | wave_right = np.asfortranarray(np.subtract(wave[0], wave[1])) 32 | elif mid_side_b2: 33 | wave_left = np.asfortranarray(np.add(wave[1], wave[0] * .5)) 34 | wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * .5)) 35 | else: 36 | wave_left = np.asfortranarray(wave[0]) 37 | wave_right = np.asfortranarray(wave[1]) 38 | 39 | spec_left = librosa.stft(wave_left, n_fft, hop_length=hop_length) 40 | spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length) 41 | 42 | spec = np.asfortranarray([spec_left, spec_right]) 43 | 44 | return spec 45 | 46 | 47 | def wave_to_spectrogram_mt(wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False): 48 | import threading 49 | 50 | if reverse: 51 | wave_left = np.flip(np.asfortranarray(wave[0])) 52 | wave_right = np.flip(np.asfortranarray(wave[1])) 53 | elif mid_side: 54 | wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2) 55 | wave_right = np.asfortranarray(np.subtract(wave[0], wave[1])) 56 | elif mid_side_b2: 57 | wave_left = np.asfortranarray(np.add(wave[1], wave[0] * .5)) 58 | wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * .5)) 59 | else: 60 | wave_left = np.asfortranarray(wave[0]) 61 | wave_right = np.asfortranarray(wave[1]) 62 | 63 | def run_thread(**kwargs): 64 | global spec_left 65 | spec_left = librosa.stft(**kwargs) 66 | 67 | thread = threading.Thread(target=run_thread, kwargs={'y': wave_left, 'n_fft': n_fft, 'hop_length': hop_length}) 68 | thread.start() 69 | spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length) 70 | thread.join() 71 | 72 | spec = np.asfortranarray([spec_left, spec_right]) 73 | 74 | return spec 75 | 76 | 77 | def combine_spectrograms(specs, mp): 78 | l = min([specs[i].shape[2] for i in specs]) 79 | spec_c = np.zeros(shape=(2, mp.param['bins'] + 1, l), dtype=np.complex64) 80 | offset = 0 81 | bands_n = len(mp.param['band']) 82 | 83 | for d in range(1, bands_n + 1): 84 | h = mp.param['band'][d]['crop_stop'] - mp.param['band'][d]['crop_start'] 85 | spec_c[:, offset:offset+h, :l] = specs[d][:, mp.param['band'][d]['crop_start']:mp.param['band'][d]['crop_stop'], :l] 86 | offset += h 87 | 88 | if offset > mp.param['bins']: 89 | raise ValueError('Too much bins') 90 | 91 | # lowpass fiter 92 | if mp.param['pre_filter_start'] > 0: # and mp.param['band'][bands_n]['res_type'] in ['scipy', 'polyphase']: 93 | if bands_n == 1: 94 | spec_c = fft_lp_filter(spec_c, mp.param['pre_filter_start'], mp.param['pre_filter_stop']) 95 | else: 96 | gp = 1 97 | for b in range(mp.param['pre_filter_start'] + 1, mp.param['pre_filter_stop']): 98 | g = math.pow(10, -(b - mp.param['pre_filter_start']) * (3.5 - gp) / 20.0) 99 | gp = g 100 | spec_c[:, b, :] *= g 101 | 102 | return np.asfortranarray(spec_c) 103 | 104 | 105 | def spectrogram_to_image(spec, mode='magnitude'): 106 | if mode == 'magnitude': 107 | if np.iscomplexobj(spec): 108 | y = np.abs(spec) 109 | else: 110 | y = spec 111 | y = np.log10(y ** 2 + 1e-8) 112 | elif mode == 'phase': 113 | if np.iscomplexobj(spec): 114 | y = np.angle(spec) 115 | else: 116 | y = spec 117 | 118 | y -= y.min() 119 | y *= 255 / y.max() 120 | img = np.uint8(y) 121 | 122 | if y.ndim == 3: 123 | img = img.transpose(1, 2, 0) 124 | img = np.concatenate([ 125 | np.max(img, axis=2, keepdims=True), img 126 | ], axis=2) 127 | 128 | return img 129 | 130 | 131 | def reduce_vocal_aggressively(X, y, softmask): 132 | v = X - y 133 | y_mag_tmp = np.abs(y) 134 | v_mag_tmp = np.abs(v) 135 | 136 | v_mask = v_mag_tmp > y_mag_tmp 137 | y_mag = np.clip(y_mag_tmp - v_mag_tmp * v_mask * softmask, 0, np.inf) 138 | 139 | return y_mag * np.exp(1.j * np.angle(y)) 140 | 141 | 142 | def mask_silence(mag, ref, thres=0.2, min_range=64, fade_size=32): 143 | if min_range < fade_size * 2: 144 | raise ValueError('min_range must be >= fade_area * 2') 145 | 146 | mag = mag.copy() 147 | 148 | idx = np.where(ref.mean(axis=(0, 1)) < thres)[0] 149 | starts = np.insert(idx[np.where(np.diff(idx) != 1)[0] + 1], 0, idx[0]) 150 | ends = np.append(idx[np.where(np.diff(idx) != 1)[0]], idx[-1]) 151 | uninformative = np.where(ends - starts > min_range)[0] 152 | if len(uninformative) > 0: 153 | starts = starts[uninformative] 154 | ends = ends[uninformative] 155 | old_e = None 156 | for s, e in zip(starts, ends): 157 | if old_e is not None and s - old_e < fade_size: 158 | s = old_e - fade_size * 2 159 | 160 | if s != 0: 161 | weight = np.linspace(0, 1, fade_size) 162 | mag[:, :, s:s + fade_size] += weight * ref[:, :, s:s + fade_size] 163 | else: 164 | s -= fade_size 165 | 166 | if e != mag.shape[2]: 167 | weight = np.linspace(1, 0, fade_size) 168 | mag[:, :, e - fade_size:e] += weight * ref[:, :, e - fade_size:e] 169 | else: 170 | e += fade_size 171 | 172 | mag[:, :, s + fade_size:e - fade_size] += ref[:, :, s + fade_size:e - fade_size] 173 | old_e = e 174 | 175 | return mag 176 | 177 | 178 | def align_wave_head_and_tail(a, b): 179 | l = min([a[0].size, b[0].size]) 180 | 181 | return a[:l,:l], b[:l,:l] 182 | 183 | 184 | def cache_or_load(mix_path, inst_path, mp): 185 | mix_basename = os.path.splitext(os.path.basename(mix_path))[0] 186 | inst_basename = os.path.splitext(os.path.basename(inst_path))[0] 187 | 188 | cache_dir = 'mph{}'.format(hashlib.sha1(json.dumps(mp.param, sort_keys=True).encode('utf-8')).hexdigest()) 189 | mix_cache_dir = os.path.join('cache', cache_dir) 190 | inst_cache_dir = os.path.join('cache', cache_dir) 191 | 192 | os.makedirs(mix_cache_dir, exist_ok=True) 193 | os.makedirs(inst_cache_dir, exist_ok=True) 194 | 195 | mix_cache_path = os.path.join(mix_cache_dir, mix_basename + '.npy') 196 | inst_cache_path = os.path.join(inst_cache_dir, inst_basename + '.npy') 197 | 198 | if os.path.exists(mix_cache_path) and os.path.exists(inst_cache_path): 199 | X_spec_m = np.load(mix_cache_path) 200 | y_spec_m = np.load(inst_cache_path) 201 | else: 202 | X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {} 203 | 204 | for d in range(len(mp.param['band']), 0, -1): 205 | bp = mp.param['band'][d] 206 | 207 | if d == len(mp.param['band']): # high-end band 208 | X_wave[d], _ = librosa.load( 209 | mix_path, bp['sr'], False, dtype=np.float32, res_type=bp['res_type']) 210 | y_wave[d], _ = librosa.load( 211 | inst_path, bp['sr'], False, dtype=np.float32, res_type=bp['res_type']) 212 | else: # lower bands 213 | X_wave[d] = librosa.resample(X_wave[d+1], mp.param['band'][d+1]['sr'], bp['sr'], res_type=bp['res_type']) 214 | y_wave[d] = librosa.resample(y_wave[d+1], mp.param['band'][d+1]['sr'], bp['sr'], res_type=bp['res_type']) 215 | 216 | X_wave[d], y_wave[d] = align_wave_head_and_tail(X_wave[d], y_wave[d]) 217 | 218 | X_spec_s[d] = wave_to_spectrogram(X_wave[d], bp['hl'], bp['n_fft'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse']) 219 | y_spec_s[d] = wave_to_spectrogram(y_wave[d], bp['hl'], bp['n_fft'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse']) 220 | 221 | del X_wave, y_wave 222 | 223 | X_spec_m = combine_spectrograms(X_spec_s, mp) 224 | y_spec_m = combine_spectrograms(y_spec_s, mp) 225 | 226 | if X_spec_m.shape != y_spec_m.shape: 227 | raise ValueError('The combined spectrograms are different: ' + mix_path) 228 | 229 | _, ext = os.path.splitext(mix_path) 230 | 231 | np.save(mix_cache_path, X_spec_m) 232 | np.save(inst_cache_path, y_spec_m) 233 | 234 | return X_spec_m, y_spec_m 235 | 236 | 237 | def spectrogram_to_wave(spec, hop_length, mid_side, mid_side_b2, reverse): 238 | spec_left = np.asfortranarray(spec[0]) 239 | spec_right = np.asfortranarray(spec[1]) 240 | 241 | wave_left = librosa.istft(spec_left, hop_length=hop_length) 242 | wave_right = librosa.istft(spec_right, hop_length=hop_length) 243 | 244 | if reverse: 245 | return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)]) 246 | elif mid_side: 247 | return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]) 248 | elif mid_side_b2: 249 | return np.asfortranarray([np.add(wave_right / 1.25, .4 * wave_left), np.subtract(wave_left / 1.25, .4 * wave_right)]) 250 | else: 251 | return np.asfortranarray([wave_left, wave_right]) 252 | 253 | 254 | def spectrogram_to_wave_mt(spec, hop_length, mid_side, reverse, mid_side_b2): 255 | import threading 256 | 257 | spec_left = np.asfortranarray(spec[0]) 258 | spec_right = np.asfortranarray(spec[1]) 259 | 260 | def run_thread(**kwargs): 261 | global wave_left 262 | wave_left = librosa.istft(**kwargs) 263 | 264 | thread = threading.Thread(target=run_thread, kwargs={'stft_matrix': spec_left, 'hop_length': hop_length}) 265 | thread.start() 266 | wave_right = librosa.istft(spec_right, hop_length=hop_length) 267 | thread.join() 268 | 269 | if reverse: 270 | return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)]) 271 | elif mid_side: 272 | return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]) 273 | elif mid_side_b2: 274 | return np.asfortranarray([np.add(wave_right / 1.25, .4 * wave_left), np.subtract(wave_left / 1.25, .4 * wave_right)]) 275 | else: 276 | return np.asfortranarray([wave_left, wave_right]) 277 | 278 | 279 | def cmb_spectrogram_to_wave(spec_m, mp, extra_bins_h=None, extra_bins=None): 280 | wave_band = {} 281 | bands_n = len(mp.param['band']) 282 | offset = 0 283 | 284 | for d in range(1, bands_n + 1): 285 | bp = mp.param['band'][d] 286 | spec_s = np.ndarray(shape=(2, bp['n_fft'] // 2 + 1, spec_m.shape[2]), dtype=complex) 287 | h = bp['crop_stop'] - bp['crop_start'] 288 | spec_s[:, bp['crop_start']:bp['crop_stop'], :] = spec_m[:, offset:offset+h, :] 289 | 290 | offset += h 291 | if d == bands_n: # higher 292 | if extra_bins_h: # if --high_end_process bypass 293 | max_bin = bp['n_fft'] // 2 294 | spec_s[:, max_bin-extra_bins_h:max_bin, :] = extra_bins[:, :extra_bins_h, :] 295 | if bp['hpf_start'] > 0: 296 | spec_s = fft_hp_filter(spec_s, bp['hpf_start'], bp['hpf_stop'] - 1) 297 | if bands_n == 1: 298 | wave = spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse']) 299 | else: 300 | wave = np.add(wave, spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse'])) 301 | else: 302 | sr = mp.param['band'][d+1]['sr'] 303 | if d == 1: # lower 304 | spec_s = fft_lp_filter(spec_s, bp['lpf_start'], bp['lpf_stop']) 305 | wave = librosa.resample(spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse']), bp['sr'], sr, res_type="sinc_fastest") 306 | else: # mid 307 | spec_s = fft_hp_filter(spec_s, bp['hpf_start'], bp['hpf_stop'] - 1) 308 | spec_s = fft_lp_filter(spec_s, bp['lpf_start'], bp['lpf_stop']) 309 | wave2 = np.add(wave, spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse'])) 310 | # wave = librosa.core.resample(wave2, bp['sr'], sr, res_type="sinc_fastest") 311 | wave = librosa.core.resample(wave2, bp['sr'], sr,res_type='scipy') 312 | 313 | return wave.T 314 | 315 | 316 | def fft_lp_filter(spec, bin_start, bin_stop): 317 | g = 1.0 318 | for b in range(bin_start, bin_stop): 319 | g -= 1 / (bin_stop - bin_start) 320 | spec[:, b, :] = g * spec[:, b, :] 321 | 322 | spec[:, bin_stop:, :] *= 0 323 | 324 | return spec 325 | 326 | 327 | def fft_hp_filter(spec, bin_start, bin_stop): 328 | g = 1.0 329 | for b in range(bin_start, bin_stop, -1): 330 | g -= 1 / (bin_start - bin_stop) 331 | spec[:, b, :] = g * spec[:, b, :] 332 | 333 | spec[:, 0:bin_stop+1, :] *= 0 334 | 335 | return spec 336 | 337 | 338 | def mirroring(a, spec_m, input_high_end, mp): 339 | if 'mirroring' == a: 340 | mirror = np.flip(np.abs(spec_m[:, mp.param['pre_filter_start']-10-input_high_end.shape[1]:mp.param['pre_filter_start']-10, :]), 1) 341 | mirror = mirror * np.exp(1.j * np.angle(input_high_end)) 342 | 343 | return np.where(np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror) 344 | 345 | if 'mirroring2' == a: 346 | mirror = np.flip(np.abs(spec_m[:, mp.param['pre_filter_start']-10-input_high_end.shape[1]:mp.param['pre_filter_start']-10, :]), 1) 347 | mi = np.multiply(mirror, input_high_end * 1.7) 348 | 349 | return np.where(np.abs(input_high_end) <= np.abs(mi), input_high_end, mi) 350 | 351 | 352 | def ensembling(a, specs): 353 | for i in range(1, len(specs)): 354 | if i == 1: 355 | spec = specs[0] 356 | 357 | ln = min([spec.shape[2], specs[i].shape[2]]) 358 | spec = spec[:,:,:ln] 359 | specs[i] = specs[i][:,:,:ln] 360 | 361 | if 'min_mag' == a: 362 | spec = np.where(np.abs(specs[i]) <= np.abs(spec), specs[i], spec) 363 | if 'max_mag' == a: 364 | spec = np.where(np.abs(specs[i]) >= np.abs(spec), specs[i], spec) 365 | 366 | return spec 367 | 368 | def stft(wave, nfft, hl): 369 | wave_left = np.asfortranarray(wave[0]) 370 | wave_right = np.asfortranarray(wave[1]) 371 | spec_left = librosa.stft(wave_left, nfft, hop_length=hl) 372 | spec_right = librosa.stft(wave_right, nfft, hop_length=hl) 373 | spec = np.asfortranarray([spec_left, spec_right]) 374 | 375 | return spec 376 | 377 | def istft(spec, hl): 378 | spec_left = np.asfortranarray(spec[0]) 379 | spec_right = np.asfortranarray(spec[1]) 380 | 381 | wave_left = librosa.istft(spec_left, hop_length=hl) 382 | wave_right = librosa.istft(spec_right, hop_length=hl) 383 | wave = np.asfortranarray([wave_left, wave_right]) 384 | 385 | 386 | if __name__ == "__main__": 387 | import cv2 388 | import sys 389 | import time 390 | import argparse 391 | from model_param_init import ModelParameters 392 | 393 | p = argparse.ArgumentParser() 394 | p.add_argument('--algorithm', '-a', type=str, choices=['invert', 'invert_p', 'min_mag', 'max_mag', 'deep', 'align'], default='min_mag') 395 | p.add_argument('--model_params', '-m', type=str, default=os.path.join('modelparams', '1band_sr44100_hl512.json')) 396 | p.add_argument('--output_name', '-o', type=str, default='output') 397 | p.add_argument('--vocals_only', '-v', action='store_true') 398 | p.add_argument('input', nargs='+') 399 | args = p.parse_args() 400 | 401 | start_time = time.time() 402 | 403 | if args.algorithm.startswith('invert') and len(args.input) != 2: 404 | raise ValueError('There should be two input files.') 405 | 406 | if not args.algorithm.startswith('invert') and len(args.input) < 2: 407 | raise ValueError('There must be at least two input files.') 408 | 409 | wave, specs = {}, {} 410 | mp = ModelParameters(args.model_params) 411 | 412 | for i in range(len(args.input)): 413 | spec = {} 414 | 415 | for d in range(len(mp.param['band']), 0, -1): 416 | bp = mp.param['band'][d] 417 | 418 | if d == len(mp.param['band']): # high-end band 419 | wave[d], _ = librosa.load( 420 | args.input[i], bp['sr'], False, dtype=np.float32, res_type=bp['res_type']) 421 | 422 | if len(wave[d].shape) == 1: # mono to stereo 423 | wave[d] = np.array([wave[d], wave[d]]) 424 | else: # lower bands 425 | wave[d] = librosa.resample(wave[d+1], mp.param['band'][d+1]['sr'], bp['sr'], res_type=bp['res_type']) 426 | 427 | spec[d] = wave_to_spectrogram(wave[d], bp['hl'], bp['n_fft'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse']) 428 | 429 | specs[i] = combine_spectrograms(spec, mp) 430 | 431 | del wave 432 | 433 | if args.algorithm == 'deep': 434 | d_spec = np.where(np.abs(specs[0]) <= np.abs(spec[1]), specs[0], spec[1]) 435 | v_spec = d_spec - specs[1] 436 | sf.write(os.path.join('{}.wav'.format(args.output_name)), cmb_spectrogram_to_wave(v_spec, mp), mp.param['sr']) 437 | 438 | if args.algorithm.startswith('invert'): 439 | ln = min([specs[0].shape[2], specs[1].shape[2]]) 440 | specs[0] = specs[0][:,:,:ln] 441 | specs[1] = specs[1][:,:,:ln] 442 | 443 | if 'invert_p' == args.algorithm: 444 | X_mag = np.abs(specs[0]) 445 | y_mag = np.abs(specs[1]) 446 | max_mag = np.where(X_mag >= y_mag, X_mag, y_mag) 447 | v_spec = specs[1] - max_mag * np.exp(1.j * np.angle(specs[0])) 448 | else: 449 | specs[1] = reduce_vocal_aggressively(specs[0], specs[1], 0.2) 450 | v_spec = specs[0] - specs[1] 451 | 452 | if not args.vocals_only: 453 | X_mag = np.abs(specs[0]) 454 | y_mag = np.abs(specs[1]) 455 | v_mag = np.abs(v_spec) 456 | 457 | X_image = spectrogram_to_image(X_mag) 458 | y_image = spectrogram_to_image(y_mag) 459 | v_image = spectrogram_to_image(v_mag) 460 | 461 | cv2.imwrite('{}_X.png'.format(args.output_name), X_image) 462 | cv2.imwrite('{}_y.png'.format(args.output_name), y_image) 463 | cv2.imwrite('{}_v.png'.format(args.output_name), v_image) 464 | 465 | sf.write('{}_X.wav'.format(args.output_name), cmb_spectrogram_to_wave(specs[0], mp), mp.param['sr']) 466 | sf.write('{}_y.wav'.format(args.output_name), cmb_spectrogram_to_wave(specs[1], mp), mp.param['sr']) 467 | 468 | sf.write('{}_v.wav'.format(args.output_name), cmb_spectrogram_to_wave(v_spec, mp), mp.param['sr']) 469 | else: 470 | if not args.algorithm == 'deep': 471 | sf.write(os.path.join('ensembled','{}.wav'.format(args.output_name)), cmb_spectrogram_to_wave(ensembling(args.algorithm, specs), mp), mp.param['sr']) 472 | 473 | if args.algorithm == 'align': 474 | 475 | trackalignment = [ 476 | { 477 | 'file1':'"{}"'.format(args.input[0]), 478 | 'file2':'"{}"'.format(args.input[1]) 479 | } 480 | ] 481 | 482 | for i,e in tqdm(enumerate(trackalignment), desc="Performing Alignment..."): 483 | os.system(f"python lib/align_tracks.py {e['file1']} {e['file2']}") 484 | 485 | #print('Total time: {0:.{1}f}s'.format(time.time() - start_time, 1)) 486 | -------------------------------------------------------------------------------- /uvr5_pack/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from tqdm import tqdm 4 | 5 | def make_padding(width, cropsize, offset): 6 | left = offset 7 | roi_size = cropsize - left * 2 8 | if roi_size == 0: 9 | roi_size = cropsize 10 | right = roi_size - (width % roi_size) + left 11 | 12 | return left, right, roi_size 13 | def inference(X_spec, device, model, aggressiveness,data): 14 | ''' 15 | data : dic configs 16 | ''' 17 | 18 | def _execute(X_mag_pad, roi_size, n_window, device, model, aggressiveness,is_half=True): 19 | model.eval() 20 | with torch.no_grad(): 21 | preds = [] 22 | 23 | iterations = [n_window] 24 | 25 | total_iterations = sum(iterations) 26 | for i in tqdm(range(n_window)): 27 | start = i * roi_size 28 | X_mag_window = X_mag_pad[None, :, :, start:start + data['window_size']] 29 | X_mag_window = torch.from_numpy(X_mag_window) 30 | if(is_half==True):X_mag_window=X_mag_window.half() 31 | X_mag_window=X_mag_window.to(device) 32 | 33 | pred = model.predict(X_mag_window, aggressiveness) 34 | 35 | pred = pred.detach().cpu().numpy() 36 | preds.append(pred[0]) 37 | 38 | pred = np.concatenate(preds, axis=2) 39 | return pred 40 | 41 | def preprocess(X_spec): 42 | X_mag = np.abs(X_spec) 43 | X_phase = np.angle(X_spec) 44 | 45 | return X_mag, X_phase 46 | 47 | X_mag, X_phase = preprocess(X_spec) 48 | 49 | coef = X_mag.max() 50 | X_mag_pre = X_mag / coef 51 | 52 | n_frame = X_mag_pre.shape[2] 53 | pad_l, pad_r, roi_size = make_padding(n_frame, 54 | data['window_size'], model.offset) 55 | n_window = int(np.ceil(n_frame / roi_size)) 56 | 57 | X_mag_pad = np.pad( 58 | X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode='constant') 59 | 60 | if(list(model.state_dict().values())[0].dtype==torch.float16):is_half=True 61 | else:is_half=False 62 | pred = _execute(X_mag_pad, roi_size, n_window, 63 | device, model, aggressiveness,is_half) 64 | pred = pred[:, :, :n_frame] 65 | 66 | if data['tta']: 67 | pad_l += roi_size // 2 68 | pad_r += roi_size // 2 69 | n_window += 1 70 | 71 | X_mag_pad = np.pad( 72 | X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode='constant') 73 | 74 | pred_tta = _execute(X_mag_pad, roi_size, n_window, 75 | device, model, aggressiveness,is_half) 76 | pred_tta = pred_tta[:, :, roi_size // 2:] 77 | pred_tta = pred_tta[:, :, :n_frame] 78 | 79 | return (pred + pred_tta) * 0.5 * coef, X_mag, np.exp(1.j * X_phase) 80 | else: 81 | return pred * coef, X_mag, np.exp(1.j * X_phase) 82 | 83 | 84 | 85 | def _get_name_params(model_path , model_hash): 86 | ModelName = model_path 87 | if model_hash == '47939caf0cfe52a0e81442b85b971dfd': 88 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json') 89 | param_name_auto=str('4band_44100') 90 | if model_hash == '4e4ecb9764c50a8c414fee6e10395bbe': 91 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_v2.json') 92 | param_name_auto=str('4band_v2') 93 | if model_hash == 'ca106edd563e034bde0bdec4bb7a4b36': 94 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_v2.json') 95 | param_name_auto=str('4band_v2') 96 | if model_hash == 'e60a1e84803ce4efc0a6551206cc4b71': 97 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json') 98 | param_name_auto=str('4band_44100') 99 | if model_hash == 'a82f14e75892e55e994376edbf0c8435': 100 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json') 101 | param_name_auto=str('4band_44100') 102 | if model_hash == '6dd9eaa6f0420af9f1d403aaafa4cc06': 103 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_v2_sn.json') 104 | param_name_auto=str('4band_v2_sn') 105 | if model_hash == '08611fb99bd59eaa79ad27c58d137727': 106 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_v2_sn.json') 107 | param_name_auto=str('4band_v2_sn') 108 | if model_hash == '5c7bbca45a187e81abbbd351606164e5': 109 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json') 110 | param_name_auto=str('3band_44100_msb2') 111 | if model_hash == 'd6b2cb685a058a091e5e7098192d3233': 112 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json') 113 | param_name_auto=str('3band_44100_msb2') 114 | if model_hash == 'c1b9f38170a7c90e96f027992eb7c62b': 115 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json') 116 | param_name_auto=str('4band_44100') 117 | if model_hash == 'c3448ec923fa0edf3d03a19e633faa53': 118 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json') 119 | param_name_auto=str('4band_44100') 120 | if model_hash == '68aa2c8093d0080704b200d140f59e54': 121 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100.json') 122 | param_name_auto=str('3band_44100.json') 123 | if model_hash == 'fdc83be5b798e4bd29fe00fe6600e147': 124 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100_mid.json') 125 | param_name_auto=str('3band_44100_mid.json') 126 | if model_hash == '2ce34bc92fd57f55db16b7a4def3d745': 127 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100_mid.json') 128 | param_name_auto=str('3band_44100_mid.json') 129 | if model_hash == '52fdca89576f06cf4340b74a4730ee5f': 130 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json') 131 | param_name_auto=str('4band_44100.json') 132 | if model_hash == '41191165b05d38fc77f072fa9e8e8a30': 133 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json') 134 | param_name_auto=str('4band_44100.json') 135 | if model_hash == '89e83b511ad474592689e562d5b1f80e': 136 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/2band_32000.json') 137 | param_name_auto=str('2band_32000.json') 138 | if model_hash == '0b954da81d453b716b114d6d7c95177f': 139 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/2band_32000.json') 140 | param_name_auto=str('2band_32000.json') 141 | 142 | #v4 Models 143 | if model_hash == '6a00461c51c2920fd68937d4609ed6c8': 144 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json') 145 | param_name_auto=str('1band_sr16000_hl512') 146 | if model_hash == '0ab504864d20f1bd378fe9c81ef37140': 147 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json') 148 | param_name_auto=str('1band_sr32000_hl512') 149 | if model_hash == '7dd21065bf91c10f7fccb57d7d83b07f': 150 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json') 151 | param_name_auto=str('1band_sr32000_hl512') 152 | if model_hash == '80ab74d65e515caa3622728d2de07d23': 153 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json') 154 | param_name_auto=str('1band_sr32000_hl512') 155 | if model_hash == 'edc115e7fc523245062200c00caa847f': 156 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json') 157 | param_name_auto=str('1band_sr33075_hl384') 158 | if model_hash == '28063e9f6ab5b341c5f6d3c67f2045b7': 159 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json') 160 | param_name_auto=str('1band_sr33075_hl384') 161 | if model_hash == 'b58090534c52cbc3e9b5104bad666ef2': 162 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json') 163 | param_name_auto=str('1band_sr44100_hl512') 164 | if model_hash == '0cdab9947f1b0928705f518f3c78ea8f': 165 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json') 166 | param_name_auto=str('1band_sr44100_hl512') 167 | if model_hash == 'ae702fed0238afb5346db8356fe25f13': 168 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json') 169 | param_name_auto=str('1band_sr44100_hl1024') 170 | #User Models 171 | 172 | #1 Band 173 | if '1band_sr16000_hl512' in ModelName: 174 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json') 175 | param_name_auto=str('1band_sr16000_hl512') 176 | if '1band_sr32000_hl512' in ModelName: 177 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json') 178 | param_name_auto=str('1band_sr32000_hl512') 179 | if '1band_sr33075_hl384' in ModelName: 180 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json') 181 | param_name_auto=str('1band_sr33075_hl384') 182 | if '1band_sr44100_hl256' in ModelName: 183 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json') 184 | param_name_auto=str('1band_sr44100_hl256') 185 | if '1band_sr44100_hl512' in ModelName: 186 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json') 187 | param_name_auto=str('1band_sr44100_hl512') 188 | if '1band_sr44100_hl1024' in ModelName: 189 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json') 190 | param_name_auto=str('1band_sr44100_hl1024') 191 | 192 | #2 Band 193 | if '2band_44100_lofi' in ModelName: 194 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json') 195 | param_name_auto=str('2band_44100_lofi') 196 | if '2band_32000' in ModelName: 197 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/2band_32000.json') 198 | param_name_auto=str('2band_32000') 199 | if '2band_48000' in ModelName: 200 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/2band_48000.json') 201 | param_name_auto=str('2band_48000') 202 | 203 | #3 Band 204 | if '3band_44100' in ModelName: 205 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100.json') 206 | param_name_auto=str('3band_44100') 207 | if '3band_44100_mid' in ModelName: 208 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100_mid.json') 209 | param_name_auto=str('3band_44100_mid') 210 | if '3band_44100_msb2' in ModelName: 211 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json') 212 | param_name_auto=str('3band_44100_msb2') 213 | 214 | #4 Band 215 | if '4band_44100' in ModelName: 216 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json') 217 | param_name_auto=str('4band_44100') 218 | if '4band_44100_mid' in ModelName: 219 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100_mid.json') 220 | param_name_auto=str('4band_44100_mid') 221 | if '4band_44100_msb' in ModelName: 222 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100_msb.json') 223 | param_name_auto=str('4band_44100_msb') 224 | if '4band_44100_msb2' in ModelName: 225 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json') 226 | param_name_auto=str('4band_44100_msb2') 227 | if '4band_44100_reverse' in ModelName: 228 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json') 229 | param_name_auto=str('4band_44100_reverse') 230 | if '4band_44100_sw' in ModelName: 231 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100_sw.json') 232 | param_name_auto=str('4band_44100_sw') 233 | if '4band_v2' in ModelName: 234 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_v2.json') 235 | param_name_auto=str('4band_v2') 236 | if '4band_v2_sn' in ModelName: 237 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_v2_sn.json') 238 | param_name_auto=str('4band_v2_sn') 239 | if 'tmodelparam' in ModelName: 240 | model_params_auto=str('uvr5_pack/lib_v5/modelparams/tmodelparam.json') 241 | param_name_auto=str('User Model Param Set') 242 | return param_name_auto , model_params_auto 243 | -------------------------------------------------------------------------------- /uvr5_weights/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/seanghay/uvr/ae9ade573f5d4d0928a7cd7a67011339b6c7d143/uvr5_weights/.gitkeep --------------------------------------------------------------------------------