├── .flake8 ├── .gitignore ├── .gitmodules ├── COLAB_USAGE.md ├── LICENSE ├── LICENSE.old ├── README.md ├── Setup DLAS.bat ├── Start DLAS.cmd ├── Start Training Monitor.cmd ├── codes ├── NOTES.md ├── configuration_gui.py ├── data │ ├── README.md │ ├── __init__.py │ ├── audio │ │ ├── audio_with_noise_dataset.py │ │ ├── fast_paired_dataset.py │ │ ├── fast_paired_dataset_with_phonemes.py │ │ ├── gpt_tts_dataset.py │ │ ├── gpt_tts_tokenizer.json │ │ ├── grand_conjoined_dataset.py │ │ ├── nv_tacotron_dataset.py │ │ ├── paired_voice_audio_dataset.py │ │ ├── preprocessed_mel_dataset.py │ │ ├── unsupervised_audio_dataset.py │ │ ├── voice_tokenizer.py │ │ └── wav_aug.py │ ├── combined_dataset.py │ ├── data_sampler.py │ ├── images │ │ ├── __init__.py │ │ ├── base_unsupervised_image_dataset.py │ │ ├── byol_attachment.py │ │ ├── chunk_with_reference.py │ │ ├── cifar.py │ │ ├── full_image_dataset.py │ │ ├── image_corruptor.py │ │ ├── image_folder_dataset.py │ │ ├── image_label_parser.py │ │ ├── image_pair_with_corresponding_points_dataset.py │ │ ├── multi_frame_dataset.py │ │ ├── multiscale_dataset.py │ │ ├── paired_frame_dataset.py │ │ ├── random_dataset.py │ │ ├── single_image_dataset.py │ │ ├── stylegan2_dataset.py │ │ └── zip_file_dataset.py │ ├── text │ │ └── hf_datasets_wrapper.py │ ├── torch_dataset.py │ ├── util.py │ └── zero_pad_dict_collate.py ├── maybe_bnb.py ├── models │ ├── __init__.py │ ├── arch_util.py │ ├── audio │ │ ├── __init__.py │ │ ├── asr │ │ │ ├── __init__.py │ │ │ └── w2v_wrapper.py │ │ ├── audio_resnet.py │ │ ├── mel2vec.py │ │ ├── music │ │ │ ├── __init__.py │ │ │ ├── cheater_gen_ar.py │ │ │ ├── diffwave.py │ │ │ ├── encoders.py │ │ │ ├── flat_diffusion.py │ │ │ ├── gpt_music.py │ │ │ ├── gpt_music2.py │ │ │ ├── instrument_quantizer.py │ │ │ ├── m2v_code_to_mel.py │ │ │ ├── mel2vec_codes_gpt.py │ │ │ ├── music_quantizer.py │ │ │ ├── music_quantizer2.py │ │ │ ├── tfdpc_v5.py │ │ │ ├── transformer_diffusion12.py │ │ │ ├── transformer_diffusion13.py │ │ │ ├── transformer_diffusion14.py │ │ │ ├── unet_diffusion_music_codes.py │ │ │ ├── unet_diffusion_waveform_gen.py │ │ │ ├── unet_diffusion_waveform_gen3.py │ │ │ └── unet_diffusion_waveform_gen_simple.py │ │ ├── tts │ │ │ ├── __init__.py │ │ │ ├── autoregressive_codegen.py │ │ │ ├── autoregressive_codegen2.py │ │ │ ├── ctc_code_generator.py │ │ │ ├── diffusion_encoder.py │ │ │ ├── lucidrains_dvae.py │ │ │ ├── mini_encoder.py │ │ │ ├── random_latent_converter.py │ │ │ ├── tacotron2 │ │ │ │ ├── LICENSE │ │ │ │ ├── __init__.py │ │ │ │ ├── audio_processing.py │ │ │ │ ├── hparams.py │ │ │ │ ├── layers.py │ │ │ │ ├── loss.py │ │ │ │ ├── stft.py │ │ │ │ ├── taco_utils.py │ │ │ │ ├── tacotron2.py │ │ │ │ ├── text │ │ │ │ │ ├── LICENSE │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── cleaners.py │ │ │ │ │ ├── cmudict.py │ │ │ │ │ ├── numbers.py │ │ │ │ │ └── symbols.py │ │ │ │ └── wave_tacotron.py │ │ │ ├── transformer_builders.py │ │ │ ├── transformer_diffusion_tts.py │ │ │ ├── transformer_diffusion_tts2.py │ │ │ ├── unet_diffusion_tts7.py │ │ │ ├── unet_diffusion_tts9.py │ │ │ ├── unet_diffusion_tts_flat.py │ │ │ ├── unet_diffusion_vocoder.py │ │ │ ├── unet_diffusion_vocoder_with_ref.py │ │ │ ├── unified_voice2.py │ │ │ ├── unified_voice3.py │ │ │ ├── unified_voice4.py │ │ │ ├── voice_voice_clip.py │ │ │ └── w2v_matcher.py │ │ └── vocoders │ │ │ ├── __init__.py │ │ │ ├── univnet │ │ │ ├── __init__.py │ │ │ ├── generator.py │ │ │ └── lvcnet.py │ │ │ └── waveglow │ │ │ ├── __init__.py │ │ │ ├── denoiser.py │ │ │ └── waveglow.py │ ├── classifiers │ │ ├── __init__.py │ │ ├── cifar_resnet.py │ │ ├── resnet_with_checkpointing.py │ │ ├── torch_models.py │ │ ├── twin_cifar_resnet.py │ │ ├── weighted_conv_resnet.py │ │ └── wide_kernel_vgg.py │ ├── clip │ │ ├── __init__.py │ │ ├── clip.py │ │ ├── clvp.py │ │ ├── contrastive_audio.py │ │ ├── cvvp.py │ │ ├── mel_text_clip.py │ │ ├── text_cond_clip.py │ │ └── text_voice_clip.py │ ├── composable │ │ ├── README.md │ │ └── __init__.py │ ├── diffusion │ │ ├── __init__.py │ │ ├── fp16_util.py │ │ ├── gaussian_diffusion.py │ │ ├── losses.py │ │ ├── nn.py │ │ ├── resample.py │ │ ├── respace.py │ │ ├── rrdb_diffusion.py │ │ ├── unet_diffusion.py │ │ └── unet_latent_guide.py │ ├── image_generation │ │ ├── RRDBNet_arch.py │ │ ├── ResGen_arch.py │ │ ├── __init__.py │ │ ├── discriminator_vgg_arch.py │ │ ├── glean │ │ │ ├── __init__.py │ │ │ ├── glean.py │ │ │ └── stylegan2_latent_bank.py │ │ ├── srflow │ │ │ ├── FlowActNorms.py │ │ │ ├── FlowAffineCouplingsAblation.py │ │ │ ├── FlowStep.py │ │ │ ├── FlowUpsamplerNet.py │ │ │ ├── Permutations.py │ │ │ ├── RRDBNet_arch.py │ │ │ ├── SRFlowNet_arch.py │ │ │ ├── Split.py │ │ │ ├── __init__.py │ │ │ ├── flow.py │ │ │ ├── glow_arch.py │ │ │ ├── module_util.py │ │ │ └── thops.py │ │ └── stylegan │ │ │ ├── Discriminator_StyleGAN.py │ │ │ ├── __init__.py │ │ │ ├── stylegan2_lucidrains.py │ │ │ └── stylegan2_rosinality.py │ ├── image_latents │ │ ├── __init__.py │ │ ├── byol │ │ │ ├── __init__.py │ │ │ ├── byol_model_wrapper.py │ │ │ └── byol_structural.py │ │ ├── fixup_resnet │ │ │ ├── DiscriminatorResnet_arch.py │ │ │ └── __init__.py │ │ ├── spinenet_arch.py │ │ └── vit_latent.py │ ├── lucidrains │ │ ├── dalle │ │ │ ├── __init__.py │ │ │ ├── attention.py │ │ │ ├── reversible.py │ │ │ └── transformer.py │ │ ├── performer │ │ │ ├── __init__.py │ │ │ ├── autoregressive_wrapper.py │ │ │ ├── performer_enc_dec.py │ │ │ ├── performer_pytorch.py │ │ │ └── reversible.py │ │ ├── vq.py │ │ └── x_transformers.py │ ├── optical_flow │ │ └── PWCNet.py │ └── vqvae │ │ ├── __init__.py │ │ ├── dvae.py │ │ ├── gumbel_quantizer.py │ │ ├── scaled_weight_conv.py │ │ ├── vector_quantizer.py │ │ └── vqvae.py ├── multi_modal_train.py ├── process_video.py ├── requirements.laxed.txt ├── requirements.txt ├── requirements_frozen_only_use_if_something_broken.txt ├── scripts │ ├── __init__.py │ ├── audio │ │ ├── __init__.py │ │ ├── gen │ │ │ ├── __init__.py │ │ │ ├── ctc_codes.py │ │ │ ├── music_joiner.py │ │ │ ├── speech_synthesis_utils.py │ │ │ ├── use_diffuse_tts.py │ │ │ ├── use_diffuse_voice_translation.py │ │ │ ├── use_discrete_vocoder.py │ │ │ ├── use_discrete_vocoder_one_way.py │ │ │ ├── use_gpt_tts.py │ │ │ ├── use_mel2vec_codes.py │ │ │ └── w2v_patcher.py │ │ ├── gen_mel.py │ │ ├── mel_bin_norm_compute.py │ │ ├── play_with_spectral_representations.py │ │ ├── prep_music │ │ │ ├── demucs_notes.txt │ │ │ ├── generate_long_cheaters.py │ │ │ ├── generate_long_mels.py │ │ │ └── phase_1_split_files.py │ │ ├── preparation │ │ │ ├── __init__.py │ │ │ ├── combine_phonetic_and_text.py │ │ │ ├── filter_clips_with_no_hifreq_data.py │ │ │ ├── gen_dvae_codes.py │ │ │ ├── phase_1_split_files.py │ │ │ ├── phase_2_sample_and_filter.py │ │ │ ├── phase_3_generate_similarities.py │ │ │ ├── pipeline.py │ │ │ ├── process_spleeter_filter_outputs.py │ │ │ ├── save_mels_to_disk.py │ │ │ ├── spleeter_filter_noisy_clips.py │ │ │ ├── spleeter_utils │ │ │ │ ├── __init__.py │ │ │ │ └── spleeter_dataset.py │ │ │ └── split_on_silence.py │ │ ├── random_mp3_splitter.py │ │ ├── spleeter_split_voice_and_background.py │ │ ├── test_audio_gen.py │ │ ├── test_audio_segmentor.py │ │ ├── test_audio_similarity.py │ │ ├── test_audio_speech_recognition.py │ │ ├── use_vocoder.py │ │ └── word_error_rate.py │ ├── byol │ │ ├── byol_extract_wrapped_model.py │ │ ├── byol_resnet_playground.py │ │ ├── byol_segformer_playground.py │ │ ├── byol_spinenet_playground.py │ │ └── tsne_torch.py │ ├── classify_into_folders.py │ ├── diffusion │ │ ├── diffusion_correction_surfer.py │ │ ├── diffusion_inference.py │ │ ├── diffusion_noise_surfer.py │ │ ├── diffusion_recursive_sampler.py │ │ └── diffusion_spacing_surfer.py │ ├── do_to_files.py │ ├── extract_square_images.py │ ├── extract_subimages.py │ ├── extract_subimages_with_ref.py │ ├── extract_temporal_squares.py │ ├── find_faulty_files.py │ ├── folderize_imagenet_val.py │ ├── gen_kmeans_clusters.py │ ├── hugging_face_hub_upload.py │ ├── post_install.py │ ├── srflow_latent_space_playground.py │ ├── start_tensorboard.py │ ├── stitch_images.py │ ├── stylegan2 │ │ ├── convert_weights_rosinality.py │ │ └── dnnlib │ │ │ └── tflib │ │ │ └── network.py │ ├── ui │ │ ├── image_labeler │ │ │ ├── image_labeler_ui.py │ │ │ ├── label_editor.py │ │ │ ├── pretrained_image_patch_classifier.py │ │ │ └── test_image_patch_classifier.py │ │ └── image_pair_labeler │ │ │ └── image_pair_ui.py │ ├── use_generator_as_filter.py │ └── validate_data.py ├── sweep.py ├── test.py ├── train.py ├── trainer │ ├── ExtensibleTrainer.py │ ├── README.md │ ├── __init__.py │ ├── base_model.py │ ├── batch_size_optimizer.py │ ├── custom_training_components │ │ ├── __init__.py │ │ ├── progressive_zoom.py │ │ ├── stereoscopic.py │ │ └── tecogan_losses.py │ ├── eval │ │ ├── __init__.py │ │ ├── audio_diffusion_fid.py │ │ ├── eval_wer.py │ │ ├── evaluator.py │ │ ├── fid.py │ │ ├── flow_gaussian_nll.py │ │ ├── mel_evaluator.py │ │ ├── music_diffusion_fid.py │ │ ├── single_point_pair_contrastive_eval.py │ │ ├── sr_diffusion_fid.py │ │ ├── sr_fid.py │ │ └── sr_style.py │ ├── experiments │ │ ├── __init__.py │ │ └── experiments.py │ ├── feature_model.py │ ├── inject.py │ ├── injectors │ │ ├── __init__.py │ │ ├── audio_injectors.py │ │ ├── base_injectors.py │ │ ├── gaussian_diffusion_injector.py │ │ └── spec_augment.py │ ├── loss.py │ ├── losses.py │ ├── lr_scheduler.py │ ├── networks.py │ ├── optimizers │ │ ├── lamb.py │ │ ├── larc.py │ │ └── sgd.py │ └── steps.py ├── use_discriminator_as_filter.py └── utils │ ├── BASE_gpt.yaml │ ├── UI_icon.png │ ├── __init__.py │ ├── audio.py │ ├── audio_resampler.py │ ├── colors.py │ ├── convert_model.py │ ├── distributed_checkpont.py │ ├── gpu_mem_track.py │ ├── kmeans.py │ ├── loss_accumulator.py │ ├── music_utils.py │ ├── numeric_stability.py │ ├── options.py │ ├── util.py │ └── weight_scheduler.py ├── environment.yaml ├── experiments ├── EXAMPLE_diff.yml ├── EXAMPLE_gpt.yml ├── bpe_lowercase_asr_256.json ├── clips_mel_norms.pth ├── train_diffusion_vocoder_22k_level.yml └── train_gpt_tts_unified.yml ├── param_calc.py ├── recipes ├── byol │ ├── README.md │ └── train_div2k_byol.yml ├── diffusion │ ├── README.md │ ├── test_diffusion_unet.yml │ └── train_ddpm_unet.yml ├── esrgan │ ├── README.md │ ├── rrdb_process_video.yml │ ├── train_div2k_esrgan.yml │ └── train_div2k_esrgan_reference.yml ├── glean │ ├── README.md │ └── train_ffhq_glean.yml ├── segformer │ └── train_byol_segformer.yml ├── srflow │ ├── README.md │ ├── convert_official_weights.py │ ├── train_div2k_rrdb_psnr.yml │ └── train_div2k_srflow.yml ├── stylegan │ └── README.md ├── tacotron2 │ ├── test_tacotron2_lj.yml │ └── train_tacotron2_lj.yml └── vqvae2 │ ├── README.md │ └── train_imgnet_vqvae_stage1.yml ├── resources └── bitsandbytes_windows │ ├── cextension.py │ ├── libbitsandbytes_cpu.dll │ ├── libbitsandbytes_cuda116.dll │ └── main.py ├── sandbox.py ├── static ├── drive_copied_file_tree.png ├── export_to_gdrive.png ├── file_directory.png ├── good_gpu.png ├── hyperparam_dataset.png ├── ljspeech.png ├── notebook_header.png ├── params.png ├── runtime_type.png ├── settings_options.png ├── stop_training.png ├── training_button.png ├── very_long_process.png ├── very_recent_save.png ├── warning.png └── yml_file.png └── voice_samples ├── kk_500 ├── kk_0_0.wav ├── kk_0_1.wav └── kk_0_2.wav ├── kk_500_emma ├── emma_0_0.wav ├── emma_0_1.wav └── emma_0_2.wav └── kk_orig ├── kk_0_0.wav ├── kk_0_1.wav └── kk_0_2.wav /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = 3 | # Too many leading '#' for block comment (E266) 4 | E266 5 | 6 | max-line-length=100 -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "flownet2"] 2 | path = flownet2 3 | url = https://github.com/NVIDIA/flownet2-pytorch.git 4 | [submodule "codes/models/flownet2"] 5 | path = codes/models/flownet2 6 | url = https://github.com/neonbjb/flownet2-pytorch.git 7 | -------------------------------------------------------------------------------- /Setup DLAS.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | :: This file is part of sygil-webui (https://github.com/Sygil-Dev/sygil-webui/). 3 | :: 4 | :: Copyright 2022 Sygil-Dev team. 5 | :: This program is free software: you can redistribute it and/or modify 6 | :: it under the terms of the GNU Affero General Public License as published by 7 | :: the Free Software Foundation, either version 3 of the License, or 8 | :: (at your option) any later version. 9 | :: 10 | :: This program is distributed in the hope that it will be useful, 11 | :: but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | :: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | :: GNU Affero General Public License for more details. 14 | :: 15 | :: You should have received a copy of the GNU Affero General Public License 16 | :: along with this program. If not, see . 17 | :: Run all commands using this script's directory as the working directory 18 | cd %~dp0 19 | 20 | :: copy over the first line from environment.yaml, e.g. name: ldm, and take the second word after splitting by ":" delimiter 21 | for /F "tokens=2 delims=: " %%i in (environment.yaml) DO ( 22 | set v_conda_env_name=%%i 23 | goto EOL 24 | ) 25 | :EOL 26 | 27 | echo Environment name is set as %v_conda_env_name% as per environment.yaml 28 | 29 | :: Put the path to conda directory in a file called "custom-conda-path.txt" if it's installed at non-standard path 30 | IF EXIST custom-conda-path.txt ( 31 | FOR /F %%i IN (custom-conda-path.txt) DO set v_custom_path=%%i 32 | ) 33 | 34 | set INSTALL_ENV_DIR=%cd%\installer_files\env 35 | set PATH=%INSTALL_ENV_DIR%;%INSTALL_ENV_DIR%\Library\bin;%INSTALL_ENV_DIR%\Scripts;%INSTALL_ENV_DIR%\Library\usr\bin;%PATH% 36 | 37 | set v_paths=%INSTALL_ENV_DIR% 38 | set v_paths=%v_paths%;%ProgramData%\miniconda3 39 | set v_paths=%v_paths%;%USERPROFILE%\miniconda3 40 | set v_paths=%v_paths%;%ProgramData%\anaconda3 41 | set v_paths=%v_paths%;%USERPROFILE%\anaconda3 42 | 43 | for %%a in (%v_paths%) do ( 44 | IF NOT "%v_custom_path%"=="" ( 45 | set v_paths=%v_custom_path%;%v_paths% 46 | ) 47 | ) 48 | 49 | for %%a in (%v_paths%) do ( 50 | if EXIST "%%a\Scripts\activate.bat" ( 51 | SET v_conda_path=%%a 52 | echo anaconda3/miniconda3 detected in %%a 53 | goto :CONDA_FOUND 54 | ) 55 | ) 56 | 57 | IF "%v_conda_path%"=="" ( 58 | echo anaconda3/miniconda3 not found. Install from here https://docs.conda.io/en/latest/miniconda.html 59 | pause 60 | exit /b 1 61 | ) 62 | 63 | :CONDA_FOUND 64 | echo Found Anaconda 65 | 66 | :SKIP_RESTORE 67 | call "%v_conda_path%\Scripts\activate.bat" 68 | echo %v_conda_env_name% 69 | 70 | call conda env create --name "%v_conda_env_name%" -f environment.yaml 71 | 72 | 73 | 74 | call "%v_conda_path%\Scripts\activate.bat" "%v_conda_env_name%" 75 | 76 | :PROMPT 77 | python codes/scripts/post_install.py 78 | pause -------------------------------------------------------------------------------- /Start DLAS.cmd: -------------------------------------------------------------------------------- 1 | @echo off 2 | :: This file is part of sygil-webui (https://github.com/Sygil-Dev/sygil-webui/). 3 | :: 4 | :: Copyright 2022 Sygil-Dev team. 5 | :: This program is free software: you can redistribute it and/or modify 6 | :: it under the terms of the GNU Affero General Public License as published by 7 | :: the Free Software Foundation, either version 3 of the License, or 8 | :: (at your option) any later version. 9 | :: 10 | :: This program is distributed in the hope that it will be useful, 11 | :: but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | :: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | :: GNU Affero General Public License for more details. 14 | :: 15 | :: You should have received a copy of the GNU Affero General Public License 16 | :: along with this program. If not, see . 17 | :: Run all commands using this script's directory as the working directory 18 | cd %~dp0 19 | 20 | :: copy over the first line from environment.yaml, e.g. name: ldm, and take the second word after splitting by ":" delimiter 21 | set v_conda_env_name="DLAS" 22 | 23 | 24 | echo Environment name is set as %v_conda_env_name% as per environment.yaml 25 | 26 | :: Put the path to conda directory in a file called "custom-conda-path.txt" if it's installed at non-standard path 27 | IF EXIST custom-conda-path.txt ( 28 | FOR /F %%i IN (custom-conda-path.txt) DO set v_custom_path=%%i 29 | ) 30 | 31 | set INSTALL_ENV_DIR=%cd%\installer_files\env 32 | set PATH=%INSTALL_ENV_DIR%;%INSTALL_ENV_DIR%\Library\bin;%INSTALL_ENV_DIR%\Scripts;%INSTALL_ENV_DIR%\Library\usr\bin;%PATH% 33 | 34 | set v_paths=%INSTALL_ENV_DIR% 35 | set v_paths=%v_paths%;%ProgramData%\miniconda3 36 | set v_paths=%v_paths%;%USERPROFILE%\miniconda3 37 | set v_paths=%v_paths%;%ProgramData%\anaconda3 38 | set v_paths=%v_paths%;%USERPROFILE%\anaconda3 39 | 40 | for %%a in (%v_paths%) do ( 41 | IF NOT "%v_custom_path%"=="" ( 42 | set v_paths=%v_custom_path%;%v_paths% 43 | ) 44 | ) 45 | 46 | for %%a in (%v_paths%) do ( 47 | if EXIST "%%a\Scripts\activate.bat" ( 48 | SET v_conda_path=%%a 49 | echo anaconda3/miniconda3 detected in %%a 50 | goto :CONDA_FOUND 51 | ) 52 | ) 53 | 54 | IF "%v_conda_path%"=="" ( 55 | echo anaconda3/miniconda3 not found. Install from here https://docs.conda.io/en/latest/miniconda.html 56 | pause 57 | exit /b 1 58 | ) 59 | 60 | :CONDA_FOUND 61 | echo Starting conda environment %v_conda_env_name% from %v_conda_path% 62 | 63 | call "%v_conda_path%\Scripts\activate.bat" "%v_conda_env_name%" 64 | 65 | :START_GUI 66 | python codes/configuration_gui.py 67 | 68 | ::cmd /k 69 | -------------------------------------------------------------------------------- /Start Training Monitor.cmd: -------------------------------------------------------------------------------- 1 | @echo off 2 | :: This file is part of sygil-webui (https://github.com/Sygil-Dev/sygil-webui/). 3 | :: 4 | :: Copyright 2022 Sygil-Dev team. 5 | :: This program is free software: you can redistribute it and/or modify 6 | :: it under the terms of the GNU Affero General Public License as published by 7 | :: the Free Software Foundation, either version 3 of the License, or 8 | :: (at your option) any later version. 9 | :: 10 | :: This program is distributed in the hope that it will be useful, 11 | :: but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | :: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | :: GNU Affero General Public License for more details. 14 | :: 15 | :: You should have received a copy of the GNU Affero General Public License 16 | :: along with this program. If not, see . 17 | :: Run all commands using this script's directory as the working directory 18 | cd %~dp0 19 | 20 | :: copy over the first line from environment.yaml, e.g. name: ldm, and take the second word after splitting by ":" delimiter 21 | set v_conda_env_name="DLAS" 22 | 23 | 24 | echo Environment name is set as %v_conda_env_name% as per environment.yaml 25 | 26 | :: Put the path to conda directory in a file called "custom-conda-path.txt" if it's installed at non-standard path 27 | IF EXIST custom-conda-path.txt ( 28 | FOR /F %%i IN (custom-conda-path.txt) DO set v_custom_path=%%i 29 | ) 30 | 31 | set INSTALL_ENV_DIR=%cd%\installer_files\env 32 | set PATH=%INSTALL_ENV_DIR%;%INSTALL_ENV_DIR%\Library\bin;%INSTALL_ENV_DIR%\Scripts;%INSTALL_ENV_DIR%\Library\usr\bin;%PATH% 33 | 34 | set v_paths=%INSTALL_ENV_DIR% 35 | set v_paths=%v_paths%;%ProgramData%\miniconda3 36 | set v_paths=%v_paths%;%USERPROFILE%\miniconda3 37 | set v_paths=%v_paths%;%ProgramData%\anaconda3 38 | set v_paths=%v_paths%;%USERPROFILE%\anaconda3 39 | 40 | for %%a in (%v_paths%) do ( 41 | IF NOT "%v_custom_path%"=="" ( 42 | set v_paths=%v_custom_path%;%v_paths% 43 | ) 44 | ) 45 | 46 | for %%a in (%v_paths%) do ( 47 | if EXIST "%%a\Scripts\activate.bat" ( 48 | SET v_conda_path=%%a 49 | echo anaconda3/miniconda3 detected in %%a 50 | goto :CONDA_FOUND 51 | ) 52 | ) 53 | 54 | IF "%v_conda_path%"=="" ( 55 | echo anaconda3/miniconda3 not found. Install from here https://docs.conda.io/en/latest/miniconda.html 56 | pause 57 | exit /b 1 58 | ) 59 | 60 | :CONDA_FOUND 61 | echo Starting conda environment %v_conda_env_name% from %v_conda_path% 62 | 63 | call "%v_conda_path%\Scripts\activate.bat" "%v_conda_env_name%" 64 | 65 | :START_GUI 66 | python codes/scripts/start_tensorboard.py 67 | 68 | ::cmd /k 69 | -------------------------------------------------------------------------------- /codes/data/audio/preprocessed_mel_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | import numpy as np 5 | import torch 6 | import torch.nn.functional as F 7 | import torch.utils.data 8 | import torchaudio 9 | import torchvision 10 | from tqdm import tqdm 11 | 12 | from utils.util import opt_get 13 | 14 | 15 | class PreprocessedMelDataset(torch.utils.data.Dataset): 16 | 17 | def __init__(self, opt): 18 | path = opt['path'] 19 | cache_path = opt['cache_path'] # Will fail when multiple paths specified, must be specified in this case. 20 | if os.path.exists(cache_path): 21 | self.paths = torch.load(cache_path) 22 | else: 23 | print("Building cache..") 24 | path = Path(path) 25 | self.paths = [str(p) for p in path.rglob("*.npz")] 26 | torch.save(self.paths, cache_path) 27 | self.pad_to = opt_get(opt, ['pad_to_samples'], 10336) 28 | self.squeeze = opt_get(opt, ['should_squeeze'], False) 29 | 30 | def __getitem__(self, index): 31 | with np.load(self.paths[index]) as npz_file: 32 | mel = torch.tensor(npz_file['arr_0']) 33 | assert mel.shape[-1] <= self.pad_to 34 | if self.squeeze: 35 | mel = mel.squeeze() 36 | padding_needed = self.pad_to - mel.shape[-1] 37 | mask = torch.zeros_like(mel) 38 | if padding_needed > 0: 39 | mel = F.pad(mel, (0,padding_needed)) 40 | mask = F.pad(mask, (0,padding_needed), value=1) 41 | 42 | output = { 43 | 'mel': mel, 44 | 'mel_lengths': torch.tensor(mel.shape[-1]), 45 | 'mask': mask, 46 | 'mask_lengths': torch.tensor(mask.shape[-1]), 47 | 'path': self.paths[index], 48 | } 49 | return output 50 | 51 | def __len__(self): 52 | return len(self.paths) 53 | 54 | 55 | if __name__ == '__main__': 56 | params = { 57 | 'mode': 'preprocessed_mel', 58 | 'path': 'Y:\\separated\\large_mel_cheaters', 59 | 'cache_path': 'Y:\\separated\\large_mel_cheaters_win.pth', 60 | 'pad_to_samples': 646, 61 | 'phase': 'train', 62 | 'n_workers': 0, 63 | 'batch_size': 16, 64 | } 65 | from data import create_dataset, create_dataloader 66 | 67 | ds = create_dataset(params) 68 | dl = create_dataloader(ds, params) 69 | i = 0 70 | for b in tqdm(dl): 71 | #pass 72 | torchvision.utils.save_image((b['mel'].unsqueeze(1)+1)/2, f'{i}.png') 73 | i += 1 74 | if i > 20: 75 | break 76 | -------------------------------------------------------------------------------- /codes/data/audio/wav_aug.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import torch 4 | import torchaudio.sox_effects 5 | 6 | from models.audio.tts.tacotron2.taco_utils import load_wav_to_torch 7 | 8 | 9 | # Returns random double on [l,h] as a string 10 | def rdstr(l=0,h=1): 11 | assert h > l 12 | i=h-l 13 | return str(random.random() * i + l) 14 | 15 | 16 | # Returns a randint on [s,e] as a string 17 | def rdi(e, s=0): 18 | return str(random.randint(s,e)) 19 | 20 | 21 | class WavAugmentor: 22 | def __init__(self): 23 | pass 24 | 25 | def augment(self, wav, sample_rate): 26 | speed_effect = ['speed', rdstr(.8, 1)] 27 | ''' 28 | Band effects are disabled until I can audit them better. 29 | band_effects = [ 30 | ['reverb', '-w'], 31 | ['reverb'], 32 | ['band', rdi(8000, 3000), rdi(1000, 100)], 33 | ['bandpass', rdi(8000, 3000), rdi(1000, 100)], 34 | ['bass', rdi(20,-20)], 35 | ['treble', rdi(20,-20)], 36 | ['dither'], 37 | ['equalizer', rdi(3000, 100), rdi(1000, 100), rdi(10, -10)], 38 | ['hilbert'], 39 | ['sinc', '3k'], 40 | ['sinc', '-4k'], 41 | ['sinc', '3k-4k'] 42 | ] 43 | band_effect = random.choice(band_effects) 44 | ''' 45 | volume_effects = [ 46 | ['loudness', rdi(10,-2)], 47 | ['overdrive', rdi(20,0), rdi(20,0)], 48 | ] 49 | vol_effect = random.choice(volume_effects) 50 | effects = [speed_effect, vol_effect] 51 | out, sr = torchaudio.sox_effects.apply_effects_tensor(wav, sample_rate, effects) 52 | # Add a variable amount of noise 53 | out = out + torch.rand_like(out) * random.random() * .03 54 | return out 55 | 56 | 57 | if __name__ == '__main__': 58 | sample, _ = load_wav_to_torch('obama1.wav') 59 | sample = sample / 32768.0 60 | aug = WavAugmentor() 61 | for j in range(10): 62 | out = aug.augment(sample, 24000) 63 | torchaudio.save(f'out{j}.wav', out, 24000) -------------------------------------------------------------------------------- /codes/data/combined_dataset.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from data import create_dataset 3 | 4 | 5 | # Simple composite dataset that combines multiple other datasets. 6 | # Assumes that the datasets output dicts. 7 | class CombinedDataset(torch.utils.data.Dataset): 8 | def __init__(self, opt): 9 | self.datasets = {} 10 | for k, v in opt.items(): 11 | if not isinstance(v, dict): 12 | continue 13 | # Scale&phase gets injected by options.py.. 14 | v['scale'] = opt['scale'] 15 | v['phase'] = opt['phase'] 16 | self.datasets[k] = create_dataset(v) 17 | self.items_fetched = 0 18 | 19 | def __getitem__(self, i): 20 | self.items_fetched += 1 21 | output = {} 22 | for name, dataset in self.datasets.items(): 23 | prefix = "" 24 | # 'default' dataset gets no prefix, other ones get `key_` 25 | if name != 'default': 26 | prefix = name + "_" 27 | 28 | data = dataset[i % len(dataset)] 29 | for k, v in data.items(): 30 | output[prefix + k] = v 31 | return output 32 | 33 | def __len__(self): 34 | return max(len(d) for d in self.datasets.values()) -------------------------------------------------------------------------------- /codes/data/data_sampler.py: -------------------------------------------------------------------------------- 1 | """ 2 | Modified from torch.utils.data.distributed.DistributedSampler 3 | Support enlarging the dataset for *iteration-oriented* training, for saving time when restart the 4 | dataloader after each epoch 5 | """ 6 | import math 7 | import torch 8 | from torch.utils.data.sampler import Sampler 9 | import torch.distributed as dist 10 | 11 | 12 | class DistIterSampler(Sampler): 13 | """Sampler that restricts data loading to a subset of the dataset. 14 | 15 | It is especially useful in conjunction with 16 | :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each 17 | process can pass a DistributedSampler instance as a DataLoader sampler, 18 | and load a subset of the original dataset that is exclusive to it. 19 | 20 | .. note:: 21 | Dataset is assumed to be of constant size. 22 | 23 | Arguments: 24 | dataset: Dataset used for sampling. 25 | num_replicas (optional): Number of processes participating in 26 | distributed training. 27 | rank (optional): Rank of the current process within num_replicas. 28 | """ 29 | 30 | def __init__(self, dataset, num_replicas=None, rank=None, ratio=100): 31 | if num_replicas is None: 32 | if not dist.is_available(): 33 | raise RuntimeError("Requires distributed package to be available") 34 | num_replicas = dist.get_world_size() 35 | if rank is None: 36 | if not dist.is_available(): 37 | raise RuntimeError("Requires distributed package to be available") 38 | rank = dist.get_rank() 39 | self.dataset = dataset 40 | self.num_replicas = num_replicas 41 | self.rank = rank 42 | self.epoch = 0 43 | self.num_samples = int(math.ceil(len(self.dataset) * ratio / self.num_replicas)) 44 | self.total_size = self.num_samples * self.num_replicas 45 | 46 | def __iter__(self): 47 | # deterministically shuffle based on epoch 48 | g = torch.Generator() 49 | g.manual_seed(self.epoch) 50 | indices = torch.randperm(self.total_size, generator=g).tolist() 51 | 52 | dsize = len(self.dataset) 53 | indices = [v % dsize for v in indices] 54 | 55 | # subsample 56 | indices = indices[self.rank:self.total_size:self.num_replicas] 57 | assert len(indices) == self.num_samples 58 | 59 | return iter(indices) 60 | 61 | def __len__(self): 62 | return self.num_samples 63 | 64 | def set_epoch(self, epoch): 65 | self.epoch = epoch 66 | -------------------------------------------------------------------------------- /codes/data/images/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/data/images/__init__.py -------------------------------------------------------------------------------- /codes/data/images/chunk_with_reference.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | from data import util 3 | import torch 4 | import numpy as np 5 | 6 | # Iterable that reads all the images in a directory that contains a reference image, tile images and center coordinates. 7 | from utils.util import opt_get 8 | 9 | 10 | class ChunkWithReference: 11 | def __init__(self, opt, path): 12 | self.path = path.path 13 | self.tiles, _ = util.find_files_of_type('img', self.path) 14 | self.need_metadata = opt_get(opt, ['strict'], False) or opt_get(opt, ['needs_metadata'], False) 15 | self.need_ref = opt_get(opt, ['need_ref'], False) 16 | if 'ignore_first' in opt.keys(): 17 | self.tiles = self.tiles[opt['ignore_first']:] 18 | 19 | # Odd failures occur at times. Rather than crashing, report the error and just return zeros. 20 | def read_image_or_get_zero(self, img_path): 21 | img = util.read_img(None, img_path, rgb=True) 22 | if img is None: 23 | return np.zeros(128, 128, 3) 24 | return img 25 | 26 | def __getitem__(self, item): 27 | tile = self.read_image_or_get_zero(self.tiles[item]) 28 | if self.need_ref and osp.exists(osp.join(self.path, "ref.jpg")): 29 | tile_id = int(osp.splitext(osp.basename(self.tiles[item]))[0]) 30 | ref = self.read_image_or_get_zero(osp.join(self.path, "ref.jpg")) 31 | if self.need_metadata: 32 | centers = torch.load(osp.join(self.path, "centers.pt")) 33 | if tile_id in centers.keys(): 34 | center, tile_width = centers[tile_id] 35 | else: 36 | print("Could not find the given tile id in the accompanying centers.pt. This generally means that " 37 | "centers.pt was overwritten at some point e.g. by duplicate data. If you don't care about tile " 38 | "centers, consider passing strict=false to the dataset options. (Note: you must re-build your" 39 | "caches for this setting change to take effect.)") 40 | raise FileNotFoundError(tile_id, self.tiles[item]) 41 | else: 42 | center = torch.tensor([128, 128], dtype=torch.long) 43 | tile_width = 256 44 | mask = np.full(tile.shape[:2] + (1,), fill_value=.1, dtype=tile.dtype) 45 | mask[center[0] - tile_width // 2:center[0] + tile_width // 2, center[1] - tile_width // 2:center[1] + tile_width // 2] = 1 46 | else: 47 | ref = np.zeros_like(tile) 48 | mask = np.zeros(tile.shape[:2] + (1,)) 49 | center = (0,0) 50 | 51 | return tile, ref, center, mask, self.tiles[item] 52 | 53 | def __len__(self): 54 | return len(self.tiles) 55 | -------------------------------------------------------------------------------- /codes/data/images/random_dataset.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data import Dataset 3 | 4 | 5 | # Dataset that feeds random data into the state. Can be useful for testing or demo purposes without actual data. 6 | class RandomDataset(Dataset): 7 | def __init__(self, opt): 8 | self.hq_shape = tuple(opt['hq_shape']) 9 | self.lq_shape = tuple(opt['lq_shape']) 10 | 11 | def __getitem__(self, item): 12 | return {'lq': torch.rand(self.lq_shape), 'hq': torch.rand(self.hq_shape), 13 | 'LQ_path': '', 'GT_path': ''} 14 | 15 | def __len__(self): 16 | # Arbitrary 17 | return 1024 * 1024 18 | -------------------------------------------------------------------------------- /codes/data/images/zip_file_dataset.py: -------------------------------------------------------------------------------- 1 | import PIL.Image 2 | import zipfile 3 | import torch 4 | import torchvision 5 | from torch.utils.data import DataLoader 6 | from torchvision.transforms import Compose, ToTensor, Normalize, Resize 7 | 8 | 9 | class ZipFileDataset(torch.utils.data.Dataset): 10 | def __init__(self, opt): 11 | self.path = opt['path'] 12 | zip = zipfile.ZipFile(self.path) 13 | self.all_files = list(zip.namelist()) 14 | self.resolution = opt['resolution'] 15 | self.paired_mode = opt['paired_mode'] 16 | self.transforms = Compose([ToTensor(), 17 | Resize(self.resolution), 18 | Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) 19 | ]) 20 | self.zip = None 21 | 22 | def __len__(self): 23 | return len(self.all_files) 24 | 25 | # Loaded on the fly because ZipFile does not tolerate pickling. 26 | def get_zip(self): 27 | if self.zip is None: 28 | self.zip = zipfile.ZipFile(self.path) 29 | return self.zip 30 | 31 | def load_image(self, path): 32 | file = self.get_zip().open(path, 'r') 33 | pilimg = PIL.Image.open(file) 34 | tensor = self.transforms(pilimg) 35 | return tensor 36 | 37 | def __getitem__(self, i): 38 | try: 39 | fname = self.all_files[i] 40 | out = { 41 | 'hq': self.load_image(fname), 42 | 'HQ_path': fname, 43 | 'has_alt': self.paired_mode 44 | } 45 | if self.paired_mode: 46 | if fname.endswith('0.jpg'): 47 | aname = fname.replace('0.jpg', '1.jpg') 48 | else: 49 | aname = fname.replace('1.jpg', '0.jpg') 50 | out['alt_hq'] = self.load_image(aname) 51 | except: 52 | print(f"Error loading {fname} from zipfile. Attempting to recover by loading next element.") 53 | return self[i+1] 54 | return out 55 | 56 | if __name__ == '__main__': 57 | opt = { 58 | 'path': 'E:\\4k6k\\datasets\\images\\youtube-imagenet-paired\\output.zip', 59 | 'resolution': 224, 60 | 'paired_mode': True 61 | } 62 | dataset = ZipFileDataset(opt) 63 | print(len(dataset)) 64 | loader = DataLoader(dataset, shuffle=True) 65 | for i, d in enumerate(loader): 66 | torchvision.utils.save_image(d['hq'], f'{i}_hq.png') 67 | torchvision.utils.save_image(d['alt_hq'], f'{i}_althq.png') 68 | 69 | -------------------------------------------------------------------------------- /codes/data/text/hf_datasets_wrapper.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Dataset 2 | import datasets 3 | 4 | 5 | class HfDataset(Dataset): 6 | """ 7 | Simple wrapper for a HuggingFace dataset that can re-map keys if desired. 8 | """ 9 | def __init__(self, corpi, cache_path=None, key_maps=None, dataset_spec_key='train'): 10 | self.hfd = [] 11 | for corpus in corpi: 12 | dataset_name, config = corpus 13 | if config == '' or config == 'None': 14 | config = None 15 | self.hfd.append(datasets.load_dataset(dataset_name, config, cache_dir=cache_path)[dataset_spec_key]) 16 | self.key_maps = key_maps 17 | 18 | def __getitem__(self, item): 19 | for dataset in self.hfd: 20 | if item < len(dataset): 21 | val = dataset[item] 22 | if self.key_maps is None: 23 | return val 24 | else: 25 | return {k: val[v] for k, v in self.key_maps.items()} 26 | else: 27 | item -= len(dataset) 28 | raise IndexError() 29 | 30 | def __len__(self): 31 | return sum([len(h) for h in self.hfd]) 32 | 33 | 34 | if __name__ == '__main__': 35 | d = HfDataset([['wikipedia', '20200501.en'], ['bookcorpus', '']], dataset_spec_key='train', cache_path='Z:\\huggingface_datasets\\cache') 36 | print(d[5]) 37 | -------------------------------------------------------------------------------- /codes/data/torch_dataset.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Dataset 2 | import torchvision.transforms as T 3 | from torchvision import datasets 4 | 5 | # Wrapper for basic pytorch datasets which re-wraps them into a format usable by ExtensibleTrainer. 6 | from data.images.cifar import CIFAR100, CIFAR10 7 | from utils.util import opt_get 8 | 9 | 10 | class TorchDataset(Dataset): 11 | def __init__(self, opt): 12 | DATASET_MAP = { 13 | "mnist": datasets.MNIST, 14 | "fmnist": datasets.FashionMNIST, 15 | "cifar10": CIFAR10, 16 | "cifar100": CIFAR100, 17 | "imagenet": datasets.ImageNet, 18 | "imagefolder": datasets.ImageFolder 19 | } 20 | normalize = T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 21 | if opt_get(opt, ['random_crop'], False): 22 | transforms = [ 23 | T.RandomResizedCrop(opt['image_size']), 24 | T.RandomHorizontalFlip(), 25 | T.ToTensor(), 26 | normalize, 27 | ] 28 | else: 29 | transforms = [ 30 | T.Resize(opt['image_size']), 31 | T.CenterCrop(opt['image_size']), 32 | T.RandomHorizontalFlip(), 33 | T.ToTensor(), 34 | normalize, 35 | ] 36 | transforms = T.Compose(transforms) 37 | self.dataset = DATASET_MAP[opt['dataset']](transform=transforms, **opt['kwargs']) 38 | self.len = opt_get(opt, ['fixed_len'], len(self.dataset)) 39 | self.offset = opt_get(opt, ['offset'], 0) 40 | 41 | def __getitem__(self, item): 42 | item = self.dataset[item+self.offset] 43 | if len(item) == 2: 44 | underlying_item, lbl = item 45 | coarselbl = None 46 | elif len(item) == 3: 47 | underlying_item, lbl, coarselbl = item 48 | else: 49 | raise NotImplementedError 50 | return {'lq': underlying_item, 'hq': underlying_item, 'labels': lbl, 'coarse_labels': coarselbl, 51 | 'LQ_path': str(item), 'GT_path': str(item)} 52 | 53 | def __len__(self): 54 | return self.len-self.offset 55 | 56 | if __name__ == '__main__': 57 | opt = { 58 | 'flip': True, 59 | 'crop_sz': None, 60 | 'dataset': 'cifar100', 61 | 'image_size': 32, 62 | 'normalize': True, 63 | 'kwargs': { 64 | 'root': 'E:\\4k6k\\datasets\\images\\cifar100', 65 | 'download': True 66 | } 67 | } 68 | set = TorchDataset(opt) 69 | j = set[0] 70 | -------------------------------------------------------------------------------- /codes/data/zero_pad_dict_collate.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | 5 | class ZeroPadDictCollate(): 6 | """ 7 | Given a list of dictionary outputs with torch.Tensors from a Dataset, iterates through each one, finds the longest 8 | tensor, and zero pads all the other tensors together. 9 | """ 10 | def collate_tensors(self, batch, key): 11 | result = [] 12 | largest_dims = [0 for _ in range(len(batch[0][key].shape))] 13 | for elem in batch: 14 | result.append(elem[key]) 15 | largest_dims = [max(current_largest, new_consideration) for current_largest, new_consideration in zip(largest_dims, elem[key].shape)] 16 | # Now pad each tensor by the largest dimension. 17 | for i in range(len(result)): 18 | padding_tuple = () 19 | for d in range(len(largest_dims)): 20 | padding_needed = largest_dims[d] - result[i].shape[d] 21 | assert padding_needed >= 0 22 | padding_tuple = (0, padding_needed) + padding_tuple 23 | result[i] = F.pad(result[i], padding_tuple) 24 | 25 | return torch.stack(result, dim=0) 26 | 27 | 28 | def collate_into_list(self, batch, key): 29 | result = [] 30 | for elem in batch: 31 | result.append(elem[key]) 32 | return result 33 | 34 | def __call__(self, batch): 35 | first_dict = batch[0] 36 | collated = {} 37 | for key in first_dict.keys(): 38 | if isinstance(first_dict[key], torch.Tensor): 39 | if len(first_dict[key].shape) > 0: 40 | collated[key] = self.collate_tensors(batch, key) 41 | else: 42 | collated[key] = torch.stack([b[key] for b in batch]) 43 | else: 44 | collated[key] = self.collate_into_list(batch, key) 45 | return collated -------------------------------------------------------------------------------- /codes/maybe_bnb.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from typing import Optional,Literal 3 | 4 | # EXPLICITLY leave these empty; ensure that an attributeerror is raised if these are not initialised properly. 5 | class nn: pass 6 | class optim: pass 7 | 8 | def populate(adam=True, adamw=True, linear=False, embedding: Optional[Literal["STABLE", "NORMAL"]]="NORMAL"): 9 | nn.Linear = torch.nn.Linear 10 | nn.Embedding = torch.nn.Embedding 11 | optim.Adam = torch.optim.Adam # this does nothing tbh 12 | optim.AdamW = torch.optim.AdamW 13 | # 14 | try: 15 | import bitsandbytes as bnb 16 | except ImportError: 17 | return print("WARNING: bnb was missing, not using 8bit for anything!") 18 | # 19 | if adam: optim.Adam = bnb.optim.Adam8bit 20 | if adamw: optim.AdamW = bnb.optim.AdamW8bit 21 | if linear: nn.Linear = bnb.nn.Linear8bitLt 22 | if embedding: 23 | nn.Embedding = bnb.nn.StableEmbedding if embedding == 'STABLE' else bnb.nn.modules.Embedding 24 | 25 | -------------------------------------------------------------------------------- /codes/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/__init__.py -------------------------------------------------------------------------------- /codes/models/audio/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/audio/__init__.py -------------------------------------------------------------------------------- /codes/models/audio/asr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/audio/asr/__init__.py -------------------------------------------------------------------------------- /codes/models/audio/music/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/audio/music/__init__.py -------------------------------------------------------------------------------- /codes/models/audio/music/encoders.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | from transformers import GPT2Config, GPT2Model 5 | 6 | from models.arch_util import AttentionBlock, ResBlock 7 | from models.audio.tts.lucidrains_dvae import DiscreteVAE 8 | from trainer.networks import register_model 9 | from utils.util import opt_get, ceil_multiple, print_network 10 | 11 | 12 | class ResEncoder16x(nn.Module): 13 | def __init__(self, 14 | spec_dim, 15 | hidden_dim, 16 | embedding_dim, 17 | checkpointing_enabled=True, 18 | ): 19 | super().__init__() 20 | attn = [] 21 | def edim(m): 22 | dd = min(spec_dim + m * 128, hidden_dim) 23 | return ceil_multiple(dd, 8) 24 | self.downsampler = nn.Sequential( 25 | ResBlock(spec_dim, out_channels=edim(2), use_conv=True, dims=1, down=True, checkpointing_enabled=checkpointing_enabled), 26 | ResBlock(edim(2), out_channels=edim(3), use_conv=True, dims=1, down=True, checkpointing_enabled=checkpointing_enabled), 27 | ResBlock(edim(3), out_channels=edim(3), use_conv=True, dims=1, checkpointing_enabled=checkpointing_enabled), 28 | ResBlock(edim(3), out_channels=edim(4), use_conv=True, dims=1, down=True, checkpointing_enabled=checkpointing_enabled), 29 | ResBlock(edim(4), out_channels=edim(4), use_conv=True, dims=1, checkpointing_enabled=checkpointing_enabled), 30 | ResBlock(edim(4), out_channels=hidden_dim, use_conv=True, dims=1, down=True, checkpointing_enabled=checkpointing_enabled)) 31 | self.encoder = nn.Sequential( 32 | ResBlock(hidden_dim, out_channels=hidden_dim, use_conv=True, dims=1, checkpointing_enabled=checkpointing_enabled), 33 | ResBlock(hidden_dim, out_channels=hidden_dim, use_conv=True, dims=1, checkpointing_enabled=checkpointing_enabled), 34 | ResBlock(hidden_dim, out_channels=hidden_dim, use_conv=True, dims=1, checkpointing_enabled=checkpointing_enabled), 35 | nn.GroupNorm(8, hidden_dim), 36 | nn.SiLU(), 37 | nn.Conv1d(hidden_dim, embedding_dim, 1), 38 | nn.Tanh(), 39 | ) 40 | 41 | def forward(self, x): 42 | h = self.downsampler(x) 43 | h = self.encoder(h) 44 | return h 45 | -------------------------------------------------------------------------------- /codes/models/audio/music/m2v_code_to_mel.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from models.arch_util import ResBlock, AttentionBlock 6 | from models.audio.music.flat_diffusion import MultiGroupEmbedding 7 | from trainer.networks import register_model 8 | from utils.util import checkpoint 9 | 10 | 11 | class Code2Mel(nn.Module): 12 | def __init__(self, out_dim=256, base_dim=1024, num_tokens=16, num_groups=4, dropout=.1): 13 | super().__init__() 14 | self.emb = MultiGroupEmbedding(num_tokens, num_groups, base_dim) 15 | self.base_blocks = nn.Sequential(ResBlock(base_dim, dropout, dims=1), 16 | AttentionBlock(base_dim, num_heads=base_dim//64), 17 | ResBlock(base_dim, dropout, dims=1)) 18 | l2dim = base_dim-256 19 | self.l2_up_block = nn.Conv1d(base_dim, l2dim, kernel_size=5, padding=2) 20 | self.l2_blocks = nn.Sequential(ResBlock(l2dim, dropout, kernel_size=5, dims=1), 21 | AttentionBlock(l2dim, num_heads=base_dim//64), 22 | ResBlock(l2dim, dropout, kernel_size=5, dims=1), 23 | AttentionBlock(l2dim, num_heads=base_dim//64), 24 | ResBlock(l2dim, dropout, dims=1), 25 | ResBlock(l2dim, dropout, dims=1)) 26 | l3dim = l2dim-256 27 | self.l3_up_block = nn.Conv1d(l2dim, l3dim, kernel_size=5, padding=2) 28 | self.l3_blocks = nn.Sequential(ResBlock(l3dim, dropout, kernel_size=5, dims=1), 29 | AttentionBlock(l3dim, num_heads=base_dim//64), 30 | ResBlock(l3dim, dropout, kernel_size=5, dims=1), 31 | ResBlock(l3dim, dropout, dims=1)) 32 | self.final_block = nn.Conv1d(l3dim, out_dim, kernel_size=3, padding=1) 33 | 34 | def forward(self, codes, target): 35 | with torch.autocast(codes.device.type): 36 | h = self.emb(codes).permute(0,2,1) 37 | h = checkpoint(self.base_blocks, h) 38 | h = F.interpolate(h, scale_factor=2, mode='linear') 39 | h = self.l2_up_block(h) 40 | h = checkpoint(self.l2_blocks, h) 41 | h = F.interpolate(h, size=target.shape[-1], mode='linear') 42 | h = self.l3_up_block(h) 43 | h = checkpoint(self.l3_blocks, h.float()) 44 | pred = self.final_block(h) 45 | return F.mse_loss(pred, target), pred 46 | 47 | 48 | @register_model 49 | def register_code2mel(opt_net, opt): 50 | return Code2Mel(**opt_net['kwargs']) 51 | 52 | 53 | if __name__ == '__main__': 54 | model = Code2Mel() 55 | codes = torch.randint(0,16, (2,200,4)) 56 | target = torch.randn(2,256,804) 57 | model(codes, target) -------------------------------------------------------------------------------- /codes/models/audio/music/mel2vec_codes_gpt.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | import maybe_bnb as mbnb 5 | from transformers import GPT2Config, GPT2Model 6 | 7 | from trainer.networks import register_model 8 | from utils.util import opt_get 9 | 10 | 11 | class Mel2VecCodesGpt(nn.Module): 12 | def __init__(self, dim, layers, num_groups=8, num_vectors=8): 13 | super().__init__() 14 | 15 | self.num_groups = num_groups 16 | 17 | self.config = GPT2Config(vocab_size=1, n_positions=8192, n_embd=dim, n_layer=layers, n_head=dim//64, 18 | n_inner=dim*2) 19 | self.gpt = GPT2Model(self.config) 20 | del self.gpt.wte # Unused, we'll do our own embeddings. 21 | # nn.Embedding 22 | self.embeddings = nn.ModuleList([mbnb.nn.Embedding(num_vectors, dim//num_groups) for _ in range(num_groups)]) 23 | self.heads = nn.ModuleList([mbnb.nn.Linear(dim, num_vectors) for _ in range(num_groups)]) 24 | 25 | def forward(self, codes): 26 | assert codes.shape[-1] == self.num_groups 27 | 28 | inputs = codes[:, :-1] 29 | targets = codes[:, 1:] 30 | 31 | h = [embedding(inputs[:, :, i]) for i, embedding in enumerate(self.embeddings)] 32 | h = torch.cat(h, dim=-1) 33 | h = self.gpt(inputs_embeds=h, return_dict=True).last_hidden_state 34 | 35 | losses = 0 36 | for i, head in enumerate(self.heads): 37 | logits = head(h).permute(0,2,1) 38 | loss = F.cross_entropy(logits, targets[:,:,i]) 39 | losses = losses + loss 40 | 41 | return losses / self.num_groups 42 | 43 | 44 | @register_model 45 | def register_music_gpt(opt_net, opt): 46 | return Mel2VecCodesGpt(**opt_get(opt_net, ['kwargs'], {})) 47 | 48 | 49 | if __name__ == '__main__': 50 | model = Mel2VecCodesGpt(512, 8) 51 | codes = torch.randint(0,8, (2,300,8)) 52 | model(codes) -------------------------------------------------------------------------------- /codes/models/audio/tts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/audio/tts/__init__.py -------------------------------------------------------------------------------- /codes/models/audio/tts/random_latent_converter.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import maybe_bnb as mbnb 7 | 8 | from trainer.networks import register_model 9 | from utils.util import opt_get 10 | 11 | 12 | def fused_leaky_relu(input, bias=None, negative_slope=0.2, scale=2 ** 0.5): 13 | if bias is not None: 14 | rest_dim = [1] * (input.ndim - bias.ndim - 1) 15 | return ( 16 | F.leaky_relu( 17 | input + bias.view(1, bias.shape[0], *rest_dim), negative_slope=negative_slope 18 | ) 19 | * scale 20 | ) 21 | else: 22 | return F.leaky_relu(input, negative_slope=0.2) * scale 23 | 24 | 25 | class EqualLinear(nn.Module): 26 | def __init__( 27 | self, in_dim, out_dim, bias=True, bias_init=0, lr_mul=1 28 | ): 29 | super().__init__() 30 | self.weight = nn.Parameter(torch.randn(out_dim, in_dim).div_(lr_mul)) 31 | if bias: 32 | self.bias = nn.Parameter(torch.zeros(out_dim).fill_(bias_init)) 33 | else: 34 | self.bias = None 35 | self.scale = (1 / math.sqrt(in_dim)) * lr_mul 36 | self.lr_mul = lr_mul 37 | 38 | def forward(self, input): 39 | out = F.linear(input, self.weight * self.scale) 40 | out = fused_leaky_relu(out, self.bias * self.lr_mul) 41 | return out 42 | 43 | 44 | class RandomLatentConverter(nn.Module): 45 | def __init__(self, channels): 46 | super().__init__() 47 | self.layers = nn.Sequential(*[EqualLinear(channels, channels, lr_mul=.1) for _ in range(5)], 48 | mbnb.nn.Linear(channels, channels)) 49 | self.channels = channels 50 | 51 | def forward(self, ref): 52 | r = torch.randn(ref.shape[0], self.channels, device=ref.device) 53 | y = self.layers(r) 54 | return y 55 | 56 | 57 | @register_model 58 | def register_random_latent_converter(opt_net, opt): 59 | return RandomLatentConverter(**opt_get(opt_net, ['kwargs'], {})) 60 | 61 | 62 | if __name__ == '__main__': 63 | model = RandomLatentConverter(512) 64 | model(torch.randn(5,512)) -------------------------------------------------------------------------------- /codes/models/audio/tts/tacotron2/LICENSE: -------------------------------------------------------------------------------- 1 | This directory contains works with the below licenses, which should be considered in addition 2 | to the base repository license. 3 | 4 | BSD 3-Clause License 5 | 6 | Copyright (c) 2018, NVIDIA Corporation 7 | All rights reserved. 8 | 9 | Redistribution and use in source and binary forms, with or without 10 | modification, are permitted provided that the following conditions are met: 11 | 12 | * Redistributions of source code must retain the above copyright notice, this 13 | list of conditions and the following disclaimer. 14 | 15 | * Redistributions in binary form must reproduce the above copyright notice, 16 | this list of conditions and the following disclaimer in the documentation 17 | and/or other materials provided with the distribution. 18 | 19 | * Neither the name of the copyright holder nor the names of its 20 | contributors may be used to endorse or promote products derived from 21 | this software without specific prior written permission. 22 | 23 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 24 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 26 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 27 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 29 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 30 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /codes/models/audio/tts/tacotron2/__init__.py: -------------------------------------------------------------------------------- 1 | from models.audio.tts.tacotron2.taco_utils import * 2 | from models.audio.tts.tacotron2.text import * 3 | from models.audio.tts.tacotron2.tacotron2 import * 4 | from models.audio.tts.tacotron2.stft import * 5 | from models.audio.tts.tacotron2.layers import * 6 | from models.audio.tts.tacotron2.loss import * -------------------------------------------------------------------------------- /codes/models/audio/tts/tacotron2/audio_processing.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from scipy.signal import get_window 4 | import librosa.util as librosa_util 5 | 6 | 7 | def window_sumsquare(window, n_frames, hop_length=200, win_length=800, 8 | n_fft=800, dtype=np.float32, norm=None): 9 | """ 10 | # from librosa 0.6 11 | Compute the sum-square envelope of a window function at a given hop length. 12 | 13 | This is used to estimate modulation effects induced by windowing 14 | observations in short-time fourier transforms. 15 | 16 | Parameters 17 | ---------- 18 | window : string, tuple, number, callable, or list-like 19 | Window specification, as in `get_window` 20 | 21 | n_frames : int > 0 22 | The number of analysis frames 23 | 24 | hop_length : int > 0 25 | The number of samples to advance between frames 26 | 27 | win_length : [optional] 28 | The length of the window function. By default, this matches `n_fft`. 29 | 30 | n_fft : int > 0 31 | The length of each analysis frame. 32 | 33 | dtype : np.dtype 34 | The data type of the output 35 | 36 | Returns 37 | ------- 38 | wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))` 39 | The sum-squared envelope of the window function 40 | """ 41 | if win_length is None: 42 | win_length = n_fft 43 | 44 | n = n_fft + hop_length * (n_frames - 1) 45 | x = np.zeros(n, dtype=dtype) 46 | 47 | # Compute the squared window at the desired length 48 | win_sq = get_window(window, win_length, fftbins=True) 49 | win_sq = librosa_util.normalize(win_sq, norm=norm)**2 50 | win_sq = librosa_util.pad_center(win_sq, n_fft) 51 | 52 | # Fill the envelope 53 | for i in range(n_frames): 54 | sample = i * hop_length 55 | x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))] 56 | return x 57 | 58 | 59 | def griffin_lim(magnitudes, stft_fn, n_iters=30): 60 | """ 61 | PARAMS 62 | ------ 63 | magnitudes: spectrogram magnitudes 64 | stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods 65 | """ 66 | 67 | angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size()))) 68 | angles = angles.astype(np.float32) 69 | angles = torch.autograd.Variable(torch.from_numpy(angles)) 70 | signal = stft_fn.inverse(magnitudes, angles).squeeze(1) 71 | 72 | for i in range(n_iters): 73 | _, angles = stft_fn.transform(signal) 74 | signal = stft_fn.inverse(magnitudes, angles).squeeze(1) 75 | return signal 76 | 77 | 78 | def dynamic_range_compression(x, C=1, clip_val=1e-5): 79 | """ 80 | PARAMS 81 | ------ 82 | C: compression factor 83 | """ 84 | return torch.log(torch.clamp(x, min=clip_val) * C) 85 | 86 | 87 | def dynamic_range_decompression(x, C=1): 88 | """ 89 | PARAMS 90 | ------ 91 | C: compression factor used to compress 92 | """ 93 | return torch.exp(x) / C -------------------------------------------------------------------------------- /codes/models/audio/tts/tacotron2/loss.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | from trainer.losses import ConfigurableLoss 4 | 5 | 6 | class Tacotron2Loss(ConfigurableLoss): 7 | def __init__(self, opt_loss, env): 8 | super().__init__(opt_loss, env) 9 | self.mel_target_key = opt_loss['mel_target_key'] 10 | self.mel_output_key = opt_loss['mel_output_key'] 11 | self.mel_output_postnet_key = opt_loss['mel_output_postnet_key'] 12 | self.gate_target_key = opt_loss['gate_target_key'] 13 | self.gate_output_key = opt_loss['gate_output_key'] 14 | self.last_mel_loss = 0 15 | self.last_gate_loss = 0 16 | 17 | def forward(self, _, state): 18 | mel_target, gate_target = state[self.mel_target_key], state[self.gate_target_key] 19 | mel_target.requires_grad = False 20 | gate_target.requires_grad = False 21 | gate_target = gate_target.view(-1, 1) 22 | 23 | mel_out, mel_out_postnet, gate_out = state[self.mel_output_key], state[self.mel_output_postnet_key], state[self.gate_output_key] 24 | gate_out = gate_out.view(-1, 1) 25 | mel_loss = nn.MSELoss()(mel_out, mel_target) + \ 26 | nn.MSELoss()(mel_out_postnet, mel_target) 27 | gate_loss = nn.BCEWithLogitsLoss()(gate_out, gate_target) 28 | self.last_mel_loss = mel_loss.detach().clone().mean().item() 29 | self.last_gate_loss = gate_loss.detach().clone().mean().item() 30 | return mel_loss + gate_loss 31 | 32 | def extra_metrics(self): 33 | return { 34 | 'mel_loss': self.last_mel_loss, 35 | 'gate_loss': self.last_gate_loss 36 | } 37 | 38 | 39 | class Tacotron2LossRaw(nn.Module): 40 | def __init__(self): 41 | super().__init__() 42 | self.last_mel_loss = 0 43 | self.last_gate_loss = 0 44 | 45 | def forward(self, model_output, targets): 46 | mel_target, gate_target = targets[0], targets[1] 47 | mel_target.requires_grad = False 48 | gate_target.requires_grad = False 49 | gate_target = gate_target.view(-1, 1) 50 | 51 | mel_out, mel_out_postnet, gate_out, _ = model_output 52 | gate_out = gate_out.view(-1, 1) 53 | mel_loss = nn.MSELoss()(mel_out, mel_target) + \ 54 | nn.MSELoss()(mel_out_postnet, mel_target) 55 | gate_loss = nn.BCEWithLogitsLoss()(gate_out, gate_target) 56 | self.last_mel_loss = mel_loss.detach().clone().mean().item() 57 | self.last_gate_loss = gate_loss.detach().clone().mean().item() 58 | return mel_loss + gate_loss 59 | 60 | def extra_metrics(self): 61 | return { 62 | 'mel_loss': self.last_mel_loss, 63 | 'gate_loss': self.last_gate_loss 64 | } -------------------------------------------------------------------------------- /codes/models/audio/tts/tacotron2/taco_utils.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | 3 | import numpy as np 4 | import torch 5 | from scipy.io.wavfile import read 6 | 7 | 8 | def get_mask_from_lengths(lengths, max_len=None): 9 | if max_len is None: 10 | max_len = torch.max(lengths).item() 11 | ids = torch.arange(0, max_len, out=torch.LongTensor(max_len)).to(lengths.device) 12 | mask = (ids < lengths.unsqueeze(1)).bool() 13 | return mask 14 | 15 | 16 | def load_wav_to_torch(full_path): 17 | sampling_rate, data = read(full_path) 18 | if data.dtype == np.int32: 19 | norm_fix = 2 ** 31 20 | elif data.dtype == np.int16: 21 | norm_fix = 2 ** 15 22 | elif data.dtype == np.float16 or data.dtype == np.float32: 23 | norm_fix = 1. 24 | else: 25 | raise NotImplemented(f"Provided data dtype not supported: {data.dtype}") 26 | return (torch.FloatTensor(data.astype(np.float32)) / norm_fix, sampling_rate) 27 | 28 | 29 | def load_filepaths_and_text_type(filename, type, split="|"): 30 | with open(filename, encoding='utf-8') as f: 31 | filepaths_and_text = [list(line.strip().split(split)) + [type] for line in f] 32 | base = os.path.dirname(filename) 33 | for j in range(len(filepaths_and_text)): 34 | filepaths_and_text[j][0] = os.path.join(base, filepaths_and_text[j][0]) 35 | return filepaths_and_text 36 | 37 | def load_filepaths_and_text(filename, split="|"): 38 | with open(filename, encoding='utf-8') as f: 39 | filepaths_and_text = [line.strip().split(split) for line in f] 40 | base = os.path.dirname(filename) 41 | for j in range(len(filepaths_and_text)): 42 | filepaths_and_text[j][0] = os.path.join(base, filepaths_and_text[j][0]) 43 | return filepaths_and_text 44 | 45 | 46 | def to_gpu(x): 47 | x = x.contiguous() 48 | 49 | if torch.cuda.is_available(): 50 | x = x.cuda(non_blocking=True) 51 | return torch.autograd.Variable(x) -------------------------------------------------------------------------------- /codes/models/audio/tts/tacotron2/text/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 Keith Ito 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /codes/models/audio/tts/tacotron2/text/__init__.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | import re 3 | 4 | import torch 5 | 6 | from models.audio.tts.tacotron2.text import cleaners 7 | from models.audio.tts.tacotron2.text.symbols import symbols 8 | 9 | 10 | # Mappings from symbol to numeric ID and vice versa: 11 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 12 | _id_to_symbol = {i: s for i, s in enumerate(symbols)} 13 | 14 | # Regular expression matching text enclosed in curly braces: 15 | _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') 16 | 17 | 18 | def text_to_sequence(text, cleaner_names=['english_cleaners']): 19 | '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 20 | 21 | The text can optionally have ARPAbet sequences enclosed in curly braces embedded 22 | in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." 23 | 24 | Args: 25 | text: string to convert to a sequence 26 | cleaner_names: names of the cleaner functions to run the text through 27 | 28 | Returns: 29 | List of integers corresponding to the symbols in the text 30 | ''' 31 | sequence = [] 32 | 33 | # Check for curly braces and treat their contents as ARPAbet: 34 | while len(text): 35 | m = _curly_re.match(text) 36 | if not m: 37 | sequence += _symbols_to_sequence(_clean_text(text, cleaner_names)) 38 | break 39 | sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names)) 40 | sequence += _arpabet_to_sequence(m.group(2)) 41 | text = m.group(3) 42 | 43 | return sequence 44 | 45 | 46 | def sequence_to_text(sequence): 47 | '''Converts a sequence of IDs back to a string''' 48 | result = '' 49 | for symbol_id in sequence: 50 | if isinstance(symbol_id, torch.Tensor): 51 | symbol_id = symbol_id.item() 52 | if symbol_id in _id_to_symbol: 53 | s = _id_to_symbol[symbol_id] 54 | # Enclose ARPAbet back in curly braces: 55 | if len(s) > 1 and s[0] == '@': 56 | s = '{%s}' % s[1:] 57 | result += s 58 | return result.replace('}{', ' ') 59 | 60 | 61 | def tacotron_symbols(): 62 | return list(_symbol_to_id.keys()) 63 | 64 | 65 | def tacotron_symbol_mapping(): 66 | return _symbol_to_id.copy() 67 | 68 | 69 | def _clean_text(text, cleaner_names): 70 | for name in cleaner_names: 71 | cleaner = getattr(cleaners, name) 72 | if not cleaner: 73 | raise Exception('Unknown cleaner: %s' % name) 74 | text = cleaner(text) 75 | return text 76 | 77 | 78 | def _symbols_to_sequence(symbols): 79 | return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)] 80 | 81 | 82 | def _arpabet_to_sequence(text): 83 | return _symbols_to_sequence(['@' + s for s in text.split()]) 84 | 85 | 86 | def _should_keep_symbol(s): 87 | return s in _symbol_to_id and s != '_' and s != '~' 88 | -------------------------------------------------------------------------------- /codes/models/audio/tts/tacotron2/text/cleaners.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | ''' 4 | Cleaners are transformations that run over the input text at both training and eval time. 5 | 6 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" 7 | hyperparameter. Some cleaners are English-specific. You'll typically want to use: 8 | 1. "english_cleaners" for English text 9 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using 10 | the Unidecode library (https://pypi.python.org/pypi/Unidecode) 11 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update 12 | the symbols in symbols.py to match your data). 13 | ''' 14 | 15 | import re 16 | from unidecode import unidecode 17 | from .numbers import normalize_numbers 18 | 19 | 20 | # Regular expression matching whitespace: 21 | _whitespace_re = re.compile(r'\s+') 22 | 23 | # List of (regular expression, replacement) pairs for abbreviations: 24 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ 25 | ('mrs', 'misess'), 26 | ('mr', 'mister'), 27 | ('dr', 'doctor'), 28 | ('st', 'saint'), 29 | ('co', 'company'), 30 | ('jr', 'junior'), 31 | ('maj', 'major'), 32 | ('gen', 'general'), 33 | ('drs', 'doctors'), 34 | ('rev', 'reverend'), 35 | ('lt', 'lieutenant'), 36 | ('hon', 'honorable'), 37 | ('sgt', 'sergeant'), 38 | ('capt', 'captain'), 39 | ('esq', 'esquire'), 40 | ('ltd', 'limited'), 41 | ('col', 'colonel'), 42 | ('ft', 'fort'), 43 | ]] 44 | 45 | 46 | def expand_abbreviations(text): 47 | for regex, replacement in _abbreviations: 48 | text = re.sub(regex, replacement, text) 49 | return text 50 | 51 | 52 | def expand_numbers(text): 53 | return normalize_numbers(text) 54 | 55 | 56 | def lowercase(text): 57 | return text.lower() 58 | 59 | 60 | def collapse_whitespace(text): 61 | return re.sub(_whitespace_re, ' ', text) 62 | 63 | 64 | def convert_to_ascii(text): 65 | return unidecode(text) 66 | 67 | 68 | def basic_cleaners(text): 69 | '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' 70 | text = lowercase(text) 71 | text = collapse_whitespace(text) 72 | return text 73 | 74 | 75 | def transliteration_cleaners(text): 76 | '''Pipeline for non-English text that transliterates to ASCII.''' 77 | text = convert_to_ascii(text) 78 | text = lowercase(text) 79 | text = collapse_whitespace(text) 80 | return text 81 | 82 | 83 | def english_cleaners(text): 84 | '''Pipeline for English text, including number and abbreviation expansion.''' 85 | text = convert_to_ascii(text) 86 | text = lowercase(text) 87 | text = expand_numbers(text) 88 | text = expand_abbreviations(text) 89 | text = collapse_whitespace(text) 90 | text = text.replace('"', '') 91 | return text 92 | -------------------------------------------------------------------------------- /codes/models/audio/tts/tacotron2/text/cmudict.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | import re 4 | 5 | 6 | valid_symbols = [ 7 | 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2', 8 | 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2', 9 | 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY', 10 | 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1', 11 | 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 12 | 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 13 | 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH' 14 | ] 15 | 16 | _valid_symbol_set = set(valid_symbols) 17 | 18 | 19 | class CMUDict: 20 | '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict''' 21 | def __init__(self, file_or_path, keep_ambiguous=True): 22 | if isinstance(file_or_path, str): 23 | with open(file_or_path, encoding='latin-1') as f: 24 | entries = _parse_cmudict(f) 25 | else: 26 | entries = _parse_cmudict(file_or_path) 27 | if not keep_ambiguous: 28 | entries = {word: pron for word, pron in entries.items() if len(pron) == 1} 29 | self._entries = entries 30 | 31 | 32 | def __len__(self): 33 | return len(self._entries) 34 | 35 | 36 | def lookup(self, word): 37 | '''Returns list of ARPAbet pronunciations of the given word.''' 38 | return self._entries.get(word.upper()) 39 | 40 | 41 | 42 | _alt_re = re.compile(r'\([0-9]+\)') 43 | 44 | 45 | def _parse_cmudict(file): 46 | cmudict = {} 47 | for line in file: 48 | if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"): 49 | parts = line.split(' ') 50 | word = re.sub(_alt_re, '', parts[0]) 51 | pronunciation = _get_pronunciation(parts[1]) 52 | if pronunciation: 53 | if word in cmudict: 54 | cmudict[word].append(pronunciation) 55 | else: 56 | cmudict[word] = [pronunciation] 57 | return cmudict 58 | 59 | 60 | def _get_pronunciation(s): 61 | parts = s.strip().split(' ') 62 | for part in parts: 63 | if part not in _valid_symbol_set: 64 | return None 65 | return ' '.join(parts) 66 | -------------------------------------------------------------------------------- /codes/models/audio/tts/tacotron2/text/numbers.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | import inflect 4 | import re 5 | 6 | 7 | _inflect = inflect.engine() 8 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') 9 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') 10 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') 11 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') 12 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') 13 | _number_re = re.compile(r'[0-9]+') 14 | 15 | 16 | def _remove_commas(m): 17 | return m.group(1).replace(',', '') 18 | 19 | 20 | def _expand_decimal_point(m): 21 | return m.group(1).replace('.', ' point ') 22 | 23 | 24 | def _expand_dollars(m): 25 | match = m.group(1) 26 | parts = match.split('.') 27 | if len(parts) > 2: 28 | return match + ' dollars' # Unexpected format 29 | dollars = int(parts[0]) if parts[0] else 0 30 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 31 | if dollars and cents: 32 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 33 | cent_unit = 'cent' if cents == 1 else 'cents' 34 | return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) 35 | elif dollars: 36 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 37 | return '%s %s' % (dollars, dollar_unit) 38 | elif cents: 39 | cent_unit = 'cent' if cents == 1 else 'cents' 40 | return '%s %s' % (cents, cent_unit) 41 | else: 42 | return 'zero dollars' 43 | 44 | 45 | def _expand_ordinal(m): 46 | return _inflect.number_to_words(m.group(0)) 47 | 48 | 49 | def _expand_number(m): 50 | num = int(m.group(0)) 51 | if num > 1000 and num < 3000: 52 | if num == 2000: 53 | return 'two thousand' 54 | elif num > 2000 and num < 2010: 55 | return 'two thousand ' + _inflect.number_to_words(num % 100) 56 | elif num % 100 == 0: 57 | return _inflect.number_to_words(num // 100) + ' hundred' 58 | else: 59 | return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') 60 | else: 61 | return _inflect.number_to_words(num, andword='') 62 | 63 | 64 | def normalize_numbers(text): 65 | text = re.sub(_comma_number_re, _remove_commas, text) 66 | text = re.sub(_pounds_re, r'\1 pounds', text) 67 | text = re.sub(_dollars_re, _expand_dollars, text) 68 | text = re.sub(_decimal_number_re, _expand_decimal_point, text) 69 | text = re.sub(_ordinal_re, _expand_ordinal, text) 70 | text = re.sub(_number_re, _expand_number, text) 71 | return text 72 | -------------------------------------------------------------------------------- /codes/models/audio/tts/tacotron2/text/symbols.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | ''' 4 | Defines the set of symbols used in text input to the model. 5 | 6 | The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. ''' 7 | from models.audio.tts.tacotron2.text import cmudict 8 | 9 | _pad = '_' 10 | _punctuation = '!\'(),.:;? ' 11 | _special = '-' 12 | _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' 13 | 14 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): 15 | _arpabet = ['@' + s for s in cmudict.valid_symbols] 16 | 17 | # Export all symbols: 18 | symbols = [_pad] + list(_special) + list(_punctuation) + list(_letters) + _arpabet 19 | -------------------------------------------------------------------------------- /codes/models/audio/vocoders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/audio/vocoders/__init__.py -------------------------------------------------------------------------------- /codes/models/audio/vocoders/univnet/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/audio/vocoders/univnet/__init__.py -------------------------------------------------------------------------------- /codes/models/audio/vocoders/waveglow/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/audio/vocoders/waveglow/__init__.py -------------------------------------------------------------------------------- /codes/models/audio/vocoders/waveglow/denoiser.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from models.audio.tts.tacotron2.stft import STFT 4 | 5 | sys.path.append('tacotron2') 6 | import torch 7 | 8 | 9 | class Denoiser(torch.nn.Module): 10 | """ Removes model bias from audio produced with waveglow """ 11 | 12 | def __init__(self, waveglow, filter_length=1024, n_overlap=4, 13 | win_length=1024, mode='zeros'): 14 | super(Denoiser, self).__init__() 15 | self.stft = STFT(filter_length=filter_length, 16 | hop_length=int(filter_length/n_overlap), 17 | win_length=win_length).cuda() 18 | if mode == 'zeros': 19 | mel_input = torch.zeros( 20 | (1, 80, 88), 21 | dtype=waveglow.upsample.weight.dtype, 22 | device=waveglow.upsample.weight.device) 23 | elif mode == 'normal': 24 | mel_input = torch.randn( 25 | (1, 80, 88), 26 | dtype=waveglow.upsample.weight.dtype, 27 | device=waveglow.upsample.weight.device) 28 | else: 29 | raise Exception("Mode {} if not supported".format(mode)) 30 | 31 | with torch.no_grad(): 32 | bias_audio = waveglow.infer(mel_input, sigma=0.0).float() 33 | bias_spec, _ = self.stft.transform(bias_audio) 34 | 35 | self.register_buffer('bias_spec', bias_spec[:, :, 0][:, :, None]) 36 | 37 | def forward(self, audio, strength=0.1): 38 | audio_spec, audio_angles = self.stft.transform(audio.cuda().float()) 39 | audio_spec_denoised = audio_spec - self.bias_spec * strength 40 | audio_spec_denoised = torch.clamp(audio_spec_denoised, 0.0) 41 | audio_denoised = self.stft.inverse(audio_spec_denoised, audio_angles) 42 | return audio_denoised 43 | -------------------------------------------------------------------------------- /codes/models/classifiers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/classifiers/__init__.py -------------------------------------------------------------------------------- /codes/models/classifiers/torch_models.py: -------------------------------------------------------------------------------- 1 | from torchvision.models import vgg16 2 | 3 | from trainer.networks import register_model 4 | from utils.util import opt_get 5 | 6 | 7 | @register_model 8 | def register_torch_vgg16(opt_net, opt): 9 | """ return a ResNet 18 object 10 | """ 11 | return vgg16(**opt_get(opt_net, ['kwargs'], {})) 12 | -------------------------------------------------------------------------------- /codes/models/clip/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/clip/__init__.py -------------------------------------------------------------------------------- /codes/models/clip/clip.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from trainer.networks import register_model 4 | from utils.util import opt_get 5 | 6 | 7 | def encoder_for_type(type, master_dim, enc_kwargs): 8 | from x_clip.x_clip import VisionTransformer, TextTransformer 9 | if type == 'image': 10 | # xclip_kwargs: image_size, patch_size, channels, depth, heads 11 | return VisionTransformer(dim=master_dim, **enc_kwargs) 12 | elif type == 'tokens': 13 | # xclip_kwargs: num_tokens, max_seq_len, depth, heads 14 | return TextTransformer(dim=master_dim, **enc_kwargs) 15 | raise NotImplementedError() 16 | 17 | 18 | class XClipWrapper(nn.Module): 19 | def __init__(self, 20 | master_dim=512, 21 | enc1_type='vision', 22 | enc1_kwargs={}, 23 | enc2_type='text', 24 | enc2_kwargs={}, 25 | mask_seq1_percentage=0, 26 | mask_seq2_percentage=0, 27 | **xclip_kwargs): 28 | super().__init__() 29 | self.mask_seq1_percentage = mask_seq1_percentage 30 | self.mask_seq2_percentage = mask_seq2_percentage 31 | enc1 = encoder_for_type(enc1_type, master_dim, enc1_kwargs) 32 | enc2 = encoder_for_type(enc2_type, master_dim, enc2_kwargs) 33 | xclip_kwargs['dim_text'] = master_dim 34 | xclip_kwargs['dim_image'] = master_dim 35 | xclip_kwargs['dim_latent'] = master_dim 36 | xclip_kwargs['text_encoder'] = enc1 # The first argument of forward 37 | xclip_kwargs['image_encoder'] = enc2 38 | # xclip_kwargs: 39 | # use_all_token_embeds 40 | # downsample_image_embeds 41 | # decoupled_contrastive_learning 42 | # extra_latent_projection 43 | # use_mlm 44 | from x_clip import CLIP 45 | self.clip = CLIP(**xclip_kwargs) 46 | 47 | def forward(self, seq1, seq2, return_loss=False): 48 | seq1_mask = torch.rand_like(seq1.float()) > self.mask_seq1_percentage 49 | # TODO: add support for seq2 mask.. 50 | #seq2_mask = torch.rand_like(seq2.float()) > self.mask_seq2_percentage 51 | return self.clip(seq1, seq2, seq1_mask, return_loss=return_loss) 52 | 53 | 54 | @register_model 55 | def register_clip(opt_net, opt): 56 | return XClipWrapper(**opt_get(opt_net, ['kwargs'], {})) 57 | 58 | if __name__ == '__main__': 59 | model = XClipWrapper(enc1_type='tokens', enc2_type='tokens', 60 | enc1_kwargs={'num_tokens': 256, 'max_seq_len': 200, 'depth': 8, 'heads': 8}, 61 | enc2_kwargs={'num_tokens': 8192, 'max_seq_len': 250, 'depth': 8, 'heads': 8}) 62 | loss = model(torch.randint(0,256, (2,200)), torch.randint(0,8192, (2,250)), True) 63 | print(loss) -------------------------------------------------------------------------------- /codes/models/composable/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/composable/__init__.py -------------------------------------------------------------------------------- /codes/models/diffusion/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/diffusion/__init__.py -------------------------------------------------------------------------------- /codes/models/diffusion/losses.py: -------------------------------------------------------------------------------- 1 | """ 2 | Helpers for various likelihood-based losses. These are ported from the original 3 | Ho et al. diffusion models codebase: 4 | https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/utils.py 5 | """ 6 | 7 | import numpy as np 8 | 9 | import torch as th 10 | 11 | 12 | def normal_kl(mean1, logvar1, mean2, logvar2): 13 | """ 14 | Compute the KL divergence between two gaussians. 15 | 16 | Shapes are automatically broadcasted, so batches can be compared to 17 | scalars, among other use cases. 18 | """ 19 | tensor = None 20 | for obj in (mean1, logvar1, mean2, logvar2): 21 | if isinstance(obj, th.Tensor): 22 | tensor = obj 23 | break 24 | assert tensor is not None, "at least one argument must be a Tensor" 25 | 26 | # Force variances to be Tensors. Broadcasting helps convert scalars to 27 | # Tensors, but it does not work for th.exp(). 28 | logvar1, logvar2 = [ 29 | x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor) 30 | for x in (logvar1, logvar2) 31 | ] 32 | 33 | return 0.5 * ( 34 | -1.0 35 | + logvar2 36 | - logvar1 37 | + th.exp(logvar1 - logvar2) 38 | + ((mean1 - mean2) ** 2) * th.exp(-logvar2) 39 | ) 40 | 41 | 42 | def approx_standard_normal_cdf(x): 43 | """ 44 | A fast approximation of the cumulative distribution function of the 45 | standard normal. 46 | """ 47 | return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3)))) 48 | 49 | 50 | def discretized_gaussian_log_likelihood(x, *, means, log_scales): 51 | """ 52 | Compute the log-likelihood of a Gaussian distribution discretizing to a 53 | given image. 54 | 55 | :param x: the target images. It is assumed that this was uint8 values, 56 | rescaled to the range [-1, 1]. 57 | :param means: the Gaussian mean Tensor. 58 | :param log_scales: the Gaussian log stddev Tensor. 59 | :return: a tensor like x of log probabilities (in nats). 60 | """ 61 | assert x.shape == means.shape == log_scales.shape 62 | centered_x = x - means 63 | inv_stdv = th.exp(-log_scales) 64 | plus_in = inv_stdv * (centered_x + 1.0 / 255.0) 65 | cdf_plus = approx_standard_normal_cdf(plus_in) 66 | min_in = inv_stdv * (centered_x - 1.0 / 255.0) 67 | cdf_min = approx_standard_normal_cdf(min_in) 68 | log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12)) 69 | log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12)) 70 | cdf_delta = cdf_plus - cdf_min 71 | log_probs = th.where( 72 | x < -0.999, 73 | log_cdf_plus, 74 | th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))), 75 | ) 76 | assert log_probs.shape == x.shape 77 | return log_probs 78 | -------------------------------------------------------------------------------- /codes/models/image_generation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/image_generation/__init__.py -------------------------------------------------------------------------------- /codes/models/image_generation/glean/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/image_generation/glean/__init__.py -------------------------------------------------------------------------------- /codes/models/image_generation/srflow/Permutations.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch import nn as nn 4 | from torch.nn import functional as F 5 | 6 | from models.image_generation.srflow import thops 7 | 8 | 9 | class InvertibleConv1x1(nn.Module): 10 | def __init__(self, num_channels, LU_decomposed=False): 11 | super().__init__() 12 | w_shape = [num_channels, num_channels] 13 | w_init = np.linalg.qr(np.random.randn(*w_shape))[0].astype(np.float32) 14 | self.register_parameter("weight", nn.Parameter(torch.Tensor(w_init))) 15 | self.w_shape = w_shape 16 | self.LU = LU_decomposed 17 | 18 | def get_weight(self, input, reverse): 19 | w_shape = self.w_shape 20 | pixels = thops.pixels(input) 21 | dlogdet = torch.slogdet(self.weight)[1] * pixels 22 | if not reverse: 23 | weight = self.weight.view(w_shape[0], w_shape[1], 1, 1) 24 | else: 25 | weight = torch.inverse(self.weight.double()).float() \ 26 | .view(w_shape[0], w_shape[1], 1, 1) 27 | return weight, dlogdet 28 | def forward(self, input, logdet=None, reverse=False): 29 | """ 30 | log-det = log|abs(|W|)| * pixels 31 | """ 32 | weight, dlogdet = self.get_weight(input, reverse) 33 | if not reverse: 34 | z = F.conv2d(input, weight) 35 | if logdet is not None: 36 | logdet = logdet + dlogdet 37 | return z, logdet 38 | else: 39 | z = F.conv2d(input, weight) 40 | if logdet is not None: 41 | logdet = logdet - dlogdet 42 | return z, logdet 43 | -------------------------------------------------------------------------------- /codes/models/image_generation/srflow/Split.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn as nn 3 | 4 | from models.image_generation.srflow import thops 5 | from models.image_generation.srflow.flow import Conv2dZeros, GaussianDiag 6 | from utils.util import opt_get 7 | 8 | 9 | class Split2d(nn.Module): 10 | def __init__(self, num_channels, logs_eps=0, cond_channels=0, position=None, consume_ratio=0.5, opt=None): 11 | super().__init__() 12 | 13 | self.num_channels_consume = int(round(num_channels * consume_ratio)) 14 | self.num_channels_pass = num_channels - self.num_channels_consume 15 | 16 | self.conv = Conv2dZeros(in_channels=self.num_channels_pass + cond_channels, 17 | out_channels=self.num_channels_consume * 2) 18 | self.logs_eps = logs_eps 19 | self.position = position 20 | self.gaussian_nll_weight = opt_get(opt, ['networks', 'generator', 'flow', 'gaussian_loss_weight'], 1) 21 | 22 | def split2d_prior(self, z, ft): 23 | if ft is not None: 24 | z = torch.cat([z, ft], dim=1) 25 | h = self.conv(z) 26 | return thops.split_feature(h, "cross") 27 | 28 | def exp_eps(self, logs): 29 | return torch.exp(logs) + self.logs_eps 30 | 31 | def forward(self, input, logdet=0., reverse=False, eps_std=None, eps=None, ft=None, y_onehot=None): 32 | if not reverse: 33 | # self.input = input 34 | z1, z2 = self.split_ratio(input) 35 | mean, logs = self.split2d_prior(z1, ft) 36 | 37 | eps = (z2 - mean) / self.exp_eps(logs) 38 | 39 | logdet = logdet + self.get_logdet(logs, mean, z2) 40 | 41 | # print(logs.shape, mean.shape, z2.shape) 42 | # self.eps = eps 43 | # print('split, enc eps:', eps) 44 | return z1, logdet, eps 45 | else: 46 | z1 = input 47 | mean, logs = self.split2d_prior(z1, ft) 48 | 49 | if eps is None: 50 | #print("WARNING: eps is None, generating eps untested functionality!") 51 | eps = GaussianDiag.sample(mean, logs, eps_std) 52 | #eps = GaussianDiag.sample_eps(mean.shape, eps_std) 53 | 54 | eps = eps.to(mean.device) 55 | z2 = mean + self.exp_eps(logs) * eps 56 | z = thops.cat_feature(z1, z2) 57 | 58 | logdet = logdet - self.get_logdet(logs, mean, z2) 59 | 60 | return z, logdet 61 | # return z, logdet, eps 62 | 63 | def get_logdet(self, logs, mean, z2): 64 | logdet_diff = GaussianDiag.logp(mean, logs, z2) 65 | return logdet_diff * self.gaussian_nll_weight 66 | 67 | def split_ratio(self, input): 68 | z1, z2 = input[:, :self.num_channels_pass, ...], input[:, self.num_channels_pass:, ...] 69 | return z1, z2 -------------------------------------------------------------------------------- /codes/models/image_generation/srflow/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/image_generation/srflow/__init__.py -------------------------------------------------------------------------------- /codes/models/image_generation/srflow/glow_arch.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | 4 | def f_conv2d_bias(in_channels, out_channels): 5 | def padding_same(kernel, stride): 6 | return [((k - 1) * s + 1) // 2 for k, s in zip(kernel, stride)] 7 | 8 | padding = padding_same([3, 3], [1, 1]) 9 | assert padding == [1, 1], padding 10 | return nn.Sequential( 11 | nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=[3, 3], stride=1, padding=1, 12 | bias=True)) 13 | -------------------------------------------------------------------------------- /codes/models/image_generation/srflow/module_util.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.init as init 4 | import torch.nn.functional as F 5 | import maybe_bnb as mbnb 6 | 7 | 8 | def initialize_weights(net_l, scale=1): 9 | if not isinstance(net_l, list): 10 | net_l = [net_l] 11 | for net in net_l: 12 | for m in net.modules(): 13 | if isinstance(m, nn.Conv2d): 14 | init.kaiming_normal_(m.weight, a=0, mode='fan_in') 15 | m.weight.data *= scale # for residual block 16 | if m.bias is not None: 17 | m.bias.data.zero_() 18 | elif isinstance(m, mbnb.nn.Linear): 19 | init.kaiming_normal_(m.weight, a=0, mode='fan_in') 20 | m.weight.data *= scale 21 | if m.bias is not None: 22 | m.bias.data.zero_() 23 | elif isinstance(m, nn.BatchNorm2d): 24 | init.constant_(m.weight, 1) 25 | init.constant_(m.bias.data, 0.0) 26 | 27 | 28 | def make_layer(block, n_layers): 29 | layers = [] 30 | for _ in range(n_layers): 31 | layers.append(block()) 32 | return nn.Sequential(*layers) 33 | 34 | 35 | class ResidualBlock_noBN(nn.Module): 36 | '''Residual block w/o BN 37 | ---Conv-ReLU-Conv-+- 38 | |________________| 39 | ''' 40 | 41 | def __init__(self, nf=64): 42 | super(ResidualBlock_noBN, self).__init__() 43 | self.conv1 = nn.Conv2d(nf, nf, 3, 1, 1, bias=True) 44 | self.conv2 = nn.Conv2d(nf, nf, 3, 1, 1, bias=True) 45 | 46 | # initialization 47 | initialize_weights([self.conv1, self.conv2], 0.1) 48 | 49 | def forward(self, x): 50 | identity = x 51 | out = F.relu(self.conv1(x), inplace=True) 52 | out = self.conv2(out) 53 | return identity + out 54 | 55 | 56 | def flow_warp(x, flow, interp_mode='bilinear', padding_mode='zeros'): 57 | """Warp an image or feature map with optical flow 58 | Args: 59 | x (Tensor): size (N, C, H, W) 60 | flow (Tensor): size (N, H, W, 2), normal value 61 | interp_mode (str): 'nearest' or 'bilinear' 62 | padding_mode (str): 'zeros' or 'border' or 'reflection' 63 | 64 | Returns: 65 | Tensor: warped image or feature map 66 | """ 67 | assert x.size()[-2:] == flow.size()[1:3] 68 | B, C, H, W = x.size() 69 | # mesh grid 70 | grid_y, grid_x = torch.meshgrid(torch.arange(0, H), torch.arange(0, W)) 71 | grid = torch.stack((grid_x, grid_y), 2).float() # W(x), H(y), 2 72 | grid.requires_grad = False 73 | grid = grid.type_as(x) 74 | vgrid = grid + flow 75 | # scale grid to [-1,1] 76 | vgrid_x = 2.0 * vgrid[:, :, :, 0] / max(W - 1, 1) - 1.0 77 | vgrid_y = 2.0 * vgrid[:, :, :, 1] / max(H - 1, 1) - 1.0 78 | vgrid_scaled = torch.stack((vgrid_x, vgrid_y), dim=3) 79 | output = F.grid_sample(x, vgrid_scaled, mode=interp_mode, padding_mode=padding_mode) 80 | return output 81 | -------------------------------------------------------------------------------- /codes/models/image_generation/srflow/thops.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def sum(tensor, dim=None, keepdim=False): 5 | if dim is None: 6 | # sum up all dim 7 | return torch.sum(tensor) 8 | else: 9 | if isinstance(dim, int): 10 | dim = [dim] 11 | dim = sorted(dim) 12 | for d in dim: 13 | tensor = tensor.sum(dim=d, keepdim=True) 14 | if not keepdim: 15 | for i, d in enumerate(dim): 16 | tensor.squeeze_(d-i) 17 | return tensor 18 | 19 | 20 | def mean(tensor, dim=None, keepdim=False): 21 | if dim is None: 22 | # mean all dim 23 | return torch.mean(tensor) 24 | else: 25 | if isinstance(dim, int): 26 | dim = [dim] 27 | dim = sorted(dim) 28 | for d in dim: 29 | tensor = tensor.mean(dim=d, keepdim=True) 30 | if not keepdim: 31 | for i, d in enumerate(dim): 32 | tensor.squeeze_(d-i) 33 | return tensor 34 | 35 | 36 | def split_feature(tensor, type="split"): 37 | """ 38 | type = ["split", "cross"] 39 | """ 40 | C = tensor.size(1) 41 | if type == "split": 42 | return tensor[:, :C // 2, ...], tensor[:, C // 2:, ...] 43 | elif type == "cross": 44 | return tensor[:, 0::2, ...], tensor[:, 1::2, ...] 45 | 46 | 47 | def cat_feature(tensor_a, tensor_b): 48 | return torch.cat((tensor_a, tensor_b), dim=1) 49 | 50 | 51 | def pixels(tensor): 52 | return int(tensor.size(2) * tensor.size(3)) -------------------------------------------------------------------------------- /codes/models/image_generation/stylegan/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | def create_stylegan2_loss(opt_loss, env): 3 | type = opt_loss['type'] 4 | if type == 'stylegan2_divergence': 5 | import models.image_generation.stylegan.stylegan2_lucidrains as stylegan2 6 | return stylegan2.StyleGan2DivergenceLoss(opt_loss, env) 7 | elif type == 'stylegan2_pathlen': 8 | import models.image_generation.stylegan.stylegan2_lucidrains as stylegan2 9 | return stylegan2.StyleGan2PathLengthLoss(opt_loss, env) 10 | else: 11 | raise NotImplementedError -------------------------------------------------------------------------------- /codes/models/image_latents/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/image_latents/__init__.py -------------------------------------------------------------------------------- /codes/models/image_latents/byol/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/image_latents/byol/__init__.py -------------------------------------------------------------------------------- /codes/models/image_latents/fixup_resnet/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/image_latents/fixup_resnet/__init__.py -------------------------------------------------------------------------------- /codes/models/lucidrains/dalle/__init__.py: -------------------------------------------------------------------------------- 1 | # This directory contains some useful code from https://github.com/lucidrains/DALLE-pytorch/tree/main/dalle_pytorch -------------------------------------------------------------------------------- /codes/models/lucidrains/performer/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /codes/models/vqvae/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/vqvae/__init__.py -------------------------------------------------------------------------------- /codes/models/vqvae/gumbel_quantizer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch import einsum 5 | 6 | from utils.weight_scheduler import LinearDecayWeightScheduler 7 | import maybe_bnb as mbnb 8 | 9 | 10 | class GumbelQuantizer(nn.Module): 11 | def __init__(self, inp_dim, codebook_dim, num_tokens, straight_through=False): 12 | super().__init__() 13 | self.to_logits = nn.Conv1d(inp_dim, num_tokens, 1) 14 | # nn.Embedding 15 | self.codebook = mbnb.nn.Embedding(num_tokens, codebook_dim) 16 | self.straight_through = straight_through 17 | self.temperature_scheduler = LinearDecayWeightScheduler(10, 5000, .9, 2000) 18 | self.step = 0 19 | self.norm = SwitchNorm(num_tokens) 20 | 21 | def get_temperature(self, step): 22 | self.step = step # VERY POOR DESIGN. WHEN WILL HE EVER LEARN??? 23 | return self.temperature_scheduler.get_weight_for_step(step) 24 | 25 | def embed_code(self, codes): 26 | return self.codebook(codes) 27 | 28 | def gumbel_softmax(self, logits, tau, dim, hard): 29 | gumbels = torch.rand_like(logits) 30 | gumbels = -torch.log(-torch.log(gumbels + 1e-8) + 1e-8) 31 | logits = (logits + gumbels) / tau # ~Gumbel(logits,tau) 32 | y_soft = F.softmax(logits, dim=dim) 33 | 34 | if hard: 35 | index = y_soft.max(dim, keepdim=True)[1] 36 | y_hard = torch.zeros_like(logits, memory_format=torch.legacy_contiguous_format).scatter_(dim, index, 1.0) 37 | ret = y_hard - y_soft.detach() + y_soft 38 | else: 39 | ret = y_soft 40 | return ret 41 | 42 | def forward(self, h): 43 | h = h.permute(0,2,1) 44 | logits = self.to_logits(h) 45 | logits = self.gumbel_softmax(logits, tau=self.temperature_scheduler.get_weight_for_step(self.step), dim=1, hard=self.straight_through) 46 | logits = self.norm(logits) 47 | codes = logits.argmax(dim=1).flatten(1) 48 | sampled = einsum('b n l, n d -> b d l', logits, self.codebook.weight) 49 | return sampled.permute(0,2,1), 0, codes 50 | 51 | if __name__ == '__main__': 52 | j = torch.randn(8,40,1024) 53 | m = GumbelQuantizer(1024, 1024, 4096) 54 | m2 = DiscreteDecoder(1024, (512, 256), 2) 55 | l=m2(m(j)[0].permute(0,2,1)) 56 | mean = 0 57 | for ls in l: 58 | mean = mean + ls.mean() 59 | mean.backward() -------------------------------------------------------------------------------- /codes/requirements.laxed.txt: -------------------------------------------------------------------------------- 1 | # Fundamentals 2 | numpy 3 | pyyaml 4 | tb-nightly 5 | future 6 | scp 7 | tqdm 8 | matplotlib 9 | scipy 10 | munch 11 | tqdm 12 | scp 13 | tensorboard 14 | orjson 15 | einops 16 | lambda-networks 17 | mup 18 | 19 | #UI 20 | customtkinter 21 | ruamel.yaml 22 | # For image generation stuff 23 | opencv-python 24 | kornia 25 | pytorch_ssim 26 | gsa-pytorch 27 | pytorch_fid 28 | 29 | # For audio generation stuff 30 | inflect 31 | librosa 32 | Unidecode 33 | tgt 34 | pyworld 35 | audio2numpy 36 | SoundFile 37 | 38 | # For text stuff 39 | transformers 40 | tokenizers 41 | jiwer # calculating WER 42 | omegaconf 43 | 44 | # lucidrains stuff 45 | vector_quantize_pytorch 46 | linear_attention_transformer 47 | rotary-embedding-torch 48 | axial_positional_embedding 49 | g-mlp-pytorch 50 | x-clip 51 | x_transformers==1.0.4 52 | 53 | # bitsandbytes 54 | bitsandbytes 55 | lion-pytorch==0.0.7 56 | # triton==2.0.0a2 57 | -------------------------------------------------------------------------------- /codes/requirements.txt: -------------------------------------------------------------------------------- 1 | # Fundamentals 2 | numpy 3 | pyyaml 4 | tb-nightly 5 | future 6 | scp 7 | tqdm 8 | matplotlib 9 | scipy 10 | munch 11 | tqdm 12 | scp 13 | tensorboard 14 | orjson 15 | einops 16 | lambda-networks 17 | mup 18 | 19 | # For image generation stuff 20 | opencv-python 21 | kornia 22 | pytorch_ssim 23 | gsa-pytorch 24 | pytorch_fid==0.1.1 25 | 26 | # For audio generation stuff 27 | inflect==0.2.5 28 | librosa==0.6.0 29 | Unidecode==1.0.22 30 | tgt == 1.4.4 31 | pyworld == 0.2.10 32 | audio2numpy 33 | SoundFile 34 | 35 | # For text stuff 36 | transformers 37 | tokenizers 38 | jiwer # calculating WER 39 | omegaconf 40 | 41 | # lucidrains stuff 42 | vector_quantize_pytorch 43 | linear_attention_transformer 44 | rotary-embedding-torch 45 | axial_positional_embedding 46 | g-mlp-pytorch 47 | x-clip 48 | x_transformers 49 | 50 | bitsandbytes 51 | lion-pytorch==0.0.7 52 | # triton==2.0.0a2 53 | -------------------------------------------------------------------------------- /codes/scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/scripts/__init__.py -------------------------------------------------------------------------------- /codes/scripts/audio/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/scripts/audio/__init__.py -------------------------------------------------------------------------------- /codes/scripts/audio/gen/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/scripts/audio/gen/__init__.py -------------------------------------------------------------------------------- /codes/scripts/audio/gen/use_discrete_vocoder_one_way.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import torch 4 | import torchaudio 5 | 6 | from data.audio.unsupervised_audio_dataset import load_audio 7 | from scripts.audio.gen.speech_synthesis_utils import do_spectrogram_diffusion, \ 8 | load_discrete_vocoder_diffuser, wav_to_mel, convert_mel_to_codes 9 | from utils.audio import plot_spectrogram 10 | from utils.util import load_model_from_config 11 | 12 | 13 | def roundtrip_vocoding(dvae, vocoder, diffuser, clip, cond=None, plot_spec=False): 14 | clip = clip.unsqueeze(0) 15 | if cond is None: 16 | cond = clip 17 | else: 18 | cond = cond.unsqueeze(0) 19 | mel = wav_to_mel(clip) 20 | if plot_spec: 21 | plot_spectrogram(mel[0].cpu()) 22 | codes = convert_mel_to_codes(dvae, mel) 23 | return 24 | 25 | 26 | if __name__ == '__main__': 27 | parser = argparse.ArgumentParser() 28 | parser.add_argument('-opt', type=str, help='Path to options YAML file used to train the diffusion model', default='X:\\dlas\\experiments\\train_diffusion_vocoder_with_cond_new_dvae.yml') 29 | parser.add_argument('-diffusion_model_name', type=str, help='Name of the diffusion model in opt.', default='generator') 30 | parser.add_argument('-diffusion_model_path', type=str, help='Name of the diffusion model in opt.', default='X:\\dlas\\experiments\\train_diffusion_vocoder_with_cond_new_dvae_full\\models\\6100_generator_ema.pth') 31 | parser.add_argument('-dvae_model_name', type=str, help='Name of the DVAE model in opt.', default='dvae') 32 | parser.add_argument('-input_file', type=str, help='Path to the input torch save file.', default='speech_forward_mels.pth') 33 | parser.add_argument('-cond', type=str, help='Path to the conditioning input audio file.', default='Z:\\clips\\books1\\3042_18_Holden__000000000\\00037.wav') 34 | args = parser.parse_args() 35 | 36 | print("Loading DVAE..") 37 | dvae = load_model_from_config(args.opt, args.dvae_model_name) 38 | print("Loading Diffusion Model..") 39 | diffusion = load_model_from_config(args.opt, args.diffusion_model_name, also_load_savepoint=False, load_path=args.diffusion_model_path) 40 | 41 | print("Loading data..") 42 | cond = load_audio(args.cond, 22050) 43 | if cond.shape[-1] > 44100+10000: 44 | cond = cond[:,10000:54100] 45 | cond = cond.unsqueeze(0).cuda() 46 | 47 | diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=20) 48 | inp = torch.load(args.input_file) 49 | codes = inp 50 | 51 | print("Performing inference..") 52 | for i, cb in enumerate(codes): 53 | roundtripped = do_spectrogram_diffusion(diffusion, dvae, diffuser, cb.unsqueeze(0).cuda(), cond, spectrogram_compression_factor=128, plt_spec=False) 54 | torchaudio.save(f'vocoded_output_sp_{i}.wav', roundtripped.squeeze(0).cpu(), 11025) -------------------------------------------------------------------------------- /codes/scripts/audio/gen/use_mel2vec_codes.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchvision 3 | 4 | from models.audio.mel2vec import ContrastiveTrainingWrapper 5 | from trainer.injectors.audio_injectors import TorchMelSpectrogramInjector, normalize_mel 6 | from utils.util import load_audio 7 | 8 | def collapse_codegroups(codes): 9 | codes = codes.clone() 10 | groups = codes.shape[-1] 11 | for k in range(groups): 12 | codes[:,:,k] = codes[:,:,k] * groups ** k 13 | codes = codes.sum(-1) 14 | return codes 15 | 16 | 17 | def recover_codegroups(codes, groups): 18 | codes = codes.clone() 19 | output = torch.LongTensor(codes.shape[0], codes.shape[1], groups, device=codes.device) 20 | for k in range(groups): 21 | output[:,:,k] = codes % groups 22 | codes = codes // groups 23 | return output 24 | 25 | 26 | if __name__ == '__main__': 27 | model = ContrastiveTrainingWrapper(mel_input_channels=256, inner_dim=1024, layers=24, dropout=0, mask_time_prob=0, 28 | mask_time_length=6, num_negatives=100, codebook_size=16, codebook_groups=4, 29 | disable_custom_linear_init=True, feature_producer_type='standard', 30 | freq_mask_percent=0, do_reconstruction_loss=True) 31 | model.load_state_dict(torch.load("../experiments/m2v_music2.pth")) 32 | model.eval() 33 | 34 | wav = load_audio("Y:/separated/bt-music-1/100 Hits - Running Songs 2014 CD 2/100 Hits - Running Songs 2014 Cd2 - 02 - 7Th Heaven - Ain't Nothin' Goin' On But The Rent/00001/no_vocals.wav", 22050) 35 | mel = TorchMelSpectrogramInjector({'n_mel_channels': 256, 'mel_fmax': 11000, 'filter_length': 16000, 36 | 'normalize': True, 'in': 'in', 'out': 'out'}, {})({'in': wav.unsqueeze(0)})['out'] 37 | codes = model.get_codes(mel) 38 | reconstruction = model.reconstruct(mel) 39 | 40 | torchvision.utils.save_image((normalize_mel(mel).unsqueeze(1)+1)/2, 'mel.png') 41 | torchvision.utils.save_image((normalize_mel(reconstruction).unsqueeze(1)+1)/2, 'reconstructed.png') 42 | 43 | collapsed = collapse_codegroups(codes) 44 | recovered = recover_codegroups(collapsed, 4) 45 | 46 | print(codes) -------------------------------------------------------------------------------- /codes/scripts/audio/gen/w2v_patcher.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from utils.util import load_model_from_config 4 | 5 | if __name__ == '__main__': 6 | config = "D:\\dlas\\options\\train_wav2vec_matcher.yml" 7 | model_name = "generator" 8 | model_path = "D:\dlas\experiments\train_wav2vec_matcher\models" 9 | wav_dump_path = "FIXME" 10 | 11 | model = load_model_from_config(config, model_name, also_load_savepoint=False, load_path=model_path, device='cuda').eval() 12 | w2v_logits, audio_samples = torch.load(wav_dump_path) 13 | 14 | w2v_logits_chunked = torch.chunk(w2v_logits, 32) 15 | for chunk in w2v_logits_chunked: 16 | -------------------------------------------------------------------------------- /codes/scripts/audio/gen_mel.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | 5 | from data.util import find_files_of_type, is_audio_file 6 | from trainer.injectors.audio_injectors import MelSpectrogramInjector 7 | from utils.util import load_audio 8 | 9 | if __name__ == '__main__': 10 | path = 'C:\\Users\\jbetk\\Documents\\tmp\\some_audio' 11 | 12 | inj = MelSpectrogramInjector({'in': 'wav', 'out': 'mel', 13 | 'mel_fmax': 12000, 'sampling_rate': 22050, 'n_mel_channels': 100 14 | },{}) 15 | audio = find_files_of_type('img', path, qualifier=is_audio_file)[0] 16 | for clip in audio: 17 | if not clip.endswith('.wav'): 18 | continue 19 | wav = load_audio(clip, 24000) 20 | mel = inj({'wav': wav.unsqueeze(0)})['mel'] 21 | torch.save(mel, clip.replace('.wav', '.mel')) -------------------------------------------------------------------------------- /codes/scripts/audio/mel_bin_norm_compute.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import torch 4 | import yaml 5 | from tqdm import tqdm 6 | 7 | from data import create_dataset, create_dataloader 8 | from scripts.audio.gen.speech_synthesis_utils import wav_to_univnet_mel 9 | from utils.options import Loader 10 | 11 | if __name__ == '__main__': 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('-opt', type=str, help='Path to options YAML file used to train the diffusion model', default='D:\\dlas\\options\\train_diffusion_tts9.yml') 14 | parser.add_argument('-key', type=str, help='Key where audio data is stored', default='wav') 15 | parser.add_argument('-num_batches', type=int, help='Number of batches to collect to compute the norm', default=50000) 16 | args = parser.parse_args() 17 | 18 | with open(args.opt, mode='r') as f: 19 | opt = yaml.load(f, Loader=Loader) 20 | dopt = opt['datasets']['train'] 21 | dopt['phase'] = 'train' 22 | dataset, collate = create_dataset(dopt, return_collate=True) 23 | dataloader = create_dataloader(dataset, dopt, collate_fn=collate, shuffle=True) 24 | 25 | mel_means = [] 26 | mel_max = -999999999 27 | mel_min = 999999999 28 | mel_stds = [] 29 | mel_vars = [] 30 | for batch in tqdm(dataloader): 31 | if len(mel_means) > args.num_batches: 32 | break 33 | clip = batch[args.key].cuda() 34 | for b in range(clip.shape[0]): 35 | wav = clip[b].unsqueeze(0) 36 | wav = wav[:, :, :batch[f'{args.key}_lengths'][b]] 37 | mel = wav_to_univnet_mel(clip) # Caution: make sure this isn't already normed. 38 | mel_means.append(mel.mean((0,2)).cpu()) 39 | mel_max = max(mel.max().item(), mel_max) 40 | mel_min = min(mel.min().item(), mel_min) 41 | mel_stds.append(mel.std((0,2)).cpu()) 42 | mel_vars.append(mel.var((0,2)).cpu()) 43 | mel_means = torch.stack(mel_means).mean(0) 44 | mel_stds = torch.stack(mel_stds).mean(0) 45 | mel_vars = torch.stack(mel_vars).mean(0) 46 | torch.save((mel_means,mel_max,mel_min,mel_stds,mel_vars), 'univnet_mel_norms.pth') -------------------------------------------------------------------------------- /codes/scripts/audio/play_with_spectral_representations.py: -------------------------------------------------------------------------------- 1 | import torchvision.utils 2 | 3 | from utils.music_utils import music2mel, music2cqt 4 | from utils.util import load_audio 5 | 6 | if __name__ == '__main__': 7 | clip = load_audio('Y:\\split\\yt-music-eval\\00001.wav', 22050) 8 | mel = music2mel(clip) 9 | cqt = music2cqt(clip) 10 | torchvision.utils.save_image((mel.unsqueeze(1) + 1) / 2, 'mel.png') 11 | torchvision.utils.save_image((cqt.unsqueeze(1) + 1) / 2, 'cqt.png') 12 | -------------------------------------------------------------------------------- /codes/scripts/audio/prep_music/demucs_notes.txt: -------------------------------------------------------------------------------- 1 | My custom demucs library is used for batch source separation: 2 | https://github.com/neonbjb/demucs 3 | 4 | ``` 5 | conda activate demucs 6 | python setup.py install 7 | CUDA_VISIBLE_DEVICES=0 python -m demucs /y/split/bt-music-5 --out=/y/separated/bt-music-5 --num_workers=2 --device cuda --two-stems=vocals 8 | ``` 9 | 10 | Example usage of generate_long_cheaters and generate_long_mels, post demucs: 11 | 12 | ``` 13 | CUDA_VISIBLE_DEVICES=0 python generate_long_mels.py --path=/y/separated/mpm/1 --progress_file=/y/separated/large_mels/mpm/already_processed.txt \ 14 | --output_path=/y/separated/large_mels/mpm/1 --num_threads=2 15 | 16 | CUDA_VISIBLE_DEVICES=2 python generate_long_cheaters.py --path=/y/separated/large_mels/mpm/3 --progress_file=/y/separated/large_mel_cheaters/mpm/already_processed.txt \ 17 | --output_path=/y/separated/large_mel_cheaters/mpm/3 --num_threads=1 18 | ``` -------------------------------------------------------------------------------- /codes/scripts/audio/preparation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/scripts/audio/preparation/__init__.py -------------------------------------------------------------------------------- /codes/scripts/audio/preparation/combine_phonetic_and_text.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | if __name__ == '__main__': 4 | basepath = 'Y:\\bigasr_dataset\\hifi_tts' 5 | 6 | english_file = os.path.join(basepath, 'transcribed-oco-realtext.tsv') 7 | if not os.path.exists(english_file): 8 | english_file = os.path.join(basepath, 'transcribed-oco.tsv') 9 | phoneme_file = os.path.join(basepath, 'transcribed-phoneme-oco.tsv') 10 | 11 | texts = {} 12 | with open(english_file, 'r', encoding='utf-8') as f: 13 | for line in f.readlines(): 14 | spl = line.split('\t') 15 | if len(spl) == 3: 16 | text, p, _ = spl 17 | texts[p] = text 18 | else: 19 | print(f'Error processing line {line}') 20 | 21 | with open(phoneme_file, 'r', encoding='utf-8') as f: 22 | wf = open(os.path.join(basepath, 'transcribed-phoneme-english-oco.tsv'), 'w', encoding='utf-8') 23 | for line in f.readlines(): 24 | spl = line.split('\t') 25 | if len(spl) == 3: 26 | _, p, codes = spl 27 | codes = codes.strip() 28 | if p not in texts: 29 | print(f'Could not find the text for {p}') 30 | continue 31 | wf.write(f'{texts[p]}\t{p}\t{codes}\n') 32 | wf.close() 33 | -------------------------------------------------------------------------------- /codes/scripts/audio/preparation/filter_clips_with_no_hifreq_data.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchaudio 3 | 4 | from data.audio.unsupervised_audio_dataset import load_audio 5 | from scripts.do_to_files import do_to_files 6 | 7 | 8 | def get_spec_mags(clip): 9 | stft = torch.stft(clip, n_fft=22000, hop_length=1024, return_complex=True) 10 | stft = stft[0, -2000:, :] 11 | return (stft.real ** 2 + stft.imag ** 2).sqrt() 12 | 13 | 14 | def filter_no_hifreq_data(path, output_path): 15 | clip = load_audio(path, 22050) 16 | if clip.shape[-1] < 22050: 17 | return 18 | stft = get_spec_mags(clip) 19 | if stft.mean() < .08: 20 | with open(output_path, 'a') as o: 21 | o.write(f'{path}\n') 22 | 23 | if __name__ == '__main__': 24 | do_to_files(filter_no_hifreq_data) -------------------------------------------------------------------------------- /codes/scripts/audio/preparation/gen_dvae_codes.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | from tqdm import tqdm 5 | 6 | from scripts.audio.gen.speech_synthesis_utils import load_speech_dvae, wav_to_mel 7 | 8 | if __name__ == '__main__': 9 | input_folder = 'C:\\Users\\James\\Downloads\\lex2\\lexfridman_training_mp3' 10 | output_folder = 'C:\\Users\\James\\Downloads\\lex2\\quantized' 11 | 12 | params = { 13 | 'mode': 'unsupervised_audio', 14 | 'path': [input_folder], 15 | 'cache_path': f'{input_folder}/cache.pth', 16 | 'sampling_rate': 22050, 17 | 'pad_to_samples': 441000, 18 | 'resample_clip': False, 19 | 'extra_samples': 0, 20 | 'phase': 'train', 21 | 'n_workers': 2, 22 | 'batch_size': 64, 23 | } 24 | from data import create_dataset, create_dataloader 25 | os.makedirs(output_folder, exist_ok=True) 26 | 27 | ds = create_dataset(params) 28 | dl = create_dataloader(ds, params) 29 | 30 | dvae = load_speech_dvae().cuda() 31 | with torch.no_grad(): 32 | for batch in tqdm(dl): 33 | audio = batch['clip'].cuda() 34 | mel = wav_to_mel(audio) 35 | codes = dvae.get_codebook_indices(mel) 36 | for i in range(audio.shape[0]): 37 | c = codes[i, :batch['clip_lengths'][i]//1024+4] # +4 seems empirically to be a good clipping point - it seems to preserve the termination codes. 38 | fn = batch['path'][i] 39 | outp = os.path.join(output_folder, os.path.relpath(fn, input_folder) + ".pth") 40 | os.makedirs(os.path.dirname(outp), exist_ok=True) 41 | torch.save(c.tolist(), outp) 42 | -------------------------------------------------------------------------------- /codes/scripts/audio/preparation/pipeline.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import shutil 4 | from subprocess import Popen 5 | 6 | if __name__ == '__main__': 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument('--path', type=str, help='Path to search for files') 9 | parser.add_argument('--output_path', type=str, help='Path for output files') 10 | args = parser.parse_args() 11 | 12 | cmds = [ 13 | f"scripts/audio/preparation/phase_1_split_files.py --path={args.path} --progress_file={args.output_path}_t1/progress.txt --num_threads=6 --output_path={args.output_path}_t1", 14 | f"scripts/audio/preparation/phase_2_sample_and_filter.py --path={args.output_path}_t1 --progress_file={args.output_path}/progress.txt --num_threads=6 --output_path={args.output_path}", 15 | f"scripts/audio/preparation/phase_3_generate_similarities.py --path={args.output_path} --num_workers=4", 16 | ] 17 | os.makedirs(args.output_path, exist_ok=True) 18 | os.makedirs(args.output_path + "_t1", exist_ok=True) 19 | 20 | for cmd in cmds: 21 | p = Popen("python " + cmd, shell=True) 22 | p.wait() 23 | 24 | shutil.rmtree(args.output_path + "_t1") 25 | -------------------------------------------------------------------------------- /codes/scripts/audio/preparation/process_spleeter_filter_outputs.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import argparse 4 | from tqdm import tqdm 5 | 6 | if __name__ == '__main__': 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument('input', metavar='in', type=str) 9 | parser.add_argument('basis', metavar='basis', type=str) 10 | parser.add_argument('garbage', metavar='garbage', type=str) 11 | args = parser.parse_args() 12 | print(f"Moving files from {args.input} to {args.garbage}") 13 | os.makedirs(args.garbage, exist_ok=True) 14 | 15 | with open(args.input) as f: 16 | lines = f.readlines() 17 | for line in tqdm(lines): 18 | line = line.strip() 19 | assert args.basis in line 20 | movefile = os.path.join(args.garbage, line.replace(args.basis, '')[1:]) 21 | print(f'{line} -> {movefile}') 22 | os.makedirs(os.path.dirname(movefile), exist_ok=True) 23 | shutil.move(line, movefile) 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /codes/scripts/audio/preparation/save_mels_to_disk.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import numpy 5 | import torch 6 | from spleeter.audio.adapter import AudioAdapter 7 | from tqdm import tqdm 8 | 9 | from data.util import find_audio_files 10 | # Uses pydub to process a directory of audio files, splitting them into clips at points where it detects a small amount 11 | # of silence. 12 | from trainer.injectors.base_injectors import MelSpectrogramInjector 13 | 14 | 15 | def main(): 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument('--path') 18 | args = parser.parse_args() 19 | files = find_audio_files(args.path, include_nonwav=True) 20 | mel_inj = MelSpectrogramInjector({'in':'in', 'out':'out'}, {}) 21 | audio_loader = AudioAdapter.default() 22 | for e, wav_file in enumerate(tqdm(files)): 23 | if e < 0: 24 | continue 25 | print(f"Processing {wav_file}..") 26 | outfile = f'{wav_file}.npz' 27 | if os.path.exists(outfile): 28 | continue 29 | 30 | try: 31 | wave, sample_rate = audio_loader.load(wav_file, sample_rate=22050) 32 | wave = torch.tensor(wave)[:,0].unsqueeze(0) 33 | wave = wave / wave.abs().max() 34 | except: 35 | print(f"Error with {wav_file}") 36 | continue 37 | 38 | inj = mel_inj({'in': wave}) 39 | numpy.savez_compressed(outfile, inj['out'].numpy()) 40 | 41 | 42 | if __name__ == '__main__': 43 | main() 44 | -------------------------------------------------------------------------------- /codes/scripts/audio/preparation/spleeter_filter_noisy_clips.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import numpy as np 4 | from spleeter.separator import Separator 5 | from torch.utils.data import DataLoader 6 | from tqdm import tqdm 7 | 8 | from scripts.audio.preparation.spleeter_utils.spleeter_dataset import SpleeterDataset 9 | 10 | 11 | def main(): 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('--path') 14 | parser.add_argument('--out') 15 | parser.add_argument('--resume', default=None) 16 | parser.add_argument('--partition_size', default=None) 17 | parser.add_argument('--partition', default=None) 18 | args = parser.parse_args() 19 | 20 | src_dir = args.path 21 | out_file = args.out 22 | output_sample_rate=22050 23 | resume_file = args.resume 24 | 25 | loader = DataLoader(SpleeterDataset(src_dir, batch_sz=16, sample_rate=output_sample_rate, 26 | max_duration=10, partition=args.partition, partition_size=args.partition_size, 27 | resume=resume_file), batch_size=1, num_workers=1) 28 | 29 | separator = Separator('spleeter:2stems') 30 | unacceptable_files = open(out_file, 'a') 31 | for batch in tqdm(loader): 32 | audio, files, ends = batch['audio'], batch['files'], batch['ends'] 33 | sep = separator.separate(audio.squeeze(0).numpy()) 34 | vocals = sep['vocals'] 35 | bg = sep['accompaniment'] 36 | start = 0 37 | for path, end in zip(files, ends): 38 | vmax = np.abs(vocals[start:end]).mean() 39 | bmax = np.abs(bg[start:end]).mean() 40 | start = end 41 | 42 | # Only output to the "good" sample dir if the ratio of background noise to vocal noise is high enough. 43 | ratio = vmax / (bmax+.0000001) 44 | if ratio < 18: # These values were derived empirically 45 | unacceptable_files.write(f'{path[0]}\n') 46 | unacceptable_files.flush() 47 | 48 | unacceptable_files.close() 49 | 50 | 51 | if __name__ == '__main__': 52 | main() 53 | -------------------------------------------------------------------------------- /codes/scripts/audio/preparation/spleeter_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/scripts/audio/preparation/spleeter_utils/__init__.py -------------------------------------------------------------------------------- /codes/scripts/audio/preparation/spleeter_utils/spleeter_dataset.py: -------------------------------------------------------------------------------- 1 | from math import ceil 2 | 3 | import numpy as np 4 | 5 | from spleeter.audio.adapter import AudioAdapter 6 | from torch.utils.data import Dataset 7 | 8 | from data.util import find_audio_files 9 | 10 | 11 | class SpleeterDataset(Dataset): 12 | def __init__(self, src_dir, batch_sz, max_duration, sample_rate=22050, partition=None, partition_size=None, resume=None): 13 | self.batch_sz = batch_sz 14 | self.max_duration = max_duration 15 | self.files = find_audio_files(src_dir, include_nonwav=True) 16 | self.sample_rate = sample_rate 17 | 18 | # Partition files if needed. 19 | if partition_size is not None: 20 | psz = int(partition_size) 21 | prt = int(partition) 22 | self.files = self.files[prt * psz:(prt + 1) * psz] 23 | 24 | # Find the resume point and carry on from there. 25 | if resume is not None: 26 | for i, f in enumerate(self.files): 27 | if resume in f: 28 | break 29 | assert i < len(self.files) 30 | self.files = self.files[i:] 31 | self.loader = AudioAdapter.default() 32 | 33 | def __len__(self): 34 | return ceil(len(self.files) / self.batch_sz) 35 | 36 | def __getitem__(self, item): 37 | item = item * self.batch_sz 38 | wavs = None 39 | files = [] 40 | ends = [] 41 | for k in range(self.batch_sz): 42 | ind = k+item 43 | if ind >= len(self.files): 44 | break 45 | 46 | #try: 47 | wav, sr = self.loader.load(self.files[ind], sample_rate=self.sample_rate) 48 | assert sr == 22050 49 | # Get rid of all channels except one. 50 | if wav.shape[1] > 1: 51 | wav = wav[:, 0] 52 | 53 | if wavs is None: 54 | wavs = wav 55 | else: 56 | wavs = np.concatenate([wavs, wav]) 57 | ends.append(wavs.shape[0]) 58 | files.append(self.files[ind]) 59 | #except: 60 | # print(f'Error loading {self.files[ind]}') 61 | return { 62 | 'audio': wavs, 63 | 'files': files, 64 | 'ends': ends 65 | } -------------------------------------------------------------------------------- /codes/scripts/audio/preparation/split_on_silence.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import os 4 | from pydub import AudioSegment 5 | from pydub.exceptions import CouldntDecodeError 6 | from pydub.silence import split_on_silence 7 | from data.util import find_audio_files 8 | from tqdm import tqdm 9 | 10 | 11 | # Uses pydub to process a directory of audio files, splitting them into clips at points where it detects a small amount 12 | # of silence. 13 | def main(): 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('--path') 16 | parser.add_argument('--out') 17 | args = parser.parse_args() 18 | minimum_duration = 2 19 | maximum_duration = 20 20 | files = find_audio_files(args.path, include_nonwav=True) 21 | for e, wav_file in enumerate(tqdm(files)): 22 | print(f"Processing {wav_file}..") 23 | outdir = os.path.join(args.out, f'{e}_{os.path.basename(wav_file[:-4])}').replace('.', '').strip() 24 | os.makedirs(outdir, exist_ok=True) 25 | 26 | try: 27 | speech = AudioSegment.from_file(wav_file) 28 | except CouldntDecodeError as e: 29 | print(e) 30 | continue 31 | chunks = split_on_silence(speech, min_silence_len=400, silence_thresh=-40, 32 | seek_step=100, keep_silence=50) 33 | 34 | for i in range(0, len(chunks)): 35 | if chunks[i].duration_seconds < minimum_duration or chunks[i].duration_seconds > maximum_duration: 36 | continue 37 | chunks[i].export(f"{outdir}/{i:05d}.mp3", format='mp3', parameters=["-ac", "1"]) 38 | 39 | if __name__ == '__main__': 40 | main() 41 | -------------------------------------------------------------------------------- /codes/scripts/audio/random_mp3_splitter.py: -------------------------------------------------------------------------------- 1 | from scipy.io import wavfile 2 | from spleeter.separator import Separator 3 | from tqdm import tqdm 4 | 5 | from data.util import find_audio_files 6 | import os.path as osp 7 | from spleeter.audio.adapter import AudioAdapter 8 | import numpy as np 9 | 10 | 11 | if __name__ == '__main__': 12 | src_dir = 'P:\\Audiobooks-Podcasts' 13 | #src_dir = 'E:\\audio\\books' 14 | output_dir = 'D:\\data\\audio\\misc-split' 15 | output_dir_lq = 'D:\\data\\audio\\misc-split-with-bg' 16 | output_dir_garbage = 'D:\\data\\audio\\misc-split-garbage' 17 | #output_dir = 'E:\\audio\\books-clips' 18 | clip_length = 5 # In seconds 19 | sparsity = .1 # Only this proportion of the total clips are extracted as wavs. 20 | output_sample_rate=22050 21 | 22 | audio_loader = AudioAdapter.default() 23 | separator = Separator('spleeter:2stems') 24 | files = find_audio_files(src_dir, include_nonwav=True) 25 | for e, file in enumerate(tqdm(files)): 26 | if e < 1092: 27 | continue 28 | file_basis = osp.relpath(file, src_dir)\ 29 | .replace('/', '_')\ 30 | .replace('\\', '_')\ 31 | .replace('.', '_')\ 32 | .replace(' ', '_')\ 33 | .replace('!', '_')\ 34 | .replace(',', '_') 35 | if len(file_basis) > 100: 36 | file_basis = file_basis[:100] 37 | try: 38 | wave, sample_rate = audio_loader.load(file, sample_rate=output_sample_rate) 39 | except: 40 | print(f"Error with {file}") 41 | continue 42 | 43 | #if len(wave.shape) < 2: 44 | # continue 45 | 46 | # Calculate how much data we need to extract for each clip. 47 | clip_sz = sample_rate * clip_length 48 | interval = int(sample_rate * (clip_length / sparsity)) 49 | i = 0 50 | while (i+clip_sz) < wave.shape[0]: 51 | clip = wave[i:i+clip_sz] 52 | sep = separator.separate(clip) 53 | vocals = sep['vocals'] 54 | bg = sep['accompaniment'] 55 | vmax = np.abs(vocals).mean() 56 | bmax = np.abs(bg).mean() 57 | 58 | # Only output to the "good" sample dir if the ratio of background noise to vocal noise is high enough. 59 | ratio = vmax / (bmax+.0000001) 60 | if ratio >= 25: # These values were derived empirically 61 | od = output_dir 62 | os = clip 63 | elif ratio >= 1: 64 | od = output_dir_lq 65 | os = vocals 66 | else: 67 | od = output_dir_garbage 68 | os = vocals 69 | 70 | # Strip out channels. 71 | if len(os.shape) > 1: 72 | os = os[:, 0] # Just use the first channel. 73 | 74 | wavfile.write(osp.join(od, f'{e}_{file_basis}_{i}.wav'), output_sample_rate, os) 75 | i = i + interval 76 | -------------------------------------------------------------------------------- /codes/scripts/audio/spleeter_split_voice_and_background.py: -------------------------------------------------------------------------------- 1 | from scipy.io import wavfile 2 | from spleeter.separator import Separator 3 | from tqdm import tqdm 4 | ''' 5 | Uses a model configuration to load a classifier and iterate through a dataset, binning each class into it's own 6 | folder. 7 | ''' 8 | 9 | from data.util import find_audio_files 10 | import os 11 | import os.path as osp 12 | from spleeter.audio.adapter import AudioAdapter 13 | import numpy as np 14 | 15 | 16 | # Uses spleeter_utils to divide audio clips into one of two bins: 17 | # 1. Audio has little to no background noise, saved to "output_dir" 18 | # 2. Audio has a lot of background noise, bg noise split off and saved to "output_dir_bg" 19 | if __name__ == '__main__': 20 | src_dir = 'F:\\split\\joe_rogan' 21 | output_dir = 'F:\\split\\cleaned\\joe_rogan' 22 | output_dir_bg = 'F:\\split\\background-noise\\joe_rogan' 23 | output_sample_rate=22050 24 | 25 | os.makedirs(output_dir_bg, exist_ok=True) 26 | os.makedirs(output_dir, exist_ok=True) 27 | 28 | audio_loader = AudioAdapter.default() 29 | separator = Separator('spleeter:2stems') 30 | files = find_audio_files(src_dir, include_nonwav=True) 31 | for e, file in enumerate(tqdm(files)): 32 | #if e < 406500: 33 | # continue 34 | file_basis = osp.relpath(file, src_dir)\ 35 | .replace('/', '_')\ 36 | .replace('\\', '_')\ 37 | .replace('.', '_')\ 38 | .replace(' ', '_')\ 39 | .replace('!', '_')\ 40 | .replace(',', '_') 41 | if len(file_basis) > 100: 42 | file_basis = file_basis[:100] 43 | try: 44 | wave, sample_rate = audio_loader.load(file, sample_rate=output_sample_rate) 45 | except: 46 | print(f"Error with {file}") 47 | continue 48 | 49 | sep = separator.separate(wave) 50 | vocals = sep['vocals'] 51 | bg = sep['accompaniment'] 52 | vmax = np.abs(vocals).mean() 53 | bmax = np.abs(bg).mean() 54 | 55 | # Only output to the "good" sample dir if the ratio of background noise to vocal noise is high enough. 56 | ratio = vmax / (bmax+.0000001) 57 | if ratio >= 25: # These values were derived empirically 58 | od = output_dir 59 | os = wave 60 | elif ratio <= 1: 61 | od = output_dir_bg 62 | os = bg 63 | else: 64 | continue 65 | 66 | # Strip out channels. 67 | if len(os.shape) > 1: 68 | os = os[:, 0] # Just use the first channel. 69 | 70 | wavfile.write(osp.join(od, file_basis, f'{e}.wav'), output_sample_rate, os) 71 | -------------------------------------------------------------------------------- /codes/scripts/audio/test_audio_similarity.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | 6 | from data.util import is_wav_file, find_files_of_type 7 | from models.audio.audio_resnet import resnet50 8 | from models.audio.tts.tacotron2.taco_utils import load_wav_to_torch 9 | from scripts.byol.byol_extract_wrapped_model import extract_byol_model_from_state_dict 10 | 11 | if __name__ == '__main__': 12 | window = 48000 13 | root_path = 'D:\\tmp\\clips' 14 | paths = find_files_of_type('img', root_path, qualifier=is_wav_file)[0] 15 | clips = [] 16 | for path in paths: 17 | clip, sr = load_wav_to_torch(os.path.join(root_path, path)) 18 | if len(clip.shape) > 1: 19 | clip = clip[:,0] 20 | clip = clip[:window].unsqueeze(0) 21 | clip = clip / 32768.0 # Normalize 22 | #clip = clip + torch.rand_like(clip) * .03 # Noise (this is how the model was trained) 23 | assert sr == 24000 24 | clips.append(clip) 25 | clips = torch.stack(clips, dim=0) 26 | 27 | resnet = resnet50() 28 | sd = torch.load('../experiments/train_byol_audio_clips/models/8000_generator.pth') 29 | sd = extract_byol_model_from_state_dict(sd) 30 | resnet.load_state_dict(sd) 31 | embedding = resnet(clips, return_pool=True) 32 | 33 | for i, path in enumerate(paths): 34 | print(f'Using a baseline of {path}..') 35 | for j, cpath in enumerate(paths): 36 | if i == j: 37 | continue 38 | l2 = F.mse_loss(embedding[j], embedding[i]) 39 | print(f'Compared to {cpath}: {l2}') 40 | 41 | -------------------------------------------------------------------------------- /codes/scripts/audio/test_audio_speech_recognition.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | import logging 3 | import random 4 | import argparse 5 | 6 | import utils 7 | import utils.options as option 8 | import utils.util as util 9 | from models.audio.tts.tacotron2 import sequence_to_text 10 | from trainer.ExtensibleTrainer import ExtensibleTrainer 11 | from data import create_dataset, create_dataloader 12 | from tqdm import tqdm 13 | import torch 14 | import numpy as np 15 | from scipy.io import wavfile 16 | 17 | 18 | def forward_pass(model, data, output_dir, opt, b): 19 | with torch.no_grad(): 20 | model.feed_data(data, 0) 21 | model.test() 22 | 23 | if 'real_text' in opt['eval'].keys(): 24 | real = data[opt['eval']['real_text']][0] 25 | print(f'{b} Real text: "{real}"') 26 | 27 | pred_seq = model.eval_state[opt['eval']['gen_text']][0] 28 | pred_text = [sequence_to_text(ts) for ts in pred_seq] 29 | audio = model.eval_state[opt['eval']['audio']][0].cpu().numpy() 30 | wavfile.write(osp.join(output_dir, f'{b}_clip.wav'), 22050, audio) 31 | for i, text in enumerate(pred_text): 32 | print(f'{b} Predicted text {i}: "{text}"') 33 | 34 | 35 | if __name__ == "__main__": 36 | # Set seeds 37 | torch.manual_seed(5555) 38 | random.seed(5555) 39 | np.random.seed(5555) 40 | 41 | #### options 42 | torch.backends.cudnn.benchmark = True 43 | want_metrics = False 44 | parser = argparse.ArgumentParser() 45 | parser.add_argument('-opt', type=str, help='Path to options YAML file.', default='../options/test_gpt_asr_mass.yml') 46 | opt = option.parse(parser.parse_args().opt, is_train=False) 47 | opt = option.dict_to_nonedict(opt) 48 | utils.util.loaded_options = opt 49 | 50 | util.mkdirs( 51 | (path for key, path in opt['path'].items() 52 | if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) 53 | util.setup_logger('base', opt['path']['log'], 'test_' + opt['name'], level=logging.INFO, 54 | screen=True, tofile=True) 55 | logger = logging.getLogger('base') 56 | logger.info(option.dict2str(opt)) 57 | 58 | test_loaders = [] 59 | for phase, dataset_opt in sorted(opt['datasets'].items()): 60 | test_set, collate_fn = create_dataset(dataset_opt, return_collate=True) 61 | test_loader = create_dataloader(test_set, dataset_opt, collate_fn=collate_fn) 62 | logger.info('Number of test texts in [{:s}]: {:d}'.format(dataset_opt['name'], len(test_set))) 63 | test_loaders.append(test_loader) 64 | 65 | model = ExtensibleTrainer(opt) 66 | 67 | batch = 0 68 | for test_loader in test_loaders: 69 | dataset_dir = opt['path']['results_root'] 70 | util.mkdir(dataset_dir) 71 | 72 | tq = tqdm(test_loader) 73 | for data in tq: 74 | forward_pass(model, data, dataset_dir, opt, batch) 75 | batch += 1 76 | 77 | -------------------------------------------------------------------------------- /codes/scripts/audio/use_vocoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from scipy.io import wavfile 3 | 4 | from models.audio.vocoders.waveglow.waveglow import WaveGlow 5 | 6 | 7 | class Vocoder: 8 | def __init__(self): 9 | self.model = WaveGlow(n_mel_channels=80, n_flows=12, n_group=8, n_early_size=2, n_early_every=4, WN_config={'n_layers': 8, 'n_channels': 256, 'kernel_size': 3}) 10 | sd = torch.load('../experiments/waveglow_256channels_universal_v5.pth') 11 | self.model.load_state_dict(sd) 12 | self.model = self.model.cpu() 13 | self.model.eval() 14 | 15 | def transform_mel_to_audio(self, mel): 16 | if len(mel.shape) == 2: # Assume it's missing the batch dimension and fix that. 17 | mel = mel.unsqueeze(0) 18 | with torch.no_grad(): 19 | return self.model.infer(mel) 20 | 21 | 22 | if __name__ == '__main__': 23 | vocoder = Vocoder() 24 | m = torch.load('C:\\Users\\jbetk\\Documents\\tmp\\some_audio\\00008.mel').cpu() 25 | wav = vocoder.transform_mel_to_audio(m) 26 | wavfile.write(f'0.wav', 22050, wav[0].cpu().numpy()) -------------------------------------------------------------------------------- /codes/scripts/audio/word_error_rate.py: -------------------------------------------------------------------------------- 1 | import Levenshtein 2 | from jiwer import wer, compute_measures 3 | import torch 4 | from tqdm import tqdm 5 | 6 | from data.audio.voice_tokenizer import VoiceBpeTokenizer 7 | 8 | 9 | def load_truths(file): 10 | niltok = VoiceBpeTokenizer(None) 11 | out = {} 12 | with open(file, 'r', encoding='utf-8') as f: 13 | for line in f.readlines(): 14 | spl = line.split('|') 15 | if len(spl) != 2: 16 | print(spl) 17 | continue 18 | path, truth = spl 19 | #path = path.replace('wav/', '') 20 | # This preprocesses the truth data in the same way that training data is processed: removing punctuation, all lowercase, removing unnecessary 21 | # whitespace, and applying "english cleaners", which convert words like "mrs" to "missus" and such. 22 | truth = niltok.preprocess_text(truth) 23 | out[path] = truth 24 | return out 25 | 26 | 27 | if __name__ == '__main__': 28 | inference_tsv = 'results.tsv' 29 | libri_base = 'y:\\bigasr_dataset/librispeech/test_clean/test_clean.txt' 30 | 31 | # Pre-process truth values 32 | truths = load_truths(libri_base) 33 | 34 | niltok = VoiceBpeTokenizer(None) 35 | ground_truths = [] 36 | hypotheses = [] 37 | with open(inference_tsv, 'r') as tsv_file: 38 | tsv = tsv_file.read().splitlines() 39 | for line in tqdm(tsv): 40 | sentence_pred, wav = line.split('\t') 41 | hypotheses.append(niltok.preprocess_text(sentence_pred)) 42 | ground_truths.append(truths[wav]) 43 | wer = wer(ground_truths, hypotheses)*100 44 | print(f"WER: {wer}") 45 | -------------------------------------------------------------------------------- /codes/scripts/byol/byol_extract_wrapped_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def extract_byol_model_from_state_dict(sd): 5 | wrap_key = 'online_encoder.net.' 6 | sdo = {} 7 | for k,v in sd.items(): 8 | if wrap_key in k: 9 | sdo[k.replace(wrap_key, '')] = v 10 | return sdo 11 | 12 | if __name__ == '__main__': 13 | pretrained_path = '../../../experiments/uresnet_pixpro4_imgset.pth' 14 | output_path = '../../../experiments/uresnet_pixpro4_imgset.pth' 15 | 16 | sd = torch.load(pretrained_path) 17 | sd = extract_byol_model_from_state_dict(sd) 18 | 19 | #model = SpineNet('49', in_channels=3, use_input_norm=True).to('cuda') 20 | #model.load_state_dict(sdo, strict=True) 21 | 22 | print("Validation succeeded, dumping state dict to output path.") 23 | torch.save(sdo, output_path) -------------------------------------------------------------------------------- /codes/scripts/do_to_files.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import functools 3 | import os 4 | import pathlib 5 | from multiprocessing.pool import ThreadPool 6 | 7 | from tqdm import tqdm 8 | 9 | 10 | ''' 11 | Helper function for scripts that iterate over large sets of files. Defines command-line arguments 12 | for operating over a large set of files, then handles setting up a worker queue system to operate 13 | on those files. You need to provide your own process_file_fn. 14 | 15 | process_file_fn expected signature: 16 | (path, output_path) 17 | ''' 18 | def do_to_files(process_file_fn): 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument('--path') 21 | parser.add_argument('--glob') 22 | parser.add_argument('--out') 23 | parser.add_argument('--resume') 24 | parser.add_argument('--num_workers') 25 | 26 | args = parser.parse_args() 27 | src = args.path 28 | glob = args.glob 29 | out = args.out 30 | resume = args.resume 31 | num_workers = int(args.num_workers) 32 | 33 | path = pathlib.Path(src) 34 | files = path.rglob(glob) 35 | files = [str(f) for f in files] 36 | files = files[resume:] 37 | pfn = functools.partial(process_file_fn, output_path=out) 38 | if num_workers > 0: 39 | with ThreadPool(num_workers) as pool: 40 | list(tqdm(pool.imap(pfn, files), total=len(files))) 41 | else: 42 | for f in tqdm(files): 43 | pfn(f) 44 | -------------------------------------------------------------------------------- /codes/scripts/folderize_imagenet_val.py: -------------------------------------------------------------------------------- 1 | from glob import glob 2 | 3 | import torch 4 | import os 5 | import shutil 6 | 7 | if __name__ == '__main__': 8 | index_map_file = 'F:\\4k6k\\datasets\\images\\imagenet_2017\\imagenet_index_to_train_folder_name_map.pth' 9 | ground_truth = 'F:\\4k6k\\datasets\\images\\imagenet_2017\\validation_ground_truth.txt' 10 | val_path = 'F:\\4k6k\\datasets\\images\\imagenet_2017\\val' 11 | 12 | index_map = torch.load(index_map_file) 13 | 14 | for folder in index_map.values(): 15 | os.makedirs(os.path.join(val_path, folder), exist_ok=True) 16 | 17 | gtfile = open(ground_truth, 'r') 18 | gtids = [] 19 | for line in gtfile: 20 | gtids.append(int(line.strip())) 21 | gtfile.close() 22 | 23 | for i, img_file in enumerate(glob(os.path.join(val_path, "*.JPEG"))): 24 | shutil.move(img_file, os.path.join(val_path, index_map[gtids[i]], 25 | os.path.basename(img_file))) 26 | print("Done!") 27 | -------------------------------------------------------------------------------- /codes/scripts/hugging_face_hub_upload.py: -------------------------------------------------------------------------------- 1 | if __name__ == '__main__': 2 | """ 3 | Utility script for uploading model weights to the HF hub 4 | """ 5 | 6 | ''' 7 | model = Wav2VecWrapper(vocab_size=148, basis_model='facebook/wav2vec2-large-robust-ft-libri-960h', freeze_transformer=True, checkpointing_enabled=False) 8 | weights = torch.load('D:\\dlas\\experiments\\train_wav2vec_mass_large2\\models\\22500_wav2vec.pth') 9 | model.load_state_dict(weights) 10 | model.w2v.save_pretrained("jbetker/wav2vec2-large-robust-ft-libritts-voxpopuli", push_to_hub=True) 11 | ''' 12 | 13 | # Build tokenizer vocab 14 | #mapping = tacotron_symbol_mapping() 15 | #print(json.dumps(mapping)) -------------------------------------------------------------------------------- /codes/scripts/start_tensorboard.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | #script to find the latest directory in a directory and start tensorboard from there 4 | 5 | 6 | def get_latest_dir(path): 7 | dirs = os.listdir(path) 8 | dirs = [os.path.join(path, d) for d in dirs] 9 | dirs = [d for d in dirs if os.path.isdir(d)] 10 | return max(dirs, key=os.path.getmtime) 11 | 12 | def start_tensorboard(path): 13 | latest_dir = get_latest_dir(path) 14 | os.path.join(latest_dir, 'tb_logger') 15 | os.system('tensorboard --logdir ' + latest_dir) 16 | 17 | if __name__ == '__main__': 18 | #process experiments folder 19 | print('Starting tensorboard from latest experiment folder:' + get_latest_dir('experiments') + '...') 20 | start_tensorboard('experiments') -------------------------------------------------------------------------------- /codes/scripts/stitch_images.py: -------------------------------------------------------------------------------- 1 | import glob 2 | 3 | import torch 4 | import torchvision 5 | from PIL import Image 6 | from torchvision.transforms import ToTensor 7 | 8 | if __name__ == '__main__': 9 | imfolder = 'F:\\dlas\\results\\test_diffusion_unet\\imgset5' 10 | cols, rows = 10, 5 11 | images = glob.glob(f'{imfolder}/*.png') 12 | output = None 13 | for r in range(rows): 14 | for c in range(cols): 15 | im = ToTensor()(Image.open(next(images))) 16 | if output is None: 17 | c, h, w = im.shape 18 | output = torch.zeros(c, h * rows, w * cols) 19 | output[:,r*h:(r+1)*h,c*w:(c+1)*w] = im 20 | torchvision.utils.save_image(output, "out.png") -------------------------------------------------------------------------------- /codes/scripts/stylegan2/dnnlib/tflib/network.py: -------------------------------------------------------------------------------- 1 | # Pretends to be the stylegan2 Network class for intercepting pickle load requests. 2 | # Horrible hack. Please don't judge me. 3 | 4 | # Globals for storing these networks because I have no idea how pickle is doing this internally. 5 | generator, discriminator, gen_ema = {}, {}, {} 6 | 7 | class Network: 8 | def __setstate__(self, state: dict) -> None: 9 | global generator, discriminator, gen_ema 10 | name = state['name'] 11 | if name in ['G_synthesis', 'G_mapping', 'G', 'G_main']: 12 | if name != 'G' and name not in generator.keys(): 13 | generator[name] = state 14 | else: 15 | gen_ema[name] = state 16 | elif name in ['D']: 17 | discriminator[name] = state 18 | -------------------------------------------------------------------------------- /codes/scripts/ui/image_labeler/label_editor.py: -------------------------------------------------------------------------------- 1 | import orjson 2 | 3 | from data.images.image_label_parser import VsNetImageLabeler 4 | 5 | 6 | # Translates from the label JSON output of the VS.NET UI to something more compact and usable. 7 | def convert_from_vsnet_labels(): 8 | labeler = VsNetImageLabeler(['F:\\4k6k\datasets\\ns_images\\512_unsupervised\\categories.json', 9 | 'F:\\4k6k\datasets\\ns_images\\512_unsupervised\\categories_new.json', 10 | 'F:\\4k6k\datasets\\ns_images\\512_unsupervised\\categories_new_new.json']) 11 | # Proposed format: 12 | # 'config': { 'dim' } 13 | # 'labels': [{ 'label', 'key'}] <- ordered by label index. 14 | # 'images': {'file': [{ 'lid', 'top', 'left' }} 15 | # 'labelMap' {} 16 | out_dict = { 17 | 'config': { 18 | 'dim': next(iter(labeler.labeled_images.values()))[0]['patch_width'] 19 | }, 20 | 'labels': [{'label': cat['label'], 'key': cat['keyBinding']} for cat in labeler.categories.values()], 21 | } 22 | out_dict['labelMap'] = {} 23 | for i, lbl in enumerate(out_dict['labels']): 24 | out_dict['labelMap'][lbl['label']] = i 25 | out_dict['images'] = {} 26 | for fname, ilbls in labeler.labeled_images.items(): 27 | out_dict['images'][fname] = [{'lid': out_dict['labelMap'][il['label']], 'top': il['patch_top'], 'left': il['patch_left']} for il in ilbls] 28 | with open("label_editor.json", 'wb') as fout: 29 | fout.write(orjson.dumps(out_dict)) 30 | 31 | 32 | if __name__ == '__main__': 33 | convert_from_vsnet_labels() -------------------------------------------------------------------------------- /codes/scripts/ui/image_labeler/pretrained_image_patch_classifier.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os.path as osp 3 | 4 | import utils 5 | import utils.options as option 6 | import utils.util as util 7 | from data import create_dataset, create_dataloader 8 | from trainer.ExtensibleTrainer import ExtensibleTrainer 9 | 10 | 11 | class PretrainedImagePatchClassifier: 12 | def __init__(self, cfg): 13 | self.cfg = cfg 14 | 15 | opt = option.parse(cfg, is_train=False) 16 | opt = option.dict_to_nonedict(opt) 17 | utils.util.loaded_options = opt 18 | 19 | util.mkdirs( 20 | (path for key, path in opt['path'].items() 21 | if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) 22 | util.setup_logger('base', opt['path']['log'], 'test_' + opt['name'], level=logging.INFO, 23 | screen=True, tofile=True) 24 | logger = logging.getLogger('base') 25 | logger.info(option.dict2str(opt)) 26 | 27 | #### Create test dataset and dataloader 28 | dataset_opt = list(opt['datasets'].values())[0] 29 | # Remove labeling features from the dataset config and wrappers. 30 | if 'dataset' in dataset_opt.keys(): 31 | if 'labeler' in dataset_opt['dataset'].keys(): 32 | dataset_opt['dataset']['includes_labels'] = False 33 | del dataset_opt['dataset']['labeler'] 34 | test_set = create_dataset(dataset_opt) 35 | if hasattr(test_set, 'wrapped_dataset'): 36 | test_set = test_set.wrapped_dataset 37 | else: 38 | test_set = create_dataset(dataset_opt) 39 | logger.info('Number of test images: {:d}'.format(len(test_set))) 40 | self.test_loader = create_dataloader(test_set, dataset_opt, opt) 41 | self.model = ExtensibleTrainer(opt) 42 | self.gen = self.model.netsG['generator'] 43 | self.dataset_dir = osp.join(opt['path']['results_root'], opt['name']) 44 | util.mkdir(self.dataset_dir) 45 | 46 | def get_next_sample(self): 47 | 48 | for data in self.test_loader: 49 | hq = data['hq'].to('cuda') 50 | res = self.gen(hq) 51 | yield hq, res, data 52 | 53 | -------------------------------------------------------------------------------- /codes/scripts/ui/image_labeler/test_image_patch_classifier.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import torch 5 | import torchvision 6 | 7 | import utils.options as option 8 | from scripts.ui.image_labeler.pretrained_image_patch_classifier import PretrainedImagePatchClassifier 9 | 10 | if __name__ == "__main__": 11 | #### options 12 | torch.backends.cudnn.benchmark = True 13 | want_metrics = False 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('-opt', type=str, help='Path to options YAML file.', default='../options/train_imgset_structural_classifier.yml') 16 | 17 | classifier = PretrainedImagePatchClassifier(parser.parse_args().opt) 18 | label_to_search_for = 4 19 | step = 1 20 | for hq, res in classifier.get_next_sample(): 21 | res = torch.nn.functional.interpolate(res, size=hq.shape[2:], mode="nearest") 22 | res_lbl = res[:, label_to_search_for, :, :].unsqueeze(1) 23 | res_lbl_mask = (1.0 * (res_lbl > .5))*.5 + .5 24 | hq = hq * res_lbl_mask 25 | torchvision.utils.save_image(hq, os.path.join(classifier.dataset_dir, "%i.png" % (step,))) 26 | step += 1 27 | -------------------------------------------------------------------------------- /codes/scripts/use_generator_as_filter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | 4 | from torch.utils.data import DataLoader 5 | 6 | from data.images.single_image_dataset import SingleImageDataset 7 | from tqdm import tqdm 8 | import torch 9 | 10 | from models.vqvae.vqvae_no_conv_transpose import VQVAE 11 | 12 | if __name__ == "__main__": 13 | bin_path = "f:\\binned" 14 | good_path = "f:\\good" 15 | os.makedirs(bin_path, exist_ok=True) 16 | os.makedirs(good_path, exist_ok=True) 17 | 18 | torch.backends.cudnn.benchmark = True 19 | 20 | model = VQVAE().cuda() 21 | model.load_state_dict(torch.load('../experiments/nvqvae_imgset.pth')) 22 | ds = SingleImageDataset({ 23 | 'name': 'amalgam', 24 | 'paths': ['F:\\4k6k\\datasets\\ns_images\\imagesets\\256_with_ref_v5'], 25 | 'weights': [1], 26 | 'target_size': 128, 27 | 'force_multiple': 32, 28 | 'scale': 1, 29 | 'eval': False 30 | }) 31 | dl = DataLoader(ds, batch_size=256, num_workers=1) 32 | 33 | means = [] 34 | model.eval() 35 | with torch.no_grad(): 36 | for i, data in enumerate(tqdm(dl)): 37 | hq = data['hq'].cuda() 38 | gen = model(hq)[0] 39 | l2 = torch.mean(torch.square(hq - gen), dim=[1,2,3]) 40 | for b in range(len(l2)): 41 | if l2[b] > .0004: 42 | shutil.copy(data['GT_path'][b], good_path) 43 | #else: 44 | # shutil.copy(data['GT_path'][b], bin_path) 45 | 46 | 47 | #means.append(l2.cpu()) 48 | #if i % 10 == 0: 49 | # print(torch.stack(means, dim=0).mean()) 50 | -------------------------------------------------------------------------------- /codes/scripts/validate_data.py: -------------------------------------------------------------------------------- 1 | # This script iterates through all the data with no worker threads and performs whatever transformations are prescribed. 2 | # The idea is to find bad/corrupt images. 3 | 4 | import math 5 | import argparse 6 | import random 7 | import torch 8 | from utils import util, options as option 9 | from data import create_dataloader, create_dataset 10 | from tqdm import tqdm 11 | from skimage import io 12 | 13 | def main(): 14 | #### options 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument('-opt', type=str, help='Path to option YAML file.', default='../../options/train_prog_mi1_rrdb_6bypass.yml') 17 | parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', 18 | help='job launcher') 19 | parser.add_argument('--local_rank', type=int, default=0) 20 | args = parser.parse_args() 21 | opt = option.parse(args.opt, is_train=True) 22 | 23 | #### distributed training settings 24 | opt['dist'] = False 25 | rank = -1 26 | 27 | # convert to NoneDict, which returns None for missing keys 28 | opt = option.dict_to_nonedict(opt) 29 | 30 | #### random seed 31 | seed = opt['train']['manual_seed'] 32 | if seed is None: 33 | seed = random.randint(1, 10000) 34 | util.set_random_seed(seed) 35 | 36 | torch.backends.cudnn.benchmark = True 37 | # torch.backends.cudnn.deterministic = True 38 | 39 | #### create train and val dataloader 40 | for phase, dataset_opt in opt['datasets'].items(): 41 | if phase == 'train': 42 | train_set = create_dataset(dataset_opt) 43 | train_size = int(math.ceil(len(train_set) / dataset_opt['batch_size'])) 44 | total_iters = int(opt['train']['niter']) 45 | total_epochs = int(math.ceil(total_iters / train_size)) 46 | dataset_opt['n_workers'] = 0 # Force num_workers=0 to make dataloader work in process. 47 | train_loader = create_dataloader(train_set, dataset_opt, opt, None) 48 | if rank <= 0: 49 | print('Number of training data elements: {:,d}, iters: {:,d}'.format( 50 | len(train_set), train_size)) 51 | assert train_loader is not None 52 | 53 | ''' 54 | tq_ldr = tqdm(train_set.get_paths()) 55 | for path in tq_ldr: 56 | try: 57 | _ = io.imread(path) 58 | # Do stuff with img 59 | except Exception as e: 60 | print("Error with %s" % (path,)) 61 | print(e) 62 | ''' 63 | tq_ldr = tqdm(train_set) 64 | for ds in tq_ldr: 65 | pass 66 | 67 | 68 | if __name__ == '__main__': 69 | main() 70 | -------------------------------------------------------------------------------- /codes/sweep.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import functools 3 | import os 4 | from multiprocessing.pool import ThreadPool 5 | 6 | import torch 7 | 8 | from train import Trainer 9 | from utils import options as option 10 | import collections.abc 11 | 12 | 13 | def deep_update(d, u): 14 | for k, v in u.items(): 15 | if isinstance(v, collections.abc.Mapping): 16 | d[k] = deep_update(d.get(k, {}), v) 17 | else: 18 | d[k] = v 19 | return d 20 | 21 | 22 | def launch_trainer(opt, opt_path, rank): 23 | os.environ['CUDA_VISIBLE_DEVICES'] = str(rank) 24 | print('export CUDA_VISIBLE_DEVICES=' + str(rank)) 25 | trainer = Trainer() 26 | opt['dist'] = False 27 | trainer.rank = -1 28 | trainer.init(opt_path, opt, 'none') 29 | trainer.do_training() 30 | 31 | 32 | if __name__ == '__main__': 33 | """ 34 | Ad-hoc script (hard coded; no command-line parameters) that spawns multiple separate trainers from a single options 35 | file, with a hard-coded set of modifications. 36 | """ 37 | base_opt = '../experiments/sweep_music_mel2vec.yml' 38 | modifications = { 39 | 'baseline': {}, 40 | 'lr1e3': {'steps': {'generator': {'optimizer_params': {'lr': {.001}}}}}, 41 | 'lr1e5': {'steps': {'generator': {'optimizer_params': {'lr': {.00001}}}}}, 42 | 'no_warmup': {'train': {'warmup_steps': 0}}, 43 | } 44 | base_rank = 4 45 | opt = option.parse(base_opt, is_train=True) 46 | all_opts = [] 47 | for i, (mod, mod_dict) in enumerate(modifications.items()): 48 | nd = copy.deepcopy(opt) 49 | deep_update(nd, mod_dict) 50 | nd['name'] = f'{nd["name"]}_{mod}' 51 | nd['wandb_run_name'] = mod 52 | base_path = nd['path']['log'] 53 | for k, p in nd['path'].items(): 54 | if isinstance(p, str) and base_path in p: 55 | nd['path'][k] = p.replace(base_path, f'{base_path}/{mod}') 56 | all_opts.append(nd) 57 | 58 | for i in range(1,len(modifications)): 59 | pid = os.fork() 60 | if pid == 0: 61 | rank = i 62 | break 63 | else: 64 | rank = 0 65 | launch_trainer(all_opts[rank], base_opt, rank+base_rank) 66 | -------------------------------------------------------------------------------- /codes/trainer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/trainer/__init__.py -------------------------------------------------------------------------------- /codes/trainer/custom_training_components/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/trainer/custom_training_components/__init__.py -------------------------------------------------------------------------------- /codes/trainer/custom_training_components/stereoscopic.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.cuda.amp import autocast 3 | from models.flownet2.networks import Resample2d 4 | from models.flownet2 import flow2img 5 | from trainer.inject import Injector 6 | 7 | 8 | def create_stereoscopic_injector(opt, env): 9 | type = opt['type'] 10 | if type == 'stereoscopic_resample': 11 | return ResampleInjector(opt, env) 12 | elif type == 'stereoscopic_flow2image': 13 | return Flow2Image(opt, env) 14 | return None 15 | 16 | 17 | class ResampleInjector(Injector): 18 | def __init__(self, opt, env): 19 | super(ResampleInjector, self).__init__(opt, env) 20 | self.resample = Resample2d() 21 | self.flow = opt['flowfield'] 22 | 23 | def forward(self, state): 24 | with autocast(enabled=False): 25 | return {self.output: self.resample(state[self.input], state[self.flow])} 26 | 27 | 28 | # Converts a flowfield to an image representation for viewing purposes. 29 | # Uses flownet's implementation to do so. Which really sucks. TODO: just do my own implementation in the future. 30 | # Note: this is not differentiable and is only usable for debugging purposes. 31 | class Flow2Image(Injector): 32 | def __init__(self, opt, env): 33 | super(Flow2Image, self).__init__(opt, env) 34 | 35 | def forward(self, state): 36 | with torch.no_grad(): 37 | flo = state[self.input].cpu() 38 | bs, c, h, w = flo.shape 39 | flo = flo.permute(0, 2, 3, 1) # flow2img works in numpy space for some reason.. 40 | imgs = torch.empty_like(flo) 41 | flo = flo.numpy() 42 | for b in range(bs): 43 | img = flow2img(flo[b]) # Note that this returns the image in an integer format. 44 | img = torch.tensor(img, dtype=torch.float) / 255 45 | imgs[b] = img 46 | imgs = imgs.permute(0, 3, 1, 2) 47 | return {self.output: imgs} 48 | -------------------------------------------------------------------------------- /codes/trainer/eval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/trainer/eval/__init__.py -------------------------------------------------------------------------------- /codes/trainer/eval/evaluator.py: -------------------------------------------------------------------------------- 1 | # Base class for an evaluator, which is responsible for feeding test data through a model and evaluating the response. 2 | import importlib 3 | import inspect 4 | import pkgutil 5 | import re 6 | import sys 7 | 8 | 9 | class Evaluator: 10 | def __init__(self, model, opt_eval, env, uses_all_ddp=True): 11 | self.model = model.module if hasattr(model, 'module') else model 12 | self.opt = opt_eval 13 | self.env = env 14 | self.uses_all_ddp = uses_all_ddp 15 | 16 | def perform_eval(self): 17 | return {} 18 | 19 | 20 | def format_evaluator_name(name): 21 | # Formats by converting from CamelCase to snake_case and removing trailing "_evaluator" 22 | name = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name) 23 | name = re.sub('([a-z0-9])([A-Z])', r'\1_\2', name).lower() 24 | return name.replace("_evaluator", "") 25 | 26 | 27 | # Works by loading all python modules in the eval/ directory and sniffing out subclasses of Evaluator. 28 | def find_registered_evaluators(base_path="trainer/eval"): 29 | module_iter = pkgutil.walk_packages([base_path]) 30 | results = {} 31 | for mod in module_iter: 32 | if mod.ispkg: 33 | EXCLUSION_LIST = [] 34 | if mod.name not in EXCLUSION_LIST: 35 | results.update(find_registered_evaluators(f'{base_path}/{mod.name}')) 36 | else: 37 | mod_name = f'{base_path}/{mod.name}'.replace('/', '.') 38 | if 'eval_wer' in mod.name: continue # TODO: this causes an import error for PyCtcDecode. get rid of this if there's a need to use that evaluator. 39 | importlib.import_module(mod_name) 40 | classes = inspect.getmembers(sys.modules[mod_name], inspect.isclass) 41 | for name, obj in classes: 42 | if 'Evaluator' in [mro.__name__ for mro in inspect.getmro(obj)]: 43 | results[format_evaluator_name(name)] = obj 44 | return results 45 | 46 | 47 | class CreateEvaluatorError(Exception): 48 | def __init__(self, name, available): 49 | super().__init__(f'Could not find the specified evaluator name: {name}. Available evaluators:' 50 | f'{available}') 51 | 52 | 53 | def create_evaluator(model, opt_eval, env): 54 | evaluators = find_registered_evaluators() 55 | type = opt_eval['type'] 56 | if type not in evaluators.keys(): 57 | raise CreateEvaluatorError(type, list(evaluators.keys())) 58 | return evaluators[opt_eval['type']](model, opt_eval, env) 59 | -------------------------------------------------------------------------------- /codes/trainer/eval/fid.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | import os.path as osp 5 | import torchvision 6 | import trainer.eval.evaluator as evaluator 7 | from pytorch_fid import fid_score 8 | from utils.util import opt_get 9 | 10 | # Evaluator that generates uniform noise to feed into a generator, then calculates a FID score on the results. 11 | class StyleTransferEvaluator(evaluator.Evaluator): 12 | def __init__(self, model, opt_eval, env): 13 | super().__init__(model, opt_eval, env, uses_all_ddp=False) 14 | self.batches_per_eval = opt_eval['batches_per_eval'] 15 | self.batch_sz = opt_eval['batch_size'] 16 | self.im_sz = opt_eval['image_size'] 17 | self.fid_real_samples = opt_eval['real_fid_path'] 18 | self.gen_output_index = opt_eval['gen_index'] if 'gen_index' in opt_eval.keys() else 0 19 | self.noise_type = opt_get(opt_eval, ['noise_type'], 'imgnoise') 20 | self.latent_dim = opt_get(opt_eval, ['latent_dim'], 512) # Not needed if using 'imgnoise' input. 21 | self.image_norm_range = tuple(opt_get(env['opt'], ['image_normalization_range'], [0,1])) 22 | 23 | def perform_eval(self): 24 | fid_fake_path = osp.join(self.env['base_path'], "../", "fid", str(self.env["step"])) 25 | os.makedirs(fid_fake_path, exist_ok=True) 26 | counter = 0 27 | self.model.eval() 28 | for i in range(self.batches_per_eval): 29 | if self.noise_type == 'imgnoise': 30 | batch = torch.FloatTensor(self.batch_sz, 3, self.im_sz, self.im_sz).uniform_(0., 1.).to(self.env['device']) 31 | elif self.noise_type == 'stylenoise': 32 | batch = torch.randn(self.batch_sz, self.latent_dim).to(self.env['device']) 33 | gen = self.model(batch) 34 | if not isinstance(gen, list) and not isinstance(gen, tuple): 35 | gen = [gen] 36 | gen = gen[self.gen_output_index] 37 | gen = (gen - self.image_norm_range[0]) / (self.image_norm_range[1]-self.image_norm_range[0]) 38 | for b in range(self.batch_sz): 39 | torchvision.utils.save_image(gen[b], osp.join(fid_fake_path, "%i_.png" % (counter))) 40 | counter += 1 41 | self.model.train() 42 | 43 | print("Got all images, computing fid") 44 | return {"fid": fid_score.calculate_fid_given_paths([self.fid_real_samples, fid_fake_path], self.batch_sz, True, 45 | 2048)} 46 | -------------------------------------------------------------------------------- /codes/trainer/eval/flow_gaussian_nll.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data import DataLoader 3 | from tqdm import tqdm 4 | 5 | import trainer.eval.evaluator as evaluator 6 | 7 | # Evaluate how close to true Gaussian a flow network predicts in a "normal" pass given a LQ/HQ image pair. 8 | from data.images.image_folder_dataset import ImageFolderDataset 9 | from models.image_generation.srflow.flow import GaussianDiag 10 | 11 | 12 | class FlowGaussianNll(evaluator.Evaluator): 13 | def __init__(self, model, opt_eval, env): 14 | super().__init__(model, opt_eval, env, uses_all_ddp=False) 15 | self.batch_sz = opt_eval['batch_size'] 16 | self.dataset = ImageFolderDataset(opt_eval['dataset']) 17 | self.dataloader = DataLoader(self.dataset, self.batch_sz) 18 | 19 | def perform_eval(self): 20 | total_zs = 0 21 | z_loss = 0 22 | self.model.eval() 23 | with torch.no_grad(): 24 | print("Evaluating FlowGaussianNll..") 25 | for batch in tqdm(self.dataloader): 26 | dev = self.env['device'] 27 | z, _, _ = self.model(gt=batch['hq'].to(dev), 28 | lr=batch['lq'].to(dev), 29 | epses=[], 30 | reverse=False, 31 | add_gt_noise=False) 32 | for z_ in z: 33 | z_loss += GaussianDiag.logp(None, None, z_).mean() 34 | total_zs += 1 35 | self.model.train() 36 | return {"gaussian_diff": z_loss / total_zs} 37 | -------------------------------------------------------------------------------- /codes/trainer/eval/mel_evaluator.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import trainer.eval.evaluator as evaluator 4 | 5 | from data import create_dataset 6 | from data.audio.nv_tacotron_dataset import TextMelCollate 7 | from models.audio.tts.tacotron2 import Tacotron2LossRaw 8 | from torch.utils.data import DataLoader 9 | from tqdm import tqdm 10 | 11 | 12 | # Evaluates the performance of a MEL spectrogram predictor. 13 | class MelEvaluator(evaluator.Evaluator): 14 | def __init__(self, model, opt_eval, env): 15 | super().__init__(model, opt_eval, env, uses_all_ddp=True) 16 | self.batch_sz = opt_eval['batch_size'] 17 | self.dataset = create_dataset(opt_eval['dataset']) 18 | assert self.batch_sz is not None 19 | self.dataloader = DataLoader(self.dataset, self.batch_sz, shuffle=False, num_workers=1, collate_fn=TextMelCollate(n_frames_per_step=1)) 20 | self.criterion = Tacotron2LossRaw() 21 | 22 | def perform_eval(self): 23 | counter = 0 24 | total_error = 0 25 | self.model.eval() 26 | for batch in tqdm(self.dataloader): 27 | model_params = { 28 | 'text_inputs': 'padded_text', 29 | 'text_lengths': 'input_lengths', 30 | 'mels': 'padded_mel', 31 | 'output_lengths': 'output_lengths', 32 | } 33 | params = {k: batch[v].to(self.env['device']) for k, v in model_params.items()} 34 | with torch.no_grad(): 35 | pred = self.model(**params) 36 | 37 | targets = ['padded_mel', 'padded_gate'] 38 | targets = [batch[t].to(self.env['device']) for t in targets] 39 | total_error += self.criterion(pred, targets).item() 40 | counter += 1 41 | self.model.train() 42 | 43 | return {"validation-score": total_error / counter} 44 | 45 | -------------------------------------------------------------------------------- /codes/trainer/eval/sr_diffusion_fid.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import os.path as osp 4 | import torchvision 5 | from torch.nn.functional import interpolate 6 | from tqdm import tqdm 7 | 8 | import trainer.eval.evaluator as evaluator 9 | 10 | from pytorch_fid import fid_score 11 | from data import create_dataset 12 | from torch.utils.data import DataLoader, DistributedSampler, SequentialSampler 13 | 14 | from trainer.injectors.gaussian_diffusion_injector import GaussianDiffusionInferenceInjector 15 | from utils.util import opt_get 16 | 17 | 18 | # Performs a FID evaluation on a diffusion network 19 | class SrDiffusionFidEvaluator(evaluator.Evaluator): 20 | def __init__(self, model, opt_eval, env): 21 | super().__init__(model, opt_eval, env) 22 | self.batch_sz = opt_eval['batch_size'] 23 | self.fid_batch_size = opt_get(opt_eval, ['fid_batch_size'], 64) 24 | assert self.batch_sz is not None 25 | self.dataset = create_dataset(opt_eval['dataset']) 26 | if torch.distributed.is_available() and torch.distributed.is_initialized(): 27 | self.sampler = DistributedSampler(self.dataset, shuffle=False, drop_last=True) 28 | else: 29 | self.sampler = SequentialSampler(self.dataset) 30 | self.fid_real_samples = opt_eval['dataset']['paths'] # This is assumed to exist for the given dataset. 31 | assert isinstance(self.fid_real_samples, str) 32 | self.gd = GaussianDiffusionInferenceInjector(opt_eval['diffusion_params'], env) 33 | self.out_key = opt_eval['diffusion_params']['out'] 34 | 35 | def perform_eval(self): 36 | # Attempt to make the dataset deterministic. 37 | self.dataset.reset_random() 38 | dataloader = DataLoader(self.dataset, self.batch_sz, sampler=self.sampler, num_workers=0) 39 | 40 | fid_fake_path = osp.join(self.env['base_path'], "..", "fid", str(self.env["step"])) 41 | os.makedirs(fid_fake_path, exist_ok=True) 42 | counter = 0 43 | for batch in tqdm(dataloader): 44 | batch = {k: v.to(self.env['device']) if isinstance(v, torch.Tensor) else v for k, v in batch.items()} 45 | gen = self.gd(batch)[self.out_key] 46 | 47 | # All gather if we're in distributed mode. 48 | if torch.distributed.is_available() and torch.distributed.is_initialized(): 49 | gather_list = [torch.zeros_like(gen) for _ in range(torch.distributed.get_world_size())] 50 | torch.distributed.all_gather(gather_list, gen) 51 | gen = torch.cat(gather_list, dim=0) 52 | 53 | if self.env['rank'] <= 0: 54 | for g in gen: 55 | torchvision.utils.save_image(g, osp.join(fid_fake_path, f"{counter}.png")) 56 | counter += 1 57 | 58 | if self.env['rank'] <= 0: 59 | return {"fid": fid_score.calculate_fid_given_paths([self.fid_real_samples, fid_fake_path], self.fid_batch_size, 60 | True, 2048)} 61 | else: 62 | return {} 63 | -------------------------------------------------------------------------------- /codes/trainer/experiments/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/trainer/experiments/__init__.py -------------------------------------------------------------------------------- /codes/trainer/inject.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import inspect 3 | import pkgutil 4 | import re 5 | import sys 6 | 7 | import torch.nn 8 | 9 | 10 | # Base class for all other injectors. 11 | class Injector(torch.nn.Module): 12 | def __init__(self, opt, env): 13 | super(Injector, self).__init__() 14 | self.opt = opt 15 | self.env = env 16 | if 'in' in opt.keys(): 17 | self.input = opt['in'] 18 | if 'out' in opt.keys(): 19 | self.output = opt['out'] 20 | 21 | # This should return a dict of new state variables. 22 | def forward(self, state): 23 | raise NotImplementedError 24 | 25 | 26 | def format_injector_name(name): 27 | # Formats by converting from CamelCase to snake_case and removing trailing "_injector" 28 | name = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name) 29 | name = re.sub('([a-z0-9])([A-Z])', r'\1_\2', name).lower() 30 | return name.replace("_injector", "") 31 | 32 | 33 | # Works by loading all python modules in the injectors/ directory and sniffing out subclasses of Injector. 34 | # field will be properly populated. 35 | def find_registered_injectors(base_path="trainer/injectors"): 36 | module_iter = pkgutil.walk_packages([base_path]) 37 | results = {} 38 | for mod in module_iter: 39 | if mod.ispkg: 40 | EXCLUSION_LIST = [] 41 | if mod.name not in EXCLUSION_LIST: 42 | results.update(find_registered_injectors(f'{base_path}/{mod.name}')) 43 | else: 44 | mod_name = f'{base_path}/{mod.name}'.replace('/', '.') 45 | importlib.import_module(mod_name) 46 | classes = inspect.getmembers(sys.modules[mod_name], inspect.isclass) 47 | for name, obj in classes: 48 | if 'Injector' in [mro.__name__ for mro in inspect.getmro(obj)]: 49 | results[format_injector_name(name)] = obj 50 | return results 51 | 52 | 53 | class CreateInjectorError(Exception): 54 | def __init__(self, name, available): 55 | super().__init__(f'Could not find the specified injector name: {name}. Available injectors:' 56 | f'{available}') 57 | 58 | 59 | # Injectors are a way to synthesize data within a step that can then be used (and reused) by loss functions. 60 | def create_injector(opt_inject, env): 61 | injectors = find_registered_injectors() 62 | type = opt_inject['type'] 63 | if type not in injectors.keys(): 64 | raise CreateInjectorError(type, list(injectors.keys())) 65 | return injectors[opt_inject['type']](opt_inject, env) 66 | -------------------------------------------------------------------------------- /codes/trainer/injectors/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/trainer/injectors/__init__.py -------------------------------------------------------------------------------- /codes/trainer/loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | from utils.colors import rgb2ycbcr 5 | 6 | 7 | class CharbonnierLoss(nn.Module): 8 | """Charbonnier Loss (L1)""" 9 | 10 | def __init__(self, eps=1e-6): 11 | super(CharbonnierLoss, self).__init__() 12 | self.eps = eps 13 | 14 | def forward(self, x, y): 15 | diff = x - y 16 | loss = torch.sum(torch.sqrt(diff * diff + self.eps)) 17 | return loss 18 | 19 | 20 | class ZeroSpreadLoss(nn.Module): 21 | def __init__(self): 22 | super(ZeroSpreadLoss, self).__init__() 23 | 24 | def forward(self, x, _): 25 | return 2 * torch.nn.functional.sigmoid(1 / torch.abs(torch.mean(x))) - 1 26 | 27 | 28 | # Define GAN loss: [vanilla | lsgan] 29 | class GANLoss(nn.Module): 30 | def __init__(self, gan_type, real_label_val=1.0, fake_label_val=0.0): 31 | super(GANLoss, self).__init__() 32 | self.gan_type = gan_type.lower() 33 | self.real_label_val = real_label_val 34 | self.fake_label_val = fake_label_val 35 | 36 | if self.gan_type in ['gan', 'ragan', 'pixgan', 'pixgan_fea', 'crossgan', 'crossgan_lrref']: 37 | self.loss = nn.BCEWithLogitsLoss() 38 | elif self.gan_type == 'lsgan': 39 | self.loss = nn.MSELoss() 40 | elif self.gan_type == 'max_spread': 41 | self.loss = ZeroSpreadLoss() 42 | else: 43 | raise NotImplementedError('GAN type [{:s}] is not found'.format(self.gan_type)) 44 | 45 | def get_target_label(self, input, target_is_real): 46 | if target_is_real: 47 | return torch.empty_like(input).fill_(self.real_label_val) 48 | else: 49 | return torch.empty_like(input).fill_(self.fake_label_val) 50 | 51 | def forward(self, input, target_is_real): 52 | if self.gan_type in ['pixgan', 'pixgan_fea', 'crossgan', 'crossgan_lrref'] and not isinstance(target_is_real, bool): 53 | target_label = target_is_real 54 | else: 55 | target_label = self.get_target_label(input, target_is_real) 56 | loss = self.loss(input.float(), target_label.float()) 57 | return loss 58 | -------------------------------------------------------------------------------- /codes/trainer/optimizers/sgd.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.optim import Optimizer 3 | 4 | 5 | class SGDNoBiasMomentum(Optimizer): 6 | r""" 7 | Copy of pytorch implementation of SGD with a modification which turns off momentum for params marked 8 | with `is_norm` or `is_bias`. 9 | """ 10 | 11 | def __init__(self, params, lr, momentum=0, dampening=0, 12 | weight_decay=0, nesterov=False): 13 | if lr < 0.0: 14 | raise ValueError("Invalid learning rate: {}".format(lr)) 15 | if momentum < 0.0: 16 | raise ValueError("Invalid momentum value: {}".format(momentum)) 17 | if weight_decay < 0.0: 18 | raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) 19 | 20 | defaults = dict(lr=lr, momentum=momentum, dampening=dampening, 21 | weight_decay=weight_decay, nesterov=nesterov) 22 | if nesterov and (momentum <= 0 or dampening != 0): 23 | raise ValueError("Nesterov momentum requires a momentum and zero dampening") 24 | super().__init__(params, defaults) 25 | 26 | def __setstate__(self, state): 27 | super().__setstate__(state) 28 | for group in self.param_groups: 29 | group.setdefault('nesterov', False) 30 | 31 | @torch.no_grad() 32 | def step(self, closure=None): 33 | """Performs a single optimization step. 34 | 35 | Arguments: 36 | closure (callable, optional): A closure that reevaluates the model 37 | and returns the loss. 38 | """ 39 | loss = None 40 | if closure is not None: 41 | with torch.enable_grad(): 42 | loss = closure() 43 | 44 | for group in self.param_groups: 45 | weight_decay = group['weight_decay'] 46 | momentum = group['momentum'] 47 | dampening = group['dampening'] 48 | nesterov = group['nesterov'] 49 | 50 | for p in group['params']: 51 | if p.grad is None: 52 | continue 53 | d_p = p.grad 54 | if weight_decay != 0: 55 | d_p = d_p.add(p, alpha=weight_decay) 56 | # **this is the only modification over standard torch.optim.SGD: 57 | is_bn_or_bias = (hasattr(p, 'is_norm') and p.is_norm) or (hasattr(p, 'is_bias') and p.is_bias) 58 | if not is_bn_or_bias and momentum != 0: 59 | param_state = self.state[p] 60 | if 'momentum_buffer' not in param_state: 61 | buf = param_state['momentum_buffer'] = torch.clone(d_p).detach() 62 | else: 63 | buf = param_state['momentum_buffer'] 64 | buf.mul_(momentum).add_(d_p, alpha=1 - dampening) 65 | if nesterov: 66 | d_p = d_p.add(buf, alpha=momentum) 67 | else: 68 | d_p = buf 69 | 70 | p.add_(d_p, alpha=-group['lr']) 71 | 72 | return loss 73 | -------------------------------------------------------------------------------- /codes/use_discriminator_as_filter.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | import logging 3 | import time 4 | import argparse 5 | 6 | import os 7 | 8 | from torchvision.transforms import CenterCrop 9 | 10 | from trainer.ExtensibleTrainer import ExtensibleTrainer 11 | from utils import options as option 12 | import utils.util as util 13 | from data import create_dataset, create_dataloader 14 | from tqdm import tqdm 15 | import torch 16 | import torchvision 17 | 18 | 19 | if __name__ == "__main__": 20 | bin_path = "f:\\tmp\\binned" 21 | good_path = "f:\\tmp\\good" 22 | os.makedirs(bin_path, exist_ok=True) 23 | os.makedirs(good_path, exist_ok=True) 24 | 25 | 26 | torch.backends.cudnn.benchmark = True 27 | parser = argparse.ArgumentParser() 28 | parser.add_argument('-opt', type=str, help='Path to options YAML file.', default='../options/train_quality_detectors/train_resnet_jpeg.yml') 29 | opt = option.parse(parser.parse_args().opt, is_train=False) 30 | opt = option.dict_to_nonedict(opt) 31 | opt['dist'] = False 32 | 33 | util.mkdirs( 34 | (path for key, path in opt['path'].items() 35 | if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) 36 | util.setup_logger('base', opt['path']['log'], 'test_' + opt['name'], level=logging.INFO, 37 | screen=True, tofile=True) 38 | logger = logging.getLogger('base') 39 | logger.info(option.dict2str(opt)) 40 | 41 | #### Create test dataset and dataloader 42 | test_loaders = [] 43 | for phase, dataset_opt in sorted(opt['datasets'].items()): 44 | test_set = create_dataset(dataset_opt) 45 | test_loader = create_dataloader(test_set, dataset_opt, opt=opt) 46 | logger.info('Number of test images in [{:s}]: {:d}'.format(dataset_opt['name'], len(test_set))) 47 | test_loaders.append(test_loader) 48 | 49 | model = ExtensibleTrainer(opt) 50 | fea_loss = 0 51 | for test_loader in test_loaders: 52 | test_set_name = test_loader.dataset.opt['name'] 53 | logger.info('\nTesting [{:s}]...'.format(test_set_name)) 54 | test_start_time = time.time() 55 | dataset_dir = osp.join(opt['path']['results_root'], test_set_name) 56 | util.mkdir(dataset_dir) 57 | 58 | tq = tqdm(test_loader) 59 | removed = 0 60 | means = [] 61 | for k, data in enumerate(tq): 62 | model.feed_data(data, k) 63 | model.test() 64 | results = torch.argmax(torch.nn.functional.softmax(model.eval_state['logits'][0], dim=-1), dim=1) 65 | for i in range(results.shape[0]): 66 | if results[i] == 0: 67 | imname = osp.basename(data['HQ_path'][i]) 68 | # For VERIFICATION: 69 | #torchvision.utils.save_image(data['hq'][i], osp.join(bin_path, imname)) 70 | # 4 REALZ: 71 | os.remove(data['HQ_path'][i]) 72 | removed += 1 73 | 74 | print("Removed %i/%i images" % (removed, len(test_set))) -------------------------------------------------------------------------------- /codes/utils/UI_icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/utils/UI_icon.png -------------------------------------------------------------------------------- /codes/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/utils/__init__.py -------------------------------------------------------------------------------- /codes/utils/audio.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import matplotlib.pyplot as plt 3 | 4 | 5 | def plot_spectrogram(spec, title=None, ylabel="freq_bin", aspect="auto", xmax=None): 6 | fig, axs = plt.subplots(1, 1) 7 | axs.set_title(title or "Spectrogram (db)") 8 | axs.set_ylabel(ylabel) 9 | axs.set_xlabel("frame") 10 | im = axs.imshow(librosa.power_to_db(spec), origin="lower", aspect=aspect) 11 | if xmax: 12 | axs.set_xlim((0, xmax)) 13 | fig.colorbar(im, ax=axs) 14 | plt.show(block=False) -------------------------------------------------------------------------------- /codes/utils/convert_model.py: -------------------------------------------------------------------------------- 1 | # Tool that can be used to add a new layer into an existing model save file. Primarily useful for "progressive" 2 | # models which can be trained piecemeal. 3 | 4 | from utils import options as option 5 | from models import create_model 6 | import torch 7 | import os 8 | 9 | def get_model_for_opt_file(filename): 10 | opt = option.parse(filename, is_train=True) 11 | opt = option.dict_to_nonedict(opt) 12 | model = create_model(opt) 13 | return model, opt 14 | 15 | def copy_state_dict_list(l_from, l_to): 16 | for i, v in enumerate(l_from): 17 | if isinstance(v, list): 18 | copy_state_dict_list(v, l_to[i]) 19 | elif isinstance(v, dict): 20 | copy_state_dict(v, l_to[i]) 21 | else: 22 | l_to[i] = v 23 | 24 | def copy_state_dict(dict_from, dict_to): 25 | for k in dict_from.keys(): 26 | if k == 'optimizers': 27 | for j in range(len(dict_from[k][0]['param_groups'])): 28 | for p in dict_to[k][0]['param_groups'][j]['params']: 29 | del dict_to[k][0]['state'] 30 | dict_to[k][0]['param_groups'][j] = dict_from[k][0]['param_groups'][j] 31 | dict_to[k][0]['state'].update(dict_from[k][0]['state']) 32 | print(len(dict_from[k][0].keys()), dict_from[k][0].keys()) 33 | print(len(dict_to[k][0].keys()), dict_to[k][0].keys()) 34 | assert k in dict_to.keys() 35 | if isinstance(dict_from[k], dict): 36 | copy_state_dict(dict_from[k], dict_to[k]) 37 | elif isinstance(dict_from[k], list): 38 | copy_state_dict_list(dict_from[k], dict_to[k]) 39 | else: 40 | dict_to[k] = dict_from[k] 41 | return dict_to 42 | 43 | if __name__ == "__main__": 44 | os.chdir("..") 45 | model_from, opt_from = get_model_for_opt_file("../options/train_imgset_pixgan_progressive_srg2.yml") 46 | model_to, _ = get_model_for_opt_file("../options/train_imgset_pixgan_progressive_srg2_.yml") 47 | 48 | ''' 49 | model_to.netG.module.update_for_step(1000000000000) 50 | l = torch.nn.MSELoss() 51 | o, _ = model_to.netG(torch.randn(1, 3, 64, 64)) 52 | l(o, torch.randn_like(o)).backward() 53 | model_to.optimizer_G.step() 54 | o = model_to.netD(torch.randn(1, 3, 128, 128)) 55 | l(o, torch.randn_like(o)).backward() 56 | model_to.optimizer_D.step() 57 | ''' 58 | 59 | torch.save(copy_state_dict(model_from.netG.state_dict(), model_to.netG.state_dict()), "converted_g.pth") 60 | torch.save(copy_state_dict(model_from.netD.state_dict(), model_to.netD.state_dict()), "converted_d.pth") 61 | 62 | # Also convert the state. 63 | resume_state_from = torch.load(opt_from['path']['resume_state']) 64 | resume_state_to = model_to.save_training_state({}, return_state=True) 65 | resume_state_from['optimizers'][0]['param_groups'].append(resume_state_to['optimizers'][0]['param_groups'][-1]) 66 | torch.save(resume_state_from, "converted_state.pth") 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | -------------------------------------------------------------------------------- /codes/utils/distributed_checkpont.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import warnings 3 | 4 | 5 | def detach_variable(inputs): 6 | if isinstance(inputs, tuple): 7 | out = [] 8 | for inp in inputs: 9 | x = inp.detach() 10 | x.requires_grad = inp.requires_grad 11 | out.append(x) 12 | return tuple(out) 13 | else: 14 | raise RuntimeError( 15 | "Only tuple of tensors is supported. Got Unsupported input type: ", type(inputs).__name__) 16 | 17 | 18 | def check_backward_validity(inputs): 19 | if not any(inp.requires_grad for inp in inputs): 20 | warnings.warn("None of the inputs have requires_grad=True. Gradients will be None") 21 | 22 | 23 | class CheckpointFunction(torch.autograd.Function): 24 | @staticmethod 25 | def forward(ctx, run_function, length, *args): 26 | ctx.run_function = run_function 27 | ctx.input_tensors = list(args[:length]) 28 | ctx.input_params = list(args[length:]) 29 | with torch.no_grad(): 30 | output_tensors = ctx.run_function(*ctx.input_tensors) 31 | return output_tensors 32 | 33 | @staticmethod 34 | def backward(ctx, *output_grads): 35 | for i in range(len(ctx.input_tensors)): 36 | temp = ctx.input_tensors[i] 37 | ctx.input_tensors[i] = temp.detach() 38 | ctx.input_tensors[i].requires_grad = temp.requires_grad 39 | with torch.enable_grad(): 40 | output_tensors = ctx.run_function(*ctx.input_tensors) 41 | input_grads = torch.autograd.grad(output_tensors, ctx.input_tensors + ctx.input_params, output_grads, allow_unused=True) 42 | return (None, None) + input_grads 43 | 44 | 45 | def checkpoint(module, *params): 46 | differentiable_params = tuple(filter(lambda p: p.requires_grad, module.parameters())) 47 | if len(differentiable_params) > 0: 48 | args = params + differentiable_params 49 | return CheckpointFunction.apply(module, len(params), *args) 50 | else: 51 | return module(*params) -------------------------------------------------------------------------------- /codes/utils/weight_scheduler.py: -------------------------------------------------------------------------------- 1 | import math 2 | from matplotlib import pyplot as plt 3 | 4 | # Base class for weight schedulers. Holds weight at a fixed initial value. 5 | class WeightScheduler: 6 | def __init__(self, initial_weight): 7 | self.initial_weight = initial_weight 8 | 9 | def get_weight_for_step(self, step): 10 | return self.initial_weight 11 | 12 | 13 | class LinearDecayWeightScheduler(WeightScheduler): 14 | def __init__(self, initial_weight, steps_to_decay, lower_bound, initial_step=0): 15 | super(LinearDecayWeightScheduler, self).__init__(initial_weight) 16 | self.steps_to_decay = steps_to_decay 17 | self.lower_bound = lower_bound 18 | self.initial_step = initial_step 19 | self.decrease_per_step = (initial_weight - lower_bound) / self.steps_to_decay 20 | 21 | def get_weight_for_step(self, step): 22 | step = step - self.initial_step 23 | if step < 0: 24 | return self.initial_weight 25 | return max(self.lower_bound, self.initial_weight - step * self.decrease_per_step) 26 | 27 | 28 | class SinusoidalWeightScheduler(WeightScheduler): 29 | def __init__(self, upper_weight, lower_weight, period_steps, initial_step=0): 30 | super(SinusoidalWeightScheduler, self).__init__(upper_weight) 31 | self.center = (upper_weight + lower_weight) / 2 32 | self.amplitude = (upper_weight - lower_weight) / 2 33 | self.period = period_steps 34 | self.initial_step = initial_step 35 | 36 | def get_weight_for_step(self, step): 37 | step = step - self.initial_step 38 | if step < 0: 39 | return self.initial_weight 40 | # Use cosine because it starts at y=1 for x=0. 41 | return math.cos(step * math.pi * 2 / self.period) * self.amplitude + self.center 42 | 43 | 44 | def get_scheduler_for_opt(opt): 45 | if opt['type'] == 'fixed': 46 | return WeightScheduler(opt['weight']) 47 | elif opt['type'] == 'linear_decay': 48 | return LinearDecayWeightScheduler(opt['initial_weight'], opt['steps'], opt['lower_bound'], opt['start_step']) 49 | elif opt['type'] == 'sinusoidal': 50 | return SinusoidalWeightScheduler(opt['upper_weight'], opt['lower_weight'], opt['period'], opt['start_step']) 51 | else: 52 | raise NotImplementedError 53 | 54 | 55 | # Do some testing. 56 | if __name__ == "__main__": 57 | #sched = SinusoidalWeightScheduler(1, .1, 50, 10) 58 | sched = LinearDecayWeightScheduler(10, 5000, .9, 2000) 59 | 60 | x = [] 61 | y = [] 62 | for s in range(8000): 63 | x.append(s) 64 | y.append(sched.get_weight_for_step(s)) 65 | plt.plot(x, y) 66 | plt.show() -------------------------------------------------------------------------------- /environment.yaml: -------------------------------------------------------------------------------- 1 | name: DLAS 2 | channels: 3 | - conda-forge 4 | - nvidia 5 | - pytorch 6 | dependencies: 7 | - nvidia::cudatoolkit 8 | - git 9 | - numpy 10 | - pip 11 | - python=3.10.0 12 | - torchvision 13 | - torchaudio 14 | - pytorch::pytorch 15 | - pip: 16 | - -r codes/requirements.laxed.txt -------------------------------------------------------------------------------- /experiments/clips_mel_norms.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/experiments/clips_mel_norms.pth -------------------------------------------------------------------------------- /experiments/train_diffusion_vocoder_22k_level.yml: -------------------------------------------------------------------------------- 1 | path: 2 | pretrain_model_dvae: '../experiments/dvae.pth' 3 | strict_load: true 4 | #resume_state: ../experiments/train_imgnet_vqvae_stage1/training_state/0.state # <-- Set this to resume from a previous training state. 5 | networks: 6 | dvae: 7 | type: generator 8 | which_model_G: lucidrains_dvae 9 | kwargs: 10 | channels: 80 11 | codebook_dim: 512 12 | hidden_dim: 512 13 | kernel_size: 3 14 | num_layers: 2 15 | num_resnet_blocks: 3 16 | num_tokens: 8192 17 | positional_dims: 1 18 | use_transposed_convs: false 19 | -------------------------------------------------------------------------------- /experiments/train_gpt_tts_unified.yml: -------------------------------------------------------------------------------- 1 | path: 2 | #pretrain_model_dvae: '../experiments/dvae.pth' 3 | strict_load: true 4 | #resume_state: ../experiments/train_imgnet_vqvae_stage1/training_state/0.state # <-- Set this to resume from a previous training state. 5 | networks: 6 | gpt: 7 | type: generator 8 | which_model_G: unified_voice2 9 | kwargs: 10 | layers: 30 # WAS 8 11 | model_dim: 1024 # WAS 512 12 | heads: 16 # WAS 8 13 | max_text_tokens: 402 # WAS 120 14 | max_mel_tokens: 604 # WAS 250 15 | max_conditioning_inputs: 2 # WAS 1 16 | mel_length_compression: 1024 17 | number_text_tokens: 256 # supposed to be 255 for newer unified_voice files 18 | number_mel_codes: 8194 19 | start_mel_token: 8192 20 | stop_mel_token: 8193 21 | start_text_token: 255 22 | train_solo_embeddings: False # missing in uv3/4 23 | use_mel_codes_as_input: True # ditto 24 | checkpointing: True 25 | freeze_everything_but_position_embeddings: True 26 | tortoise_compat: True 27 | -------------------------------------------------------------------------------- /recipes/byol/README.md: -------------------------------------------------------------------------------- 1 | # Working with BYOL in DLAS 2 | 3 | [BYOL](https://arxiv.org/abs/2006.07733) is a technique for pretraining an arbitrary image processing 4 | neural network. It is built upon previous self-supervised architectures like SimCLR. 5 | 6 | BYOL in DLAS is adapted from an implementation written by [lucidrains](https://github.com/lucidrains/byol-pytorch). 7 | It is implemented via two wrappers: 8 | 9 | 1. A Dataset wrapper that augments the LQ and HQ inputs from a typical DLAS dataset. Since differentiable 10 | augmentations don't actually matter for BYOL, it makes more sense (to me) to do this on the CPU at the 11 | dataset layer, so your GPU can focus on processing gradients. 12 | 1. A model wrapper that attaches a small MLP to the end of your input network to produce a fixed 13 | size latent. This latent is used to produce the BYOL loss which trains the master weights from 14 | your network. 15 | 16 | Thanks to the excellent implementation from lucidrains, this wrapping process makes training your 17 | network on unsupervised datasets extremely easy. 18 | 19 | The DLAS version improves on lucidrains implementation adding some important training details, such as 20 | a custom LARS optimizer implementation that aligns with the recommendations from the paper. By moving augmentation 21 | to the dataset level, additional augmentation options are unlocked - like being able to take two similar video frames 22 | as the image pair. 23 | 24 | # Training BYOL 25 | 26 | In this directory, you will find a sample training config for training BYOL on DIV2K. You will 27 | likely want to insert your own model architecture first. 28 | 29 | Run the trainer by: 30 | 31 | `python train.py -opt train_div2k_byol.yml` 32 | 33 | BYOL is data hungry, as most unsupervised training methods are. If you're providing your own dataset, make sure it is 34 | the hundreds of K-images or more! 35 | 36 | ## Using your own model 37 | 38 | Training your own model on this BYOL implementation is trivial: 39 | 1. Add your nn.Module model implementation to the models/ directory. 40 | 2. Register your model with `trainer/networks.py` as a generator. This file tells DLAS how to build your model from 41 | a set of configuration options. 42 | 3. Copy the sample training config. Change the `subnet` and `hidden_layer` params. 43 | 4. Run your config with `python train.py -opt `. 44 | 45 | *hint: Your network architecture (including layer names) is printed out when running train.py 46 | against your network.* -------------------------------------------------------------------------------- /recipes/diffusion/README.md: -------------------------------------------------------------------------------- 1 | # Working with Gaussian Diffusion models in DLAS 2 | 3 | Diffusion Models are a method of generating structural data using a gradual de-noising process. This process allows a 4 | simple network training regime. 5 | 6 | This implementation of Gaussian Diffusion is largely based on the work done by OpenAI in their paper ["Diffusion Models 7 | Beat GANs on Image Synthesis"](https://arxiv.org/pdf/2105.05233.pdf) and ["Improved Denoising Diffusion Probabilistic 8 | Models"](https://arxiv.org/pdf/2102.09672). 9 | 10 | OpenAI opened sourced their reference implementations [here](https://github.com/openai/guided-diffusion). The diffusion 11 | model that DLAS trains uses the [gaussian_diffusion.py](https://github.com/openai/guided-diffusion/blob/main/guided_diffusion/gaussian_diffusion.py) 12 | script from that repo for training and inference with these models. We also include the UNet from that repo as a model 13 | that can be used to train a diffusion network. 14 | 15 | Diffusion networks can be re-purposed to pretty much any image generation task, including super-resolution. Even though 16 | they are trained with MSE losses, they produce incredibly crisp images with FID scores competitive with the best GANs. 17 | More importantly, it is easy to track training progress since diffusion networks use a "normal" loss. 18 | 19 | Diffusion networks are unique in that during inference, they perform multiple forward passes to generate a single image. 20 | During training, these networks are trained to denoise images over 4000 steps. In inference, this sample rate can be 21 | adjusted. For the purposes of super-resolution, I have found that images sampled in 50 steps to be of very good quality. 22 | This still means that a diffusion generator is 50x slower than generators trained in different ways. 23 | 24 | What's more is that I have found that diffusion networks can be trained in the tiled methodology used by ESRGAN: instead 25 | of training on whole images, you can train on tiles of larger images. At inference time, the network can be applied to 26 | larger images than the network was initially trained on. I have found this works well on inference images within ~3x 27 | the training size. I have not tried larger, because the size of the UNet model means that inference at ultra-high 28 | resolutions is impossible (I run out of GPU memory). 29 | 30 | I have provided a reference configuration for training a diffusion model in this manner. The config performs a 2x 31 | upsampling to 256px, de-blurs it and removes JPEG artifacts. The deblurring and image repairs are done on a configurable 32 | scale. The scale is [0,1] passed to the model as `corruption_entropy`. `1` represents a maximum correction factor. 33 | You can try reducing this to 128px for faster training. It should work fine. 34 | 35 | Diffusion models also have a fairly arcane inference method. To help you along, I've provided an inference configuration 36 | that can be used with models trained in DLAS. -------------------------------------------------------------------------------- /recipes/diffusion/test_diffusion_unet.yml: -------------------------------------------------------------------------------- 1 | #### general settings 2 | name: test_diffusion_unet 3 | use_tb_logger: true 4 | model: extensibletrainer 5 | scale: 1 6 | gpu_ids: [0] 7 | start_step: -1 8 | checkpointing_enabled: true 9 | fp16: false 10 | wandb: false 11 | 12 | datasets: 13 | train: 14 | name: my_inference_images 15 | n_workers: 0 16 | batch_size: 1 17 | mode: imagefolder 18 | rgb_n1_to_1: true 19 | disable_flip: true 20 | force_square: false 21 | paths: 22 | scale: 1 23 | skip_lq: true 24 | fixed_parameters: 25 | # Specify correction factors here. For networks trained with the paired training configuration, the first number 26 | # is a JPEG correction factor, and the second number is a deblurring factor. Testing shows that if you attempt to 27 | # deblur too far, you get extremely distorted images. It's actually pretty cool - the network clearly knows how 28 | # much deblurring is appropriate. 29 | corruption_entropy: [.2, .5] 30 | 31 | networks: 32 | generator: 33 | type: generator 34 | which_model_G: unet_diffusion 35 | args: 36 | image_size: 256 37 | in_channels: 3 38 | num_corruptions: 2 39 | model_channels: 192 40 | out_channels: 6 41 | num_res_blocks: 2 42 | attention_resolutions: [8,16] 43 | dropout: 0 44 | channel_mult: [1,1,2,2,4,4] 45 | num_heads: 4 46 | num_heads_upsample: -1 47 | use_scale_shift_norm: true 48 | 49 | #### path 50 | path: 51 | pretrain_model_generator: 52 | strict_load: true 53 | 54 | steps: 55 | generator: 56 | training: generator 57 | injectors: 58 | visual_debug: 59 | type: gaussian_diffusion_inference 60 | generator: generator 61 | output_batch_size: 1 62 | output_scale_factor: 2 63 | respaced_timestep_spacing: 50 # This can be tweaked to perform inference faster or slower. 50-200 seems to be the sweet spot. At 4000 steps, the quality is actually worse often. 64 | undo_n1_to_1: true 65 | beta_schedule: 66 | schedule_name: linear 67 | num_diffusion_timesteps: 4000 68 | diffusion_args: 69 | model_mean_type: epsilon 70 | model_var_type: learned_range 71 | loss_type: mse 72 | model_input_keys: 73 | low_res: hq 74 | corruption_factor: corruption_entropy 75 | out: sample 76 | 77 | eval: 78 | output_state: sample -------------------------------------------------------------------------------- /recipes/esrgan/rrdb_process_video.yml: -------------------------------------------------------------------------------- 1 | name: video_process 2 | suffix: ~ # add suffix to saved images 3 | model: extensibletrainer 4 | scale: 4 5 | gpu_ids: [0] 6 | fp16: true 7 | minivid_crf: 12 # Defines the 'crf' output video quality parameter fed to FFMPEG 8 | frames_per_mini_vid: 360 # How many frames to process before generating a small video segment. Used to reduce number of images you must store to convert an entire video. 9 | minivid_start_no: 360 10 | recurrent_mode: false 11 | 12 | dataset: 13 | n_workers: 1 14 | name: myvideo 15 | video_file: # <-- Path to your video file here. any format supported by ffmpeg works. 16 | frame_rate: 30 # Set to the frame rate of your video. 17 | start_at_seconds: 0 # Set this if you want to start somewhere other than the beginning of the video. 18 | end_at_seconds: 5000 # Set to the time you want to stop at. 19 | batch_size: 1 # Set to the number of frames to convert at once. Larger batches provide a modest performance increase. 20 | vertical_splits: 1 # Used for 3d binocular videos. Leave at 1. 21 | force_multiple: 1 22 | 23 | #### network structures 24 | networks: 25 | generator: 26 | type: generator 27 | which_model_G: RRDBNet 28 | in_nc: 3 29 | out_nc: 3 30 | initial_stride: 1 31 | nf: 64 32 | nb: 23 33 | scale: 4 34 | blocks_per_checkpoint: 3 35 | 36 | #### path 37 | path: 38 | pretrain_model_generator: # <-- Set your generator path here. 39 | 40 | steps: 41 | generator: 42 | training: generator 43 | generator: generator 44 | 45 | # Optimizer params. Not used, but currently required to initialize ExtensibleTrainer, even in eval mode. 46 | lr: !!float 5e-6 47 | weight_decay: 0 48 | beta1: 0.9 49 | beta2: 0.99 50 | 51 | injectors: 52 | gen_inj: 53 | type: generator 54 | generator: generator 55 | in: lq 56 | out: gen 57 | 58 | # Train section is required, even though we are just evaluating. 59 | train: 60 | niter: 500000 61 | warmup_iter: -1 62 | mega_batch_factor: 1 63 | val_freq: 500 64 | default_lr_scheme: MultiStepLR 65 | gen_lr_steps: [20000, 40000, 80000, 100000, 140000, 180000] 66 | lr_gamma: 0.5 67 | 68 | eval: 69 | output_state: gen -------------------------------------------------------------------------------- /recipes/glean/README.md: -------------------------------------------------------------------------------- 1 | # GLEAN 2 | 3 | DLAS contains an attempt at implementing [GLEAN](https://ckkelvinchan.github.io/papers/glean.pdf), which performs image 4 | super-resolution guided by pretrained StyleGAN networks. Since this paper is currently closed-source, it was 5 | implemented entirely on what information I could glean from the paper. 6 | 7 | ## Training 8 | 9 | GLEAN requires a pre-trained StyleGAN network to operate. DLAS currently only has support for StyleGAN2 models, so 10 | you will need to use one of those. The pre-eminent StyleGAN 2 model is the one trained on FFHQ faces, so I will use 11 | that in this training example. 12 | 13 | 1. Download the ffhq model from [nVidias Drive](https://drive.google.com/drive/folders/1yanUI9m4b4PWzR0eurKNq6JR1Bbfbh6L). 14 | This repo currently only supports the "-f.pkl" files without further modifications, so choose one of those. 15 | 1. Download and extract the [FFHQ dataset](https://github.com/NVlabs/ffhq-dataset). 16 | 1. Convert the TF model to a Pytorch one supported by DLAS: 17 | 18 | `python scripts/stylegan2/convert_weights_rosinality.py stylegan2-ffhq-config-f.pkl` 19 | 20 | 1. The above conversion script outputs a *.pth file as well as JPG preview of model outputs. Check the JPG to ensure 21 | the StyleGAN is performing as expected. If so, copy the *.pth file to your experiments/ directory within DLAS. 22 | 1. Edit the provided trainer configuration. Find comments starting with '<--' and make changes as indicated. 23 | 1. Train the model: 24 | 25 | `python train.py -opt train_ffhq_glean.yml` -------------------------------------------------------------------------------- /recipes/segformer/train_byol_segformer.yml: -------------------------------------------------------------------------------- 1 | #### general settings 2 | name: train_byol_segformer 3 | use_tb_logger: true 4 | model: extensibletrainer 5 | distortion: sr 6 | scale: 1 7 | gpu_ids: [0] 8 | fp16: false 9 | start_step: -1 10 | checkpointing_enabled: false 11 | wandb: false 12 | 13 | datasets: 14 | train: 15 | n_workers: 1 16 | batch_size: 96 17 | mode: byol_dataset 18 | crop_size: 224 19 | key1: hq 20 | key2: hq 21 | dataset: 22 | mode: imagefolder 23 | paths: <> 24 | target_size: 224 25 | scale: 1 26 | fetch_alt_image: false 27 | skip_lq: true 28 | normalize: imagenet 29 | 30 | networks: 31 | generator: 32 | type: generator 33 | which_model_G: pixel_local_byol 34 | image_size: 224 35 | hidden_layer: tail 36 | subnet: 37 | which_model_G: segformer 38 | 39 | #### path 40 | path: 41 | strict_load: true 42 | #resume_state: <> 43 | 44 | steps: 45 | generator: 46 | training: generator 47 | optimizer: lars 48 | optimizer_params: 49 | # All parameters from appendix J of BYOL. 50 | lr: .08 # From BYOL: LR=.2*/256 51 | weight_decay: !!float 1.5e-6 52 | lars_coefficient: .001 53 | momentum: .9 54 | 55 | injectors: 56 | gen_inj: 57 | type: generator 58 | generator: generator 59 | in: aug1 60 | out: loss 61 | 62 | losses: 63 | byol_loss: 64 | type: direct 65 | key: loss 66 | weight: 1 67 | 68 | train: 69 | warmup_iter: -1 70 | mega_batch_factor: 2 71 | val_freq: 1000 72 | niter: 300000 73 | 74 | # Default LR scheduler options 75 | default_lr_scheme: CosineAnnealingLR_Restart 76 | T_period: [120000, 120000, 120000] 77 | warmup: 10000 78 | eta_min: .01 # Unspecified by the paper.. 79 | restarts: [140000, 280000] # Paper says no re-starts, but this scheduler will add them automatically if we don't set them. 80 | # likely I won't train this far. 81 | restart_weights: [.5, .25] 82 | 83 | 84 | eval: 85 | output_state: loss 86 | evaluators: 87 | single_point_pair_contrastive_eval: 88 | for: generator 89 | type: single_point_pair_contrastive_eval 90 | batch_size: 16 91 | quantity: 96 92 | similar_set_args: 93 | path: <> 94 | size: 256 95 | dissimilar_set_args: 96 | path: <> 97 | size: 256 98 | 99 | logger: 100 | print_freq: 30 101 | save_checkpoint_freq: 1000 102 | visuals: [hq, aug1] 103 | visual_debug_rate: 100 -------------------------------------------------------------------------------- /recipes/srflow/convert_official_weights.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | # Quick script that can be used to convert from pretrained SRFlow weights to the variants used in this repo. The only 4 | # differences between the two is the variable naming conventions used by the RRDBNet. (FWIW this repo is using the 5 | # more up-to-date names that conform to Python standards). 6 | 7 | official_weight_file = 'SRFlow_CelebA_8X.pth' 8 | output = 'CelebA_converted.pth' 9 | 10 | sd = torch.load(official_weight_file) 11 | sdp = {} 12 | for k,v in sd.items(): 13 | k = k.replace('RRDB.RRDB_trunk', 'RRDB.body') 14 | k = k.replace('.RDB', '.rdb') 15 | k = k.replace('trunk_conv.', 'conv_body.') 16 | k = k.replace('.upconv', '.conv_up') 17 | k = k.replace('.HRconv', '.conv_hr') 18 | sdp[k] = v 19 | torch.save(sdp, output) 20 | -------------------------------------------------------------------------------- /recipes/srflow/train_div2k_rrdb_psnr.yml: -------------------------------------------------------------------------------- 1 | #### general settings 2 | name: train_div2k_rrdb_psnr 3 | use_tb_logger: true 4 | model: extensibletrainer 5 | distortion: sr 6 | scale: 2 7 | gpu_ids: [0] 8 | fp16: false 9 | start_step: 0 10 | checkpointing_enabled: true # <-- Highly recommended for single-GPU training. Will not work with DDP. 11 | wandb: false 12 | 13 | datasets: 14 | train: 15 | n_workers: 4 16 | batch_size: 32 17 | name: div2k 18 | mode: single_image_extensible 19 | paths: /content/div2k # <-- Put your path here. 20 | target_size: 128 21 | force_multiple: 1 22 | scale: 4 23 | eval: False 24 | num_corrupts_per_image: 0 25 | strict: false 26 | val: 27 | name: val 28 | mode: fullimage 29 | dataroot_GT: /content/set14 30 | scale: 4 31 | force_multiple: 16 32 | 33 | networks: 34 | generator: 35 | type: generator 36 | which_model_G: RRDBNet 37 | in_nc: 3 38 | out_nc: 3 39 | nf: 64 40 | nb: 23 41 | scale: 4 42 | blocks_per_checkpoint: 3 43 | 44 | #### path 45 | path: 46 | #pretrain_model_generator: 47 | strict_load: true 48 | #resume_state: ../experiments/train_div2k_rrdb_psnr/training_state/0.state # <-- Set this to resume from a previous training state. 49 | 50 | steps: 51 | generator: 52 | training: generator 53 | 54 | optimizer_params: 55 | # Optimizer params 56 | lr: !!float 2e-4 57 | weight_decay: 0 58 | beta1: 0.9 59 | beta2: 0.99 60 | 61 | injectors: 62 | gen_inj: 63 | type: generator 64 | generator: generator 65 | in: lq 66 | out: gen 67 | 68 | losses: 69 | pix: 70 | type: pix 71 | weight: 1 72 | criterion: l1 73 | real: hq 74 | fake: gen 75 | 76 | train: 77 | niter: 500000 78 | warmup_iter: -1 79 | mega_batch_factor: 1 # <-- Gradient accumulation factor. If you are running OOM, increase this to [2,4,8]. 80 | val_freq: 2000 81 | 82 | # Default LR scheduler options 83 | default_lr_scheme: MultiStepLR 84 | gen_lr_steps: [50000, 100000, 150000, 200000] 85 | lr_gamma: 0.5 86 | 87 | eval: 88 | output_state: gen 89 | 90 | logger: 91 | print_freq: 30 92 | save_checkpoint_freq: 1000 93 | visuals: [gen, hq, lq] 94 | visual_debug_rate: 100 -------------------------------------------------------------------------------- /recipes/stylegan/README.md: -------------------------------------------------------------------------------- 1 | # StyleGAN Implementations 2 | DLAS supports two different StyleGAN2 implementations: 3 | 4 | - [@rosinality implementation](https://github.com/rosinality/stylegan2-pytorch/commits/master) 5 | Designed to reach parity with the nVidia reference implementation in TF1.5 6 | - [@lucidrains implementation](https://github.com/lucidrains/stylegan2-pytorch) 7 | Designed with simplicity and readability in mind. 8 | 9 | I prefer the readability of @lucidrains implementation, but you cannot (yet) use pretrained weights 10 | with it. I'm working on that. 11 | -------------------------------------------------------------------------------- /recipes/tacotron2/test_tacotron2_lj.yml: -------------------------------------------------------------------------------- 1 | #### general settings 2 | name: test_tacotron2_lj 3 | use_tb_logger: true 4 | gpu_ids: [0] 5 | start_step: -1 6 | fp16: false 7 | checkpointing_enabled: true 8 | wandb: false 9 | 10 | datasets: 11 | train: 12 | name: lj 13 | n_workers: 0 14 | batch_size: 1 15 | mode: nv_tacotron 16 | path: E:\4k6k\datasets\audio\LJSpeech-1.1\ljs_audio_text_train_filelist.txt 17 | 18 | networks: 19 | mel_gen: 20 | type: generator 21 | which_model_G: nv_tacotron2 22 | args: 23 | encoder_kernel_size: 5 24 | encoder_n_convolutions: 3 25 | encoder_embedding_dim: 512 26 | decoder_rnn_dim: 1024 27 | prenet_dim: 256 28 | max_decoder_steps: 1000 29 | attention_rnn_dim: 1024 30 | attention_dim: 128 31 | attention_location_n_filters: 32 32 | attention_location_kernel_size: 31 33 | postnet_embedding_dim: 512 34 | postnet_kernel_size: 5 35 | postnet_n_convolutions: 5 36 | waveglow: 37 | type: generator 38 | which_model_G: nv_waveglow 39 | args: 40 | n_mel_channels: 80 41 | n_flows: 12 42 | n_group: 8 43 | n_early_every: 4 44 | n_early_size: 2 45 | WN_config: 46 | n_layers: 8 47 | n_channels: 256 48 | kernel_size: 3 49 | 50 | #### path 51 | path: 52 | pretrain_model_mel_gen: ../experiments/train_tacotron2_lj/models/22000_mel_gen_ema.pth 53 | pretrain_model_waveglow: ../experiments/waveglow_256channels_universal_v5.pth 54 | strict_load: true 55 | #resume_state: ../experiments/train_imgset_unet_diffusion/training_state/54000.state 56 | 57 | steps: 58 | generator: 59 | training: mel_gen 60 | injectors: 61 | mel: 62 | type: generator 63 | generator: mel_gen 64 | in: [padded_text, input_lengths, padded_mel, output_lengths] 65 | out: [mel_outputs, mel_outputs_postnet, gate_outputs, alignments] 66 | wave: 67 | type: generator 68 | generator: waveglow 69 | method: infer 70 | in: mel_outputs 71 | out: waveform 72 | 73 | eval: 74 | output_state: waveform -------------------------------------------------------------------------------- /recipes/tacotron2/train_tacotron2_lj.yml: -------------------------------------------------------------------------------- 1 | #### general settings 2 | name: train_tacotron2_lj 3 | use_tb_logger: true 4 | gpu_ids: [0] 5 | start_step: -1 6 | fp16: false 7 | checkpointing_enabled: true 8 | wandb: false 9 | 10 | datasets: 11 | train: 12 | name: lj 13 | n_workers: 1 14 | batch_size: 72 15 | mode: nv_tacotron 16 | path: E:\4k6k\datasets\audio\LJSpeech-1.1\ljs_audio_text_train_filelist.txt 17 | 18 | networks: 19 | mel_gen: 20 | type: generator 21 | which_model_G: nv_tacotron2 22 | args: 23 | encoder_kernel_size: 5 24 | encoder_n_convolutions: 3 25 | encoder_embedding_dim: 512 26 | decoder_rnn_dim: 1024 27 | prenet_dim: 256 28 | max_decoder_steps: 1000 29 | attention_rnn_dim: 1024 30 | attention_dim: 128 31 | attention_location_n_filters: 32 32 | attention_location_kernel_size: 31 33 | postnet_embedding_dim: 512 34 | postnet_kernel_size: 5 35 | postnet_n_convolutions: 5 36 | 37 | #### path 38 | path: 39 | #pretrain_model_generator: ../experiments/diffusion_unet_128_imageset_22000.pt 40 | strict_load: true 41 | #resume_state: ../experiments/train_imgset_unet_diffusion/training_state/54000.state 42 | 43 | steps: 44 | generator: 45 | training: mel_gen 46 | 47 | optimizer: adamw 48 | optimizer_params: 49 | lr: !!float 1.2e-3 50 | weight_decay: !!float 1e-6 51 | beta1: 0.9 52 | beta2: 0.9999 53 | clip_grad_eps: 1.0 54 | 55 | injectors: 56 | mel: 57 | type: generator 58 | generator: mel_gen 59 | in: [padded_text, input_lengths, padded_mel, output_lengths] 60 | out: [mel_outputs, mel_outputs_postnet, gate_outputs, alignments] 61 | losses: 62 | tacotron_loss: 63 | type: nv_tacotron2_loss 64 | weight: 1 65 | mel_target_key: padded_mel 66 | mel_output_key: mel_outputs 67 | mel_output_postnet_key: mel_outputs_postnet 68 | gate_target_key: padded_gate 69 | gate_output_key: gate_outputs 70 | 71 | train: 72 | niter: 500000 73 | warmup_iter: -1 74 | mega_batch_factor: 3 75 | ema_rate: .999 76 | val_freq: 500 77 | 78 | default_lr_scheme: MultiStepLR 79 | gen_lr_steps: [ 50000, 100000, 150000 ] 80 | lr_gamma: 0.5 81 | 82 | eval: 83 | evaluators: 84 | val: 85 | type: mel 86 | for: mel_gen 87 | batch_size: 16 88 | dataset: 89 | mode: nv_tacotron 90 | path: E:\4k6k\datasets\audio\LJSpeech-1.1\ljs_audio_text_val_filelist.txt 91 | 92 | 93 | logger: 94 | print_freq: 30 95 | save_checkpoint_freq: 500 96 | visuals: [mel_outputs, padded_mel] 97 | is_mel_spectrogram: true 98 | visual_debug_rate: 100 -------------------------------------------------------------------------------- /recipes/vqvae2/README.md: -------------------------------------------------------------------------------- 1 | # VQVAE2 in Pytorch 2 | 3 | [VQVAE2](https://arxiv.org/pdf/1906.00446.pdf) is a generative autoencoder developed by Deepmind. It's unique innovation is 4 | discretizing the latent space into a fixed set of "codebook" vectors. This codebook 5 | can then be used in downstream tasks to rebuild images from the training set. 6 | 7 | This model is in DLAS thanks to work [@rosinality](https://github.com/rosinality) did 8 | [converting the Deepmind model](https://github.com/rosinality/vq-vae-2-pytorch) to Pytorch. 9 | 10 | # Training VQVAE2 11 | 12 | VQVAE2 is trained in two steps: 13 | 14 | ## Training the autoencoder 15 | 16 | This first step is to train the autoencoder itself. The config file `train_imgnet_vqvae_stage1.yml` provided shows how to do this 17 | for imagenet with the hyperparameters specified by deepmind. You'll need to bring your own imagenet folder for this. 18 | 19 | ## Training the PixelCNN encoder 20 | 21 | The second step is to train the PixelCNN model which will create "codebook" vectors given an 22 | input image. -------------------------------------------------------------------------------- /resources/bitsandbytes_windows/cextension.py: -------------------------------------------------------------------------------- 1 | import ctypes as ct 2 | from pathlib import Path 3 | from warnings import warn 4 | 5 | from .cuda_setup.main import evaluate_cuda_setup 6 | 7 | 8 | class CUDALibrary_Singleton(object): 9 | _instance = None 10 | 11 | def __init__(self): 12 | raise RuntimeError("Call get_instance() instead") 13 | 14 | def initialize(self): 15 | binary_name = evaluate_cuda_setup() 16 | package_dir = Path(__file__).parent 17 | binary_path = package_dir / binary_name 18 | 19 | if not binary_path.exists(): 20 | print(f"CUDA SETUP: TODO: compile library for specific version: {binary_name}") 21 | legacy_binary_name = "libbitsandbytes.so" 22 | print(f"CUDA SETUP: Defaulting to {legacy_binary_name}...") 23 | binary_path = package_dir / legacy_binary_name 24 | if not binary_path.exists(): 25 | print('CUDA SETUP: CUDA detection failed. Either CUDA driver not installed, CUDA not installed, or you have multiple conflicting CUDA libraries!') 26 | print('CUDA SETUP: If you compiled from source, try again with `make CUDA_VERSION=DETECTED_CUDA_VERSION` for example, `make CUDA_VERSION=113`.') 27 | raise Exception('CUDA SETUP: Setup Failed!') 28 | # self.lib = ct.cdll.LoadLibrary(binary_path) 29 | self.lib = ct.cdll.LoadLibrary(str(binary_path)) # $$$ 30 | else: 31 | print(f"CUDA SETUP: Loading binary {binary_path}...") 32 | # self.lib = ct.cdll.LoadLibrary(binary_path) 33 | self.lib = ct.cdll.LoadLibrary(str(binary_path)) # $$$ 34 | 35 | @classmethod 36 | def get_instance(cls): 37 | if cls._instance is None: 38 | cls._instance = cls.__new__(cls) 39 | cls._instance.initialize() 40 | return cls._instance 41 | 42 | 43 | lib = CUDALibrary_Singleton.get_instance().lib 44 | try: 45 | lib.cadam32bit_g32 46 | lib.get_context.restype = ct.c_void_p 47 | lib.get_cusparse.restype = ct.c_void_p 48 | COMPILED_WITH_CUDA = True 49 | except AttributeError: 50 | warn( 51 | "The installed version of bitsandbytes was compiled without GPU support. " 52 | "8-bit optimizers and GPU quantization are unavailable." 53 | ) 54 | COMPILED_WITH_CUDA = False 55 | -------------------------------------------------------------------------------- /resources/bitsandbytes_windows/libbitsandbytes_cpu.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/resources/bitsandbytes_windows/libbitsandbytes_cpu.dll -------------------------------------------------------------------------------- /resources/bitsandbytes_windows/libbitsandbytes_cuda116.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/resources/bitsandbytes_windows/libbitsandbytes_cuda116.dll -------------------------------------------------------------------------------- /sandbox.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchvision 3 | from PIL import Image 4 | from pytorch_wavelets import DWTForward, DWTInverse 5 | import torch.nn.functional as F 6 | 7 | def load_img(path): 8 | im = Image.open(path).convert(mode="RGB") 9 | return torchvision.transforms.ToTensor()(im) 10 | 11 | def save_img(t, path): 12 | torchvision.utils.save_image(t, path) 13 | 14 | img = load_img("pu.jpg") 15 | img = img.unsqueeze(0) 16 | 17 | # Reshape image to be multiple of 32 18 | w, h = img.shape[2:] 19 | w = (w // 32) * 32 20 | h = (h // 32) * 32 21 | img = F.interpolate(img, size=(w, h)) 22 | print("Input shape:", img.shape) 23 | 24 | J_spec = 5 25 | 26 | Yl, Yh = DWTForward(J=J_spec, mode='periodization', wave='db3')(img) 27 | print(Yl.shape, [h.shape for h in Yh]) 28 | 29 | imgLR = F.interpolate(img, scale_factor=.5) 30 | LQYl, LQYh = DWTForward(J=J_spec-1, mode='periodization', wave='db3')(imgLR) 31 | print(LQYl.shape, [h.shape for h in LQYh]) 32 | 33 | for i in range(J_spec): 34 | smd = torch.sum(Yh[i], dim=2).cpu() 35 | save_img(smd, "high_%i.png" % (i,)) 36 | save_img(Yl, "lo.png") 37 | 38 | ''' 39 | Following code reconstructs the image with different high passes cancelled out. 40 | ''' 41 | for i in range(J_spec): 42 | corrupted_im = [y for y in Yh] 43 | corrupted_im[i] = torch.zeros_like(corrupted_im[i]) 44 | im = DWTInverse(mode='periodization', wave='db3')((Yl, corrupted_im)) 45 | save_img(im, "corrupt_%i.png" % (i,)) 46 | im = DWTInverse(mode='periodization', wave='db3')((torch.full_like(Yl, fill_value=torch.mean(Yl)), Yh)) 47 | save_img(im, "corrupt_im.png") 48 | 49 | 50 | ''' 51 | Following code reconstructs a hybrid image with the first high pass from the HR and the rest of the data from the LR. 52 | highpass = [Yh[0]] + LQYh 53 | im = DWTInverse(mode='periodization', wave='db3')((LQYl, highpass)) 54 | save_img(im, "hybrid_lrhr.png") 55 | save_img(F.interpolate(imgLR, scale_factor=2), "upscaled.png") 56 | ''' -------------------------------------------------------------------------------- /static/drive_copied_file_tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/drive_copied_file_tree.png -------------------------------------------------------------------------------- /static/export_to_gdrive.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/export_to_gdrive.png -------------------------------------------------------------------------------- /static/file_directory.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/file_directory.png -------------------------------------------------------------------------------- /static/good_gpu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/good_gpu.png -------------------------------------------------------------------------------- /static/hyperparam_dataset.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/hyperparam_dataset.png -------------------------------------------------------------------------------- /static/ljspeech.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/ljspeech.png -------------------------------------------------------------------------------- /static/notebook_header.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/notebook_header.png -------------------------------------------------------------------------------- /static/params.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/params.png -------------------------------------------------------------------------------- /static/runtime_type.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/runtime_type.png -------------------------------------------------------------------------------- /static/settings_options.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/settings_options.png -------------------------------------------------------------------------------- /static/stop_training.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/stop_training.png -------------------------------------------------------------------------------- /static/training_button.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/training_button.png -------------------------------------------------------------------------------- /static/very_long_process.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/very_long_process.png -------------------------------------------------------------------------------- /static/very_recent_save.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/very_recent_save.png -------------------------------------------------------------------------------- /static/warning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/warning.png -------------------------------------------------------------------------------- /static/yml_file.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/yml_file.png -------------------------------------------------------------------------------- /voice_samples/kk_500/kk_0_0.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/voice_samples/kk_500/kk_0_0.wav -------------------------------------------------------------------------------- /voice_samples/kk_500/kk_0_1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/voice_samples/kk_500/kk_0_1.wav -------------------------------------------------------------------------------- /voice_samples/kk_500/kk_0_2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/voice_samples/kk_500/kk_0_2.wav -------------------------------------------------------------------------------- /voice_samples/kk_500_emma/emma_0_0.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/voice_samples/kk_500_emma/emma_0_0.wav -------------------------------------------------------------------------------- /voice_samples/kk_500_emma/emma_0_1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/voice_samples/kk_500_emma/emma_0_1.wav -------------------------------------------------------------------------------- /voice_samples/kk_500_emma/emma_0_2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/voice_samples/kk_500_emma/emma_0_2.wav -------------------------------------------------------------------------------- /voice_samples/kk_orig/kk_0_0.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/voice_samples/kk_orig/kk_0_0.wav -------------------------------------------------------------------------------- /voice_samples/kk_orig/kk_0_1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/voice_samples/kk_orig/kk_0_1.wav -------------------------------------------------------------------------------- /voice_samples/kk_orig/kk_0_2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/voice_samples/kk_orig/kk_0_2.wav --------------------------------------------------------------------------------