├── .flake8
├── .gitignore
├── .gitmodules
├── COLAB_USAGE.md
├── LICENSE
├── LICENSE.old
├── README.md
├── Setup DLAS.bat
├── Start DLAS.cmd
├── Start Training Monitor.cmd
├── codes
├── NOTES.md
├── configuration_gui.py
├── data
│ ├── README.md
│ ├── __init__.py
│ ├── audio
│ │ ├── audio_with_noise_dataset.py
│ │ ├── fast_paired_dataset.py
│ │ ├── fast_paired_dataset_with_phonemes.py
│ │ ├── gpt_tts_dataset.py
│ │ ├── gpt_tts_tokenizer.json
│ │ ├── grand_conjoined_dataset.py
│ │ ├── nv_tacotron_dataset.py
│ │ ├── paired_voice_audio_dataset.py
│ │ ├── preprocessed_mel_dataset.py
│ │ ├── unsupervised_audio_dataset.py
│ │ ├── voice_tokenizer.py
│ │ └── wav_aug.py
│ ├── combined_dataset.py
│ ├── data_sampler.py
│ ├── images
│ │ ├── __init__.py
│ │ ├── base_unsupervised_image_dataset.py
│ │ ├── byol_attachment.py
│ │ ├── chunk_with_reference.py
│ │ ├── cifar.py
│ │ ├── full_image_dataset.py
│ │ ├── image_corruptor.py
│ │ ├── image_folder_dataset.py
│ │ ├── image_label_parser.py
│ │ ├── image_pair_with_corresponding_points_dataset.py
│ │ ├── multi_frame_dataset.py
│ │ ├── multiscale_dataset.py
│ │ ├── paired_frame_dataset.py
│ │ ├── random_dataset.py
│ │ ├── single_image_dataset.py
│ │ ├── stylegan2_dataset.py
│ │ └── zip_file_dataset.py
│ ├── text
│ │ └── hf_datasets_wrapper.py
│ ├── torch_dataset.py
│ ├── util.py
│ └── zero_pad_dict_collate.py
├── maybe_bnb.py
├── models
│ ├── __init__.py
│ ├── arch_util.py
│ ├── audio
│ │ ├── __init__.py
│ │ ├── asr
│ │ │ ├── __init__.py
│ │ │ └── w2v_wrapper.py
│ │ ├── audio_resnet.py
│ │ ├── mel2vec.py
│ │ ├── music
│ │ │ ├── __init__.py
│ │ │ ├── cheater_gen_ar.py
│ │ │ ├── diffwave.py
│ │ │ ├── encoders.py
│ │ │ ├── flat_diffusion.py
│ │ │ ├── gpt_music.py
│ │ │ ├── gpt_music2.py
│ │ │ ├── instrument_quantizer.py
│ │ │ ├── m2v_code_to_mel.py
│ │ │ ├── mel2vec_codes_gpt.py
│ │ │ ├── music_quantizer.py
│ │ │ ├── music_quantizer2.py
│ │ │ ├── tfdpc_v5.py
│ │ │ ├── transformer_diffusion12.py
│ │ │ ├── transformer_diffusion13.py
│ │ │ ├── transformer_diffusion14.py
│ │ │ ├── unet_diffusion_music_codes.py
│ │ │ ├── unet_diffusion_waveform_gen.py
│ │ │ ├── unet_diffusion_waveform_gen3.py
│ │ │ └── unet_diffusion_waveform_gen_simple.py
│ │ ├── tts
│ │ │ ├── __init__.py
│ │ │ ├── autoregressive_codegen.py
│ │ │ ├── autoregressive_codegen2.py
│ │ │ ├── ctc_code_generator.py
│ │ │ ├── diffusion_encoder.py
│ │ │ ├── lucidrains_dvae.py
│ │ │ ├── mini_encoder.py
│ │ │ ├── random_latent_converter.py
│ │ │ ├── tacotron2
│ │ │ │ ├── LICENSE
│ │ │ │ ├── __init__.py
│ │ │ │ ├── audio_processing.py
│ │ │ │ ├── hparams.py
│ │ │ │ ├── layers.py
│ │ │ │ ├── loss.py
│ │ │ │ ├── stft.py
│ │ │ │ ├── taco_utils.py
│ │ │ │ ├── tacotron2.py
│ │ │ │ ├── text
│ │ │ │ │ ├── LICENSE
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── cleaners.py
│ │ │ │ │ ├── cmudict.py
│ │ │ │ │ ├── numbers.py
│ │ │ │ │ └── symbols.py
│ │ │ │ └── wave_tacotron.py
│ │ │ ├── transformer_builders.py
│ │ │ ├── transformer_diffusion_tts.py
│ │ │ ├── transformer_diffusion_tts2.py
│ │ │ ├── unet_diffusion_tts7.py
│ │ │ ├── unet_diffusion_tts9.py
│ │ │ ├── unet_diffusion_tts_flat.py
│ │ │ ├── unet_diffusion_vocoder.py
│ │ │ ├── unet_diffusion_vocoder_with_ref.py
│ │ │ ├── unified_voice2.py
│ │ │ ├── unified_voice3.py
│ │ │ ├── unified_voice4.py
│ │ │ ├── voice_voice_clip.py
│ │ │ └── w2v_matcher.py
│ │ └── vocoders
│ │ │ ├── __init__.py
│ │ │ ├── univnet
│ │ │ ├── __init__.py
│ │ │ ├── generator.py
│ │ │ └── lvcnet.py
│ │ │ └── waveglow
│ │ │ ├── __init__.py
│ │ │ ├── denoiser.py
│ │ │ └── waveglow.py
│ ├── classifiers
│ │ ├── __init__.py
│ │ ├── cifar_resnet.py
│ │ ├── resnet_with_checkpointing.py
│ │ ├── torch_models.py
│ │ ├── twin_cifar_resnet.py
│ │ ├── weighted_conv_resnet.py
│ │ └── wide_kernel_vgg.py
│ ├── clip
│ │ ├── __init__.py
│ │ ├── clip.py
│ │ ├── clvp.py
│ │ ├── contrastive_audio.py
│ │ ├── cvvp.py
│ │ ├── mel_text_clip.py
│ │ ├── text_cond_clip.py
│ │ └── text_voice_clip.py
│ ├── composable
│ │ ├── README.md
│ │ └── __init__.py
│ ├── diffusion
│ │ ├── __init__.py
│ │ ├── fp16_util.py
│ │ ├── gaussian_diffusion.py
│ │ ├── losses.py
│ │ ├── nn.py
│ │ ├── resample.py
│ │ ├── respace.py
│ │ ├── rrdb_diffusion.py
│ │ ├── unet_diffusion.py
│ │ └── unet_latent_guide.py
│ ├── image_generation
│ │ ├── RRDBNet_arch.py
│ │ ├── ResGen_arch.py
│ │ ├── __init__.py
│ │ ├── discriminator_vgg_arch.py
│ │ ├── glean
│ │ │ ├── __init__.py
│ │ │ ├── glean.py
│ │ │ └── stylegan2_latent_bank.py
│ │ ├── srflow
│ │ │ ├── FlowActNorms.py
│ │ │ ├── FlowAffineCouplingsAblation.py
│ │ │ ├── FlowStep.py
│ │ │ ├── FlowUpsamplerNet.py
│ │ │ ├── Permutations.py
│ │ │ ├── RRDBNet_arch.py
│ │ │ ├── SRFlowNet_arch.py
│ │ │ ├── Split.py
│ │ │ ├── __init__.py
│ │ │ ├── flow.py
│ │ │ ├── glow_arch.py
│ │ │ ├── module_util.py
│ │ │ └── thops.py
│ │ └── stylegan
│ │ │ ├── Discriminator_StyleGAN.py
│ │ │ ├── __init__.py
│ │ │ ├── stylegan2_lucidrains.py
│ │ │ └── stylegan2_rosinality.py
│ ├── image_latents
│ │ ├── __init__.py
│ │ ├── byol
│ │ │ ├── __init__.py
│ │ │ ├── byol_model_wrapper.py
│ │ │ └── byol_structural.py
│ │ ├── fixup_resnet
│ │ │ ├── DiscriminatorResnet_arch.py
│ │ │ └── __init__.py
│ │ ├── spinenet_arch.py
│ │ └── vit_latent.py
│ ├── lucidrains
│ │ ├── dalle
│ │ │ ├── __init__.py
│ │ │ ├── attention.py
│ │ │ ├── reversible.py
│ │ │ └── transformer.py
│ │ ├── performer
│ │ │ ├── __init__.py
│ │ │ ├── autoregressive_wrapper.py
│ │ │ ├── performer_enc_dec.py
│ │ │ ├── performer_pytorch.py
│ │ │ └── reversible.py
│ │ ├── vq.py
│ │ └── x_transformers.py
│ ├── optical_flow
│ │ └── PWCNet.py
│ └── vqvae
│ │ ├── __init__.py
│ │ ├── dvae.py
│ │ ├── gumbel_quantizer.py
│ │ ├── scaled_weight_conv.py
│ │ ├── vector_quantizer.py
│ │ └── vqvae.py
├── multi_modal_train.py
├── process_video.py
├── requirements.laxed.txt
├── requirements.txt
├── requirements_frozen_only_use_if_something_broken.txt
├── scripts
│ ├── __init__.py
│ ├── audio
│ │ ├── __init__.py
│ │ ├── gen
│ │ │ ├── __init__.py
│ │ │ ├── ctc_codes.py
│ │ │ ├── music_joiner.py
│ │ │ ├── speech_synthesis_utils.py
│ │ │ ├── use_diffuse_tts.py
│ │ │ ├── use_diffuse_voice_translation.py
│ │ │ ├── use_discrete_vocoder.py
│ │ │ ├── use_discrete_vocoder_one_way.py
│ │ │ ├── use_gpt_tts.py
│ │ │ ├── use_mel2vec_codes.py
│ │ │ └── w2v_patcher.py
│ │ ├── gen_mel.py
│ │ ├── mel_bin_norm_compute.py
│ │ ├── play_with_spectral_representations.py
│ │ ├── prep_music
│ │ │ ├── demucs_notes.txt
│ │ │ ├── generate_long_cheaters.py
│ │ │ ├── generate_long_mels.py
│ │ │ └── phase_1_split_files.py
│ │ ├── preparation
│ │ │ ├── __init__.py
│ │ │ ├── combine_phonetic_and_text.py
│ │ │ ├── filter_clips_with_no_hifreq_data.py
│ │ │ ├── gen_dvae_codes.py
│ │ │ ├── phase_1_split_files.py
│ │ │ ├── phase_2_sample_and_filter.py
│ │ │ ├── phase_3_generate_similarities.py
│ │ │ ├── pipeline.py
│ │ │ ├── process_spleeter_filter_outputs.py
│ │ │ ├── save_mels_to_disk.py
│ │ │ ├── spleeter_filter_noisy_clips.py
│ │ │ ├── spleeter_utils
│ │ │ │ ├── __init__.py
│ │ │ │ └── spleeter_dataset.py
│ │ │ └── split_on_silence.py
│ │ ├── random_mp3_splitter.py
│ │ ├── spleeter_split_voice_and_background.py
│ │ ├── test_audio_gen.py
│ │ ├── test_audio_segmentor.py
│ │ ├── test_audio_similarity.py
│ │ ├── test_audio_speech_recognition.py
│ │ ├── use_vocoder.py
│ │ └── word_error_rate.py
│ ├── byol
│ │ ├── byol_extract_wrapped_model.py
│ │ ├── byol_resnet_playground.py
│ │ ├── byol_segformer_playground.py
│ │ ├── byol_spinenet_playground.py
│ │ └── tsne_torch.py
│ ├── classify_into_folders.py
│ ├── diffusion
│ │ ├── diffusion_correction_surfer.py
│ │ ├── diffusion_inference.py
│ │ ├── diffusion_noise_surfer.py
│ │ ├── diffusion_recursive_sampler.py
│ │ └── diffusion_spacing_surfer.py
│ ├── do_to_files.py
│ ├── extract_square_images.py
│ ├── extract_subimages.py
│ ├── extract_subimages_with_ref.py
│ ├── extract_temporal_squares.py
│ ├── find_faulty_files.py
│ ├── folderize_imagenet_val.py
│ ├── gen_kmeans_clusters.py
│ ├── hugging_face_hub_upload.py
│ ├── post_install.py
│ ├── srflow_latent_space_playground.py
│ ├── start_tensorboard.py
│ ├── stitch_images.py
│ ├── stylegan2
│ │ ├── convert_weights_rosinality.py
│ │ └── dnnlib
│ │ │ └── tflib
│ │ │ └── network.py
│ ├── ui
│ │ ├── image_labeler
│ │ │ ├── image_labeler_ui.py
│ │ │ ├── label_editor.py
│ │ │ ├── pretrained_image_patch_classifier.py
│ │ │ └── test_image_patch_classifier.py
│ │ └── image_pair_labeler
│ │ │ └── image_pair_ui.py
│ ├── use_generator_as_filter.py
│ └── validate_data.py
├── sweep.py
├── test.py
├── train.py
├── trainer
│ ├── ExtensibleTrainer.py
│ ├── README.md
│ ├── __init__.py
│ ├── base_model.py
│ ├── batch_size_optimizer.py
│ ├── custom_training_components
│ │ ├── __init__.py
│ │ ├── progressive_zoom.py
│ │ ├── stereoscopic.py
│ │ └── tecogan_losses.py
│ ├── eval
│ │ ├── __init__.py
│ │ ├── audio_diffusion_fid.py
│ │ ├── eval_wer.py
│ │ ├── evaluator.py
│ │ ├── fid.py
│ │ ├── flow_gaussian_nll.py
│ │ ├── mel_evaluator.py
│ │ ├── music_diffusion_fid.py
│ │ ├── single_point_pair_contrastive_eval.py
│ │ ├── sr_diffusion_fid.py
│ │ ├── sr_fid.py
│ │ └── sr_style.py
│ ├── experiments
│ │ ├── __init__.py
│ │ └── experiments.py
│ ├── feature_model.py
│ ├── inject.py
│ ├── injectors
│ │ ├── __init__.py
│ │ ├── audio_injectors.py
│ │ ├── base_injectors.py
│ │ ├── gaussian_diffusion_injector.py
│ │ └── spec_augment.py
│ ├── loss.py
│ ├── losses.py
│ ├── lr_scheduler.py
│ ├── networks.py
│ ├── optimizers
│ │ ├── lamb.py
│ │ ├── larc.py
│ │ └── sgd.py
│ └── steps.py
├── use_discriminator_as_filter.py
└── utils
│ ├── BASE_gpt.yaml
│ ├── UI_icon.png
│ ├── __init__.py
│ ├── audio.py
│ ├── audio_resampler.py
│ ├── colors.py
│ ├── convert_model.py
│ ├── distributed_checkpont.py
│ ├── gpu_mem_track.py
│ ├── kmeans.py
│ ├── loss_accumulator.py
│ ├── music_utils.py
│ ├── numeric_stability.py
│ ├── options.py
│ ├── util.py
│ └── weight_scheduler.py
├── environment.yaml
├── experiments
├── EXAMPLE_diff.yml
├── EXAMPLE_gpt.yml
├── bpe_lowercase_asr_256.json
├── clips_mel_norms.pth
├── train_diffusion_vocoder_22k_level.yml
└── train_gpt_tts_unified.yml
├── param_calc.py
├── recipes
├── byol
│ ├── README.md
│ └── train_div2k_byol.yml
├── diffusion
│ ├── README.md
│ ├── test_diffusion_unet.yml
│ └── train_ddpm_unet.yml
├── esrgan
│ ├── README.md
│ ├── rrdb_process_video.yml
│ ├── train_div2k_esrgan.yml
│ └── train_div2k_esrgan_reference.yml
├── glean
│ ├── README.md
│ └── train_ffhq_glean.yml
├── segformer
│ └── train_byol_segformer.yml
├── srflow
│ ├── README.md
│ ├── convert_official_weights.py
│ ├── train_div2k_rrdb_psnr.yml
│ └── train_div2k_srflow.yml
├── stylegan
│ └── README.md
├── tacotron2
│ ├── test_tacotron2_lj.yml
│ └── train_tacotron2_lj.yml
└── vqvae2
│ ├── README.md
│ └── train_imgnet_vqvae_stage1.yml
├── resources
└── bitsandbytes_windows
│ ├── cextension.py
│ ├── libbitsandbytes_cpu.dll
│ ├── libbitsandbytes_cuda116.dll
│ └── main.py
├── sandbox.py
├── static
├── drive_copied_file_tree.png
├── export_to_gdrive.png
├── file_directory.png
├── good_gpu.png
├── hyperparam_dataset.png
├── ljspeech.png
├── notebook_header.png
├── params.png
├── runtime_type.png
├── settings_options.png
├── stop_training.png
├── training_button.png
├── very_long_process.png
├── very_recent_save.png
├── warning.png
└── yml_file.png
└── voice_samples
├── kk_500
├── kk_0_0.wav
├── kk_0_1.wav
└── kk_0_2.wav
├── kk_500_emma
├── emma_0_0.wav
├── emma_0_1.wav
└── emma_0_2.wav
└── kk_orig
├── kk_0_0.wav
├── kk_0_1.wav
└── kk_0_2.wav
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore =
3 | # Too many leading '#' for block comment (E266)
4 | E266
5 |
6 | max-line-length=100
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "flownet2"]
2 | path = flownet2
3 | url = https://github.com/NVIDIA/flownet2-pytorch.git
4 | [submodule "codes/models/flownet2"]
5 | path = codes/models/flownet2
6 | url = https://github.com/neonbjb/flownet2-pytorch.git
7 |
--------------------------------------------------------------------------------
/Setup DLAS.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | :: This file is part of sygil-webui (https://github.com/Sygil-Dev/sygil-webui/).
3 | ::
4 | :: Copyright 2022 Sygil-Dev team.
5 | :: This program is free software: you can redistribute it and/or modify
6 | :: it under the terms of the GNU Affero General Public License as published by
7 | :: the Free Software Foundation, either version 3 of the License, or
8 | :: (at your option) any later version.
9 | ::
10 | :: This program is distributed in the hope that it will be useful,
11 | :: but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | :: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | :: GNU Affero General Public License for more details.
14 | ::
15 | :: You should have received a copy of the GNU Affero General Public License
16 | :: along with this program. If not, see .
17 | :: Run all commands using this script's directory as the working directory
18 | cd %~dp0
19 |
20 | :: copy over the first line from environment.yaml, e.g. name: ldm, and take the second word after splitting by ":" delimiter
21 | for /F "tokens=2 delims=: " %%i in (environment.yaml) DO (
22 | set v_conda_env_name=%%i
23 | goto EOL
24 | )
25 | :EOL
26 |
27 | echo Environment name is set as %v_conda_env_name% as per environment.yaml
28 |
29 | :: Put the path to conda directory in a file called "custom-conda-path.txt" if it's installed at non-standard path
30 | IF EXIST custom-conda-path.txt (
31 | FOR /F %%i IN (custom-conda-path.txt) DO set v_custom_path=%%i
32 | )
33 |
34 | set INSTALL_ENV_DIR=%cd%\installer_files\env
35 | set PATH=%INSTALL_ENV_DIR%;%INSTALL_ENV_DIR%\Library\bin;%INSTALL_ENV_DIR%\Scripts;%INSTALL_ENV_DIR%\Library\usr\bin;%PATH%
36 |
37 | set v_paths=%INSTALL_ENV_DIR%
38 | set v_paths=%v_paths%;%ProgramData%\miniconda3
39 | set v_paths=%v_paths%;%USERPROFILE%\miniconda3
40 | set v_paths=%v_paths%;%ProgramData%\anaconda3
41 | set v_paths=%v_paths%;%USERPROFILE%\anaconda3
42 |
43 | for %%a in (%v_paths%) do (
44 | IF NOT "%v_custom_path%"=="" (
45 | set v_paths=%v_custom_path%;%v_paths%
46 | )
47 | )
48 |
49 | for %%a in (%v_paths%) do (
50 | if EXIST "%%a\Scripts\activate.bat" (
51 | SET v_conda_path=%%a
52 | echo anaconda3/miniconda3 detected in %%a
53 | goto :CONDA_FOUND
54 | )
55 | )
56 |
57 | IF "%v_conda_path%"=="" (
58 | echo anaconda3/miniconda3 not found. Install from here https://docs.conda.io/en/latest/miniconda.html
59 | pause
60 | exit /b 1
61 | )
62 |
63 | :CONDA_FOUND
64 | echo Found Anaconda
65 |
66 | :SKIP_RESTORE
67 | call "%v_conda_path%\Scripts\activate.bat"
68 | echo %v_conda_env_name%
69 |
70 | call conda env create --name "%v_conda_env_name%" -f environment.yaml
71 |
72 |
73 |
74 | call "%v_conda_path%\Scripts\activate.bat" "%v_conda_env_name%"
75 |
76 | :PROMPT
77 | python codes/scripts/post_install.py
78 | pause
--------------------------------------------------------------------------------
/Start DLAS.cmd:
--------------------------------------------------------------------------------
1 | @echo off
2 | :: This file is part of sygil-webui (https://github.com/Sygil-Dev/sygil-webui/).
3 | ::
4 | :: Copyright 2022 Sygil-Dev team.
5 | :: This program is free software: you can redistribute it and/or modify
6 | :: it under the terms of the GNU Affero General Public License as published by
7 | :: the Free Software Foundation, either version 3 of the License, or
8 | :: (at your option) any later version.
9 | ::
10 | :: This program is distributed in the hope that it will be useful,
11 | :: but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | :: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | :: GNU Affero General Public License for more details.
14 | ::
15 | :: You should have received a copy of the GNU Affero General Public License
16 | :: along with this program. If not, see .
17 | :: Run all commands using this script's directory as the working directory
18 | cd %~dp0
19 |
20 | :: copy over the first line from environment.yaml, e.g. name: ldm, and take the second word after splitting by ":" delimiter
21 | set v_conda_env_name="DLAS"
22 |
23 |
24 | echo Environment name is set as %v_conda_env_name% as per environment.yaml
25 |
26 | :: Put the path to conda directory in a file called "custom-conda-path.txt" if it's installed at non-standard path
27 | IF EXIST custom-conda-path.txt (
28 | FOR /F %%i IN (custom-conda-path.txt) DO set v_custom_path=%%i
29 | )
30 |
31 | set INSTALL_ENV_DIR=%cd%\installer_files\env
32 | set PATH=%INSTALL_ENV_DIR%;%INSTALL_ENV_DIR%\Library\bin;%INSTALL_ENV_DIR%\Scripts;%INSTALL_ENV_DIR%\Library\usr\bin;%PATH%
33 |
34 | set v_paths=%INSTALL_ENV_DIR%
35 | set v_paths=%v_paths%;%ProgramData%\miniconda3
36 | set v_paths=%v_paths%;%USERPROFILE%\miniconda3
37 | set v_paths=%v_paths%;%ProgramData%\anaconda3
38 | set v_paths=%v_paths%;%USERPROFILE%\anaconda3
39 |
40 | for %%a in (%v_paths%) do (
41 | IF NOT "%v_custom_path%"=="" (
42 | set v_paths=%v_custom_path%;%v_paths%
43 | )
44 | )
45 |
46 | for %%a in (%v_paths%) do (
47 | if EXIST "%%a\Scripts\activate.bat" (
48 | SET v_conda_path=%%a
49 | echo anaconda3/miniconda3 detected in %%a
50 | goto :CONDA_FOUND
51 | )
52 | )
53 |
54 | IF "%v_conda_path%"=="" (
55 | echo anaconda3/miniconda3 not found. Install from here https://docs.conda.io/en/latest/miniconda.html
56 | pause
57 | exit /b 1
58 | )
59 |
60 | :CONDA_FOUND
61 | echo Starting conda environment %v_conda_env_name% from %v_conda_path%
62 |
63 | call "%v_conda_path%\Scripts\activate.bat" "%v_conda_env_name%"
64 |
65 | :START_GUI
66 | python codes/configuration_gui.py
67 |
68 | ::cmd /k
69 |
--------------------------------------------------------------------------------
/Start Training Monitor.cmd:
--------------------------------------------------------------------------------
1 | @echo off
2 | :: This file is part of sygil-webui (https://github.com/Sygil-Dev/sygil-webui/).
3 | ::
4 | :: Copyright 2022 Sygil-Dev team.
5 | :: This program is free software: you can redistribute it and/or modify
6 | :: it under the terms of the GNU Affero General Public License as published by
7 | :: the Free Software Foundation, either version 3 of the License, or
8 | :: (at your option) any later version.
9 | ::
10 | :: This program is distributed in the hope that it will be useful,
11 | :: but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | :: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | :: GNU Affero General Public License for more details.
14 | ::
15 | :: You should have received a copy of the GNU Affero General Public License
16 | :: along with this program. If not, see .
17 | :: Run all commands using this script's directory as the working directory
18 | cd %~dp0
19 |
20 | :: copy over the first line from environment.yaml, e.g. name: ldm, and take the second word after splitting by ":" delimiter
21 | set v_conda_env_name="DLAS"
22 |
23 |
24 | echo Environment name is set as %v_conda_env_name% as per environment.yaml
25 |
26 | :: Put the path to conda directory in a file called "custom-conda-path.txt" if it's installed at non-standard path
27 | IF EXIST custom-conda-path.txt (
28 | FOR /F %%i IN (custom-conda-path.txt) DO set v_custom_path=%%i
29 | )
30 |
31 | set INSTALL_ENV_DIR=%cd%\installer_files\env
32 | set PATH=%INSTALL_ENV_DIR%;%INSTALL_ENV_DIR%\Library\bin;%INSTALL_ENV_DIR%\Scripts;%INSTALL_ENV_DIR%\Library\usr\bin;%PATH%
33 |
34 | set v_paths=%INSTALL_ENV_DIR%
35 | set v_paths=%v_paths%;%ProgramData%\miniconda3
36 | set v_paths=%v_paths%;%USERPROFILE%\miniconda3
37 | set v_paths=%v_paths%;%ProgramData%\anaconda3
38 | set v_paths=%v_paths%;%USERPROFILE%\anaconda3
39 |
40 | for %%a in (%v_paths%) do (
41 | IF NOT "%v_custom_path%"=="" (
42 | set v_paths=%v_custom_path%;%v_paths%
43 | )
44 | )
45 |
46 | for %%a in (%v_paths%) do (
47 | if EXIST "%%a\Scripts\activate.bat" (
48 | SET v_conda_path=%%a
49 | echo anaconda3/miniconda3 detected in %%a
50 | goto :CONDA_FOUND
51 | )
52 | )
53 |
54 | IF "%v_conda_path%"=="" (
55 | echo anaconda3/miniconda3 not found. Install from here https://docs.conda.io/en/latest/miniconda.html
56 | pause
57 | exit /b 1
58 | )
59 |
60 | :CONDA_FOUND
61 | echo Starting conda environment %v_conda_env_name% from %v_conda_path%
62 |
63 | call "%v_conda_path%\Scripts\activate.bat" "%v_conda_env_name%"
64 |
65 | :START_GUI
66 | python codes/scripts/start_tensorboard.py
67 |
68 | ::cmd /k
69 |
--------------------------------------------------------------------------------
/codes/data/audio/preprocessed_mel_dataset.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pathlib import Path
3 |
4 | import numpy as np
5 | import torch
6 | import torch.nn.functional as F
7 | import torch.utils.data
8 | import torchaudio
9 | import torchvision
10 | from tqdm import tqdm
11 |
12 | from utils.util import opt_get
13 |
14 |
15 | class PreprocessedMelDataset(torch.utils.data.Dataset):
16 |
17 | def __init__(self, opt):
18 | path = opt['path']
19 | cache_path = opt['cache_path'] # Will fail when multiple paths specified, must be specified in this case.
20 | if os.path.exists(cache_path):
21 | self.paths = torch.load(cache_path)
22 | else:
23 | print("Building cache..")
24 | path = Path(path)
25 | self.paths = [str(p) for p in path.rglob("*.npz")]
26 | torch.save(self.paths, cache_path)
27 | self.pad_to = opt_get(opt, ['pad_to_samples'], 10336)
28 | self.squeeze = opt_get(opt, ['should_squeeze'], False)
29 |
30 | def __getitem__(self, index):
31 | with np.load(self.paths[index]) as npz_file:
32 | mel = torch.tensor(npz_file['arr_0'])
33 | assert mel.shape[-1] <= self.pad_to
34 | if self.squeeze:
35 | mel = mel.squeeze()
36 | padding_needed = self.pad_to - mel.shape[-1]
37 | mask = torch.zeros_like(mel)
38 | if padding_needed > 0:
39 | mel = F.pad(mel, (0,padding_needed))
40 | mask = F.pad(mask, (0,padding_needed), value=1)
41 |
42 | output = {
43 | 'mel': mel,
44 | 'mel_lengths': torch.tensor(mel.shape[-1]),
45 | 'mask': mask,
46 | 'mask_lengths': torch.tensor(mask.shape[-1]),
47 | 'path': self.paths[index],
48 | }
49 | return output
50 |
51 | def __len__(self):
52 | return len(self.paths)
53 |
54 |
55 | if __name__ == '__main__':
56 | params = {
57 | 'mode': 'preprocessed_mel',
58 | 'path': 'Y:\\separated\\large_mel_cheaters',
59 | 'cache_path': 'Y:\\separated\\large_mel_cheaters_win.pth',
60 | 'pad_to_samples': 646,
61 | 'phase': 'train',
62 | 'n_workers': 0,
63 | 'batch_size': 16,
64 | }
65 | from data import create_dataset, create_dataloader
66 |
67 | ds = create_dataset(params)
68 | dl = create_dataloader(ds, params)
69 | i = 0
70 | for b in tqdm(dl):
71 | #pass
72 | torchvision.utils.save_image((b['mel'].unsqueeze(1)+1)/2, f'{i}.png')
73 | i += 1
74 | if i > 20:
75 | break
76 |
--------------------------------------------------------------------------------
/codes/data/audio/wav_aug.py:
--------------------------------------------------------------------------------
1 | import random
2 |
3 | import torch
4 | import torchaudio.sox_effects
5 |
6 | from models.audio.tts.tacotron2.taco_utils import load_wav_to_torch
7 |
8 |
9 | # Returns random double on [l,h] as a string
10 | def rdstr(l=0,h=1):
11 | assert h > l
12 | i=h-l
13 | return str(random.random() * i + l)
14 |
15 |
16 | # Returns a randint on [s,e] as a string
17 | def rdi(e, s=0):
18 | return str(random.randint(s,e))
19 |
20 |
21 | class WavAugmentor:
22 | def __init__(self):
23 | pass
24 |
25 | def augment(self, wav, sample_rate):
26 | speed_effect = ['speed', rdstr(.8, 1)]
27 | '''
28 | Band effects are disabled until I can audit them better.
29 | band_effects = [
30 | ['reverb', '-w'],
31 | ['reverb'],
32 | ['band', rdi(8000, 3000), rdi(1000, 100)],
33 | ['bandpass', rdi(8000, 3000), rdi(1000, 100)],
34 | ['bass', rdi(20,-20)],
35 | ['treble', rdi(20,-20)],
36 | ['dither'],
37 | ['equalizer', rdi(3000, 100), rdi(1000, 100), rdi(10, -10)],
38 | ['hilbert'],
39 | ['sinc', '3k'],
40 | ['sinc', '-4k'],
41 | ['sinc', '3k-4k']
42 | ]
43 | band_effect = random.choice(band_effects)
44 | '''
45 | volume_effects = [
46 | ['loudness', rdi(10,-2)],
47 | ['overdrive', rdi(20,0), rdi(20,0)],
48 | ]
49 | vol_effect = random.choice(volume_effects)
50 | effects = [speed_effect, vol_effect]
51 | out, sr = torchaudio.sox_effects.apply_effects_tensor(wav, sample_rate, effects)
52 | # Add a variable amount of noise
53 | out = out + torch.rand_like(out) * random.random() * .03
54 | return out
55 |
56 |
57 | if __name__ == '__main__':
58 | sample, _ = load_wav_to_torch('obama1.wav')
59 | sample = sample / 32768.0
60 | aug = WavAugmentor()
61 | for j in range(10):
62 | out = aug.augment(sample, 24000)
63 | torchaudio.save(f'out{j}.wav', out, 24000)
--------------------------------------------------------------------------------
/codes/data/combined_dataset.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from data import create_dataset
3 |
4 |
5 | # Simple composite dataset that combines multiple other datasets.
6 | # Assumes that the datasets output dicts.
7 | class CombinedDataset(torch.utils.data.Dataset):
8 | def __init__(self, opt):
9 | self.datasets = {}
10 | for k, v in opt.items():
11 | if not isinstance(v, dict):
12 | continue
13 | # Scale&phase gets injected by options.py..
14 | v['scale'] = opt['scale']
15 | v['phase'] = opt['phase']
16 | self.datasets[k] = create_dataset(v)
17 | self.items_fetched = 0
18 |
19 | def __getitem__(self, i):
20 | self.items_fetched += 1
21 | output = {}
22 | for name, dataset in self.datasets.items():
23 | prefix = ""
24 | # 'default' dataset gets no prefix, other ones get `key_`
25 | if name != 'default':
26 | prefix = name + "_"
27 |
28 | data = dataset[i % len(dataset)]
29 | for k, v in data.items():
30 | output[prefix + k] = v
31 | return output
32 |
33 | def __len__(self):
34 | return max(len(d) for d in self.datasets.values())
--------------------------------------------------------------------------------
/codes/data/data_sampler.py:
--------------------------------------------------------------------------------
1 | """
2 | Modified from torch.utils.data.distributed.DistributedSampler
3 | Support enlarging the dataset for *iteration-oriented* training, for saving time when restart the
4 | dataloader after each epoch
5 | """
6 | import math
7 | import torch
8 | from torch.utils.data.sampler import Sampler
9 | import torch.distributed as dist
10 |
11 |
12 | class DistIterSampler(Sampler):
13 | """Sampler that restricts data loading to a subset of the dataset.
14 |
15 | It is especially useful in conjunction with
16 | :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
17 | process can pass a DistributedSampler instance as a DataLoader sampler,
18 | and load a subset of the original dataset that is exclusive to it.
19 |
20 | .. note::
21 | Dataset is assumed to be of constant size.
22 |
23 | Arguments:
24 | dataset: Dataset used for sampling.
25 | num_replicas (optional): Number of processes participating in
26 | distributed training.
27 | rank (optional): Rank of the current process within num_replicas.
28 | """
29 |
30 | def __init__(self, dataset, num_replicas=None, rank=None, ratio=100):
31 | if num_replicas is None:
32 | if not dist.is_available():
33 | raise RuntimeError("Requires distributed package to be available")
34 | num_replicas = dist.get_world_size()
35 | if rank is None:
36 | if not dist.is_available():
37 | raise RuntimeError("Requires distributed package to be available")
38 | rank = dist.get_rank()
39 | self.dataset = dataset
40 | self.num_replicas = num_replicas
41 | self.rank = rank
42 | self.epoch = 0
43 | self.num_samples = int(math.ceil(len(self.dataset) * ratio / self.num_replicas))
44 | self.total_size = self.num_samples * self.num_replicas
45 |
46 | def __iter__(self):
47 | # deterministically shuffle based on epoch
48 | g = torch.Generator()
49 | g.manual_seed(self.epoch)
50 | indices = torch.randperm(self.total_size, generator=g).tolist()
51 |
52 | dsize = len(self.dataset)
53 | indices = [v % dsize for v in indices]
54 |
55 | # subsample
56 | indices = indices[self.rank:self.total_size:self.num_replicas]
57 | assert len(indices) == self.num_samples
58 |
59 | return iter(indices)
60 |
61 | def __len__(self):
62 | return self.num_samples
63 |
64 | def set_epoch(self, epoch):
65 | self.epoch = epoch
66 |
--------------------------------------------------------------------------------
/codes/data/images/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/data/images/__init__.py
--------------------------------------------------------------------------------
/codes/data/images/chunk_with_reference.py:
--------------------------------------------------------------------------------
1 | import os.path as osp
2 | from data import util
3 | import torch
4 | import numpy as np
5 |
6 | # Iterable that reads all the images in a directory that contains a reference image, tile images and center coordinates.
7 | from utils.util import opt_get
8 |
9 |
10 | class ChunkWithReference:
11 | def __init__(self, opt, path):
12 | self.path = path.path
13 | self.tiles, _ = util.find_files_of_type('img', self.path)
14 | self.need_metadata = opt_get(opt, ['strict'], False) or opt_get(opt, ['needs_metadata'], False)
15 | self.need_ref = opt_get(opt, ['need_ref'], False)
16 | if 'ignore_first' in opt.keys():
17 | self.tiles = self.tiles[opt['ignore_first']:]
18 |
19 | # Odd failures occur at times. Rather than crashing, report the error and just return zeros.
20 | def read_image_or_get_zero(self, img_path):
21 | img = util.read_img(None, img_path, rgb=True)
22 | if img is None:
23 | return np.zeros(128, 128, 3)
24 | return img
25 |
26 | def __getitem__(self, item):
27 | tile = self.read_image_or_get_zero(self.tiles[item])
28 | if self.need_ref and osp.exists(osp.join(self.path, "ref.jpg")):
29 | tile_id = int(osp.splitext(osp.basename(self.tiles[item]))[0])
30 | ref = self.read_image_or_get_zero(osp.join(self.path, "ref.jpg"))
31 | if self.need_metadata:
32 | centers = torch.load(osp.join(self.path, "centers.pt"))
33 | if tile_id in centers.keys():
34 | center, tile_width = centers[tile_id]
35 | else:
36 | print("Could not find the given tile id in the accompanying centers.pt. This generally means that "
37 | "centers.pt was overwritten at some point e.g. by duplicate data. If you don't care about tile "
38 | "centers, consider passing strict=false to the dataset options. (Note: you must re-build your"
39 | "caches for this setting change to take effect.)")
40 | raise FileNotFoundError(tile_id, self.tiles[item])
41 | else:
42 | center = torch.tensor([128, 128], dtype=torch.long)
43 | tile_width = 256
44 | mask = np.full(tile.shape[:2] + (1,), fill_value=.1, dtype=tile.dtype)
45 | mask[center[0] - tile_width // 2:center[0] + tile_width // 2, center[1] - tile_width // 2:center[1] + tile_width // 2] = 1
46 | else:
47 | ref = np.zeros_like(tile)
48 | mask = np.zeros(tile.shape[:2] + (1,))
49 | center = (0,0)
50 |
51 | return tile, ref, center, mask, self.tiles[item]
52 |
53 | def __len__(self):
54 | return len(self.tiles)
55 |
--------------------------------------------------------------------------------
/codes/data/images/random_dataset.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.utils.data import Dataset
3 |
4 |
5 | # Dataset that feeds random data into the state. Can be useful for testing or demo purposes without actual data.
6 | class RandomDataset(Dataset):
7 | def __init__(self, opt):
8 | self.hq_shape = tuple(opt['hq_shape'])
9 | self.lq_shape = tuple(opt['lq_shape'])
10 |
11 | def __getitem__(self, item):
12 | return {'lq': torch.rand(self.lq_shape), 'hq': torch.rand(self.hq_shape),
13 | 'LQ_path': '', 'GT_path': ''}
14 |
15 | def __len__(self):
16 | # Arbitrary
17 | return 1024 * 1024
18 |
--------------------------------------------------------------------------------
/codes/data/images/zip_file_dataset.py:
--------------------------------------------------------------------------------
1 | import PIL.Image
2 | import zipfile
3 | import torch
4 | import torchvision
5 | from torch.utils.data import DataLoader
6 | from torchvision.transforms import Compose, ToTensor, Normalize, Resize
7 |
8 |
9 | class ZipFileDataset(torch.utils.data.Dataset):
10 | def __init__(self, opt):
11 | self.path = opt['path']
12 | zip = zipfile.ZipFile(self.path)
13 | self.all_files = list(zip.namelist())
14 | self.resolution = opt['resolution']
15 | self.paired_mode = opt['paired_mode']
16 | self.transforms = Compose([ToTensor(),
17 | Resize(self.resolution),
18 | Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
19 | ])
20 | self.zip = None
21 |
22 | def __len__(self):
23 | return len(self.all_files)
24 |
25 | # Loaded on the fly because ZipFile does not tolerate pickling.
26 | def get_zip(self):
27 | if self.zip is None:
28 | self.zip = zipfile.ZipFile(self.path)
29 | return self.zip
30 |
31 | def load_image(self, path):
32 | file = self.get_zip().open(path, 'r')
33 | pilimg = PIL.Image.open(file)
34 | tensor = self.transforms(pilimg)
35 | return tensor
36 |
37 | def __getitem__(self, i):
38 | try:
39 | fname = self.all_files[i]
40 | out = {
41 | 'hq': self.load_image(fname),
42 | 'HQ_path': fname,
43 | 'has_alt': self.paired_mode
44 | }
45 | if self.paired_mode:
46 | if fname.endswith('0.jpg'):
47 | aname = fname.replace('0.jpg', '1.jpg')
48 | else:
49 | aname = fname.replace('1.jpg', '0.jpg')
50 | out['alt_hq'] = self.load_image(aname)
51 | except:
52 | print(f"Error loading {fname} from zipfile. Attempting to recover by loading next element.")
53 | return self[i+1]
54 | return out
55 |
56 | if __name__ == '__main__':
57 | opt = {
58 | 'path': 'E:\\4k6k\\datasets\\images\\youtube-imagenet-paired\\output.zip',
59 | 'resolution': 224,
60 | 'paired_mode': True
61 | }
62 | dataset = ZipFileDataset(opt)
63 | print(len(dataset))
64 | loader = DataLoader(dataset, shuffle=True)
65 | for i, d in enumerate(loader):
66 | torchvision.utils.save_image(d['hq'], f'{i}_hq.png')
67 | torchvision.utils.save_image(d['alt_hq'], f'{i}_althq.png')
68 |
69 |
--------------------------------------------------------------------------------
/codes/data/text/hf_datasets_wrapper.py:
--------------------------------------------------------------------------------
1 | from torch.utils.data import Dataset
2 | import datasets
3 |
4 |
5 | class HfDataset(Dataset):
6 | """
7 | Simple wrapper for a HuggingFace dataset that can re-map keys if desired.
8 | """
9 | def __init__(self, corpi, cache_path=None, key_maps=None, dataset_spec_key='train'):
10 | self.hfd = []
11 | for corpus in corpi:
12 | dataset_name, config = corpus
13 | if config == '' or config == 'None':
14 | config = None
15 | self.hfd.append(datasets.load_dataset(dataset_name, config, cache_dir=cache_path)[dataset_spec_key])
16 | self.key_maps = key_maps
17 |
18 | def __getitem__(self, item):
19 | for dataset in self.hfd:
20 | if item < len(dataset):
21 | val = dataset[item]
22 | if self.key_maps is None:
23 | return val
24 | else:
25 | return {k: val[v] for k, v in self.key_maps.items()}
26 | else:
27 | item -= len(dataset)
28 | raise IndexError()
29 |
30 | def __len__(self):
31 | return sum([len(h) for h in self.hfd])
32 |
33 |
34 | if __name__ == '__main__':
35 | d = HfDataset([['wikipedia', '20200501.en'], ['bookcorpus', '']], dataset_spec_key='train', cache_path='Z:\\huggingface_datasets\\cache')
36 | print(d[5])
37 |
--------------------------------------------------------------------------------
/codes/data/torch_dataset.py:
--------------------------------------------------------------------------------
1 | from torch.utils.data import Dataset
2 | import torchvision.transforms as T
3 | from torchvision import datasets
4 |
5 | # Wrapper for basic pytorch datasets which re-wraps them into a format usable by ExtensibleTrainer.
6 | from data.images.cifar import CIFAR100, CIFAR10
7 | from utils.util import opt_get
8 |
9 |
10 | class TorchDataset(Dataset):
11 | def __init__(self, opt):
12 | DATASET_MAP = {
13 | "mnist": datasets.MNIST,
14 | "fmnist": datasets.FashionMNIST,
15 | "cifar10": CIFAR10,
16 | "cifar100": CIFAR100,
17 | "imagenet": datasets.ImageNet,
18 | "imagefolder": datasets.ImageFolder
19 | }
20 | normalize = T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
21 | if opt_get(opt, ['random_crop'], False):
22 | transforms = [
23 | T.RandomResizedCrop(opt['image_size']),
24 | T.RandomHorizontalFlip(),
25 | T.ToTensor(),
26 | normalize,
27 | ]
28 | else:
29 | transforms = [
30 | T.Resize(opt['image_size']),
31 | T.CenterCrop(opt['image_size']),
32 | T.RandomHorizontalFlip(),
33 | T.ToTensor(),
34 | normalize,
35 | ]
36 | transforms = T.Compose(transforms)
37 | self.dataset = DATASET_MAP[opt['dataset']](transform=transforms, **opt['kwargs'])
38 | self.len = opt_get(opt, ['fixed_len'], len(self.dataset))
39 | self.offset = opt_get(opt, ['offset'], 0)
40 |
41 | def __getitem__(self, item):
42 | item = self.dataset[item+self.offset]
43 | if len(item) == 2:
44 | underlying_item, lbl = item
45 | coarselbl = None
46 | elif len(item) == 3:
47 | underlying_item, lbl, coarselbl = item
48 | else:
49 | raise NotImplementedError
50 | return {'lq': underlying_item, 'hq': underlying_item, 'labels': lbl, 'coarse_labels': coarselbl,
51 | 'LQ_path': str(item), 'GT_path': str(item)}
52 |
53 | def __len__(self):
54 | return self.len-self.offset
55 |
56 | if __name__ == '__main__':
57 | opt = {
58 | 'flip': True,
59 | 'crop_sz': None,
60 | 'dataset': 'cifar100',
61 | 'image_size': 32,
62 | 'normalize': True,
63 | 'kwargs': {
64 | 'root': 'E:\\4k6k\\datasets\\images\\cifar100',
65 | 'download': True
66 | }
67 | }
68 | set = TorchDataset(opt)
69 | j = set[0]
70 |
--------------------------------------------------------------------------------
/codes/data/zero_pad_dict_collate.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 |
4 |
5 | class ZeroPadDictCollate():
6 | """
7 | Given a list of dictionary outputs with torch.Tensors from a Dataset, iterates through each one, finds the longest
8 | tensor, and zero pads all the other tensors together.
9 | """
10 | def collate_tensors(self, batch, key):
11 | result = []
12 | largest_dims = [0 for _ in range(len(batch[0][key].shape))]
13 | for elem in batch:
14 | result.append(elem[key])
15 | largest_dims = [max(current_largest, new_consideration) for current_largest, new_consideration in zip(largest_dims, elem[key].shape)]
16 | # Now pad each tensor by the largest dimension.
17 | for i in range(len(result)):
18 | padding_tuple = ()
19 | for d in range(len(largest_dims)):
20 | padding_needed = largest_dims[d] - result[i].shape[d]
21 | assert padding_needed >= 0
22 | padding_tuple = (0, padding_needed) + padding_tuple
23 | result[i] = F.pad(result[i], padding_tuple)
24 |
25 | return torch.stack(result, dim=0)
26 |
27 |
28 | def collate_into_list(self, batch, key):
29 | result = []
30 | for elem in batch:
31 | result.append(elem[key])
32 | return result
33 |
34 | def __call__(self, batch):
35 | first_dict = batch[0]
36 | collated = {}
37 | for key in first_dict.keys():
38 | if isinstance(first_dict[key], torch.Tensor):
39 | if len(first_dict[key].shape) > 0:
40 | collated[key] = self.collate_tensors(batch, key)
41 | else:
42 | collated[key] = torch.stack([b[key] for b in batch])
43 | else:
44 | collated[key] = self.collate_into_list(batch, key)
45 | return collated
--------------------------------------------------------------------------------
/codes/maybe_bnb.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from typing import Optional,Literal
3 |
4 | # EXPLICITLY leave these empty; ensure that an attributeerror is raised if these are not initialised properly.
5 | class nn: pass
6 | class optim: pass
7 |
8 | def populate(adam=True, adamw=True, linear=False, embedding: Optional[Literal["STABLE", "NORMAL"]]="NORMAL"):
9 | nn.Linear = torch.nn.Linear
10 | nn.Embedding = torch.nn.Embedding
11 | optim.Adam = torch.optim.Adam # this does nothing tbh
12 | optim.AdamW = torch.optim.AdamW
13 | #
14 | try:
15 | import bitsandbytes as bnb
16 | except ImportError:
17 | return print("WARNING: bnb was missing, not using 8bit for anything!")
18 | #
19 | if adam: optim.Adam = bnb.optim.Adam8bit
20 | if adamw: optim.AdamW = bnb.optim.AdamW8bit
21 | if linear: nn.Linear = bnb.nn.Linear8bitLt
22 | if embedding:
23 | nn.Embedding = bnb.nn.StableEmbedding if embedding == 'STABLE' else bnb.nn.modules.Embedding
24 |
25 |
--------------------------------------------------------------------------------
/codes/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/__init__.py
--------------------------------------------------------------------------------
/codes/models/audio/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/audio/__init__.py
--------------------------------------------------------------------------------
/codes/models/audio/asr/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/audio/asr/__init__.py
--------------------------------------------------------------------------------
/codes/models/audio/music/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/audio/music/__init__.py
--------------------------------------------------------------------------------
/codes/models/audio/music/encoders.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from torch import nn
4 | from transformers import GPT2Config, GPT2Model
5 |
6 | from models.arch_util import AttentionBlock, ResBlock
7 | from models.audio.tts.lucidrains_dvae import DiscreteVAE
8 | from trainer.networks import register_model
9 | from utils.util import opt_get, ceil_multiple, print_network
10 |
11 |
12 | class ResEncoder16x(nn.Module):
13 | def __init__(self,
14 | spec_dim,
15 | hidden_dim,
16 | embedding_dim,
17 | checkpointing_enabled=True,
18 | ):
19 | super().__init__()
20 | attn = []
21 | def edim(m):
22 | dd = min(spec_dim + m * 128, hidden_dim)
23 | return ceil_multiple(dd, 8)
24 | self.downsampler = nn.Sequential(
25 | ResBlock(spec_dim, out_channels=edim(2), use_conv=True, dims=1, down=True, checkpointing_enabled=checkpointing_enabled),
26 | ResBlock(edim(2), out_channels=edim(3), use_conv=True, dims=1, down=True, checkpointing_enabled=checkpointing_enabled),
27 | ResBlock(edim(3), out_channels=edim(3), use_conv=True, dims=1, checkpointing_enabled=checkpointing_enabled),
28 | ResBlock(edim(3), out_channels=edim(4), use_conv=True, dims=1, down=True, checkpointing_enabled=checkpointing_enabled),
29 | ResBlock(edim(4), out_channels=edim(4), use_conv=True, dims=1, checkpointing_enabled=checkpointing_enabled),
30 | ResBlock(edim(4), out_channels=hidden_dim, use_conv=True, dims=1, down=True, checkpointing_enabled=checkpointing_enabled))
31 | self.encoder = nn.Sequential(
32 | ResBlock(hidden_dim, out_channels=hidden_dim, use_conv=True, dims=1, checkpointing_enabled=checkpointing_enabled),
33 | ResBlock(hidden_dim, out_channels=hidden_dim, use_conv=True, dims=1, checkpointing_enabled=checkpointing_enabled),
34 | ResBlock(hidden_dim, out_channels=hidden_dim, use_conv=True, dims=1, checkpointing_enabled=checkpointing_enabled),
35 | nn.GroupNorm(8, hidden_dim),
36 | nn.SiLU(),
37 | nn.Conv1d(hidden_dim, embedding_dim, 1),
38 | nn.Tanh(),
39 | )
40 |
41 | def forward(self, x):
42 | h = self.downsampler(x)
43 | h = self.encoder(h)
44 | return h
45 |
--------------------------------------------------------------------------------
/codes/models/audio/music/m2v_code_to_mel.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 | from models.arch_util import ResBlock, AttentionBlock
6 | from models.audio.music.flat_diffusion import MultiGroupEmbedding
7 | from trainer.networks import register_model
8 | from utils.util import checkpoint
9 |
10 |
11 | class Code2Mel(nn.Module):
12 | def __init__(self, out_dim=256, base_dim=1024, num_tokens=16, num_groups=4, dropout=.1):
13 | super().__init__()
14 | self.emb = MultiGroupEmbedding(num_tokens, num_groups, base_dim)
15 | self.base_blocks = nn.Sequential(ResBlock(base_dim, dropout, dims=1),
16 | AttentionBlock(base_dim, num_heads=base_dim//64),
17 | ResBlock(base_dim, dropout, dims=1))
18 | l2dim = base_dim-256
19 | self.l2_up_block = nn.Conv1d(base_dim, l2dim, kernel_size=5, padding=2)
20 | self.l2_blocks = nn.Sequential(ResBlock(l2dim, dropout, kernel_size=5, dims=1),
21 | AttentionBlock(l2dim, num_heads=base_dim//64),
22 | ResBlock(l2dim, dropout, kernel_size=5, dims=1),
23 | AttentionBlock(l2dim, num_heads=base_dim//64),
24 | ResBlock(l2dim, dropout, dims=1),
25 | ResBlock(l2dim, dropout, dims=1))
26 | l3dim = l2dim-256
27 | self.l3_up_block = nn.Conv1d(l2dim, l3dim, kernel_size=5, padding=2)
28 | self.l3_blocks = nn.Sequential(ResBlock(l3dim, dropout, kernel_size=5, dims=1),
29 | AttentionBlock(l3dim, num_heads=base_dim//64),
30 | ResBlock(l3dim, dropout, kernel_size=5, dims=1),
31 | ResBlock(l3dim, dropout, dims=1))
32 | self.final_block = nn.Conv1d(l3dim, out_dim, kernel_size=3, padding=1)
33 |
34 | def forward(self, codes, target):
35 | with torch.autocast(codes.device.type):
36 | h = self.emb(codes).permute(0,2,1)
37 | h = checkpoint(self.base_blocks, h)
38 | h = F.interpolate(h, scale_factor=2, mode='linear')
39 | h = self.l2_up_block(h)
40 | h = checkpoint(self.l2_blocks, h)
41 | h = F.interpolate(h, size=target.shape[-1], mode='linear')
42 | h = self.l3_up_block(h)
43 | h = checkpoint(self.l3_blocks, h.float())
44 | pred = self.final_block(h)
45 | return F.mse_loss(pred, target), pred
46 |
47 |
48 | @register_model
49 | def register_code2mel(opt_net, opt):
50 | return Code2Mel(**opt_net['kwargs'])
51 |
52 |
53 | if __name__ == '__main__':
54 | model = Code2Mel()
55 | codes = torch.randint(0,16, (2,200,4))
56 | target = torch.randn(2,256,804)
57 | model(codes, target)
--------------------------------------------------------------------------------
/codes/models/audio/music/mel2vec_codes_gpt.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 | import torch.nn.functional as F
4 | import maybe_bnb as mbnb
5 | from transformers import GPT2Config, GPT2Model
6 |
7 | from trainer.networks import register_model
8 | from utils.util import opt_get
9 |
10 |
11 | class Mel2VecCodesGpt(nn.Module):
12 | def __init__(self, dim, layers, num_groups=8, num_vectors=8):
13 | super().__init__()
14 |
15 | self.num_groups = num_groups
16 |
17 | self.config = GPT2Config(vocab_size=1, n_positions=8192, n_embd=dim, n_layer=layers, n_head=dim//64,
18 | n_inner=dim*2)
19 | self.gpt = GPT2Model(self.config)
20 | del self.gpt.wte # Unused, we'll do our own embeddings.
21 | # nn.Embedding
22 | self.embeddings = nn.ModuleList([mbnb.nn.Embedding(num_vectors, dim//num_groups) for _ in range(num_groups)])
23 | self.heads = nn.ModuleList([mbnb.nn.Linear(dim, num_vectors) for _ in range(num_groups)])
24 |
25 | def forward(self, codes):
26 | assert codes.shape[-1] == self.num_groups
27 |
28 | inputs = codes[:, :-1]
29 | targets = codes[:, 1:]
30 |
31 | h = [embedding(inputs[:, :, i]) for i, embedding in enumerate(self.embeddings)]
32 | h = torch.cat(h, dim=-1)
33 | h = self.gpt(inputs_embeds=h, return_dict=True).last_hidden_state
34 |
35 | losses = 0
36 | for i, head in enumerate(self.heads):
37 | logits = head(h).permute(0,2,1)
38 | loss = F.cross_entropy(logits, targets[:,:,i])
39 | losses = losses + loss
40 |
41 | return losses / self.num_groups
42 |
43 |
44 | @register_model
45 | def register_music_gpt(opt_net, opt):
46 | return Mel2VecCodesGpt(**opt_get(opt_net, ['kwargs'], {}))
47 |
48 |
49 | if __name__ == '__main__':
50 | model = Mel2VecCodesGpt(512, 8)
51 | codes = torch.randint(0,8, (2,300,8))
52 | model(codes)
--------------------------------------------------------------------------------
/codes/models/audio/tts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/audio/tts/__init__.py
--------------------------------------------------------------------------------
/codes/models/audio/tts/random_latent_converter.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | import torch
4 | import torch.nn as nn
5 | import torch.nn.functional as F
6 | import maybe_bnb as mbnb
7 |
8 | from trainer.networks import register_model
9 | from utils.util import opt_get
10 |
11 |
12 | def fused_leaky_relu(input, bias=None, negative_slope=0.2, scale=2 ** 0.5):
13 | if bias is not None:
14 | rest_dim = [1] * (input.ndim - bias.ndim - 1)
15 | return (
16 | F.leaky_relu(
17 | input + bias.view(1, bias.shape[0], *rest_dim), negative_slope=negative_slope
18 | )
19 | * scale
20 | )
21 | else:
22 | return F.leaky_relu(input, negative_slope=0.2) * scale
23 |
24 |
25 | class EqualLinear(nn.Module):
26 | def __init__(
27 | self, in_dim, out_dim, bias=True, bias_init=0, lr_mul=1
28 | ):
29 | super().__init__()
30 | self.weight = nn.Parameter(torch.randn(out_dim, in_dim).div_(lr_mul))
31 | if bias:
32 | self.bias = nn.Parameter(torch.zeros(out_dim).fill_(bias_init))
33 | else:
34 | self.bias = None
35 | self.scale = (1 / math.sqrt(in_dim)) * lr_mul
36 | self.lr_mul = lr_mul
37 |
38 | def forward(self, input):
39 | out = F.linear(input, self.weight * self.scale)
40 | out = fused_leaky_relu(out, self.bias * self.lr_mul)
41 | return out
42 |
43 |
44 | class RandomLatentConverter(nn.Module):
45 | def __init__(self, channels):
46 | super().__init__()
47 | self.layers = nn.Sequential(*[EqualLinear(channels, channels, lr_mul=.1) for _ in range(5)],
48 | mbnb.nn.Linear(channels, channels))
49 | self.channels = channels
50 |
51 | def forward(self, ref):
52 | r = torch.randn(ref.shape[0], self.channels, device=ref.device)
53 | y = self.layers(r)
54 | return y
55 |
56 |
57 | @register_model
58 | def register_random_latent_converter(opt_net, opt):
59 | return RandomLatentConverter(**opt_get(opt_net, ['kwargs'], {}))
60 |
61 |
62 | if __name__ == '__main__':
63 | model = RandomLatentConverter(512)
64 | model(torch.randn(5,512))
--------------------------------------------------------------------------------
/codes/models/audio/tts/tacotron2/LICENSE:
--------------------------------------------------------------------------------
1 | This directory contains works with the below licenses, which should be considered in addition
2 | to the base repository license.
3 |
4 | BSD 3-Clause License
5 |
6 | Copyright (c) 2018, NVIDIA Corporation
7 | All rights reserved.
8 |
9 | Redistribution and use in source and binary forms, with or without
10 | modification, are permitted provided that the following conditions are met:
11 |
12 | * Redistributions of source code must retain the above copyright notice, this
13 | list of conditions and the following disclaimer.
14 |
15 | * Redistributions in binary form must reproduce the above copyright notice,
16 | this list of conditions and the following disclaimer in the documentation
17 | and/or other materials provided with the distribution.
18 |
19 | * Neither the name of the copyright holder nor the names of its
20 | contributors may be used to endorse or promote products derived from
21 | this software without specific prior written permission.
22 |
23 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
24 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
26 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
27 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
29 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
30 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--------------------------------------------------------------------------------
/codes/models/audio/tts/tacotron2/__init__.py:
--------------------------------------------------------------------------------
1 | from models.audio.tts.tacotron2.taco_utils import *
2 | from models.audio.tts.tacotron2.text import *
3 | from models.audio.tts.tacotron2.tacotron2 import *
4 | from models.audio.tts.tacotron2.stft import *
5 | from models.audio.tts.tacotron2.layers import *
6 | from models.audio.tts.tacotron2.loss import *
--------------------------------------------------------------------------------
/codes/models/audio/tts/tacotron2/audio_processing.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | from scipy.signal import get_window
4 | import librosa.util as librosa_util
5 |
6 |
7 | def window_sumsquare(window, n_frames, hop_length=200, win_length=800,
8 | n_fft=800, dtype=np.float32, norm=None):
9 | """
10 | # from librosa 0.6
11 | Compute the sum-square envelope of a window function at a given hop length.
12 |
13 | This is used to estimate modulation effects induced by windowing
14 | observations in short-time fourier transforms.
15 |
16 | Parameters
17 | ----------
18 | window : string, tuple, number, callable, or list-like
19 | Window specification, as in `get_window`
20 |
21 | n_frames : int > 0
22 | The number of analysis frames
23 |
24 | hop_length : int > 0
25 | The number of samples to advance between frames
26 |
27 | win_length : [optional]
28 | The length of the window function. By default, this matches `n_fft`.
29 |
30 | n_fft : int > 0
31 | The length of each analysis frame.
32 |
33 | dtype : np.dtype
34 | The data type of the output
35 |
36 | Returns
37 | -------
38 | wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
39 | The sum-squared envelope of the window function
40 | """
41 | if win_length is None:
42 | win_length = n_fft
43 |
44 | n = n_fft + hop_length * (n_frames - 1)
45 | x = np.zeros(n, dtype=dtype)
46 |
47 | # Compute the squared window at the desired length
48 | win_sq = get_window(window, win_length, fftbins=True)
49 | win_sq = librosa_util.normalize(win_sq, norm=norm)**2
50 | win_sq = librosa_util.pad_center(win_sq, n_fft)
51 |
52 | # Fill the envelope
53 | for i in range(n_frames):
54 | sample = i * hop_length
55 | x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))]
56 | return x
57 |
58 |
59 | def griffin_lim(magnitudes, stft_fn, n_iters=30):
60 | """
61 | PARAMS
62 | ------
63 | magnitudes: spectrogram magnitudes
64 | stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
65 | """
66 |
67 | angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
68 | angles = angles.astype(np.float32)
69 | angles = torch.autograd.Variable(torch.from_numpy(angles))
70 | signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
71 |
72 | for i in range(n_iters):
73 | _, angles = stft_fn.transform(signal)
74 | signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
75 | return signal
76 |
77 |
78 | def dynamic_range_compression(x, C=1, clip_val=1e-5):
79 | """
80 | PARAMS
81 | ------
82 | C: compression factor
83 | """
84 | return torch.log(torch.clamp(x, min=clip_val) * C)
85 |
86 |
87 | def dynamic_range_decompression(x, C=1):
88 | """
89 | PARAMS
90 | ------
91 | C: compression factor used to compress
92 | """
93 | return torch.exp(x) / C
--------------------------------------------------------------------------------
/codes/models/audio/tts/tacotron2/loss.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 |
3 | from trainer.losses import ConfigurableLoss
4 |
5 |
6 | class Tacotron2Loss(ConfigurableLoss):
7 | def __init__(self, opt_loss, env):
8 | super().__init__(opt_loss, env)
9 | self.mel_target_key = opt_loss['mel_target_key']
10 | self.mel_output_key = opt_loss['mel_output_key']
11 | self.mel_output_postnet_key = opt_loss['mel_output_postnet_key']
12 | self.gate_target_key = opt_loss['gate_target_key']
13 | self.gate_output_key = opt_loss['gate_output_key']
14 | self.last_mel_loss = 0
15 | self.last_gate_loss = 0
16 |
17 | def forward(self, _, state):
18 | mel_target, gate_target = state[self.mel_target_key], state[self.gate_target_key]
19 | mel_target.requires_grad = False
20 | gate_target.requires_grad = False
21 | gate_target = gate_target.view(-1, 1)
22 |
23 | mel_out, mel_out_postnet, gate_out = state[self.mel_output_key], state[self.mel_output_postnet_key], state[self.gate_output_key]
24 | gate_out = gate_out.view(-1, 1)
25 | mel_loss = nn.MSELoss()(mel_out, mel_target) + \
26 | nn.MSELoss()(mel_out_postnet, mel_target)
27 | gate_loss = nn.BCEWithLogitsLoss()(gate_out, gate_target)
28 | self.last_mel_loss = mel_loss.detach().clone().mean().item()
29 | self.last_gate_loss = gate_loss.detach().clone().mean().item()
30 | return mel_loss + gate_loss
31 |
32 | def extra_metrics(self):
33 | return {
34 | 'mel_loss': self.last_mel_loss,
35 | 'gate_loss': self.last_gate_loss
36 | }
37 |
38 |
39 | class Tacotron2LossRaw(nn.Module):
40 | def __init__(self):
41 | super().__init__()
42 | self.last_mel_loss = 0
43 | self.last_gate_loss = 0
44 |
45 | def forward(self, model_output, targets):
46 | mel_target, gate_target = targets[0], targets[1]
47 | mel_target.requires_grad = False
48 | gate_target.requires_grad = False
49 | gate_target = gate_target.view(-1, 1)
50 |
51 | mel_out, mel_out_postnet, gate_out, _ = model_output
52 | gate_out = gate_out.view(-1, 1)
53 | mel_loss = nn.MSELoss()(mel_out, mel_target) + \
54 | nn.MSELoss()(mel_out_postnet, mel_target)
55 | gate_loss = nn.BCEWithLogitsLoss()(gate_out, gate_target)
56 | self.last_mel_loss = mel_loss.detach().clone().mean().item()
57 | self.last_gate_loss = gate_loss.detach().clone().mean().item()
58 | return mel_loss + gate_loss
59 |
60 | def extra_metrics(self):
61 | return {
62 | 'mel_loss': self.last_mel_loss,
63 | 'gate_loss': self.last_gate_loss
64 | }
--------------------------------------------------------------------------------
/codes/models/audio/tts/tacotron2/taco_utils.py:
--------------------------------------------------------------------------------
1 | import os.path
2 |
3 | import numpy as np
4 | import torch
5 | from scipy.io.wavfile import read
6 |
7 |
8 | def get_mask_from_lengths(lengths, max_len=None):
9 | if max_len is None:
10 | max_len = torch.max(lengths).item()
11 | ids = torch.arange(0, max_len, out=torch.LongTensor(max_len)).to(lengths.device)
12 | mask = (ids < lengths.unsqueeze(1)).bool()
13 | return mask
14 |
15 |
16 | def load_wav_to_torch(full_path):
17 | sampling_rate, data = read(full_path)
18 | if data.dtype == np.int32:
19 | norm_fix = 2 ** 31
20 | elif data.dtype == np.int16:
21 | norm_fix = 2 ** 15
22 | elif data.dtype == np.float16 or data.dtype == np.float32:
23 | norm_fix = 1.
24 | else:
25 | raise NotImplemented(f"Provided data dtype not supported: {data.dtype}")
26 | return (torch.FloatTensor(data.astype(np.float32)) / norm_fix, sampling_rate)
27 |
28 |
29 | def load_filepaths_and_text_type(filename, type, split="|"):
30 | with open(filename, encoding='utf-8') as f:
31 | filepaths_and_text = [list(line.strip().split(split)) + [type] for line in f]
32 | base = os.path.dirname(filename)
33 | for j in range(len(filepaths_and_text)):
34 | filepaths_and_text[j][0] = os.path.join(base, filepaths_and_text[j][0])
35 | return filepaths_and_text
36 |
37 | def load_filepaths_and_text(filename, split="|"):
38 | with open(filename, encoding='utf-8') as f:
39 | filepaths_and_text = [line.strip().split(split) for line in f]
40 | base = os.path.dirname(filename)
41 | for j in range(len(filepaths_and_text)):
42 | filepaths_and_text[j][0] = os.path.join(base, filepaths_and_text[j][0])
43 | return filepaths_and_text
44 |
45 |
46 | def to_gpu(x):
47 | x = x.contiguous()
48 |
49 | if torch.cuda.is_available():
50 | x = x.cuda(non_blocking=True)
51 | return torch.autograd.Variable(x)
--------------------------------------------------------------------------------
/codes/models/audio/tts/tacotron2/text/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2017 Keith Ito
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy
4 | of this software and associated documentation files (the "Software"), to deal
5 | in the Software without restriction, including without limitation the rights
6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | copies of the Software, and to permit persons to whom the Software is
8 | furnished to do so, subject to the following conditions:
9 |
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 |
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 |
--------------------------------------------------------------------------------
/codes/models/audio/tts/tacotron2/text/__init__.py:
--------------------------------------------------------------------------------
1 | """ from https://github.com/keithito/tacotron """
2 | import re
3 |
4 | import torch
5 |
6 | from models.audio.tts.tacotron2.text import cleaners
7 | from models.audio.tts.tacotron2.text.symbols import symbols
8 |
9 |
10 | # Mappings from symbol to numeric ID and vice versa:
11 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
12 | _id_to_symbol = {i: s for i, s in enumerate(symbols)}
13 |
14 | # Regular expression matching text enclosed in curly braces:
15 | _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
16 |
17 |
18 | def text_to_sequence(text, cleaner_names=['english_cleaners']):
19 | '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
20 |
21 | The text can optionally have ARPAbet sequences enclosed in curly braces embedded
22 | in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
23 |
24 | Args:
25 | text: string to convert to a sequence
26 | cleaner_names: names of the cleaner functions to run the text through
27 |
28 | Returns:
29 | List of integers corresponding to the symbols in the text
30 | '''
31 | sequence = []
32 |
33 | # Check for curly braces and treat their contents as ARPAbet:
34 | while len(text):
35 | m = _curly_re.match(text)
36 | if not m:
37 | sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
38 | break
39 | sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
40 | sequence += _arpabet_to_sequence(m.group(2))
41 | text = m.group(3)
42 |
43 | return sequence
44 |
45 |
46 | def sequence_to_text(sequence):
47 | '''Converts a sequence of IDs back to a string'''
48 | result = ''
49 | for symbol_id in sequence:
50 | if isinstance(symbol_id, torch.Tensor):
51 | symbol_id = symbol_id.item()
52 | if symbol_id in _id_to_symbol:
53 | s = _id_to_symbol[symbol_id]
54 | # Enclose ARPAbet back in curly braces:
55 | if len(s) > 1 and s[0] == '@':
56 | s = '{%s}' % s[1:]
57 | result += s
58 | return result.replace('}{', ' ')
59 |
60 |
61 | def tacotron_symbols():
62 | return list(_symbol_to_id.keys())
63 |
64 |
65 | def tacotron_symbol_mapping():
66 | return _symbol_to_id.copy()
67 |
68 |
69 | def _clean_text(text, cleaner_names):
70 | for name in cleaner_names:
71 | cleaner = getattr(cleaners, name)
72 | if not cleaner:
73 | raise Exception('Unknown cleaner: %s' % name)
74 | text = cleaner(text)
75 | return text
76 |
77 |
78 | def _symbols_to_sequence(symbols):
79 | return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
80 |
81 |
82 | def _arpabet_to_sequence(text):
83 | return _symbols_to_sequence(['@' + s for s in text.split()])
84 |
85 |
86 | def _should_keep_symbol(s):
87 | return s in _symbol_to_id and s != '_' and s != '~'
88 |
--------------------------------------------------------------------------------
/codes/models/audio/tts/tacotron2/text/cleaners.py:
--------------------------------------------------------------------------------
1 | """ from https://github.com/keithito/tacotron """
2 |
3 | '''
4 | Cleaners are transformations that run over the input text at both training and eval time.
5 |
6 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
7 | hyperparameter. Some cleaners are English-specific. You'll typically want to use:
8 | 1. "english_cleaners" for English text
9 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
10 | the Unidecode library (https://pypi.python.org/pypi/Unidecode)
11 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
12 | the symbols in symbols.py to match your data).
13 | '''
14 |
15 | import re
16 | from unidecode import unidecode
17 | from .numbers import normalize_numbers
18 |
19 |
20 | # Regular expression matching whitespace:
21 | _whitespace_re = re.compile(r'\s+')
22 |
23 | # List of (regular expression, replacement) pairs for abbreviations:
24 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
25 | ('mrs', 'misess'),
26 | ('mr', 'mister'),
27 | ('dr', 'doctor'),
28 | ('st', 'saint'),
29 | ('co', 'company'),
30 | ('jr', 'junior'),
31 | ('maj', 'major'),
32 | ('gen', 'general'),
33 | ('drs', 'doctors'),
34 | ('rev', 'reverend'),
35 | ('lt', 'lieutenant'),
36 | ('hon', 'honorable'),
37 | ('sgt', 'sergeant'),
38 | ('capt', 'captain'),
39 | ('esq', 'esquire'),
40 | ('ltd', 'limited'),
41 | ('col', 'colonel'),
42 | ('ft', 'fort'),
43 | ]]
44 |
45 |
46 | def expand_abbreviations(text):
47 | for regex, replacement in _abbreviations:
48 | text = re.sub(regex, replacement, text)
49 | return text
50 |
51 |
52 | def expand_numbers(text):
53 | return normalize_numbers(text)
54 |
55 |
56 | def lowercase(text):
57 | return text.lower()
58 |
59 |
60 | def collapse_whitespace(text):
61 | return re.sub(_whitespace_re, ' ', text)
62 |
63 |
64 | def convert_to_ascii(text):
65 | return unidecode(text)
66 |
67 |
68 | def basic_cleaners(text):
69 | '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
70 | text = lowercase(text)
71 | text = collapse_whitespace(text)
72 | return text
73 |
74 |
75 | def transliteration_cleaners(text):
76 | '''Pipeline for non-English text that transliterates to ASCII.'''
77 | text = convert_to_ascii(text)
78 | text = lowercase(text)
79 | text = collapse_whitespace(text)
80 | return text
81 |
82 |
83 | def english_cleaners(text):
84 | '''Pipeline for English text, including number and abbreviation expansion.'''
85 | text = convert_to_ascii(text)
86 | text = lowercase(text)
87 | text = expand_numbers(text)
88 | text = expand_abbreviations(text)
89 | text = collapse_whitespace(text)
90 | text = text.replace('"', '')
91 | return text
92 |
--------------------------------------------------------------------------------
/codes/models/audio/tts/tacotron2/text/cmudict.py:
--------------------------------------------------------------------------------
1 | """ from https://github.com/keithito/tacotron """
2 |
3 | import re
4 |
5 |
6 | valid_symbols = [
7 | 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
8 | 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
9 | 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
10 | 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
11 | 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
12 | 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
13 | 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
14 | ]
15 |
16 | _valid_symbol_set = set(valid_symbols)
17 |
18 |
19 | class CMUDict:
20 | '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''
21 | def __init__(self, file_or_path, keep_ambiguous=True):
22 | if isinstance(file_or_path, str):
23 | with open(file_or_path, encoding='latin-1') as f:
24 | entries = _parse_cmudict(f)
25 | else:
26 | entries = _parse_cmudict(file_or_path)
27 | if not keep_ambiguous:
28 | entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
29 | self._entries = entries
30 |
31 |
32 | def __len__(self):
33 | return len(self._entries)
34 |
35 |
36 | def lookup(self, word):
37 | '''Returns list of ARPAbet pronunciations of the given word.'''
38 | return self._entries.get(word.upper())
39 |
40 |
41 |
42 | _alt_re = re.compile(r'\([0-9]+\)')
43 |
44 |
45 | def _parse_cmudict(file):
46 | cmudict = {}
47 | for line in file:
48 | if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
49 | parts = line.split(' ')
50 | word = re.sub(_alt_re, '', parts[0])
51 | pronunciation = _get_pronunciation(parts[1])
52 | if pronunciation:
53 | if word in cmudict:
54 | cmudict[word].append(pronunciation)
55 | else:
56 | cmudict[word] = [pronunciation]
57 | return cmudict
58 |
59 |
60 | def _get_pronunciation(s):
61 | parts = s.strip().split(' ')
62 | for part in parts:
63 | if part not in _valid_symbol_set:
64 | return None
65 | return ' '.join(parts)
66 |
--------------------------------------------------------------------------------
/codes/models/audio/tts/tacotron2/text/numbers.py:
--------------------------------------------------------------------------------
1 | """ from https://github.com/keithito/tacotron """
2 |
3 | import inflect
4 | import re
5 |
6 |
7 | _inflect = inflect.engine()
8 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
9 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
10 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
11 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
12 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
13 | _number_re = re.compile(r'[0-9]+')
14 |
15 |
16 | def _remove_commas(m):
17 | return m.group(1).replace(',', '')
18 |
19 |
20 | def _expand_decimal_point(m):
21 | return m.group(1).replace('.', ' point ')
22 |
23 |
24 | def _expand_dollars(m):
25 | match = m.group(1)
26 | parts = match.split('.')
27 | if len(parts) > 2:
28 | return match + ' dollars' # Unexpected format
29 | dollars = int(parts[0]) if parts[0] else 0
30 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
31 | if dollars and cents:
32 | dollar_unit = 'dollar' if dollars == 1 else 'dollars'
33 | cent_unit = 'cent' if cents == 1 else 'cents'
34 | return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
35 | elif dollars:
36 | dollar_unit = 'dollar' if dollars == 1 else 'dollars'
37 | return '%s %s' % (dollars, dollar_unit)
38 | elif cents:
39 | cent_unit = 'cent' if cents == 1 else 'cents'
40 | return '%s %s' % (cents, cent_unit)
41 | else:
42 | return 'zero dollars'
43 |
44 |
45 | def _expand_ordinal(m):
46 | return _inflect.number_to_words(m.group(0))
47 |
48 |
49 | def _expand_number(m):
50 | num = int(m.group(0))
51 | if num > 1000 and num < 3000:
52 | if num == 2000:
53 | return 'two thousand'
54 | elif num > 2000 and num < 2010:
55 | return 'two thousand ' + _inflect.number_to_words(num % 100)
56 | elif num % 100 == 0:
57 | return _inflect.number_to_words(num // 100) + ' hundred'
58 | else:
59 | return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
60 | else:
61 | return _inflect.number_to_words(num, andword='')
62 |
63 |
64 | def normalize_numbers(text):
65 | text = re.sub(_comma_number_re, _remove_commas, text)
66 | text = re.sub(_pounds_re, r'\1 pounds', text)
67 | text = re.sub(_dollars_re, _expand_dollars, text)
68 | text = re.sub(_decimal_number_re, _expand_decimal_point, text)
69 | text = re.sub(_ordinal_re, _expand_ordinal, text)
70 | text = re.sub(_number_re, _expand_number, text)
71 | return text
72 |
--------------------------------------------------------------------------------
/codes/models/audio/tts/tacotron2/text/symbols.py:
--------------------------------------------------------------------------------
1 | """ from https://github.com/keithito/tacotron """
2 |
3 | '''
4 | Defines the set of symbols used in text input to the model.
5 |
6 | The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. '''
7 | from models.audio.tts.tacotron2.text import cmudict
8 |
9 | _pad = '_'
10 | _punctuation = '!\'(),.:;? '
11 | _special = '-'
12 | _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
13 |
14 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
15 | _arpabet = ['@' + s for s in cmudict.valid_symbols]
16 |
17 | # Export all symbols:
18 | symbols = [_pad] + list(_special) + list(_punctuation) + list(_letters) + _arpabet
19 |
--------------------------------------------------------------------------------
/codes/models/audio/vocoders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/audio/vocoders/__init__.py
--------------------------------------------------------------------------------
/codes/models/audio/vocoders/univnet/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/audio/vocoders/univnet/__init__.py
--------------------------------------------------------------------------------
/codes/models/audio/vocoders/waveglow/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/audio/vocoders/waveglow/__init__.py
--------------------------------------------------------------------------------
/codes/models/audio/vocoders/waveglow/denoiser.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | from models.audio.tts.tacotron2.stft import STFT
4 |
5 | sys.path.append('tacotron2')
6 | import torch
7 |
8 |
9 | class Denoiser(torch.nn.Module):
10 | """ Removes model bias from audio produced with waveglow """
11 |
12 | def __init__(self, waveglow, filter_length=1024, n_overlap=4,
13 | win_length=1024, mode='zeros'):
14 | super(Denoiser, self).__init__()
15 | self.stft = STFT(filter_length=filter_length,
16 | hop_length=int(filter_length/n_overlap),
17 | win_length=win_length).cuda()
18 | if mode == 'zeros':
19 | mel_input = torch.zeros(
20 | (1, 80, 88),
21 | dtype=waveglow.upsample.weight.dtype,
22 | device=waveglow.upsample.weight.device)
23 | elif mode == 'normal':
24 | mel_input = torch.randn(
25 | (1, 80, 88),
26 | dtype=waveglow.upsample.weight.dtype,
27 | device=waveglow.upsample.weight.device)
28 | else:
29 | raise Exception("Mode {} if not supported".format(mode))
30 |
31 | with torch.no_grad():
32 | bias_audio = waveglow.infer(mel_input, sigma=0.0).float()
33 | bias_spec, _ = self.stft.transform(bias_audio)
34 |
35 | self.register_buffer('bias_spec', bias_spec[:, :, 0][:, :, None])
36 |
37 | def forward(self, audio, strength=0.1):
38 | audio_spec, audio_angles = self.stft.transform(audio.cuda().float())
39 | audio_spec_denoised = audio_spec - self.bias_spec * strength
40 | audio_spec_denoised = torch.clamp(audio_spec_denoised, 0.0)
41 | audio_denoised = self.stft.inverse(audio_spec_denoised, audio_angles)
42 | return audio_denoised
43 |
--------------------------------------------------------------------------------
/codes/models/classifiers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/classifiers/__init__.py
--------------------------------------------------------------------------------
/codes/models/classifiers/torch_models.py:
--------------------------------------------------------------------------------
1 | from torchvision.models import vgg16
2 |
3 | from trainer.networks import register_model
4 | from utils.util import opt_get
5 |
6 |
7 | @register_model
8 | def register_torch_vgg16(opt_net, opt):
9 | """ return a ResNet 18 object
10 | """
11 | return vgg16(**opt_get(opt_net, ['kwargs'], {}))
12 |
--------------------------------------------------------------------------------
/codes/models/clip/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/clip/__init__.py
--------------------------------------------------------------------------------
/codes/models/clip/clip.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from trainer.networks import register_model
4 | from utils.util import opt_get
5 |
6 |
7 | def encoder_for_type(type, master_dim, enc_kwargs):
8 | from x_clip.x_clip import VisionTransformer, TextTransformer
9 | if type == 'image':
10 | # xclip_kwargs: image_size, patch_size, channels, depth, heads
11 | return VisionTransformer(dim=master_dim, **enc_kwargs)
12 | elif type == 'tokens':
13 | # xclip_kwargs: num_tokens, max_seq_len, depth, heads
14 | return TextTransformer(dim=master_dim, **enc_kwargs)
15 | raise NotImplementedError()
16 |
17 |
18 | class XClipWrapper(nn.Module):
19 | def __init__(self,
20 | master_dim=512,
21 | enc1_type='vision',
22 | enc1_kwargs={},
23 | enc2_type='text',
24 | enc2_kwargs={},
25 | mask_seq1_percentage=0,
26 | mask_seq2_percentage=0,
27 | **xclip_kwargs):
28 | super().__init__()
29 | self.mask_seq1_percentage = mask_seq1_percentage
30 | self.mask_seq2_percentage = mask_seq2_percentage
31 | enc1 = encoder_for_type(enc1_type, master_dim, enc1_kwargs)
32 | enc2 = encoder_for_type(enc2_type, master_dim, enc2_kwargs)
33 | xclip_kwargs['dim_text'] = master_dim
34 | xclip_kwargs['dim_image'] = master_dim
35 | xclip_kwargs['dim_latent'] = master_dim
36 | xclip_kwargs['text_encoder'] = enc1 # The first argument of forward
37 | xclip_kwargs['image_encoder'] = enc2
38 | # xclip_kwargs:
39 | # use_all_token_embeds
40 | # downsample_image_embeds
41 | # decoupled_contrastive_learning
42 | # extra_latent_projection
43 | # use_mlm
44 | from x_clip import CLIP
45 | self.clip = CLIP(**xclip_kwargs)
46 |
47 | def forward(self, seq1, seq2, return_loss=False):
48 | seq1_mask = torch.rand_like(seq1.float()) > self.mask_seq1_percentage
49 | # TODO: add support for seq2 mask..
50 | #seq2_mask = torch.rand_like(seq2.float()) > self.mask_seq2_percentage
51 | return self.clip(seq1, seq2, seq1_mask, return_loss=return_loss)
52 |
53 |
54 | @register_model
55 | def register_clip(opt_net, opt):
56 | return XClipWrapper(**opt_get(opt_net, ['kwargs'], {}))
57 |
58 | if __name__ == '__main__':
59 | model = XClipWrapper(enc1_type='tokens', enc2_type='tokens',
60 | enc1_kwargs={'num_tokens': 256, 'max_seq_len': 200, 'depth': 8, 'heads': 8},
61 | enc2_kwargs={'num_tokens': 8192, 'max_seq_len': 250, 'depth': 8, 'heads': 8})
62 | loss = model(torch.randint(0,256, (2,200)), torch.randint(0,8192, (2,250)), True)
63 | print(loss)
--------------------------------------------------------------------------------
/codes/models/composable/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/composable/__init__.py
--------------------------------------------------------------------------------
/codes/models/diffusion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/diffusion/__init__.py
--------------------------------------------------------------------------------
/codes/models/diffusion/losses.py:
--------------------------------------------------------------------------------
1 | """
2 | Helpers for various likelihood-based losses. These are ported from the original
3 | Ho et al. diffusion models codebase:
4 | https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/utils.py
5 | """
6 |
7 | import numpy as np
8 |
9 | import torch as th
10 |
11 |
12 | def normal_kl(mean1, logvar1, mean2, logvar2):
13 | """
14 | Compute the KL divergence between two gaussians.
15 |
16 | Shapes are automatically broadcasted, so batches can be compared to
17 | scalars, among other use cases.
18 | """
19 | tensor = None
20 | for obj in (mean1, logvar1, mean2, logvar2):
21 | if isinstance(obj, th.Tensor):
22 | tensor = obj
23 | break
24 | assert tensor is not None, "at least one argument must be a Tensor"
25 |
26 | # Force variances to be Tensors. Broadcasting helps convert scalars to
27 | # Tensors, but it does not work for th.exp().
28 | logvar1, logvar2 = [
29 | x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor)
30 | for x in (logvar1, logvar2)
31 | ]
32 |
33 | return 0.5 * (
34 | -1.0
35 | + logvar2
36 | - logvar1
37 | + th.exp(logvar1 - logvar2)
38 | + ((mean1 - mean2) ** 2) * th.exp(-logvar2)
39 | )
40 |
41 |
42 | def approx_standard_normal_cdf(x):
43 | """
44 | A fast approximation of the cumulative distribution function of the
45 | standard normal.
46 | """
47 | return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3))))
48 |
49 |
50 | def discretized_gaussian_log_likelihood(x, *, means, log_scales):
51 | """
52 | Compute the log-likelihood of a Gaussian distribution discretizing to a
53 | given image.
54 |
55 | :param x: the target images. It is assumed that this was uint8 values,
56 | rescaled to the range [-1, 1].
57 | :param means: the Gaussian mean Tensor.
58 | :param log_scales: the Gaussian log stddev Tensor.
59 | :return: a tensor like x of log probabilities (in nats).
60 | """
61 | assert x.shape == means.shape == log_scales.shape
62 | centered_x = x - means
63 | inv_stdv = th.exp(-log_scales)
64 | plus_in = inv_stdv * (centered_x + 1.0 / 255.0)
65 | cdf_plus = approx_standard_normal_cdf(plus_in)
66 | min_in = inv_stdv * (centered_x - 1.0 / 255.0)
67 | cdf_min = approx_standard_normal_cdf(min_in)
68 | log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12))
69 | log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12))
70 | cdf_delta = cdf_plus - cdf_min
71 | log_probs = th.where(
72 | x < -0.999,
73 | log_cdf_plus,
74 | th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))),
75 | )
76 | assert log_probs.shape == x.shape
77 | return log_probs
78 |
--------------------------------------------------------------------------------
/codes/models/image_generation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/image_generation/__init__.py
--------------------------------------------------------------------------------
/codes/models/image_generation/glean/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/image_generation/glean/__init__.py
--------------------------------------------------------------------------------
/codes/models/image_generation/srflow/Permutations.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | from torch import nn as nn
4 | from torch.nn import functional as F
5 |
6 | from models.image_generation.srflow import thops
7 |
8 |
9 | class InvertibleConv1x1(nn.Module):
10 | def __init__(self, num_channels, LU_decomposed=False):
11 | super().__init__()
12 | w_shape = [num_channels, num_channels]
13 | w_init = np.linalg.qr(np.random.randn(*w_shape))[0].astype(np.float32)
14 | self.register_parameter("weight", nn.Parameter(torch.Tensor(w_init)))
15 | self.w_shape = w_shape
16 | self.LU = LU_decomposed
17 |
18 | def get_weight(self, input, reverse):
19 | w_shape = self.w_shape
20 | pixels = thops.pixels(input)
21 | dlogdet = torch.slogdet(self.weight)[1] * pixels
22 | if not reverse:
23 | weight = self.weight.view(w_shape[0], w_shape[1], 1, 1)
24 | else:
25 | weight = torch.inverse(self.weight.double()).float() \
26 | .view(w_shape[0], w_shape[1], 1, 1)
27 | return weight, dlogdet
28 | def forward(self, input, logdet=None, reverse=False):
29 | """
30 | log-det = log|abs(|W|)| * pixels
31 | """
32 | weight, dlogdet = self.get_weight(input, reverse)
33 | if not reverse:
34 | z = F.conv2d(input, weight)
35 | if logdet is not None:
36 | logdet = logdet + dlogdet
37 | return z, logdet
38 | else:
39 | z = F.conv2d(input, weight)
40 | if logdet is not None:
41 | logdet = logdet - dlogdet
42 | return z, logdet
43 |
--------------------------------------------------------------------------------
/codes/models/image_generation/srflow/Split.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn as nn
3 |
4 | from models.image_generation.srflow import thops
5 | from models.image_generation.srflow.flow import Conv2dZeros, GaussianDiag
6 | from utils.util import opt_get
7 |
8 |
9 | class Split2d(nn.Module):
10 | def __init__(self, num_channels, logs_eps=0, cond_channels=0, position=None, consume_ratio=0.5, opt=None):
11 | super().__init__()
12 |
13 | self.num_channels_consume = int(round(num_channels * consume_ratio))
14 | self.num_channels_pass = num_channels - self.num_channels_consume
15 |
16 | self.conv = Conv2dZeros(in_channels=self.num_channels_pass + cond_channels,
17 | out_channels=self.num_channels_consume * 2)
18 | self.logs_eps = logs_eps
19 | self.position = position
20 | self.gaussian_nll_weight = opt_get(opt, ['networks', 'generator', 'flow', 'gaussian_loss_weight'], 1)
21 |
22 | def split2d_prior(self, z, ft):
23 | if ft is not None:
24 | z = torch.cat([z, ft], dim=1)
25 | h = self.conv(z)
26 | return thops.split_feature(h, "cross")
27 |
28 | def exp_eps(self, logs):
29 | return torch.exp(logs) + self.logs_eps
30 |
31 | def forward(self, input, logdet=0., reverse=False, eps_std=None, eps=None, ft=None, y_onehot=None):
32 | if not reverse:
33 | # self.input = input
34 | z1, z2 = self.split_ratio(input)
35 | mean, logs = self.split2d_prior(z1, ft)
36 |
37 | eps = (z2 - mean) / self.exp_eps(logs)
38 |
39 | logdet = logdet + self.get_logdet(logs, mean, z2)
40 |
41 | # print(logs.shape, mean.shape, z2.shape)
42 | # self.eps = eps
43 | # print('split, enc eps:', eps)
44 | return z1, logdet, eps
45 | else:
46 | z1 = input
47 | mean, logs = self.split2d_prior(z1, ft)
48 |
49 | if eps is None:
50 | #print("WARNING: eps is None, generating eps untested functionality!")
51 | eps = GaussianDiag.sample(mean, logs, eps_std)
52 | #eps = GaussianDiag.sample_eps(mean.shape, eps_std)
53 |
54 | eps = eps.to(mean.device)
55 | z2 = mean + self.exp_eps(logs) * eps
56 | z = thops.cat_feature(z1, z2)
57 |
58 | logdet = logdet - self.get_logdet(logs, mean, z2)
59 |
60 | return z, logdet
61 | # return z, logdet, eps
62 |
63 | def get_logdet(self, logs, mean, z2):
64 | logdet_diff = GaussianDiag.logp(mean, logs, z2)
65 | return logdet_diff * self.gaussian_nll_weight
66 |
67 | def split_ratio(self, input):
68 | z1, z2 = input[:, :self.num_channels_pass, ...], input[:, self.num_channels_pass:, ...]
69 | return z1, z2
--------------------------------------------------------------------------------
/codes/models/image_generation/srflow/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/image_generation/srflow/__init__.py
--------------------------------------------------------------------------------
/codes/models/image_generation/srflow/glow_arch.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 |
3 |
4 | def f_conv2d_bias(in_channels, out_channels):
5 | def padding_same(kernel, stride):
6 | return [((k - 1) * s + 1) // 2 for k, s in zip(kernel, stride)]
7 |
8 | padding = padding_same([3, 3], [1, 1])
9 | assert padding == [1, 1], padding
10 | return nn.Sequential(
11 | nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=[3, 3], stride=1, padding=1,
12 | bias=True))
13 |
--------------------------------------------------------------------------------
/codes/models/image_generation/srflow/module_util.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.init as init
4 | import torch.nn.functional as F
5 | import maybe_bnb as mbnb
6 |
7 |
8 | def initialize_weights(net_l, scale=1):
9 | if not isinstance(net_l, list):
10 | net_l = [net_l]
11 | for net in net_l:
12 | for m in net.modules():
13 | if isinstance(m, nn.Conv2d):
14 | init.kaiming_normal_(m.weight, a=0, mode='fan_in')
15 | m.weight.data *= scale # for residual block
16 | if m.bias is not None:
17 | m.bias.data.zero_()
18 | elif isinstance(m, mbnb.nn.Linear):
19 | init.kaiming_normal_(m.weight, a=0, mode='fan_in')
20 | m.weight.data *= scale
21 | if m.bias is not None:
22 | m.bias.data.zero_()
23 | elif isinstance(m, nn.BatchNorm2d):
24 | init.constant_(m.weight, 1)
25 | init.constant_(m.bias.data, 0.0)
26 |
27 |
28 | def make_layer(block, n_layers):
29 | layers = []
30 | for _ in range(n_layers):
31 | layers.append(block())
32 | return nn.Sequential(*layers)
33 |
34 |
35 | class ResidualBlock_noBN(nn.Module):
36 | '''Residual block w/o BN
37 | ---Conv-ReLU-Conv-+-
38 | |________________|
39 | '''
40 |
41 | def __init__(self, nf=64):
42 | super(ResidualBlock_noBN, self).__init__()
43 | self.conv1 = nn.Conv2d(nf, nf, 3, 1, 1, bias=True)
44 | self.conv2 = nn.Conv2d(nf, nf, 3, 1, 1, bias=True)
45 |
46 | # initialization
47 | initialize_weights([self.conv1, self.conv2], 0.1)
48 |
49 | def forward(self, x):
50 | identity = x
51 | out = F.relu(self.conv1(x), inplace=True)
52 | out = self.conv2(out)
53 | return identity + out
54 |
55 |
56 | def flow_warp(x, flow, interp_mode='bilinear', padding_mode='zeros'):
57 | """Warp an image or feature map with optical flow
58 | Args:
59 | x (Tensor): size (N, C, H, W)
60 | flow (Tensor): size (N, H, W, 2), normal value
61 | interp_mode (str): 'nearest' or 'bilinear'
62 | padding_mode (str): 'zeros' or 'border' or 'reflection'
63 |
64 | Returns:
65 | Tensor: warped image or feature map
66 | """
67 | assert x.size()[-2:] == flow.size()[1:3]
68 | B, C, H, W = x.size()
69 | # mesh grid
70 | grid_y, grid_x = torch.meshgrid(torch.arange(0, H), torch.arange(0, W))
71 | grid = torch.stack((grid_x, grid_y), 2).float() # W(x), H(y), 2
72 | grid.requires_grad = False
73 | grid = grid.type_as(x)
74 | vgrid = grid + flow
75 | # scale grid to [-1,1]
76 | vgrid_x = 2.0 * vgrid[:, :, :, 0] / max(W - 1, 1) - 1.0
77 | vgrid_y = 2.0 * vgrid[:, :, :, 1] / max(H - 1, 1) - 1.0
78 | vgrid_scaled = torch.stack((vgrid_x, vgrid_y), dim=3)
79 | output = F.grid_sample(x, vgrid_scaled, mode=interp_mode, padding_mode=padding_mode)
80 | return output
81 |
--------------------------------------------------------------------------------
/codes/models/image_generation/srflow/thops.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | def sum(tensor, dim=None, keepdim=False):
5 | if dim is None:
6 | # sum up all dim
7 | return torch.sum(tensor)
8 | else:
9 | if isinstance(dim, int):
10 | dim = [dim]
11 | dim = sorted(dim)
12 | for d in dim:
13 | tensor = tensor.sum(dim=d, keepdim=True)
14 | if not keepdim:
15 | for i, d in enumerate(dim):
16 | tensor.squeeze_(d-i)
17 | return tensor
18 |
19 |
20 | def mean(tensor, dim=None, keepdim=False):
21 | if dim is None:
22 | # mean all dim
23 | return torch.mean(tensor)
24 | else:
25 | if isinstance(dim, int):
26 | dim = [dim]
27 | dim = sorted(dim)
28 | for d in dim:
29 | tensor = tensor.mean(dim=d, keepdim=True)
30 | if not keepdim:
31 | for i, d in enumerate(dim):
32 | tensor.squeeze_(d-i)
33 | return tensor
34 |
35 |
36 | def split_feature(tensor, type="split"):
37 | """
38 | type = ["split", "cross"]
39 | """
40 | C = tensor.size(1)
41 | if type == "split":
42 | return tensor[:, :C // 2, ...], tensor[:, C // 2:, ...]
43 | elif type == "cross":
44 | return tensor[:, 0::2, ...], tensor[:, 1::2, ...]
45 |
46 |
47 | def cat_feature(tensor_a, tensor_b):
48 | return torch.cat((tensor_a, tensor_b), dim=1)
49 |
50 |
51 | def pixels(tensor):
52 | return int(tensor.size(2) * tensor.size(3))
--------------------------------------------------------------------------------
/codes/models/image_generation/stylegan/__init__.py:
--------------------------------------------------------------------------------
1 |
2 | def create_stylegan2_loss(opt_loss, env):
3 | type = opt_loss['type']
4 | if type == 'stylegan2_divergence':
5 | import models.image_generation.stylegan.stylegan2_lucidrains as stylegan2
6 | return stylegan2.StyleGan2DivergenceLoss(opt_loss, env)
7 | elif type == 'stylegan2_pathlen':
8 | import models.image_generation.stylegan.stylegan2_lucidrains as stylegan2
9 | return stylegan2.StyleGan2PathLengthLoss(opt_loss, env)
10 | else:
11 | raise NotImplementedError
--------------------------------------------------------------------------------
/codes/models/image_latents/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/image_latents/__init__.py
--------------------------------------------------------------------------------
/codes/models/image_latents/byol/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/image_latents/byol/__init__.py
--------------------------------------------------------------------------------
/codes/models/image_latents/fixup_resnet/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/image_latents/fixup_resnet/__init__.py
--------------------------------------------------------------------------------
/codes/models/lucidrains/dalle/__init__.py:
--------------------------------------------------------------------------------
1 | # This directory contains some useful code from https://github.com/lucidrains/DALLE-pytorch/tree/main/dalle_pytorch
--------------------------------------------------------------------------------
/codes/models/lucidrains/performer/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/codes/models/vqvae/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/vqvae/__init__.py
--------------------------------------------------------------------------------
/codes/models/vqvae/gumbel_quantizer.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | from torch import einsum
5 |
6 | from utils.weight_scheduler import LinearDecayWeightScheduler
7 | import maybe_bnb as mbnb
8 |
9 |
10 | class GumbelQuantizer(nn.Module):
11 | def __init__(self, inp_dim, codebook_dim, num_tokens, straight_through=False):
12 | super().__init__()
13 | self.to_logits = nn.Conv1d(inp_dim, num_tokens, 1)
14 | # nn.Embedding
15 | self.codebook = mbnb.nn.Embedding(num_tokens, codebook_dim)
16 | self.straight_through = straight_through
17 | self.temperature_scheduler = LinearDecayWeightScheduler(10, 5000, .9, 2000)
18 | self.step = 0
19 | self.norm = SwitchNorm(num_tokens)
20 |
21 | def get_temperature(self, step):
22 | self.step = step # VERY POOR DESIGN. WHEN WILL HE EVER LEARN???
23 | return self.temperature_scheduler.get_weight_for_step(step)
24 |
25 | def embed_code(self, codes):
26 | return self.codebook(codes)
27 |
28 | def gumbel_softmax(self, logits, tau, dim, hard):
29 | gumbels = torch.rand_like(logits)
30 | gumbels = -torch.log(-torch.log(gumbels + 1e-8) + 1e-8)
31 | logits = (logits + gumbels) / tau # ~Gumbel(logits,tau)
32 | y_soft = F.softmax(logits, dim=dim)
33 |
34 | if hard:
35 | index = y_soft.max(dim, keepdim=True)[1]
36 | y_hard = torch.zeros_like(logits, memory_format=torch.legacy_contiguous_format).scatter_(dim, index, 1.0)
37 | ret = y_hard - y_soft.detach() + y_soft
38 | else:
39 | ret = y_soft
40 | return ret
41 |
42 | def forward(self, h):
43 | h = h.permute(0,2,1)
44 | logits = self.to_logits(h)
45 | logits = self.gumbel_softmax(logits, tau=self.temperature_scheduler.get_weight_for_step(self.step), dim=1, hard=self.straight_through)
46 | logits = self.norm(logits)
47 | codes = logits.argmax(dim=1).flatten(1)
48 | sampled = einsum('b n l, n d -> b d l', logits, self.codebook.weight)
49 | return sampled.permute(0,2,1), 0, codes
50 |
51 | if __name__ == '__main__':
52 | j = torch.randn(8,40,1024)
53 | m = GumbelQuantizer(1024, 1024, 4096)
54 | m2 = DiscreteDecoder(1024, (512, 256), 2)
55 | l=m2(m(j)[0].permute(0,2,1))
56 | mean = 0
57 | for ls in l:
58 | mean = mean + ls.mean()
59 | mean.backward()
--------------------------------------------------------------------------------
/codes/requirements.laxed.txt:
--------------------------------------------------------------------------------
1 | # Fundamentals
2 | numpy
3 | pyyaml
4 | tb-nightly
5 | future
6 | scp
7 | tqdm
8 | matplotlib
9 | scipy
10 | munch
11 | tqdm
12 | scp
13 | tensorboard
14 | orjson
15 | einops
16 | lambda-networks
17 | mup
18 |
19 | #UI
20 | customtkinter
21 | ruamel.yaml
22 | # For image generation stuff
23 | opencv-python
24 | kornia
25 | pytorch_ssim
26 | gsa-pytorch
27 | pytorch_fid
28 |
29 | # For audio generation stuff
30 | inflect
31 | librosa
32 | Unidecode
33 | tgt
34 | pyworld
35 | audio2numpy
36 | SoundFile
37 |
38 | # For text stuff
39 | transformers
40 | tokenizers
41 | jiwer # calculating WER
42 | omegaconf
43 |
44 | # lucidrains stuff
45 | vector_quantize_pytorch
46 | linear_attention_transformer
47 | rotary-embedding-torch
48 | axial_positional_embedding
49 | g-mlp-pytorch
50 | x-clip
51 | x_transformers==1.0.4
52 |
53 | # bitsandbytes
54 | bitsandbytes
55 | lion-pytorch==0.0.7
56 | # triton==2.0.0a2
57 |
--------------------------------------------------------------------------------
/codes/requirements.txt:
--------------------------------------------------------------------------------
1 | # Fundamentals
2 | numpy
3 | pyyaml
4 | tb-nightly
5 | future
6 | scp
7 | tqdm
8 | matplotlib
9 | scipy
10 | munch
11 | tqdm
12 | scp
13 | tensorboard
14 | orjson
15 | einops
16 | lambda-networks
17 | mup
18 |
19 | # For image generation stuff
20 | opencv-python
21 | kornia
22 | pytorch_ssim
23 | gsa-pytorch
24 | pytorch_fid==0.1.1
25 |
26 | # For audio generation stuff
27 | inflect==0.2.5
28 | librosa==0.6.0
29 | Unidecode==1.0.22
30 | tgt == 1.4.4
31 | pyworld == 0.2.10
32 | audio2numpy
33 | SoundFile
34 |
35 | # For text stuff
36 | transformers
37 | tokenizers
38 | jiwer # calculating WER
39 | omegaconf
40 |
41 | # lucidrains stuff
42 | vector_quantize_pytorch
43 | linear_attention_transformer
44 | rotary-embedding-torch
45 | axial_positional_embedding
46 | g-mlp-pytorch
47 | x-clip
48 | x_transformers
49 |
50 | bitsandbytes
51 | lion-pytorch==0.0.7
52 | # triton==2.0.0a2
53 |
--------------------------------------------------------------------------------
/codes/scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/scripts/__init__.py
--------------------------------------------------------------------------------
/codes/scripts/audio/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/scripts/audio/__init__.py
--------------------------------------------------------------------------------
/codes/scripts/audio/gen/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/scripts/audio/gen/__init__.py
--------------------------------------------------------------------------------
/codes/scripts/audio/gen/use_discrete_vocoder_one_way.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | import torch
4 | import torchaudio
5 |
6 | from data.audio.unsupervised_audio_dataset import load_audio
7 | from scripts.audio.gen.speech_synthesis_utils import do_spectrogram_diffusion, \
8 | load_discrete_vocoder_diffuser, wav_to_mel, convert_mel_to_codes
9 | from utils.audio import plot_spectrogram
10 | from utils.util import load_model_from_config
11 |
12 |
13 | def roundtrip_vocoding(dvae, vocoder, diffuser, clip, cond=None, plot_spec=False):
14 | clip = clip.unsqueeze(0)
15 | if cond is None:
16 | cond = clip
17 | else:
18 | cond = cond.unsqueeze(0)
19 | mel = wav_to_mel(clip)
20 | if plot_spec:
21 | plot_spectrogram(mel[0].cpu())
22 | codes = convert_mel_to_codes(dvae, mel)
23 | return
24 |
25 |
26 | if __name__ == '__main__':
27 | parser = argparse.ArgumentParser()
28 | parser.add_argument('-opt', type=str, help='Path to options YAML file used to train the diffusion model', default='X:\\dlas\\experiments\\train_diffusion_vocoder_with_cond_new_dvae.yml')
29 | parser.add_argument('-diffusion_model_name', type=str, help='Name of the diffusion model in opt.', default='generator')
30 | parser.add_argument('-diffusion_model_path', type=str, help='Name of the diffusion model in opt.', default='X:\\dlas\\experiments\\train_diffusion_vocoder_with_cond_new_dvae_full\\models\\6100_generator_ema.pth')
31 | parser.add_argument('-dvae_model_name', type=str, help='Name of the DVAE model in opt.', default='dvae')
32 | parser.add_argument('-input_file', type=str, help='Path to the input torch save file.', default='speech_forward_mels.pth')
33 | parser.add_argument('-cond', type=str, help='Path to the conditioning input audio file.', default='Z:\\clips\\books1\\3042_18_Holden__000000000\\00037.wav')
34 | args = parser.parse_args()
35 |
36 | print("Loading DVAE..")
37 | dvae = load_model_from_config(args.opt, args.dvae_model_name)
38 | print("Loading Diffusion Model..")
39 | diffusion = load_model_from_config(args.opt, args.diffusion_model_name, also_load_savepoint=False, load_path=args.diffusion_model_path)
40 |
41 | print("Loading data..")
42 | cond = load_audio(args.cond, 22050)
43 | if cond.shape[-1] > 44100+10000:
44 | cond = cond[:,10000:54100]
45 | cond = cond.unsqueeze(0).cuda()
46 |
47 | diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=20)
48 | inp = torch.load(args.input_file)
49 | codes = inp
50 |
51 | print("Performing inference..")
52 | for i, cb in enumerate(codes):
53 | roundtripped = do_spectrogram_diffusion(diffusion, dvae, diffuser, cb.unsqueeze(0).cuda(), cond, spectrogram_compression_factor=128, plt_spec=False)
54 | torchaudio.save(f'vocoded_output_sp_{i}.wav', roundtripped.squeeze(0).cpu(), 11025)
--------------------------------------------------------------------------------
/codes/scripts/audio/gen/use_mel2vec_codes.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torchvision
3 |
4 | from models.audio.mel2vec import ContrastiveTrainingWrapper
5 | from trainer.injectors.audio_injectors import TorchMelSpectrogramInjector, normalize_mel
6 | from utils.util import load_audio
7 |
8 | def collapse_codegroups(codes):
9 | codes = codes.clone()
10 | groups = codes.shape[-1]
11 | for k in range(groups):
12 | codes[:,:,k] = codes[:,:,k] * groups ** k
13 | codes = codes.sum(-1)
14 | return codes
15 |
16 |
17 | def recover_codegroups(codes, groups):
18 | codes = codes.clone()
19 | output = torch.LongTensor(codes.shape[0], codes.shape[1], groups, device=codes.device)
20 | for k in range(groups):
21 | output[:,:,k] = codes % groups
22 | codes = codes // groups
23 | return output
24 |
25 |
26 | if __name__ == '__main__':
27 | model = ContrastiveTrainingWrapper(mel_input_channels=256, inner_dim=1024, layers=24, dropout=0, mask_time_prob=0,
28 | mask_time_length=6, num_negatives=100, codebook_size=16, codebook_groups=4,
29 | disable_custom_linear_init=True, feature_producer_type='standard',
30 | freq_mask_percent=0, do_reconstruction_loss=True)
31 | model.load_state_dict(torch.load("../experiments/m2v_music2.pth"))
32 | model.eval()
33 |
34 | wav = load_audio("Y:/separated/bt-music-1/100 Hits - Running Songs 2014 CD 2/100 Hits - Running Songs 2014 Cd2 - 02 - 7Th Heaven - Ain't Nothin' Goin' On But The Rent/00001/no_vocals.wav", 22050)
35 | mel = TorchMelSpectrogramInjector({'n_mel_channels': 256, 'mel_fmax': 11000, 'filter_length': 16000,
36 | 'normalize': True, 'in': 'in', 'out': 'out'}, {})({'in': wav.unsqueeze(0)})['out']
37 | codes = model.get_codes(mel)
38 | reconstruction = model.reconstruct(mel)
39 |
40 | torchvision.utils.save_image((normalize_mel(mel).unsqueeze(1)+1)/2, 'mel.png')
41 | torchvision.utils.save_image((normalize_mel(reconstruction).unsqueeze(1)+1)/2, 'reconstructed.png')
42 |
43 | collapsed = collapse_codegroups(codes)
44 | recovered = recover_codegroups(collapsed, 4)
45 |
46 | print(codes)
--------------------------------------------------------------------------------
/codes/scripts/audio/gen/w2v_patcher.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from utils.util import load_model_from_config
4 |
5 | if __name__ == '__main__':
6 | config = "D:\\dlas\\options\\train_wav2vec_matcher.yml"
7 | model_name = "generator"
8 | model_path = "D:\dlas\experiments\train_wav2vec_matcher\models"
9 | wav_dump_path = "FIXME"
10 |
11 | model = load_model_from_config(config, model_name, also_load_savepoint=False, load_path=model_path, device='cuda').eval()
12 | w2v_logits, audio_samples = torch.load(wav_dump_path)
13 |
14 | w2v_logits_chunked = torch.chunk(w2v_logits, 32)
15 | for chunk in w2v_logits_chunked:
16 |
--------------------------------------------------------------------------------
/codes/scripts/audio/gen_mel.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import torch
4 |
5 | from data.util import find_files_of_type, is_audio_file
6 | from trainer.injectors.audio_injectors import MelSpectrogramInjector
7 | from utils.util import load_audio
8 |
9 | if __name__ == '__main__':
10 | path = 'C:\\Users\\jbetk\\Documents\\tmp\\some_audio'
11 |
12 | inj = MelSpectrogramInjector({'in': 'wav', 'out': 'mel',
13 | 'mel_fmax': 12000, 'sampling_rate': 22050, 'n_mel_channels': 100
14 | },{})
15 | audio = find_files_of_type('img', path, qualifier=is_audio_file)[0]
16 | for clip in audio:
17 | if not clip.endswith('.wav'):
18 | continue
19 | wav = load_audio(clip, 24000)
20 | mel = inj({'wav': wav.unsqueeze(0)})['mel']
21 | torch.save(mel, clip.replace('.wav', '.mel'))
--------------------------------------------------------------------------------
/codes/scripts/audio/mel_bin_norm_compute.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | import torch
4 | import yaml
5 | from tqdm import tqdm
6 |
7 | from data import create_dataset, create_dataloader
8 | from scripts.audio.gen.speech_synthesis_utils import wav_to_univnet_mel
9 | from utils.options import Loader
10 |
11 | if __name__ == '__main__':
12 | parser = argparse.ArgumentParser()
13 | parser.add_argument('-opt', type=str, help='Path to options YAML file used to train the diffusion model', default='D:\\dlas\\options\\train_diffusion_tts9.yml')
14 | parser.add_argument('-key', type=str, help='Key where audio data is stored', default='wav')
15 | parser.add_argument('-num_batches', type=int, help='Number of batches to collect to compute the norm', default=50000)
16 | args = parser.parse_args()
17 |
18 | with open(args.opt, mode='r') as f:
19 | opt = yaml.load(f, Loader=Loader)
20 | dopt = opt['datasets']['train']
21 | dopt['phase'] = 'train'
22 | dataset, collate = create_dataset(dopt, return_collate=True)
23 | dataloader = create_dataloader(dataset, dopt, collate_fn=collate, shuffle=True)
24 |
25 | mel_means = []
26 | mel_max = -999999999
27 | mel_min = 999999999
28 | mel_stds = []
29 | mel_vars = []
30 | for batch in tqdm(dataloader):
31 | if len(mel_means) > args.num_batches:
32 | break
33 | clip = batch[args.key].cuda()
34 | for b in range(clip.shape[0]):
35 | wav = clip[b].unsqueeze(0)
36 | wav = wav[:, :, :batch[f'{args.key}_lengths'][b]]
37 | mel = wav_to_univnet_mel(clip) # Caution: make sure this isn't already normed.
38 | mel_means.append(mel.mean((0,2)).cpu())
39 | mel_max = max(mel.max().item(), mel_max)
40 | mel_min = min(mel.min().item(), mel_min)
41 | mel_stds.append(mel.std((0,2)).cpu())
42 | mel_vars.append(mel.var((0,2)).cpu())
43 | mel_means = torch.stack(mel_means).mean(0)
44 | mel_stds = torch.stack(mel_stds).mean(0)
45 | mel_vars = torch.stack(mel_vars).mean(0)
46 | torch.save((mel_means,mel_max,mel_min,mel_stds,mel_vars), 'univnet_mel_norms.pth')
--------------------------------------------------------------------------------
/codes/scripts/audio/play_with_spectral_representations.py:
--------------------------------------------------------------------------------
1 | import torchvision.utils
2 |
3 | from utils.music_utils import music2mel, music2cqt
4 | from utils.util import load_audio
5 |
6 | if __name__ == '__main__':
7 | clip = load_audio('Y:\\split\\yt-music-eval\\00001.wav', 22050)
8 | mel = music2mel(clip)
9 | cqt = music2cqt(clip)
10 | torchvision.utils.save_image((mel.unsqueeze(1) + 1) / 2, 'mel.png')
11 | torchvision.utils.save_image((cqt.unsqueeze(1) + 1) / 2, 'cqt.png')
12 |
--------------------------------------------------------------------------------
/codes/scripts/audio/prep_music/demucs_notes.txt:
--------------------------------------------------------------------------------
1 | My custom demucs library is used for batch source separation:
2 | https://github.com/neonbjb/demucs
3 |
4 | ```
5 | conda activate demucs
6 | python setup.py install
7 | CUDA_VISIBLE_DEVICES=0 python -m demucs /y/split/bt-music-5 --out=/y/separated/bt-music-5 --num_workers=2 --device cuda --two-stems=vocals
8 | ```
9 |
10 | Example usage of generate_long_cheaters and generate_long_mels, post demucs:
11 |
12 | ```
13 | CUDA_VISIBLE_DEVICES=0 python generate_long_mels.py --path=/y/separated/mpm/1 --progress_file=/y/separated/large_mels/mpm/already_processed.txt \
14 | --output_path=/y/separated/large_mels/mpm/1 --num_threads=2
15 |
16 | CUDA_VISIBLE_DEVICES=2 python generate_long_cheaters.py --path=/y/separated/large_mels/mpm/3 --progress_file=/y/separated/large_mel_cheaters/mpm/already_processed.txt \
17 | --output_path=/y/separated/large_mel_cheaters/mpm/3 --num_threads=1
18 | ```
--------------------------------------------------------------------------------
/codes/scripts/audio/preparation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/scripts/audio/preparation/__init__.py
--------------------------------------------------------------------------------
/codes/scripts/audio/preparation/combine_phonetic_and_text.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | if __name__ == '__main__':
4 | basepath = 'Y:\\bigasr_dataset\\hifi_tts'
5 |
6 | english_file = os.path.join(basepath, 'transcribed-oco-realtext.tsv')
7 | if not os.path.exists(english_file):
8 | english_file = os.path.join(basepath, 'transcribed-oco.tsv')
9 | phoneme_file = os.path.join(basepath, 'transcribed-phoneme-oco.tsv')
10 |
11 | texts = {}
12 | with open(english_file, 'r', encoding='utf-8') as f:
13 | for line in f.readlines():
14 | spl = line.split('\t')
15 | if len(spl) == 3:
16 | text, p, _ = spl
17 | texts[p] = text
18 | else:
19 | print(f'Error processing line {line}')
20 |
21 | with open(phoneme_file, 'r', encoding='utf-8') as f:
22 | wf = open(os.path.join(basepath, 'transcribed-phoneme-english-oco.tsv'), 'w', encoding='utf-8')
23 | for line in f.readlines():
24 | spl = line.split('\t')
25 | if len(spl) == 3:
26 | _, p, codes = spl
27 | codes = codes.strip()
28 | if p not in texts:
29 | print(f'Could not find the text for {p}')
30 | continue
31 | wf.write(f'{texts[p]}\t{p}\t{codes}\n')
32 | wf.close()
33 |
--------------------------------------------------------------------------------
/codes/scripts/audio/preparation/filter_clips_with_no_hifreq_data.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torchaudio
3 |
4 | from data.audio.unsupervised_audio_dataset import load_audio
5 | from scripts.do_to_files import do_to_files
6 |
7 |
8 | def get_spec_mags(clip):
9 | stft = torch.stft(clip, n_fft=22000, hop_length=1024, return_complex=True)
10 | stft = stft[0, -2000:, :]
11 | return (stft.real ** 2 + stft.imag ** 2).sqrt()
12 |
13 |
14 | def filter_no_hifreq_data(path, output_path):
15 | clip = load_audio(path, 22050)
16 | if clip.shape[-1] < 22050:
17 | return
18 | stft = get_spec_mags(clip)
19 | if stft.mean() < .08:
20 | with open(output_path, 'a') as o:
21 | o.write(f'{path}\n')
22 |
23 | if __name__ == '__main__':
24 | do_to_files(filter_no_hifreq_data)
--------------------------------------------------------------------------------
/codes/scripts/audio/preparation/gen_dvae_codes.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import torch
4 | from tqdm import tqdm
5 |
6 | from scripts.audio.gen.speech_synthesis_utils import load_speech_dvae, wav_to_mel
7 |
8 | if __name__ == '__main__':
9 | input_folder = 'C:\\Users\\James\\Downloads\\lex2\\lexfridman_training_mp3'
10 | output_folder = 'C:\\Users\\James\\Downloads\\lex2\\quantized'
11 |
12 | params = {
13 | 'mode': 'unsupervised_audio',
14 | 'path': [input_folder],
15 | 'cache_path': f'{input_folder}/cache.pth',
16 | 'sampling_rate': 22050,
17 | 'pad_to_samples': 441000,
18 | 'resample_clip': False,
19 | 'extra_samples': 0,
20 | 'phase': 'train',
21 | 'n_workers': 2,
22 | 'batch_size': 64,
23 | }
24 | from data import create_dataset, create_dataloader
25 | os.makedirs(output_folder, exist_ok=True)
26 |
27 | ds = create_dataset(params)
28 | dl = create_dataloader(ds, params)
29 |
30 | dvae = load_speech_dvae().cuda()
31 | with torch.no_grad():
32 | for batch in tqdm(dl):
33 | audio = batch['clip'].cuda()
34 | mel = wav_to_mel(audio)
35 | codes = dvae.get_codebook_indices(mel)
36 | for i in range(audio.shape[0]):
37 | c = codes[i, :batch['clip_lengths'][i]//1024+4] # +4 seems empirically to be a good clipping point - it seems to preserve the termination codes.
38 | fn = batch['path'][i]
39 | outp = os.path.join(output_folder, os.path.relpath(fn, input_folder) + ".pth")
40 | os.makedirs(os.path.dirname(outp), exist_ok=True)
41 | torch.save(c.tolist(), outp)
42 |
--------------------------------------------------------------------------------
/codes/scripts/audio/preparation/pipeline.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import shutil
4 | from subprocess import Popen
5 |
6 | if __name__ == '__main__':
7 | parser = argparse.ArgumentParser()
8 | parser.add_argument('--path', type=str, help='Path to search for files')
9 | parser.add_argument('--output_path', type=str, help='Path for output files')
10 | args = parser.parse_args()
11 |
12 | cmds = [
13 | f"scripts/audio/preparation/phase_1_split_files.py --path={args.path} --progress_file={args.output_path}_t1/progress.txt --num_threads=6 --output_path={args.output_path}_t1",
14 | f"scripts/audio/preparation/phase_2_sample_and_filter.py --path={args.output_path}_t1 --progress_file={args.output_path}/progress.txt --num_threads=6 --output_path={args.output_path}",
15 | f"scripts/audio/preparation/phase_3_generate_similarities.py --path={args.output_path} --num_workers=4",
16 | ]
17 | os.makedirs(args.output_path, exist_ok=True)
18 | os.makedirs(args.output_path + "_t1", exist_ok=True)
19 |
20 | for cmd in cmds:
21 | p = Popen("python " + cmd, shell=True)
22 | p.wait()
23 |
24 | shutil.rmtree(args.output_path + "_t1")
25 |
--------------------------------------------------------------------------------
/codes/scripts/audio/preparation/process_spleeter_filter_outputs.py:
--------------------------------------------------------------------------------
1 | import os
2 | import shutil
3 | import argparse
4 | from tqdm import tqdm
5 |
6 | if __name__ == '__main__':
7 | parser = argparse.ArgumentParser()
8 | parser.add_argument('input', metavar='in', type=str)
9 | parser.add_argument('basis', metavar='basis', type=str)
10 | parser.add_argument('garbage', metavar='garbage', type=str)
11 | args = parser.parse_args()
12 | print(f"Moving files from {args.input} to {args.garbage}")
13 | os.makedirs(args.garbage, exist_ok=True)
14 |
15 | with open(args.input) as f:
16 | lines = f.readlines()
17 | for line in tqdm(lines):
18 | line = line.strip()
19 | assert args.basis in line
20 | movefile = os.path.join(args.garbage, line.replace(args.basis, '')[1:])
21 | print(f'{line} -> {movefile}')
22 | os.makedirs(os.path.dirname(movefile), exist_ok=True)
23 | shutil.move(line, movefile)
24 |
25 |
26 |
27 |
28 |
--------------------------------------------------------------------------------
/codes/scripts/audio/preparation/save_mels_to_disk.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 |
4 | import numpy
5 | import torch
6 | from spleeter.audio.adapter import AudioAdapter
7 | from tqdm import tqdm
8 |
9 | from data.util import find_audio_files
10 | # Uses pydub to process a directory of audio files, splitting them into clips at points where it detects a small amount
11 | # of silence.
12 | from trainer.injectors.base_injectors import MelSpectrogramInjector
13 |
14 |
15 | def main():
16 | parser = argparse.ArgumentParser()
17 | parser.add_argument('--path')
18 | args = parser.parse_args()
19 | files = find_audio_files(args.path, include_nonwav=True)
20 | mel_inj = MelSpectrogramInjector({'in':'in', 'out':'out'}, {})
21 | audio_loader = AudioAdapter.default()
22 | for e, wav_file in enumerate(tqdm(files)):
23 | if e < 0:
24 | continue
25 | print(f"Processing {wav_file}..")
26 | outfile = f'{wav_file}.npz'
27 | if os.path.exists(outfile):
28 | continue
29 |
30 | try:
31 | wave, sample_rate = audio_loader.load(wav_file, sample_rate=22050)
32 | wave = torch.tensor(wave)[:,0].unsqueeze(0)
33 | wave = wave / wave.abs().max()
34 | except:
35 | print(f"Error with {wav_file}")
36 | continue
37 |
38 | inj = mel_inj({'in': wave})
39 | numpy.savez_compressed(outfile, inj['out'].numpy())
40 |
41 |
42 | if __name__ == '__main__':
43 | main()
44 |
--------------------------------------------------------------------------------
/codes/scripts/audio/preparation/spleeter_filter_noisy_clips.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | import numpy as np
4 | from spleeter.separator import Separator
5 | from torch.utils.data import DataLoader
6 | from tqdm import tqdm
7 |
8 | from scripts.audio.preparation.spleeter_utils.spleeter_dataset import SpleeterDataset
9 |
10 |
11 | def main():
12 | parser = argparse.ArgumentParser()
13 | parser.add_argument('--path')
14 | parser.add_argument('--out')
15 | parser.add_argument('--resume', default=None)
16 | parser.add_argument('--partition_size', default=None)
17 | parser.add_argument('--partition', default=None)
18 | args = parser.parse_args()
19 |
20 | src_dir = args.path
21 | out_file = args.out
22 | output_sample_rate=22050
23 | resume_file = args.resume
24 |
25 | loader = DataLoader(SpleeterDataset(src_dir, batch_sz=16, sample_rate=output_sample_rate,
26 | max_duration=10, partition=args.partition, partition_size=args.partition_size,
27 | resume=resume_file), batch_size=1, num_workers=1)
28 |
29 | separator = Separator('spleeter:2stems')
30 | unacceptable_files = open(out_file, 'a')
31 | for batch in tqdm(loader):
32 | audio, files, ends = batch['audio'], batch['files'], batch['ends']
33 | sep = separator.separate(audio.squeeze(0).numpy())
34 | vocals = sep['vocals']
35 | bg = sep['accompaniment']
36 | start = 0
37 | for path, end in zip(files, ends):
38 | vmax = np.abs(vocals[start:end]).mean()
39 | bmax = np.abs(bg[start:end]).mean()
40 | start = end
41 |
42 | # Only output to the "good" sample dir if the ratio of background noise to vocal noise is high enough.
43 | ratio = vmax / (bmax+.0000001)
44 | if ratio < 18: # These values were derived empirically
45 | unacceptable_files.write(f'{path[0]}\n')
46 | unacceptable_files.flush()
47 |
48 | unacceptable_files.close()
49 |
50 |
51 | if __name__ == '__main__':
52 | main()
53 |
--------------------------------------------------------------------------------
/codes/scripts/audio/preparation/spleeter_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/scripts/audio/preparation/spleeter_utils/__init__.py
--------------------------------------------------------------------------------
/codes/scripts/audio/preparation/spleeter_utils/spleeter_dataset.py:
--------------------------------------------------------------------------------
1 | from math import ceil
2 |
3 | import numpy as np
4 |
5 | from spleeter.audio.adapter import AudioAdapter
6 | from torch.utils.data import Dataset
7 |
8 | from data.util import find_audio_files
9 |
10 |
11 | class SpleeterDataset(Dataset):
12 | def __init__(self, src_dir, batch_sz, max_duration, sample_rate=22050, partition=None, partition_size=None, resume=None):
13 | self.batch_sz = batch_sz
14 | self.max_duration = max_duration
15 | self.files = find_audio_files(src_dir, include_nonwav=True)
16 | self.sample_rate = sample_rate
17 |
18 | # Partition files if needed.
19 | if partition_size is not None:
20 | psz = int(partition_size)
21 | prt = int(partition)
22 | self.files = self.files[prt * psz:(prt + 1) * psz]
23 |
24 | # Find the resume point and carry on from there.
25 | if resume is not None:
26 | for i, f in enumerate(self.files):
27 | if resume in f:
28 | break
29 | assert i < len(self.files)
30 | self.files = self.files[i:]
31 | self.loader = AudioAdapter.default()
32 |
33 | def __len__(self):
34 | return ceil(len(self.files) / self.batch_sz)
35 |
36 | def __getitem__(self, item):
37 | item = item * self.batch_sz
38 | wavs = None
39 | files = []
40 | ends = []
41 | for k in range(self.batch_sz):
42 | ind = k+item
43 | if ind >= len(self.files):
44 | break
45 |
46 | #try:
47 | wav, sr = self.loader.load(self.files[ind], sample_rate=self.sample_rate)
48 | assert sr == 22050
49 | # Get rid of all channels except one.
50 | if wav.shape[1] > 1:
51 | wav = wav[:, 0]
52 |
53 | if wavs is None:
54 | wavs = wav
55 | else:
56 | wavs = np.concatenate([wavs, wav])
57 | ends.append(wavs.shape[0])
58 | files.append(self.files[ind])
59 | #except:
60 | # print(f'Error loading {self.files[ind]}')
61 | return {
62 | 'audio': wavs,
63 | 'files': files,
64 | 'ends': ends
65 | }
--------------------------------------------------------------------------------
/codes/scripts/audio/preparation/split_on_silence.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import logging
3 | import os
4 | from pydub import AudioSegment
5 | from pydub.exceptions import CouldntDecodeError
6 | from pydub.silence import split_on_silence
7 | from data.util import find_audio_files
8 | from tqdm import tqdm
9 |
10 |
11 | # Uses pydub to process a directory of audio files, splitting them into clips at points where it detects a small amount
12 | # of silence.
13 | def main():
14 | parser = argparse.ArgumentParser()
15 | parser.add_argument('--path')
16 | parser.add_argument('--out')
17 | args = parser.parse_args()
18 | minimum_duration = 2
19 | maximum_duration = 20
20 | files = find_audio_files(args.path, include_nonwav=True)
21 | for e, wav_file in enumerate(tqdm(files)):
22 | print(f"Processing {wav_file}..")
23 | outdir = os.path.join(args.out, f'{e}_{os.path.basename(wav_file[:-4])}').replace('.', '').strip()
24 | os.makedirs(outdir, exist_ok=True)
25 |
26 | try:
27 | speech = AudioSegment.from_file(wav_file)
28 | except CouldntDecodeError as e:
29 | print(e)
30 | continue
31 | chunks = split_on_silence(speech, min_silence_len=400, silence_thresh=-40,
32 | seek_step=100, keep_silence=50)
33 |
34 | for i in range(0, len(chunks)):
35 | if chunks[i].duration_seconds < minimum_duration or chunks[i].duration_seconds > maximum_duration:
36 | continue
37 | chunks[i].export(f"{outdir}/{i:05d}.mp3", format='mp3', parameters=["-ac", "1"])
38 |
39 | if __name__ == '__main__':
40 | main()
41 |
--------------------------------------------------------------------------------
/codes/scripts/audio/random_mp3_splitter.py:
--------------------------------------------------------------------------------
1 | from scipy.io import wavfile
2 | from spleeter.separator import Separator
3 | from tqdm import tqdm
4 |
5 | from data.util import find_audio_files
6 | import os.path as osp
7 | from spleeter.audio.adapter import AudioAdapter
8 | import numpy as np
9 |
10 |
11 | if __name__ == '__main__':
12 | src_dir = 'P:\\Audiobooks-Podcasts'
13 | #src_dir = 'E:\\audio\\books'
14 | output_dir = 'D:\\data\\audio\\misc-split'
15 | output_dir_lq = 'D:\\data\\audio\\misc-split-with-bg'
16 | output_dir_garbage = 'D:\\data\\audio\\misc-split-garbage'
17 | #output_dir = 'E:\\audio\\books-clips'
18 | clip_length = 5 # In seconds
19 | sparsity = .1 # Only this proportion of the total clips are extracted as wavs.
20 | output_sample_rate=22050
21 |
22 | audio_loader = AudioAdapter.default()
23 | separator = Separator('spleeter:2stems')
24 | files = find_audio_files(src_dir, include_nonwav=True)
25 | for e, file in enumerate(tqdm(files)):
26 | if e < 1092:
27 | continue
28 | file_basis = osp.relpath(file, src_dir)\
29 | .replace('/', '_')\
30 | .replace('\\', '_')\
31 | .replace('.', '_')\
32 | .replace(' ', '_')\
33 | .replace('!', '_')\
34 | .replace(',', '_')
35 | if len(file_basis) > 100:
36 | file_basis = file_basis[:100]
37 | try:
38 | wave, sample_rate = audio_loader.load(file, sample_rate=output_sample_rate)
39 | except:
40 | print(f"Error with {file}")
41 | continue
42 |
43 | #if len(wave.shape) < 2:
44 | # continue
45 |
46 | # Calculate how much data we need to extract for each clip.
47 | clip_sz = sample_rate * clip_length
48 | interval = int(sample_rate * (clip_length / sparsity))
49 | i = 0
50 | while (i+clip_sz) < wave.shape[0]:
51 | clip = wave[i:i+clip_sz]
52 | sep = separator.separate(clip)
53 | vocals = sep['vocals']
54 | bg = sep['accompaniment']
55 | vmax = np.abs(vocals).mean()
56 | bmax = np.abs(bg).mean()
57 |
58 | # Only output to the "good" sample dir if the ratio of background noise to vocal noise is high enough.
59 | ratio = vmax / (bmax+.0000001)
60 | if ratio >= 25: # These values were derived empirically
61 | od = output_dir
62 | os = clip
63 | elif ratio >= 1:
64 | od = output_dir_lq
65 | os = vocals
66 | else:
67 | od = output_dir_garbage
68 | os = vocals
69 |
70 | # Strip out channels.
71 | if len(os.shape) > 1:
72 | os = os[:, 0] # Just use the first channel.
73 |
74 | wavfile.write(osp.join(od, f'{e}_{file_basis}_{i}.wav'), output_sample_rate, os)
75 | i = i + interval
76 |
--------------------------------------------------------------------------------
/codes/scripts/audio/spleeter_split_voice_and_background.py:
--------------------------------------------------------------------------------
1 | from scipy.io import wavfile
2 | from spleeter.separator import Separator
3 | from tqdm import tqdm
4 | '''
5 | Uses a model configuration to load a classifier and iterate through a dataset, binning each class into it's own
6 | folder.
7 | '''
8 |
9 | from data.util import find_audio_files
10 | import os
11 | import os.path as osp
12 | from spleeter.audio.adapter import AudioAdapter
13 | import numpy as np
14 |
15 |
16 | # Uses spleeter_utils to divide audio clips into one of two bins:
17 | # 1. Audio has little to no background noise, saved to "output_dir"
18 | # 2. Audio has a lot of background noise, bg noise split off and saved to "output_dir_bg"
19 | if __name__ == '__main__':
20 | src_dir = 'F:\\split\\joe_rogan'
21 | output_dir = 'F:\\split\\cleaned\\joe_rogan'
22 | output_dir_bg = 'F:\\split\\background-noise\\joe_rogan'
23 | output_sample_rate=22050
24 |
25 | os.makedirs(output_dir_bg, exist_ok=True)
26 | os.makedirs(output_dir, exist_ok=True)
27 |
28 | audio_loader = AudioAdapter.default()
29 | separator = Separator('spleeter:2stems')
30 | files = find_audio_files(src_dir, include_nonwav=True)
31 | for e, file in enumerate(tqdm(files)):
32 | #if e < 406500:
33 | # continue
34 | file_basis = osp.relpath(file, src_dir)\
35 | .replace('/', '_')\
36 | .replace('\\', '_')\
37 | .replace('.', '_')\
38 | .replace(' ', '_')\
39 | .replace('!', '_')\
40 | .replace(',', '_')
41 | if len(file_basis) > 100:
42 | file_basis = file_basis[:100]
43 | try:
44 | wave, sample_rate = audio_loader.load(file, sample_rate=output_sample_rate)
45 | except:
46 | print(f"Error with {file}")
47 | continue
48 |
49 | sep = separator.separate(wave)
50 | vocals = sep['vocals']
51 | bg = sep['accompaniment']
52 | vmax = np.abs(vocals).mean()
53 | bmax = np.abs(bg).mean()
54 |
55 | # Only output to the "good" sample dir if the ratio of background noise to vocal noise is high enough.
56 | ratio = vmax / (bmax+.0000001)
57 | if ratio >= 25: # These values were derived empirically
58 | od = output_dir
59 | os = wave
60 | elif ratio <= 1:
61 | od = output_dir_bg
62 | os = bg
63 | else:
64 | continue
65 |
66 | # Strip out channels.
67 | if len(os.shape) > 1:
68 | os = os[:, 0] # Just use the first channel.
69 |
70 | wavfile.write(osp.join(od, file_basis, f'{e}.wav'), output_sample_rate, os)
71 |
--------------------------------------------------------------------------------
/codes/scripts/audio/test_audio_similarity.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import torch
4 | import torch.nn.functional as F
5 |
6 | from data.util import is_wav_file, find_files_of_type
7 | from models.audio.audio_resnet import resnet50
8 | from models.audio.tts.tacotron2.taco_utils import load_wav_to_torch
9 | from scripts.byol.byol_extract_wrapped_model import extract_byol_model_from_state_dict
10 |
11 | if __name__ == '__main__':
12 | window = 48000
13 | root_path = 'D:\\tmp\\clips'
14 | paths = find_files_of_type('img', root_path, qualifier=is_wav_file)[0]
15 | clips = []
16 | for path in paths:
17 | clip, sr = load_wav_to_torch(os.path.join(root_path, path))
18 | if len(clip.shape) > 1:
19 | clip = clip[:,0]
20 | clip = clip[:window].unsqueeze(0)
21 | clip = clip / 32768.0 # Normalize
22 | #clip = clip + torch.rand_like(clip) * .03 # Noise (this is how the model was trained)
23 | assert sr == 24000
24 | clips.append(clip)
25 | clips = torch.stack(clips, dim=0)
26 |
27 | resnet = resnet50()
28 | sd = torch.load('../experiments/train_byol_audio_clips/models/8000_generator.pth')
29 | sd = extract_byol_model_from_state_dict(sd)
30 | resnet.load_state_dict(sd)
31 | embedding = resnet(clips, return_pool=True)
32 |
33 | for i, path in enumerate(paths):
34 | print(f'Using a baseline of {path}..')
35 | for j, cpath in enumerate(paths):
36 | if i == j:
37 | continue
38 | l2 = F.mse_loss(embedding[j], embedding[i])
39 | print(f'Compared to {cpath}: {l2}')
40 |
41 |
--------------------------------------------------------------------------------
/codes/scripts/audio/test_audio_speech_recognition.py:
--------------------------------------------------------------------------------
1 | import os.path as osp
2 | import logging
3 | import random
4 | import argparse
5 |
6 | import utils
7 | import utils.options as option
8 | import utils.util as util
9 | from models.audio.tts.tacotron2 import sequence_to_text
10 | from trainer.ExtensibleTrainer import ExtensibleTrainer
11 | from data import create_dataset, create_dataloader
12 | from tqdm import tqdm
13 | import torch
14 | import numpy as np
15 | from scipy.io import wavfile
16 |
17 |
18 | def forward_pass(model, data, output_dir, opt, b):
19 | with torch.no_grad():
20 | model.feed_data(data, 0)
21 | model.test()
22 |
23 | if 'real_text' in opt['eval'].keys():
24 | real = data[opt['eval']['real_text']][0]
25 | print(f'{b} Real text: "{real}"')
26 |
27 | pred_seq = model.eval_state[opt['eval']['gen_text']][0]
28 | pred_text = [sequence_to_text(ts) for ts in pred_seq]
29 | audio = model.eval_state[opt['eval']['audio']][0].cpu().numpy()
30 | wavfile.write(osp.join(output_dir, f'{b}_clip.wav'), 22050, audio)
31 | for i, text in enumerate(pred_text):
32 | print(f'{b} Predicted text {i}: "{text}"')
33 |
34 |
35 | if __name__ == "__main__":
36 | # Set seeds
37 | torch.manual_seed(5555)
38 | random.seed(5555)
39 | np.random.seed(5555)
40 |
41 | #### options
42 | torch.backends.cudnn.benchmark = True
43 | want_metrics = False
44 | parser = argparse.ArgumentParser()
45 | parser.add_argument('-opt', type=str, help='Path to options YAML file.', default='../options/test_gpt_asr_mass.yml')
46 | opt = option.parse(parser.parse_args().opt, is_train=False)
47 | opt = option.dict_to_nonedict(opt)
48 | utils.util.loaded_options = opt
49 |
50 | util.mkdirs(
51 | (path for key, path in opt['path'].items()
52 | if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key))
53 | util.setup_logger('base', opt['path']['log'], 'test_' + opt['name'], level=logging.INFO,
54 | screen=True, tofile=True)
55 | logger = logging.getLogger('base')
56 | logger.info(option.dict2str(opt))
57 |
58 | test_loaders = []
59 | for phase, dataset_opt in sorted(opt['datasets'].items()):
60 | test_set, collate_fn = create_dataset(dataset_opt, return_collate=True)
61 | test_loader = create_dataloader(test_set, dataset_opt, collate_fn=collate_fn)
62 | logger.info('Number of test texts in [{:s}]: {:d}'.format(dataset_opt['name'], len(test_set)))
63 | test_loaders.append(test_loader)
64 |
65 | model = ExtensibleTrainer(opt)
66 |
67 | batch = 0
68 | for test_loader in test_loaders:
69 | dataset_dir = opt['path']['results_root']
70 | util.mkdir(dataset_dir)
71 |
72 | tq = tqdm(test_loader)
73 | for data in tq:
74 | forward_pass(model, data, dataset_dir, opt, batch)
75 | batch += 1
76 |
77 |
--------------------------------------------------------------------------------
/codes/scripts/audio/use_vocoder.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from scipy.io import wavfile
3 |
4 | from models.audio.vocoders.waveglow.waveglow import WaveGlow
5 |
6 |
7 | class Vocoder:
8 | def __init__(self):
9 | self.model = WaveGlow(n_mel_channels=80, n_flows=12, n_group=8, n_early_size=2, n_early_every=4, WN_config={'n_layers': 8, 'n_channels': 256, 'kernel_size': 3})
10 | sd = torch.load('../experiments/waveglow_256channels_universal_v5.pth')
11 | self.model.load_state_dict(sd)
12 | self.model = self.model.cpu()
13 | self.model.eval()
14 |
15 | def transform_mel_to_audio(self, mel):
16 | if len(mel.shape) == 2: # Assume it's missing the batch dimension and fix that.
17 | mel = mel.unsqueeze(0)
18 | with torch.no_grad():
19 | return self.model.infer(mel)
20 |
21 |
22 | if __name__ == '__main__':
23 | vocoder = Vocoder()
24 | m = torch.load('C:\\Users\\jbetk\\Documents\\tmp\\some_audio\\00008.mel').cpu()
25 | wav = vocoder.transform_mel_to_audio(m)
26 | wavfile.write(f'0.wav', 22050, wav[0].cpu().numpy())
--------------------------------------------------------------------------------
/codes/scripts/audio/word_error_rate.py:
--------------------------------------------------------------------------------
1 | import Levenshtein
2 | from jiwer import wer, compute_measures
3 | import torch
4 | from tqdm import tqdm
5 |
6 | from data.audio.voice_tokenizer import VoiceBpeTokenizer
7 |
8 |
9 | def load_truths(file):
10 | niltok = VoiceBpeTokenizer(None)
11 | out = {}
12 | with open(file, 'r', encoding='utf-8') as f:
13 | for line in f.readlines():
14 | spl = line.split('|')
15 | if len(spl) != 2:
16 | print(spl)
17 | continue
18 | path, truth = spl
19 | #path = path.replace('wav/', '')
20 | # This preprocesses the truth data in the same way that training data is processed: removing punctuation, all lowercase, removing unnecessary
21 | # whitespace, and applying "english cleaners", which convert words like "mrs" to "missus" and such.
22 | truth = niltok.preprocess_text(truth)
23 | out[path] = truth
24 | return out
25 |
26 |
27 | if __name__ == '__main__':
28 | inference_tsv = 'results.tsv'
29 | libri_base = 'y:\\bigasr_dataset/librispeech/test_clean/test_clean.txt'
30 |
31 | # Pre-process truth values
32 | truths = load_truths(libri_base)
33 |
34 | niltok = VoiceBpeTokenizer(None)
35 | ground_truths = []
36 | hypotheses = []
37 | with open(inference_tsv, 'r') as tsv_file:
38 | tsv = tsv_file.read().splitlines()
39 | for line in tqdm(tsv):
40 | sentence_pred, wav = line.split('\t')
41 | hypotheses.append(niltok.preprocess_text(sentence_pred))
42 | ground_truths.append(truths[wav])
43 | wer = wer(ground_truths, hypotheses)*100
44 | print(f"WER: {wer}")
45 |
--------------------------------------------------------------------------------
/codes/scripts/byol/byol_extract_wrapped_model.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | def extract_byol_model_from_state_dict(sd):
5 | wrap_key = 'online_encoder.net.'
6 | sdo = {}
7 | for k,v in sd.items():
8 | if wrap_key in k:
9 | sdo[k.replace(wrap_key, '')] = v
10 | return sdo
11 |
12 | if __name__ == '__main__':
13 | pretrained_path = '../../../experiments/uresnet_pixpro4_imgset.pth'
14 | output_path = '../../../experiments/uresnet_pixpro4_imgset.pth'
15 |
16 | sd = torch.load(pretrained_path)
17 | sd = extract_byol_model_from_state_dict(sd)
18 |
19 | #model = SpineNet('49', in_channels=3, use_input_norm=True).to('cuda')
20 | #model.load_state_dict(sdo, strict=True)
21 |
22 | print("Validation succeeded, dumping state dict to output path.")
23 | torch.save(sdo, output_path)
--------------------------------------------------------------------------------
/codes/scripts/do_to_files.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import functools
3 | import os
4 | import pathlib
5 | from multiprocessing.pool import ThreadPool
6 |
7 | from tqdm import tqdm
8 |
9 |
10 | '''
11 | Helper function for scripts that iterate over large sets of files. Defines command-line arguments
12 | for operating over a large set of files, then handles setting up a worker queue system to operate
13 | on those files. You need to provide your own process_file_fn.
14 |
15 | process_file_fn expected signature:
16 | (path, output_path)
17 | '''
18 | def do_to_files(process_file_fn):
19 | parser = argparse.ArgumentParser()
20 | parser.add_argument('--path')
21 | parser.add_argument('--glob')
22 | parser.add_argument('--out')
23 | parser.add_argument('--resume')
24 | parser.add_argument('--num_workers')
25 |
26 | args = parser.parse_args()
27 | src = args.path
28 | glob = args.glob
29 | out = args.out
30 | resume = args.resume
31 | num_workers = int(args.num_workers)
32 |
33 | path = pathlib.Path(src)
34 | files = path.rglob(glob)
35 | files = [str(f) for f in files]
36 | files = files[resume:]
37 | pfn = functools.partial(process_file_fn, output_path=out)
38 | if num_workers > 0:
39 | with ThreadPool(num_workers) as pool:
40 | list(tqdm(pool.imap(pfn, files), total=len(files)))
41 | else:
42 | for f in tqdm(files):
43 | pfn(f)
44 |
--------------------------------------------------------------------------------
/codes/scripts/folderize_imagenet_val.py:
--------------------------------------------------------------------------------
1 | from glob import glob
2 |
3 | import torch
4 | import os
5 | import shutil
6 |
7 | if __name__ == '__main__':
8 | index_map_file = 'F:\\4k6k\\datasets\\images\\imagenet_2017\\imagenet_index_to_train_folder_name_map.pth'
9 | ground_truth = 'F:\\4k6k\\datasets\\images\\imagenet_2017\\validation_ground_truth.txt'
10 | val_path = 'F:\\4k6k\\datasets\\images\\imagenet_2017\\val'
11 |
12 | index_map = torch.load(index_map_file)
13 |
14 | for folder in index_map.values():
15 | os.makedirs(os.path.join(val_path, folder), exist_ok=True)
16 |
17 | gtfile = open(ground_truth, 'r')
18 | gtids = []
19 | for line in gtfile:
20 | gtids.append(int(line.strip()))
21 | gtfile.close()
22 |
23 | for i, img_file in enumerate(glob(os.path.join(val_path, "*.JPEG"))):
24 | shutil.move(img_file, os.path.join(val_path, index_map[gtids[i]],
25 | os.path.basename(img_file)))
26 | print("Done!")
27 |
--------------------------------------------------------------------------------
/codes/scripts/hugging_face_hub_upload.py:
--------------------------------------------------------------------------------
1 | if __name__ == '__main__':
2 | """
3 | Utility script for uploading model weights to the HF hub
4 | """
5 |
6 | '''
7 | model = Wav2VecWrapper(vocab_size=148, basis_model='facebook/wav2vec2-large-robust-ft-libri-960h', freeze_transformer=True, checkpointing_enabled=False)
8 | weights = torch.load('D:\\dlas\\experiments\\train_wav2vec_mass_large2\\models\\22500_wav2vec.pth')
9 | model.load_state_dict(weights)
10 | model.w2v.save_pretrained("jbetker/wav2vec2-large-robust-ft-libritts-voxpopuli", push_to_hub=True)
11 | '''
12 |
13 | # Build tokenizer vocab
14 | #mapping = tacotron_symbol_mapping()
15 | #print(json.dumps(mapping))
--------------------------------------------------------------------------------
/codes/scripts/start_tensorboard.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | #script to find the latest directory in a directory and start tensorboard from there
4 |
5 |
6 | def get_latest_dir(path):
7 | dirs = os.listdir(path)
8 | dirs = [os.path.join(path, d) for d in dirs]
9 | dirs = [d for d in dirs if os.path.isdir(d)]
10 | return max(dirs, key=os.path.getmtime)
11 |
12 | def start_tensorboard(path):
13 | latest_dir = get_latest_dir(path)
14 | os.path.join(latest_dir, 'tb_logger')
15 | os.system('tensorboard --logdir ' + latest_dir)
16 |
17 | if __name__ == '__main__':
18 | #process experiments folder
19 | print('Starting tensorboard from latest experiment folder:' + get_latest_dir('experiments') + '...')
20 | start_tensorboard('experiments')
--------------------------------------------------------------------------------
/codes/scripts/stitch_images.py:
--------------------------------------------------------------------------------
1 | import glob
2 |
3 | import torch
4 | import torchvision
5 | from PIL import Image
6 | from torchvision.transforms import ToTensor
7 |
8 | if __name__ == '__main__':
9 | imfolder = 'F:\\dlas\\results\\test_diffusion_unet\\imgset5'
10 | cols, rows = 10, 5
11 | images = glob.glob(f'{imfolder}/*.png')
12 | output = None
13 | for r in range(rows):
14 | for c in range(cols):
15 | im = ToTensor()(Image.open(next(images)))
16 | if output is None:
17 | c, h, w = im.shape
18 | output = torch.zeros(c, h * rows, w * cols)
19 | output[:,r*h:(r+1)*h,c*w:(c+1)*w] = im
20 | torchvision.utils.save_image(output, "out.png")
--------------------------------------------------------------------------------
/codes/scripts/stylegan2/dnnlib/tflib/network.py:
--------------------------------------------------------------------------------
1 | # Pretends to be the stylegan2 Network class for intercepting pickle load requests.
2 | # Horrible hack. Please don't judge me.
3 |
4 | # Globals for storing these networks because I have no idea how pickle is doing this internally.
5 | generator, discriminator, gen_ema = {}, {}, {}
6 |
7 | class Network:
8 | def __setstate__(self, state: dict) -> None:
9 | global generator, discriminator, gen_ema
10 | name = state['name']
11 | if name in ['G_synthesis', 'G_mapping', 'G', 'G_main']:
12 | if name != 'G' and name not in generator.keys():
13 | generator[name] = state
14 | else:
15 | gen_ema[name] = state
16 | elif name in ['D']:
17 | discriminator[name] = state
18 |
--------------------------------------------------------------------------------
/codes/scripts/ui/image_labeler/label_editor.py:
--------------------------------------------------------------------------------
1 | import orjson
2 |
3 | from data.images.image_label_parser import VsNetImageLabeler
4 |
5 |
6 | # Translates from the label JSON output of the VS.NET UI to something more compact and usable.
7 | def convert_from_vsnet_labels():
8 | labeler = VsNetImageLabeler(['F:\\4k6k\datasets\\ns_images\\512_unsupervised\\categories.json',
9 | 'F:\\4k6k\datasets\\ns_images\\512_unsupervised\\categories_new.json',
10 | 'F:\\4k6k\datasets\\ns_images\\512_unsupervised\\categories_new_new.json'])
11 | # Proposed format:
12 | # 'config': { 'dim' }
13 | # 'labels': [{ 'label', 'key'}] <- ordered by label index.
14 | # 'images': {'file': [{ 'lid', 'top', 'left' }}
15 | # 'labelMap' {}
16 | out_dict = {
17 | 'config': {
18 | 'dim': next(iter(labeler.labeled_images.values()))[0]['patch_width']
19 | },
20 | 'labels': [{'label': cat['label'], 'key': cat['keyBinding']} for cat in labeler.categories.values()],
21 | }
22 | out_dict['labelMap'] = {}
23 | for i, lbl in enumerate(out_dict['labels']):
24 | out_dict['labelMap'][lbl['label']] = i
25 | out_dict['images'] = {}
26 | for fname, ilbls in labeler.labeled_images.items():
27 | out_dict['images'][fname] = [{'lid': out_dict['labelMap'][il['label']], 'top': il['patch_top'], 'left': il['patch_left']} for il in ilbls]
28 | with open("label_editor.json", 'wb') as fout:
29 | fout.write(orjson.dumps(out_dict))
30 |
31 |
32 | if __name__ == '__main__':
33 | convert_from_vsnet_labels()
--------------------------------------------------------------------------------
/codes/scripts/ui/image_labeler/pretrained_image_patch_classifier.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os.path as osp
3 |
4 | import utils
5 | import utils.options as option
6 | import utils.util as util
7 | from data import create_dataset, create_dataloader
8 | from trainer.ExtensibleTrainer import ExtensibleTrainer
9 |
10 |
11 | class PretrainedImagePatchClassifier:
12 | def __init__(self, cfg):
13 | self.cfg = cfg
14 |
15 | opt = option.parse(cfg, is_train=False)
16 | opt = option.dict_to_nonedict(opt)
17 | utils.util.loaded_options = opt
18 |
19 | util.mkdirs(
20 | (path for key, path in opt['path'].items()
21 | if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key))
22 | util.setup_logger('base', opt['path']['log'], 'test_' + opt['name'], level=logging.INFO,
23 | screen=True, tofile=True)
24 | logger = logging.getLogger('base')
25 | logger.info(option.dict2str(opt))
26 |
27 | #### Create test dataset and dataloader
28 | dataset_opt = list(opt['datasets'].values())[0]
29 | # Remove labeling features from the dataset config and wrappers.
30 | if 'dataset' in dataset_opt.keys():
31 | if 'labeler' in dataset_opt['dataset'].keys():
32 | dataset_opt['dataset']['includes_labels'] = False
33 | del dataset_opt['dataset']['labeler']
34 | test_set = create_dataset(dataset_opt)
35 | if hasattr(test_set, 'wrapped_dataset'):
36 | test_set = test_set.wrapped_dataset
37 | else:
38 | test_set = create_dataset(dataset_opt)
39 | logger.info('Number of test images: {:d}'.format(len(test_set)))
40 | self.test_loader = create_dataloader(test_set, dataset_opt, opt)
41 | self.model = ExtensibleTrainer(opt)
42 | self.gen = self.model.netsG['generator']
43 | self.dataset_dir = osp.join(opt['path']['results_root'], opt['name'])
44 | util.mkdir(self.dataset_dir)
45 |
46 | def get_next_sample(self):
47 |
48 | for data in self.test_loader:
49 | hq = data['hq'].to('cuda')
50 | res = self.gen(hq)
51 | yield hq, res, data
52 |
53 |
--------------------------------------------------------------------------------
/codes/scripts/ui/image_labeler/test_image_patch_classifier.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 |
4 | import torch
5 | import torchvision
6 |
7 | import utils.options as option
8 | from scripts.ui.image_labeler.pretrained_image_patch_classifier import PretrainedImagePatchClassifier
9 |
10 | if __name__ == "__main__":
11 | #### options
12 | torch.backends.cudnn.benchmark = True
13 | want_metrics = False
14 | parser = argparse.ArgumentParser()
15 | parser.add_argument('-opt', type=str, help='Path to options YAML file.', default='../options/train_imgset_structural_classifier.yml')
16 |
17 | classifier = PretrainedImagePatchClassifier(parser.parse_args().opt)
18 | label_to_search_for = 4
19 | step = 1
20 | for hq, res in classifier.get_next_sample():
21 | res = torch.nn.functional.interpolate(res, size=hq.shape[2:], mode="nearest")
22 | res_lbl = res[:, label_to_search_for, :, :].unsqueeze(1)
23 | res_lbl_mask = (1.0 * (res_lbl > .5))*.5 + .5
24 | hq = hq * res_lbl_mask
25 | torchvision.utils.save_image(hq, os.path.join(classifier.dataset_dir, "%i.png" % (step,)))
26 | step += 1
27 |
--------------------------------------------------------------------------------
/codes/scripts/use_generator_as_filter.py:
--------------------------------------------------------------------------------
1 | import os
2 | import shutil
3 |
4 | from torch.utils.data import DataLoader
5 |
6 | from data.images.single_image_dataset import SingleImageDataset
7 | from tqdm import tqdm
8 | import torch
9 |
10 | from models.vqvae.vqvae_no_conv_transpose import VQVAE
11 |
12 | if __name__ == "__main__":
13 | bin_path = "f:\\binned"
14 | good_path = "f:\\good"
15 | os.makedirs(bin_path, exist_ok=True)
16 | os.makedirs(good_path, exist_ok=True)
17 |
18 | torch.backends.cudnn.benchmark = True
19 |
20 | model = VQVAE().cuda()
21 | model.load_state_dict(torch.load('../experiments/nvqvae_imgset.pth'))
22 | ds = SingleImageDataset({
23 | 'name': 'amalgam',
24 | 'paths': ['F:\\4k6k\\datasets\\ns_images\\imagesets\\256_with_ref_v5'],
25 | 'weights': [1],
26 | 'target_size': 128,
27 | 'force_multiple': 32,
28 | 'scale': 1,
29 | 'eval': False
30 | })
31 | dl = DataLoader(ds, batch_size=256, num_workers=1)
32 |
33 | means = []
34 | model.eval()
35 | with torch.no_grad():
36 | for i, data in enumerate(tqdm(dl)):
37 | hq = data['hq'].cuda()
38 | gen = model(hq)[0]
39 | l2 = torch.mean(torch.square(hq - gen), dim=[1,2,3])
40 | for b in range(len(l2)):
41 | if l2[b] > .0004:
42 | shutil.copy(data['GT_path'][b], good_path)
43 | #else:
44 | # shutil.copy(data['GT_path'][b], bin_path)
45 |
46 |
47 | #means.append(l2.cpu())
48 | #if i % 10 == 0:
49 | # print(torch.stack(means, dim=0).mean())
50 |
--------------------------------------------------------------------------------
/codes/scripts/validate_data.py:
--------------------------------------------------------------------------------
1 | # This script iterates through all the data with no worker threads and performs whatever transformations are prescribed.
2 | # The idea is to find bad/corrupt images.
3 |
4 | import math
5 | import argparse
6 | import random
7 | import torch
8 | from utils import util, options as option
9 | from data import create_dataloader, create_dataset
10 | from tqdm import tqdm
11 | from skimage import io
12 |
13 | def main():
14 | #### options
15 | parser = argparse.ArgumentParser()
16 | parser.add_argument('-opt', type=str, help='Path to option YAML file.', default='../../options/train_prog_mi1_rrdb_6bypass.yml')
17 | parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none',
18 | help='job launcher')
19 | parser.add_argument('--local_rank', type=int, default=0)
20 | args = parser.parse_args()
21 | opt = option.parse(args.opt, is_train=True)
22 |
23 | #### distributed training settings
24 | opt['dist'] = False
25 | rank = -1
26 |
27 | # convert to NoneDict, which returns None for missing keys
28 | opt = option.dict_to_nonedict(opt)
29 |
30 | #### random seed
31 | seed = opt['train']['manual_seed']
32 | if seed is None:
33 | seed = random.randint(1, 10000)
34 | util.set_random_seed(seed)
35 |
36 | torch.backends.cudnn.benchmark = True
37 | # torch.backends.cudnn.deterministic = True
38 |
39 | #### create train and val dataloader
40 | for phase, dataset_opt in opt['datasets'].items():
41 | if phase == 'train':
42 | train_set = create_dataset(dataset_opt)
43 | train_size = int(math.ceil(len(train_set) / dataset_opt['batch_size']))
44 | total_iters = int(opt['train']['niter'])
45 | total_epochs = int(math.ceil(total_iters / train_size))
46 | dataset_opt['n_workers'] = 0 # Force num_workers=0 to make dataloader work in process.
47 | train_loader = create_dataloader(train_set, dataset_opt, opt, None)
48 | if rank <= 0:
49 | print('Number of training data elements: {:,d}, iters: {:,d}'.format(
50 | len(train_set), train_size))
51 | assert train_loader is not None
52 |
53 | '''
54 | tq_ldr = tqdm(train_set.get_paths())
55 | for path in tq_ldr:
56 | try:
57 | _ = io.imread(path)
58 | # Do stuff with img
59 | except Exception as e:
60 | print("Error with %s" % (path,))
61 | print(e)
62 | '''
63 | tq_ldr = tqdm(train_set)
64 | for ds in tq_ldr:
65 | pass
66 |
67 |
68 | if __name__ == '__main__':
69 | main()
70 |
--------------------------------------------------------------------------------
/codes/sweep.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import functools
3 | import os
4 | from multiprocessing.pool import ThreadPool
5 |
6 | import torch
7 |
8 | from train import Trainer
9 | from utils import options as option
10 | import collections.abc
11 |
12 |
13 | def deep_update(d, u):
14 | for k, v in u.items():
15 | if isinstance(v, collections.abc.Mapping):
16 | d[k] = deep_update(d.get(k, {}), v)
17 | else:
18 | d[k] = v
19 | return d
20 |
21 |
22 | def launch_trainer(opt, opt_path, rank):
23 | os.environ['CUDA_VISIBLE_DEVICES'] = str(rank)
24 | print('export CUDA_VISIBLE_DEVICES=' + str(rank))
25 | trainer = Trainer()
26 | opt['dist'] = False
27 | trainer.rank = -1
28 | trainer.init(opt_path, opt, 'none')
29 | trainer.do_training()
30 |
31 |
32 | if __name__ == '__main__':
33 | """
34 | Ad-hoc script (hard coded; no command-line parameters) that spawns multiple separate trainers from a single options
35 | file, with a hard-coded set of modifications.
36 | """
37 | base_opt = '../experiments/sweep_music_mel2vec.yml'
38 | modifications = {
39 | 'baseline': {},
40 | 'lr1e3': {'steps': {'generator': {'optimizer_params': {'lr': {.001}}}}},
41 | 'lr1e5': {'steps': {'generator': {'optimizer_params': {'lr': {.00001}}}}},
42 | 'no_warmup': {'train': {'warmup_steps': 0}},
43 | }
44 | base_rank = 4
45 | opt = option.parse(base_opt, is_train=True)
46 | all_opts = []
47 | for i, (mod, mod_dict) in enumerate(modifications.items()):
48 | nd = copy.deepcopy(opt)
49 | deep_update(nd, mod_dict)
50 | nd['name'] = f'{nd["name"]}_{mod}'
51 | nd['wandb_run_name'] = mod
52 | base_path = nd['path']['log']
53 | for k, p in nd['path'].items():
54 | if isinstance(p, str) and base_path in p:
55 | nd['path'][k] = p.replace(base_path, f'{base_path}/{mod}')
56 | all_opts.append(nd)
57 |
58 | for i in range(1,len(modifications)):
59 | pid = os.fork()
60 | if pid == 0:
61 | rank = i
62 | break
63 | else:
64 | rank = 0
65 | launch_trainer(all_opts[rank], base_opt, rank+base_rank)
66 |
--------------------------------------------------------------------------------
/codes/trainer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/trainer/__init__.py
--------------------------------------------------------------------------------
/codes/trainer/custom_training_components/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/trainer/custom_training_components/__init__.py
--------------------------------------------------------------------------------
/codes/trainer/custom_training_components/stereoscopic.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.cuda.amp import autocast
3 | from models.flownet2.networks import Resample2d
4 | from models.flownet2 import flow2img
5 | from trainer.inject import Injector
6 |
7 |
8 | def create_stereoscopic_injector(opt, env):
9 | type = opt['type']
10 | if type == 'stereoscopic_resample':
11 | return ResampleInjector(opt, env)
12 | elif type == 'stereoscopic_flow2image':
13 | return Flow2Image(opt, env)
14 | return None
15 |
16 |
17 | class ResampleInjector(Injector):
18 | def __init__(self, opt, env):
19 | super(ResampleInjector, self).__init__(opt, env)
20 | self.resample = Resample2d()
21 | self.flow = opt['flowfield']
22 |
23 | def forward(self, state):
24 | with autocast(enabled=False):
25 | return {self.output: self.resample(state[self.input], state[self.flow])}
26 |
27 |
28 | # Converts a flowfield to an image representation for viewing purposes.
29 | # Uses flownet's implementation to do so. Which really sucks. TODO: just do my own implementation in the future.
30 | # Note: this is not differentiable and is only usable for debugging purposes.
31 | class Flow2Image(Injector):
32 | def __init__(self, opt, env):
33 | super(Flow2Image, self).__init__(opt, env)
34 |
35 | def forward(self, state):
36 | with torch.no_grad():
37 | flo = state[self.input].cpu()
38 | bs, c, h, w = flo.shape
39 | flo = flo.permute(0, 2, 3, 1) # flow2img works in numpy space for some reason..
40 | imgs = torch.empty_like(flo)
41 | flo = flo.numpy()
42 | for b in range(bs):
43 | img = flow2img(flo[b]) # Note that this returns the image in an integer format.
44 | img = torch.tensor(img, dtype=torch.float) / 255
45 | imgs[b] = img
46 | imgs = imgs.permute(0, 3, 1, 2)
47 | return {self.output: imgs}
48 |
--------------------------------------------------------------------------------
/codes/trainer/eval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/trainer/eval/__init__.py
--------------------------------------------------------------------------------
/codes/trainer/eval/evaluator.py:
--------------------------------------------------------------------------------
1 | # Base class for an evaluator, which is responsible for feeding test data through a model and evaluating the response.
2 | import importlib
3 | import inspect
4 | import pkgutil
5 | import re
6 | import sys
7 |
8 |
9 | class Evaluator:
10 | def __init__(self, model, opt_eval, env, uses_all_ddp=True):
11 | self.model = model.module if hasattr(model, 'module') else model
12 | self.opt = opt_eval
13 | self.env = env
14 | self.uses_all_ddp = uses_all_ddp
15 |
16 | def perform_eval(self):
17 | return {}
18 |
19 |
20 | def format_evaluator_name(name):
21 | # Formats by converting from CamelCase to snake_case and removing trailing "_evaluator"
22 | name = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
23 | name = re.sub('([a-z0-9])([A-Z])', r'\1_\2', name).lower()
24 | return name.replace("_evaluator", "")
25 |
26 |
27 | # Works by loading all python modules in the eval/ directory and sniffing out subclasses of Evaluator.
28 | def find_registered_evaluators(base_path="trainer/eval"):
29 | module_iter = pkgutil.walk_packages([base_path])
30 | results = {}
31 | for mod in module_iter:
32 | if mod.ispkg:
33 | EXCLUSION_LIST = []
34 | if mod.name not in EXCLUSION_LIST:
35 | results.update(find_registered_evaluators(f'{base_path}/{mod.name}'))
36 | else:
37 | mod_name = f'{base_path}/{mod.name}'.replace('/', '.')
38 | if 'eval_wer' in mod.name: continue # TODO: this causes an import error for PyCtcDecode. get rid of this if there's a need to use that evaluator.
39 | importlib.import_module(mod_name)
40 | classes = inspect.getmembers(sys.modules[mod_name], inspect.isclass)
41 | for name, obj in classes:
42 | if 'Evaluator' in [mro.__name__ for mro in inspect.getmro(obj)]:
43 | results[format_evaluator_name(name)] = obj
44 | return results
45 |
46 |
47 | class CreateEvaluatorError(Exception):
48 | def __init__(self, name, available):
49 | super().__init__(f'Could not find the specified evaluator name: {name}. Available evaluators:'
50 | f'{available}')
51 |
52 |
53 | def create_evaluator(model, opt_eval, env):
54 | evaluators = find_registered_evaluators()
55 | type = opt_eval['type']
56 | if type not in evaluators.keys():
57 | raise CreateEvaluatorError(type, list(evaluators.keys()))
58 | return evaluators[opt_eval['type']](model, opt_eval, env)
59 |
--------------------------------------------------------------------------------
/codes/trainer/eval/fid.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import torch
4 | import os.path as osp
5 | import torchvision
6 | import trainer.eval.evaluator as evaluator
7 | from pytorch_fid import fid_score
8 | from utils.util import opt_get
9 |
10 | # Evaluator that generates uniform noise to feed into a generator, then calculates a FID score on the results.
11 | class StyleTransferEvaluator(evaluator.Evaluator):
12 | def __init__(self, model, opt_eval, env):
13 | super().__init__(model, opt_eval, env, uses_all_ddp=False)
14 | self.batches_per_eval = opt_eval['batches_per_eval']
15 | self.batch_sz = opt_eval['batch_size']
16 | self.im_sz = opt_eval['image_size']
17 | self.fid_real_samples = opt_eval['real_fid_path']
18 | self.gen_output_index = opt_eval['gen_index'] if 'gen_index' in opt_eval.keys() else 0
19 | self.noise_type = opt_get(opt_eval, ['noise_type'], 'imgnoise')
20 | self.latent_dim = opt_get(opt_eval, ['latent_dim'], 512) # Not needed if using 'imgnoise' input.
21 | self.image_norm_range = tuple(opt_get(env['opt'], ['image_normalization_range'], [0,1]))
22 |
23 | def perform_eval(self):
24 | fid_fake_path = osp.join(self.env['base_path'], "../", "fid", str(self.env["step"]))
25 | os.makedirs(fid_fake_path, exist_ok=True)
26 | counter = 0
27 | self.model.eval()
28 | for i in range(self.batches_per_eval):
29 | if self.noise_type == 'imgnoise':
30 | batch = torch.FloatTensor(self.batch_sz, 3, self.im_sz, self.im_sz).uniform_(0., 1.).to(self.env['device'])
31 | elif self.noise_type == 'stylenoise':
32 | batch = torch.randn(self.batch_sz, self.latent_dim).to(self.env['device'])
33 | gen = self.model(batch)
34 | if not isinstance(gen, list) and not isinstance(gen, tuple):
35 | gen = [gen]
36 | gen = gen[self.gen_output_index]
37 | gen = (gen - self.image_norm_range[0]) / (self.image_norm_range[1]-self.image_norm_range[0])
38 | for b in range(self.batch_sz):
39 | torchvision.utils.save_image(gen[b], osp.join(fid_fake_path, "%i_.png" % (counter)))
40 | counter += 1
41 | self.model.train()
42 |
43 | print("Got all images, computing fid")
44 | return {"fid": fid_score.calculate_fid_given_paths([self.fid_real_samples, fid_fake_path], self.batch_sz, True,
45 | 2048)}
46 |
--------------------------------------------------------------------------------
/codes/trainer/eval/flow_gaussian_nll.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.utils.data import DataLoader
3 | from tqdm import tqdm
4 |
5 | import trainer.eval.evaluator as evaluator
6 |
7 | # Evaluate how close to true Gaussian a flow network predicts in a "normal" pass given a LQ/HQ image pair.
8 | from data.images.image_folder_dataset import ImageFolderDataset
9 | from models.image_generation.srflow.flow import GaussianDiag
10 |
11 |
12 | class FlowGaussianNll(evaluator.Evaluator):
13 | def __init__(self, model, opt_eval, env):
14 | super().__init__(model, opt_eval, env, uses_all_ddp=False)
15 | self.batch_sz = opt_eval['batch_size']
16 | self.dataset = ImageFolderDataset(opt_eval['dataset'])
17 | self.dataloader = DataLoader(self.dataset, self.batch_sz)
18 |
19 | def perform_eval(self):
20 | total_zs = 0
21 | z_loss = 0
22 | self.model.eval()
23 | with torch.no_grad():
24 | print("Evaluating FlowGaussianNll..")
25 | for batch in tqdm(self.dataloader):
26 | dev = self.env['device']
27 | z, _, _ = self.model(gt=batch['hq'].to(dev),
28 | lr=batch['lq'].to(dev),
29 | epses=[],
30 | reverse=False,
31 | add_gt_noise=False)
32 | for z_ in z:
33 | z_loss += GaussianDiag.logp(None, None, z_).mean()
34 | total_zs += 1
35 | self.model.train()
36 | return {"gaussian_diff": z_loss / total_zs}
37 |
--------------------------------------------------------------------------------
/codes/trainer/eval/mel_evaluator.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | import trainer.eval.evaluator as evaluator
4 |
5 | from data import create_dataset
6 | from data.audio.nv_tacotron_dataset import TextMelCollate
7 | from models.audio.tts.tacotron2 import Tacotron2LossRaw
8 | from torch.utils.data import DataLoader
9 | from tqdm import tqdm
10 |
11 |
12 | # Evaluates the performance of a MEL spectrogram predictor.
13 | class MelEvaluator(evaluator.Evaluator):
14 | def __init__(self, model, opt_eval, env):
15 | super().__init__(model, opt_eval, env, uses_all_ddp=True)
16 | self.batch_sz = opt_eval['batch_size']
17 | self.dataset = create_dataset(opt_eval['dataset'])
18 | assert self.batch_sz is not None
19 | self.dataloader = DataLoader(self.dataset, self.batch_sz, shuffle=False, num_workers=1, collate_fn=TextMelCollate(n_frames_per_step=1))
20 | self.criterion = Tacotron2LossRaw()
21 |
22 | def perform_eval(self):
23 | counter = 0
24 | total_error = 0
25 | self.model.eval()
26 | for batch in tqdm(self.dataloader):
27 | model_params = {
28 | 'text_inputs': 'padded_text',
29 | 'text_lengths': 'input_lengths',
30 | 'mels': 'padded_mel',
31 | 'output_lengths': 'output_lengths',
32 | }
33 | params = {k: batch[v].to(self.env['device']) for k, v in model_params.items()}
34 | with torch.no_grad():
35 | pred = self.model(**params)
36 |
37 | targets = ['padded_mel', 'padded_gate']
38 | targets = [batch[t].to(self.env['device']) for t in targets]
39 | total_error += self.criterion(pred, targets).item()
40 | counter += 1
41 | self.model.train()
42 |
43 | return {"validation-score": total_error / counter}
44 |
45 |
--------------------------------------------------------------------------------
/codes/trainer/eval/sr_diffusion_fid.py:
--------------------------------------------------------------------------------
1 | import os
2 | import torch
3 | import os.path as osp
4 | import torchvision
5 | from torch.nn.functional import interpolate
6 | from tqdm import tqdm
7 |
8 | import trainer.eval.evaluator as evaluator
9 |
10 | from pytorch_fid import fid_score
11 | from data import create_dataset
12 | from torch.utils.data import DataLoader, DistributedSampler, SequentialSampler
13 |
14 | from trainer.injectors.gaussian_diffusion_injector import GaussianDiffusionInferenceInjector
15 | from utils.util import opt_get
16 |
17 |
18 | # Performs a FID evaluation on a diffusion network
19 | class SrDiffusionFidEvaluator(evaluator.Evaluator):
20 | def __init__(self, model, opt_eval, env):
21 | super().__init__(model, opt_eval, env)
22 | self.batch_sz = opt_eval['batch_size']
23 | self.fid_batch_size = opt_get(opt_eval, ['fid_batch_size'], 64)
24 | assert self.batch_sz is not None
25 | self.dataset = create_dataset(opt_eval['dataset'])
26 | if torch.distributed.is_available() and torch.distributed.is_initialized():
27 | self.sampler = DistributedSampler(self.dataset, shuffle=False, drop_last=True)
28 | else:
29 | self.sampler = SequentialSampler(self.dataset)
30 | self.fid_real_samples = opt_eval['dataset']['paths'] # This is assumed to exist for the given dataset.
31 | assert isinstance(self.fid_real_samples, str)
32 | self.gd = GaussianDiffusionInferenceInjector(opt_eval['diffusion_params'], env)
33 | self.out_key = opt_eval['diffusion_params']['out']
34 |
35 | def perform_eval(self):
36 | # Attempt to make the dataset deterministic.
37 | self.dataset.reset_random()
38 | dataloader = DataLoader(self.dataset, self.batch_sz, sampler=self.sampler, num_workers=0)
39 |
40 | fid_fake_path = osp.join(self.env['base_path'], "..", "fid", str(self.env["step"]))
41 | os.makedirs(fid_fake_path, exist_ok=True)
42 | counter = 0
43 | for batch in tqdm(dataloader):
44 | batch = {k: v.to(self.env['device']) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
45 | gen = self.gd(batch)[self.out_key]
46 |
47 | # All gather if we're in distributed mode.
48 | if torch.distributed.is_available() and torch.distributed.is_initialized():
49 | gather_list = [torch.zeros_like(gen) for _ in range(torch.distributed.get_world_size())]
50 | torch.distributed.all_gather(gather_list, gen)
51 | gen = torch.cat(gather_list, dim=0)
52 |
53 | if self.env['rank'] <= 0:
54 | for g in gen:
55 | torchvision.utils.save_image(g, osp.join(fid_fake_path, f"{counter}.png"))
56 | counter += 1
57 |
58 | if self.env['rank'] <= 0:
59 | return {"fid": fid_score.calculate_fid_given_paths([self.fid_real_samples, fid_fake_path], self.fid_batch_size,
60 | True, 2048)}
61 | else:
62 | return {}
63 |
--------------------------------------------------------------------------------
/codes/trainer/experiments/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/trainer/experiments/__init__.py
--------------------------------------------------------------------------------
/codes/trainer/inject.py:
--------------------------------------------------------------------------------
1 | import importlib
2 | import inspect
3 | import pkgutil
4 | import re
5 | import sys
6 |
7 | import torch.nn
8 |
9 |
10 | # Base class for all other injectors.
11 | class Injector(torch.nn.Module):
12 | def __init__(self, opt, env):
13 | super(Injector, self).__init__()
14 | self.opt = opt
15 | self.env = env
16 | if 'in' in opt.keys():
17 | self.input = opt['in']
18 | if 'out' in opt.keys():
19 | self.output = opt['out']
20 |
21 | # This should return a dict of new state variables.
22 | def forward(self, state):
23 | raise NotImplementedError
24 |
25 |
26 | def format_injector_name(name):
27 | # Formats by converting from CamelCase to snake_case and removing trailing "_injector"
28 | name = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
29 | name = re.sub('([a-z0-9])([A-Z])', r'\1_\2', name).lower()
30 | return name.replace("_injector", "")
31 |
32 |
33 | # Works by loading all python modules in the injectors/ directory and sniffing out subclasses of Injector.
34 | # field will be properly populated.
35 | def find_registered_injectors(base_path="trainer/injectors"):
36 | module_iter = pkgutil.walk_packages([base_path])
37 | results = {}
38 | for mod in module_iter:
39 | if mod.ispkg:
40 | EXCLUSION_LIST = []
41 | if mod.name not in EXCLUSION_LIST:
42 | results.update(find_registered_injectors(f'{base_path}/{mod.name}'))
43 | else:
44 | mod_name = f'{base_path}/{mod.name}'.replace('/', '.')
45 | importlib.import_module(mod_name)
46 | classes = inspect.getmembers(sys.modules[mod_name], inspect.isclass)
47 | for name, obj in classes:
48 | if 'Injector' in [mro.__name__ for mro in inspect.getmro(obj)]:
49 | results[format_injector_name(name)] = obj
50 | return results
51 |
52 |
53 | class CreateInjectorError(Exception):
54 | def __init__(self, name, available):
55 | super().__init__(f'Could not find the specified injector name: {name}. Available injectors:'
56 | f'{available}')
57 |
58 |
59 | # Injectors are a way to synthesize data within a step that can then be used (and reused) by loss functions.
60 | def create_injector(opt_inject, env):
61 | injectors = find_registered_injectors()
62 | type = opt_inject['type']
63 | if type not in injectors.keys():
64 | raise CreateInjectorError(type, list(injectors.keys()))
65 | return injectors[opt_inject['type']](opt_inject, env)
66 |
--------------------------------------------------------------------------------
/codes/trainer/injectors/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/trainer/injectors/__init__.py
--------------------------------------------------------------------------------
/codes/trainer/loss.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import numpy as np
4 | from utils.colors import rgb2ycbcr
5 |
6 |
7 | class CharbonnierLoss(nn.Module):
8 | """Charbonnier Loss (L1)"""
9 |
10 | def __init__(self, eps=1e-6):
11 | super(CharbonnierLoss, self).__init__()
12 | self.eps = eps
13 |
14 | def forward(self, x, y):
15 | diff = x - y
16 | loss = torch.sum(torch.sqrt(diff * diff + self.eps))
17 | return loss
18 |
19 |
20 | class ZeroSpreadLoss(nn.Module):
21 | def __init__(self):
22 | super(ZeroSpreadLoss, self).__init__()
23 |
24 | def forward(self, x, _):
25 | return 2 * torch.nn.functional.sigmoid(1 / torch.abs(torch.mean(x))) - 1
26 |
27 |
28 | # Define GAN loss: [vanilla | lsgan]
29 | class GANLoss(nn.Module):
30 | def __init__(self, gan_type, real_label_val=1.0, fake_label_val=0.0):
31 | super(GANLoss, self).__init__()
32 | self.gan_type = gan_type.lower()
33 | self.real_label_val = real_label_val
34 | self.fake_label_val = fake_label_val
35 |
36 | if self.gan_type in ['gan', 'ragan', 'pixgan', 'pixgan_fea', 'crossgan', 'crossgan_lrref']:
37 | self.loss = nn.BCEWithLogitsLoss()
38 | elif self.gan_type == 'lsgan':
39 | self.loss = nn.MSELoss()
40 | elif self.gan_type == 'max_spread':
41 | self.loss = ZeroSpreadLoss()
42 | else:
43 | raise NotImplementedError('GAN type [{:s}] is not found'.format(self.gan_type))
44 |
45 | def get_target_label(self, input, target_is_real):
46 | if target_is_real:
47 | return torch.empty_like(input).fill_(self.real_label_val)
48 | else:
49 | return torch.empty_like(input).fill_(self.fake_label_val)
50 |
51 | def forward(self, input, target_is_real):
52 | if self.gan_type in ['pixgan', 'pixgan_fea', 'crossgan', 'crossgan_lrref'] and not isinstance(target_is_real, bool):
53 | target_label = target_is_real
54 | else:
55 | target_label = self.get_target_label(input, target_is_real)
56 | loss = self.loss(input.float(), target_label.float())
57 | return loss
58 |
--------------------------------------------------------------------------------
/codes/trainer/optimizers/sgd.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.optim import Optimizer
3 |
4 |
5 | class SGDNoBiasMomentum(Optimizer):
6 | r"""
7 | Copy of pytorch implementation of SGD with a modification which turns off momentum for params marked
8 | with `is_norm` or `is_bias`.
9 | """
10 |
11 | def __init__(self, params, lr, momentum=0, dampening=0,
12 | weight_decay=0, nesterov=False):
13 | if lr < 0.0:
14 | raise ValueError("Invalid learning rate: {}".format(lr))
15 | if momentum < 0.0:
16 | raise ValueError("Invalid momentum value: {}".format(momentum))
17 | if weight_decay < 0.0:
18 | raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
19 |
20 | defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
21 | weight_decay=weight_decay, nesterov=nesterov)
22 | if nesterov and (momentum <= 0 or dampening != 0):
23 | raise ValueError("Nesterov momentum requires a momentum and zero dampening")
24 | super().__init__(params, defaults)
25 |
26 | def __setstate__(self, state):
27 | super().__setstate__(state)
28 | for group in self.param_groups:
29 | group.setdefault('nesterov', False)
30 |
31 | @torch.no_grad()
32 | def step(self, closure=None):
33 | """Performs a single optimization step.
34 |
35 | Arguments:
36 | closure (callable, optional): A closure that reevaluates the model
37 | and returns the loss.
38 | """
39 | loss = None
40 | if closure is not None:
41 | with torch.enable_grad():
42 | loss = closure()
43 |
44 | for group in self.param_groups:
45 | weight_decay = group['weight_decay']
46 | momentum = group['momentum']
47 | dampening = group['dampening']
48 | nesterov = group['nesterov']
49 |
50 | for p in group['params']:
51 | if p.grad is None:
52 | continue
53 | d_p = p.grad
54 | if weight_decay != 0:
55 | d_p = d_p.add(p, alpha=weight_decay)
56 | # **this is the only modification over standard torch.optim.SGD:
57 | is_bn_or_bias = (hasattr(p, 'is_norm') and p.is_norm) or (hasattr(p, 'is_bias') and p.is_bias)
58 | if not is_bn_or_bias and momentum != 0:
59 | param_state = self.state[p]
60 | if 'momentum_buffer' not in param_state:
61 | buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()
62 | else:
63 | buf = param_state['momentum_buffer']
64 | buf.mul_(momentum).add_(d_p, alpha=1 - dampening)
65 | if nesterov:
66 | d_p = d_p.add(buf, alpha=momentum)
67 | else:
68 | d_p = buf
69 |
70 | p.add_(d_p, alpha=-group['lr'])
71 |
72 | return loss
73 |
--------------------------------------------------------------------------------
/codes/use_discriminator_as_filter.py:
--------------------------------------------------------------------------------
1 | import os.path as osp
2 | import logging
3 | import time
4 | import argparse
5 |
6 | import os
7 |
8 | from torchvision.transforms import CenterCrop
9 |
10 | from trainer.ExtensibleTrainer import ExtensibleTrainer
11 | from utils import options as option
12 | import utils.util as util
13 | from data import create_dataset, create_dataloader
14 | from tqdm import tqdm
15 | import torch
16 | import torchvision
17 |
18 |
19 | if __name__ == "__main__":
20 | bin_path = "f:\\tmp\\binned"
21 | good_path = "f:\\tmp\\good"
22 | os.makedirs(bin_path, exist_ok=True)
23 | os.makedirs(good_path, exist_ok=True)
24 |
25 |
26 | torch.backends.cudnn.benchmark = True
27 | parser = argparse.ArgumentParser()
28 | parser.add_argument('-opt', type=str, help='Path to options YAML file.', default='../options/train_quality_detectors/train_resnet_jpeg.yml')
29 | opt = option.parse(parser.parse_args().opt, is_train=False)
30 | opt = option.dict_to_nonedict(opt)
31 | opt['dist'] = False
32 |
33 | util.mkdirs(
34 | (path for key, path in opt['path'].items()
35 | if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key))
36 | util.setup_logger('base', opt['path']['log'], 'test_' + opt['name'], level=logging.INFO,
37 | screen=True, tofile=True)
38 | logger = logging.getLogger('base')
39 | logger.info(option.dict2str(opt))
40 |
41 | #### Create test dataset and dataloader
42 | test_loaders = []
43 | for phase, dataset_opt in sorted(opt['datasets'].items()):
44 | test_set = create_dataset(dataset_opt)
45 | test_loader = create_dataloader(test_set, dataset_opt, opt=opt)
46 | logger.info('Number of test images in [{:s}]: {:d}'.format(dataset_opt['name'], len(test_set)))
47 | test_loaders.append(test_loader)
48 |
49 | model = ExtensibleTrainer(opt)
50 | fea_loss = 0
51 | for test_loader in test_loaders:
52 | test_set_name = test_loader.dataset.opt['name']
53 | logger.info('\nTesting [{:s}]...'.format(test_set_name))
54 | test_start_time = time.time()
55 | dataset_dir = osp.join(opt['path']['results_root'], test_set_name)
56 | util.mkdir(dataset_dir)
57 |
58 | tq = tqdm(test_loader)
59 | removed = 0
60 | means = []
61 | for k, data in enumerate(tq):
62 | model.feed_data(data, k)
63 | model.test()
64 | results = torch.argmax(torch.nn.functional.softmax(model.eval_state['logits'][0], dim=-1), dim=1)
65 | for i in range(results.shape[0]):
66 | if results[i] == 0:
67 | imname = osp.basename(data['HQ_path'][i])
68 | # For VERIFICATION:
69 | #torchvision.utils.save_image(data['hq'][i], osp.join(bin_path, imname))
70 | # 4 REALZ:
71 | os.remove(data['HQ_path'][i])
72 | removed += 1
73 |
74 | print("Removed %i/%i images" % (removed, len(test_set)))
--------------------------------------------------------------------------------
/codes/utils/UI_icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/utils/UI_icon.png
--------------------------------------------------------------------------------
/codes/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/utils/__init__.py
--------------------------------------------------------------------------------
/codes/utils/audio.py:
--------------------------------------------------------------------------------
1 | import librosa
2 | import matplotlib.pyplot as plt
3 |
4 |
5 | def plot_spectrogram(spec, title=None, ylabel="freq_bin", aspect="auto", xmax=None):
6 | fig, axs = plt.subplots(1, 1)
7 | axs.set_title(title or "Spectrogram (db)")
8 | axs.set_ylabel(ylabel)
9 | axs.set_xlabel("frame")
10 | im = axs.imshow(librosa.power_to_db(spec), origin="lower", aspect=aspect)
11 | if xmax:
12 | axs.set_xlim((0, xmax))
13 | fig.colorbar(im, ax=axs)
14 | plt.show(block=False)
--------------------------------------------------------------------------------
/codes/utils/convert_model.py:
--------------------------------------------------------------------------------
1 | # Tool that can be used to add a new layer into an existing model save file. Primarily useful for "progressive"
2 | # models which can be trained piecemeal.
3 |
4 | from utils import options as option
5 | from models import create_model
6 | import torch
7 | import os
8 |
9 | def get_model_for_opt_file(filename):
10 | opt = option.parse(filename, is_train=True)
11 | opt = option.dict_to_nonedict(opt)
12 | model = create_model(opt)
13 | return model, opt
14 |
15 | def copy_state_dict_list(l_from, l_to):
16 | for i, v in enumerate(l_from):
17 | if isinstance(v, list):
18 | copy_state_dict_list(v, l_to[i])
19 | elif isinstance(v, dict):
20 | copy_state_dict(v, l_to[i])
21 | else:
22 | l_to[i] = v
23 |
24 | def copy_state_dict(dict_from, dict_to):
25 | for k in dict_from.keys():
26 | if k == 'optimizers':
27 | for j in range(len(dict_from[k][0]['param_groups'])):
28 | for p in dict_to[k][0]['param_groups'][j]['params']:
29 | del dict_to[k][0]['state']
30 | dict_to[k][0]['param_groups'][j] = dict_from[k][0]['param_groups'][j]
31 | dict_to[k][0]['state'].update(dict_from[k][0]['state'])
32 | print(len(dict_from[k][0].keys()), dict_from[k][0].keys())
33 | print(len(dict_to[k][0].keys()), dict_to[k][0].keys())
34 | assert k in dict_to.keys()
35 | if isinstance(dict_from[k], dict):
36 | copy_state_dict(dict_from[k], dict_to[k])
37 | elif isinstance(dict_from[k], list):
38 | copy_state_dict_list(dict_from[k], dict_to[k])
39 | else:
40 | dict_to[k] = dict_from[k]
41 | return dict_to
42 |
43 | if __name__ == "__main__":
44 | os.chdir("..")
45 | model_from, opt_from = get_model_for_opt_file("../options/train_imgset_pixgan_progressive_srg2.yml")
46 | model_to, _ = get_model_for_opt_file("../options/train_imgset_pixgan_progressive_srg2_.yml")
47 |
48 | '''
49 | model_to.netG.module.update_for_step(1000000000000)
50 | l = torch.nn.MSELoss()
51 | o, _ = model_to.netG(torch.randn(1, 3, 64, 64))
52 | l(o, torch.randn_like(o)).backward()
53 | model_to.optimizer_G.step()
54 | o = model_to.netD(torch.randn(1, 3, 128, 128))
55 | l(o, torch.randn_like(o)).backward()
56 | model_to.optimizer_D.step()
57 | '''
58 |
59 | torch.save(copy_state_dict(model_from.netG.state_dict(), model_to.netG.state_dict()), "converted_g.pth")
60 | torch.save(copy_state_dict(model_from.netD.state_dict(), model_to.netD.state_dict()), "converted_d.pth")
61 |
62 | # Also convert the state.
63 | resume_state_from = torch.load(opt_from['path']['resume_state'])
64 | resume_state_to = model_to.save_training_state({}, return_state=True)
65 | resume_state_from['optimizers'][0]['param_groups'].append(resume_state_to['optimizers'][0]['param_groups'][-1])
66 | torch.save(resume_state_from, "converted_state.pth")
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
--------------------------------------------------------------------------------
/codes/utils/distributed_checkpont.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import warnings
3 |
4 |
5 | def detach_variable(inputs):
6 | if isinstance(inputs, tuple):
7 | out = []
8 | for inp in inputs:
9 | x = inp.detach()
10 | x.requires_grad = inp.requires_grad
11 | out.append(x)
12 | return tuple(out)
13 | else:
14 | raise RuntimeError(
15 | "Only tuple of tensors is supported. Got Unsupported input type: ", type(inputs).__name__)
16 |
17 |
18 | def check_backward_validity(inputs):
19 | if not any(inp.requires_grad for inp in inputs):
20 | warnings.warn("None of the inputs have requires_grad=True. Gradients will be None")
21 |
22 |
23 | class CheckpointFunction(torch.autograd.Function):
24 | @staticmethod
25 | def forward(ctx, run_function, length, *args):
26 | ctx.run_function = run_function
27 | ctx.input_tensors = list(args[:length])
28 | ctx.input_params = list(args[length:])
29 | with torch.no_grad():
30 | output_tensors = ctx.run_function(*ctx.input_tensors)
31 | return output_tensors
32 |
33 | @staticmethod
34 | def backward(ctx, *output_grads):
35 | for i in range(len(ctx.input_tensors)):
36 | temp = ctx.input_tensors[i]
37 | ctx.input_tensors[i] = temp.detach()
38 | ctx.input_tensors[i].requires_grad = temp.requires_grad
39 | with torch.enable_grad():
40 | output_tensors = ctx.run_function(*ctx.input_tensors)
41 | input_grads = torch.autograd.grad(output_tensors, ctx.input_tensors + ctx.input_params, output_grads, allow_unused=True)
42 | return (None, None) + input_grads
43 |
44 |
45 | def checkpoint(module, *params):
46 | differentiable_params = tuple(filter(lambda p: p.requires_grad, module.parameters()))
47 | if len(differentiable_params) > 0:
48 | args = params + differentiable_params
49 | return CheckpointFunction.apply(module, len(params), *args)
50 | else:
51 | return module(*params)
--------------------------------------------------------------------------------
/codes/utils/weight_scheduler.py:
--------------------------------------------------------------------------------
1 | import math
2 | from matplotlib import pyplot as plt
3 |
4 | # Base class for weight schedulers. Holds weight at a fixed initial value.
5 | class WeightScheduler:
6 | def __init__(self, initial_weight):
7 | self.initial_weight = initial_weight
8 |
9 | def get_weight_for_step(self, step):
10 | return self.initial_weight
11 |
12 |
13 | class LinearDecayWeightScheduler(WeightScheduler):
14 | def __init__(self, initial_weight, steps_to_decay, lower_bound, initial_step=0):
15 | super(LinearDecayWeightScheduler, self).__init__(initial_weight)
16 | self.steps_to_decay = steps_to_decay
17 | self.lower_bound = lower_bound
18 | self.initial_step = initial_step
19 | self.decrease_per_step = (initial_weight - lower_bound) / self.steps_to_decay
20 |
21 | def get_weight_for_step(self, step):
22 | step = step - self.initial_step
23 | if step < 0:
24 | return self.initial_weight
25 | return max(self.lower_bound, self.initial_weight - step * self.decrease_per_step)
26 |
27 |
28 | class SinusoidalWeightScheduler(WeightScheduler):
29 | def __init__(self, upper_weight, lower_weight, period_steps, initial_step=0):
30 | super(SinusoidalWeightScheduler, self).__init__(upper_weight)
31 | self.center = (upper_weight + lower_weight) / 2
32 | self.amplitude = (upper_weight - lower_weight) / 2
33 | self.period = period_steps
34 | self.initial_step = initial_step
35 |
36 | def get_weight_for_step(self, step):
37 | step = step - self.initial_step
38 | if step < 0:
39 | return self.initial_weight
40 | # Use cosine because it starts at y=1 for x=0.
41 | return math.cos(step * math.pi * 2 / self.period) * self.amplitude + self.center
42 |
43 |
44 | def get_scheduler_for_opt(opt):
45 | if opt['type'] == 'fixed':
46 | return WeightScheduler(opt['weight'])
47 | elif opt['type'] == 'linear_decay':
48 | return LinearDecayWeightScheduler(opt['initial_weight'], opt['steps'], opt['lower_bound'], opt['start_step'])
49 | elif opt['type'] == 'sinusoidal':
50 | return SinusoidalWeightScheduler(opt['upper_weight'], opt['lower_weight'], opt['period'], opt['start_step'])
51 | else:
52 | raise NotImplementedError
53 |
54 |
55 | # Do some testing.
56 | if __name__ == "__main__":
57 | #sched = SinusoidalWeightScheduler(1, .1, 50, 10)
58 | sched = LinearDecayWeightScheduler(10, 5000, .9, 2000)
59 |
60 | x = []
61 | y = []
62 | for s in range(8000):
63 | x.append(s)
64 | y.append(sched.get_weight_for_step(s))
65 | plt.plot(x, y)
66 | plt.show()
--------------------------------------------------------------------------------
/environment.yaml:
--------------------------------------------------------------------------------
1 | name: DLAS
2 | channels:
3 | - conda-forge
4 | - nvidia
5 | - pytorch
6 | dependencies:
7 | - nvidia::cudatoolkit
8 | - git
9 | - numpy
10 | - pip
11 | - python=3.10.0
12 | - torchvision
13 | - torchaudio
14 | - pytorch::pytorch
15 | - pip:
16 | - -r codes/requirements.laxed.txt
--------------------------------------------------------------------------------
/experiments/clips_mel_norms.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/experiments/clips_mel_norms.pth
--------------------------------------------------------------------------------
/experiments/train_diffusion_vocoder_22k_level.yml:
--------------------------------------------------------------------------------
1 | path:
2 | pretrain_model_dvae: '../experiments/dvae.pth'
3 | strict_load: true
4 | #resume_state: ../experiments/train_imgnet_vqvae_stage1/training_state/0.state # <-- Set this to resume from a previous training state.
5 | networks:
6 | dvae:
7 | type: generator
8 | which_model_G: lucidrains_dvae
9 | kwargs:
10 | channels: 80
11 | codebook_dim: 512
12 | hidden_dim: 512
13 | kernel_size: 3
14 | num_layers: 2
15 | num_resnet_blocks: 3
16 | num_tokens: 8192
17 | positional_dims: 1
18 | use_transposed_convs: false
19 |
--------------------------------------------------------------------------------
/experiments/train_gpt_tts_unified.yml:
--------------------------------------------------------------------------------
1 | path:
2 | #pretrain_model_dvae: '../experiments/dvae.pth'
3 | strict_load: true
4 | #resume_state: ../experiments/train_imgnet_vqvae_stage1/training_state/0.state # <-- Set this to resume from a previous training state.
5 | networks:
6 | gpt:
7 | type: generator
8 | which_model_G: unified_voice2
9 | kwargs:
10 | layers: 30 # WAS 8
11 | model_dim: 1024 # WAS 512
12 | heads: 16 # WAS 8
13 | max_text_tokens: 402 # WAS 120
14 | max_mel_tokens: 604 # WAS 250
15 | max_conditioning_inputs: 2 # WAS 1
16 | mel_length_compression: 1024
17 | number_text_tokens: 256 # supposed to be 255 for newer unified_voice files
18 | number_mel_codes: 8194
19 | start_mel_token: 8192
20 | stop_mel_token: 8193
21 | start_text_token: 255
22 | train_solo_embeddings: False # missing in uv3/4
23 | use_mel_codes_as_input: True # ditto
24 | checkpointing: True
25 | freeze_everything_but_position_embeddings: True
26 | tortoise_compat: True
27 |
--------------------------------------------------------------------------------
/recipes/byol/README.md:
--------------------------------------------------------------------------------
1 | # Working with BYOL in DLAS
2 |
3 | [BYOL](https://arxiv.org/abs/2006.07733) is a technique for pretraining an arbitrary image processing
4 | neural network. It is built upon previous self-supervised architectures like SimCLR.
5 |
6 | BYOL in DLAS is adapted from an implementation written by [lucidrains](https://github.com/lucidrains/byol-pytorch).
7 | It is implemented via two wrappers:
8 |
9 | 1. A Dataset wrapper that augments the LQ and HQ inputs from a typical DLAS dataset. Since differentiable
10 | augmentations don't actually matter for BYOL, it makes more sense (to me) to do this on the CPU at the
11 | dataset layer, so your GPU can focus on processing gradients.
12 | 1. A model wrapper that attaches a small MLP to the end of your input network to produce a fixed
13 | size latent. This latent is used to produce the BYOL loss which trains the master weights from
14 | your network.
15 |
16 | Thanks to the excellent implementation from lucidrains, this wrapping process makes training your
17 | network on unsupervised datasets extremely easy.
18 |
19 | The DLAS version improves on lucidrains implementation adding some important training details, such as
20 | a custom LARS optimizer implementation that aligns with the recommendations from the paper. By moving augmentation
21 | to the dataset level, additional augmentation options are unlocked - like being able to take two similar video frames
22 | as the image pair.
23 |
24 | # Training BYOL
25 |
26 | In this directory, you will find a sample training config for training BYOL on DIV2K. You will
27 | likely want to insert your own model architecture first.
28 |
29 | Run the trainer by:
30 |
31 | `python train.py -opt train_div2k_byol.yml`
32 |
33 | BYOL is data hungry, as most unsupervised training methods are. If you're providing your own dataset, make sure it is
34 | the hundreds of K-images or more!
35 |
36 | ## Using your own model
37 |
38 | Training your own model on this BYOL implementation is trivial:
39 | 1. Add your nn.Module model implementation to the models/ directory.
40 | 2. Register your model with `trainer/networks.py` as a generator. This file tells DLAS how to build your model from
41 | a set of configuration options.
42 | 3. Copy the sample training config. Change the `subnet` and `hidden_layer` params.
43 | 4. Run your config with `python train.py -opt `.
44 |
45 | *hint: Your network architecture (including layer names) is printed out when running train.py
46 | against your network.*
--------------------------------------------------------------------------------
/recipes/diffusion/README.md:
--------------------------------------------------------------------------------
1 | # Working with Gaussian Diffusion models in DLAS
2 |
3 | Diffusion Models are a method of generating structural data using a gradual de-noising process. This process allows a
4 | simple network training regime.
5 |
6 | This implementation of Gaussian Diffusion is largely based on the work done by OpenAI in their paper ["Diffusion Models
7 | Beat GANs on Image Synthesis"](https://arxiv.org/pdf/2105.05233.pdf) and ["Improved Denoising Diffusion Probabilistic
8 | Models"](https://arxiv.org/pdf/2102.09672).
9 |
10 | OpenAI opened sourced their reference implementations [here](https://github.com/openai/guided-diffusion). The diffusion
11 | model that DLAS trains uses the [gaussian_diffusion.py](https://github.com/openai/guided-diffusion/blob/main/guided_diffusion/gaussian_diffusion.py)
12 | script from that repo for training and inference with these models. We also include the UNet from that repo as a model
13 | that can be used to train a diffusion network.
14 |
15 | Diffusion networks can be re-purposed to pretty much any image generation task, including super-resolution. Even though
16 | they are trained with MSE losses, they produce incredibly crisp images with FID scores competitive with the best GANs.
17 | More importantly, it is easy to track training progress since diffusion networks use a "normal" loss.
18 |
19 | Diffusion networks are unique in that during inference, they perform multiple forward passes to generate a single image.
20 | During training, these networks are trained to denoise images over 4000 steps. In inference, this sample rate can be
21 | adjusted. For the purposes of super-resolution, I have found that images sampled in 50 steps to be of very good quality.
22 | This still means that a diffusion generator is 50x slower than generators trained in different ways.
23 |
24 | What's more is that I have found that diffusion networks can be trained in the tiled methodology used by ESRGAN: instead
25 | of training on whole images, you can train on tiles of larger images. At inference time, the network can be applied to
26 | larger images than the network was initially trained on. I have found this works well on inference images within ~3x
27 | the training size. I have not tried larger, because the size of the UNet model means that inference at ultra-high
28 | resolutions is impossible (I run out of GPU memory).
29 |
30 | I have provided a reference configuration for training a diffusion model in this manner. The config performs a 2x
31 | upsampling to 256px, de-blurs it and removes JPEG artifacts. The deblurring and image repairs are done on a configurable
32 | scale. The scale is [0,1] passed to the model as `corruption_entropy`. `1` represents a maximum correction factor.
33 | You can try reducing this to 128px for faster training. It should work fine.
34 |
35 | Diffusion models also have a fairly arcane inference method. To help you along, I've provided an inference configuration
36 | that can be used with models trained in DLAS.
--------------------------------------------------------------------------------
/recipes/diffusion/test_diffusion_unet.yml:
--------------------------------------------------------------------------------
1 | #### general settings
2 | name: test_diffusion_unet
3 | use_tb_logger: true
4 | model: extensibletrainer
5 | scale: 1
6 | gpu_ids: [0]
7 | start_step: -1
8 | checkpointing_enabled: true
9 | fp16: false
10 | wandb: false
11 |
12 | datasets:
13 | train:
14 | name: my_inference_images
15 | n_workers: 0
16 | batch_size: 1
17 | mode: imagefolder
18 | rgb_n1_to_1: true
19 | disable_flip: true
20 | force_square: false
21 | paths:
22 | scale: 1
23 | skip_lq: true
24 | fixed_parameters:
25 | # Specify correction factors here. For networks trained with the paired training configuration, the first number
26 | # is a JPEG correction factor, and the second number is a deblurring factor. Testing shows that if you attempt to
27 | # deblur too far, you get extremely distorted images. It's actually pretty cool - the network clearly knows how
28 | # much deblurring is appropriate.
29 | corruption_entropy: [.2, .5]
30 |
31 | networks:
32 | generator:
33 | type: generator
34 | which_model_G: unet_diffusion
35 | args:
36 | image_size: 256
37 | in_channels: 3
38 | num_corruptions: 2
39 | model_channels: 192
40 | out_channels: 6
41 | num_res_blocks: 2
42 | attention_resolutions: [8,16]
43 | dropout: 0
44 | channel_mult: [1,1,2,2,4,4]
45 | num_heads: 4
46 | num_heads_upsample: -1
47 | use_scale_shift_norm: true
48 |
49 | #### path
50 | path:
51 | pretrain_model_generator:
52 | strict_load: true
53 |
54 | steps:
55 | generator:
56 | training: generator
57 | injectors:
58 | visual_debug:
59 | type: gaussian_diffusion_inference
60 | generator: generator
61 | output_batch_size: 1
62 | output_scale_factor: 2
63 | respaced_timestep_spacing: 50 # This can be tweaked to perform inference faster or slower. 50-200 seems to be the sweet spot. At 4000 steps, the quality is actually worse often.
64 | undo_n1_to_1: true
65 | beta_schedule:
66 | schedule_name: linear
67 | num_diffusion_timesteps: 4000
68 | diffusion_args:
69 | model_mean_type: epsilon
70 | model_var_type: learned_range
71 | loss_type: mse
72 | model_input_keys:
73 | low_res: hq
74 | corruption_factor: corruption_entropy
75 | out: sample
76 |
77 | eval:
78 | output_state: sample
--------------------------------------------------------------------------------
/recipes/esrgan/rrdb_process_video.yml:
--------------------------------------------------------------------------------
1 | name: video_process
2 | suffix: ~ # add suffix to saved images
3 | model: extensibletrainer
4 | scale: 4
5 | gpu_ids: [0]
6 | fp16: true
7 | minivid_crf: 12 # Defines the 'crf' output video quality parameter fed to FFMPEG
8 | frames_per_mini_vid: 360 # How many frames to process before generating a small video segment. Used to reduce number of images you must store to convert an entire video.
9 | minivid_start_no: 360
10 | recurrent_mode: false
11 |
12 | dataset:
13 | n_workers: 1
14 | name: myvideo
15 | video_file: # <-- Path to your video file here. any format supported by ffmpeg works.
16 | frame_rate: 30 # Set to the frame rate of your video.
17 | start_at_seconds: 0 # Set this if you want to start somewhere other than the beginning of the video.
18 | end_at_seconds: 5000 # Set to the time you want to stop at.
19 | batch_size: 1 # Set to the number of frames to convert at once. Larger batches provide a modest performance increase.
20 | vertical_splits: 1 # Used for 3d binocular videos. Leave at 1.
21 | force_multiple: 1
22 |
23 | #### network structures
24 | networks:
25 | generator:
26 | type: generator
27 | which_model_G: RRDBNet
28 | in_nc: 3
29 | out_nc: 3
30 | initial_stride: 1
31 | nf: 64
32 | nb: 23
33 | scale: 4
34 | blocks_per_checkpoint: 3
35 |
36 | #### path
37 | path:
38 | pretrain_model_generator: # <-- Set your generator path here.
39 |
40 | steps:
41 | generator:
42 | training: generator
43 | generator: generator
44 |
45 | # Optimizer params. Not used, but currently required to initialize ExtensibleTrainer, even in eval mode.
46 | lr: !!float 5e-6
47 | weight_decay: 0
48 | beta1: 0.9
49 | beta2: 0.99
50 |
51 | injectors:
52 | gen_inj:
53 | type: generator
54 | generator: generator
55 | in: lq
56 | out: gen
57 |
58 | # Train section is required, even though we are just evaluating.
59 | train:
60 | niter: 500000
61 | warmup_iter: -1
62 | mega_batch_factor: 1
63 | val_freq: 500
64 | default_lr_scheme: MultiStepLR
65 | gen_lr_steps: [20000, 40000, 80000, 100000, 140000, 180000]
66 | lr_gamma: 0.5
67 |
68 | eval:
69 | output_state: gen
--------------------------------------------------------------------------------
/recipes/glean/README.md:
--------------------------------------------------------------------------------
1 | # GLEAN
2 |
3 | DLAS contains an attempt at implementing [GLEAN](https://ckkelvinchan.github.io/papers/glean.pdf), which performs image
4 | super-resolution guided by pretrained StyleGAN networks. Since this paper is currently closed-source, it was
5 | implemented entirely on what information I could glean from the paper.
6 |
7 | ## Training
8 |
9 | GLEAN requires a pre-trained StyleGAN network to operate. DLAS currently only has support for StyleGAN2 models, so
10 | you will need to use one of those. The pre-eminent StyleGAN 2 model is the one trained on FFHQ faces, so I will use
11 | that in this training example.
12 |
13 | 1. Download the ffhq model from [nVidias Drive](https://drive.google.com/drive/folders/1yanUI9m4b4PWzR0eurKNq6JR1Bbfbh6L).
14 | This repo currently only supports the "-f.pkl" files without further modifications, so choose one of those.
15 | 1. Download and extract the [FFHQ dataset](https://github.com/NVlabs/ffhq-dataset).
16 | 1. Convert the TF model to a Pytorch one supported by DLAS:
17 |
18 | `python scripts/stylegan2/convert_weights_rosinality.py stylegan2-ffhq-config-f.pkl`
19 |
20 | 1. The above conversion script outputs a *.pth file as well as JPG preview of model outputs. Check the JPG to ensure
21 | the StyleGAN is performing as expected. If so, copy the *.pth file to your experiments/ directory within DLAS.
22 | 1. Edit the provided trainer configuration. Find comments starting with '<--' and make changes as indicated.
23 | 1. Train the model:
24 |
25 | `python train.py -opt train_ffhq_glean.yml`
--------------------------------------------------------------------------------
/recipes/segformer/train_byol_segformer.yml:
--------------------------------------------------------------------------------
1 | #### general settings
2 | name: train_byol_segformer
3 | use_tb_logger: true
4 | model: extensibletrainer
5 | distortion: sr
6 | scale: 1
7 | gpu_ids: [0]
8 | fp16: false
9 | start_step: -1
10 | checkpointing_enabled: false
11 | wandb: false
12 |
13 | datasets:
14 | train:
15 | n_workers: 1
16 | batch_size: 96
17 | mode: byol_dataset
18 | crop_size: 224
19 | key1: hq
20 | key2: hq
21 | dataset:
22 | mode: imagefolder
23 | paths: <>
24 | target_size: 224
25 | scale: 1
26 | fetch_alt_image: false
27 | skip_lq: true
28 | normalize: imagenet
29 |
30 | networks:
31 | generator:
32 | type: generator
33 | which_model_G: pixel_local_byol
34 | image_size: 224
35 | hidden_layer: tail
36 | subnet:
37 | which_model_G: segformer
38 |
39 | #### path
40 | path:
41 | strict_load: true
42 | #resume_state: <>
43 |
44 | steps:
45 | generator:
46 | training: generator
47 | optimizer: lars
48 | optimizer_params:
49 | # All parameters from appendix J of BYOL.
50 | lr: .08 # From BYOL: LR=.2*/256
51 | weight_decay: !!float 1.5e-6
52 | lars_coefficient: .001
53 | momentum: .9
54 |
55 | injectors:
56 | gen_inj:
57 | type: generator
58 | generator: generator
59 | in: aug1
60 | out: loss
61 |
62 | losses:
63 | byol_loss:
64 | type: direct
65 | key: loss
66 | weight: 1
67 |
68 | train:
69 | warmup_iter: -1
70 | mega_batch_factor: 2
71 | val_freq: 1000
72 | niter: 300000
73 |
74 | # Default LR scheduler options
75 | default_lr_scheme: CosineAnnealingLR_Restart
76 | T_period: [120000, 120000, 120000]
77 | warmup: 10000
78 | eta_min: .01 # Unspecified by the paper..
79 | restarts: [140000, 280000] # Paper says no re-starts, but this scheduler will add them automatically if we don't set them.
80 | # likely I won't train this far.
81 | restart_weights: [.5, .25]
82 |
83 |
84 | eval:
85 | output_state: loss
86 | evaluators:
87 | single_point_pair_contrastive_eval:
88 | for: generator
89 | type: single_point_pair_contrastive_eval
90 | batch_size: 16
91 | quantity: 96
92 | similar_set_args:
93 | path: <>
94 | size: 256
95 | dissimilar_set_args:
96 | path: <>
97 | size: 256
98 |
99 | logger:
100 | print_freq: 30
101 | save_checkpoint_freq: 1000
102 | visuals: [hq, aug1]
103 | visual_debug_rate: 100
--------------------------------------------------------------------------------
/recipes/srflow/convert_official_weights.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | # Quick script that can be used to convert from pretrained SRFlow weights to the variants used in this repo. The only
4 | # differences between the two is the variable naming conventions used by the RRDBNet. (FWIW this repo is using the
5 | # more up-to-date names that conform to Python standards).
6 |
7 | official_weight_file = 'SRFlow_CelebA_8X.pth'
8 | output = 'CelebA_converted.pth'
9 |
10 | sd = torch.load(official_weight_file)
11 | sdp = {}
12 | for k,v in sd.items():
13 | k = k.replace('RRDB.RRDB_trunk', 'RRDB.body')
14 | k = k.replace('.RDB', '.rdb')
15 | k = k.replace('trunk_conv.', 'conv_body.')
16 | k = k.replace('.upconv', '.conv_up')
17 | k = k.replace('.HRconv', '.conv_hr')
18 | sdp[k] = v
19 | torch.save(sdp, output)
20 |
--------------------------------------------------------------------------------
/recipes/srflow/train_div2k_rrdb_psnr.yml:
--------------------------------------------------------------------------------
1 | #### general settings
2 | name: train_div2k_rrdb_psnr
3 | use_tb_logger: true
4 | model: extensibletrainer
5 | distortion: sr
6 | scale: 2
7 | gpu_ids: [0]
8 | fp16: false
9 | start_step: 0
10 | checkpointing_enabled: true # <-- Highly recommended for single-GPU training. Will not work with DDP.
11 | wandb: false
12 |
13 | datasets:
14 | train:
15 | n_workers: 4
16 | batch_size: 32
17 | name: div2k
18 | mode: single_image_extensible
19 | paths: /content/div2k # <-- Put your path here.
20 | target_size: 128
21 | force_multiple: 1
22 | scale: 4
23 | eval: False
24 | num_corrupts_per_image: 0
25 | strict: false
26 | val:
27 | name: val
28 | mode: fullimage
29 | dataroot_GT: /content/set14
30 | scale: 4
31 | force_multiple: 16
32 |
33 | networks:
34 | generator:
35 | type: generator
36 | which_model_G: RRDBNet
37 | in_nc: 3
38 | out_nc: 3
39 | nf: 64
40 | nb: 23
41 | scale: 4
42 | blocks_per_checkpoint: 3
43 |
44 | #### path
45 | path:
46 | #pretrain_model_generator:
47 | strict_load: true
48 | #resume_state: ../experiments/train_div2k_rrdb_psnr/training_state/0.state # <-- Set this to resume from a previous training state.
49 |
50 | steps:
51 | generator:
52 | training: generator
53 |
54 | optimizer_params:
55 | # Optimizer params
56 | lr: !!float 2e-4
57 | weight_decay: 0
58 | beta1: 0.9
59 | beta2: 0.99
60 |
61 | injectors:
62 | gen_inj:
63 | type: generator
64 | generator: generator
65 | in: lq
66 | out: gen
67 |
68 | losses:
69 | pix:
70 | type: pix
71 | weight: 1
72 | criterion: l1
73 | real: hq
74 | fake: gen
75 |
76 | train:
77 | niter: 500000
78 | warmup_iter: -1
79 | mega_batch_factor: 1 # <-- Gradient accumulation factor. If you are running OOM, increase this to [2,4,8].
80 | val_freq: 2000
81 |
82 | # Default LR scheduler options
83 | default_lr_scheme: MultiStepLR
84 | gen_lr_steps: [50000, 100000, 150000, 200000]
85 | lr_gamma: 0.5
86 |
87 | eval:
88 | output_state: gen
89 |
90 | logger:
91 | print_freq: 30
92 | save_checkpoint_freq: 1000
93 | visuals: [gen, hq, lq]
94 | visual_debug_rate: 100
--------------------------------------------------------------------------------
/recipes/stylegan/README.md:
--------------------------------------------------------------------------------
1 | # StyleGAN Implementations
2 | DLAS supports two different StyleGAN2 implementations:
3 |
4 | - [@rosinality implementation](https://github.com/rosinality/stylegan2-pytorch/commits/master)
5 | Designed to reach parity with the nVidia reference implementation in TF1.5
6 | - [@lucidrains implementation](https://github.com/lucidrains/stylegan2-pytorch)
7 | Designed with simplicity and readability in mind.
8 |
9 | I prefer the readability of @lucidrains implementation, but you cannot (yet) use pretrained weights
10 | with it. I'm working on that.
11 |
--------------------------------------------------------------------------------
/recipes/tacotron2/test_tacotron2_lj.yml:
--------------------------------------------------------------------------------
1 | #### general settings
2 | name: test_tacotron2_lj
3 | use_tb_logger: true
4 | gpu_ids: [0]
5 | start_step: -1
6 | fp16: false
7 | checkpointing_enabled: true
8 | wandb: false
9 |
10 | datasets:
11 | train:
12 | name: lj
13 | n_workers: 0
14 | batch_size: 1
15 | mode: nv_tacotron
16 | path: E:\4k6k\datasets\audio\LJSpeech-1.1\ljs_audio_text_train_filelist.txt
17 |
18 | networks:
19 | mel_gen:
20 | type: generator
21 | which_model_G: nv_tacotron2
22 | args:
23 | encoder_kernel_size: 5
24 | encoder_n_convolutions: 3
25 | encoder_embedding_dim: 512
26 | decoder_rnn_dim: 1024
27 | prenet_dim: 256
28 | max_decoder_steps: 1000
29 | attention_rnn_dim: 1024
30 | attention_dim: 128
31 | attention_location_n_filters: 32
32 | attention_location_kernel_size: 31
33 | postnet_embedding_dim: 512
34 | postnet_kernel_size: 5
35 | postnet_n_convolutions: 5
36 | waveglow:
37 | type: generator
38 | which_model_G: nv_waveglow
39 | args:
40 | n_mel_channels: 80
41 | n_flows: 12
42 | n_group: 8
43 | n_early_every: 4
44 | n_early_size: 2
45 | WN_config:
46 | n_layers: 8
47 | n_channels: 256
48 | kernel_size: 3
49 |
50 | #### path
51 | path:
52 | pretrain_model_mel_gen: ../experiments/train_tacotron2_lj/models/22000_mel_gen_ema.pth
53 | pretrain_model_waveglow: ../experiments/waveglow_256channels_universal_v5.pth
54 | strict_load: true
55 | #resume_state: ../experiments/train_imgset_unet_diffusion/training_state/54000.state
56 |
57 | steps:
58 | generator:
59 | training: mel_gen
60 | injectors:
61 | mel:
62 | type: generator
63 | generator: mel_gen
64 | in: [padded_text, input_lengths, padded_mel, output_lengths]
65 | out: [mel_outputs, mel_outputs_postnet, gate_outputs, alignments]
66 | wave:
67 | type: generator
68 | generator: waveglow
69 | method: infer
70 | in: mel_outputs
71 | out: waveform
72 |
73 | eval:
74 | output_state: waveform
--------------------------------------------------------------------------------
/recipes/tacotron2/train_tacotron2_lj.yml:
--------------------------------------------------------------------------------
1 | #### general settings
2 | name: train_tacotron2_lj
3 | use_tb_logger: true
4 | gpu_ids: [0]
5 | start_step: -1
6 | fp16: false
7 | checkpointing_enabled: true
8 | wandb: false
9 |
10 | datasets:
11 | train:
12 | name: lj
13 | n_workers: 1
14 | batch_size: 72
15 | mode: nv_tacotron
16 | path: E:\4k6k\datasets\audio\LJSpeech-1.1\ljs_audio_text_train_filelist.txt
17 |
18 | networks:
19 | mel_gen:
20 | type: generator
21 | which_model_G: nv_tacotron2
22 | args:
23 | encoder_kernel_size: 5
24 | encoder_n_convolutions: 3
25 | encoder_embedding_dim: 512
26 | decoder_rnn_dim: 1024
27 | prenet_dim: 256
28 | max_decoder_steps: 1000
29 | attention_rnn_dim: 1024
30 | attention_dim: 128
31 | attention_location_n_filters: 32
32 | attention_location_kernel_size: 31
33 | postnet_embedding_dim: 512
34 | postnet_kernel_size: 5
35 | postnet_n_convolutions: 5
36 |
37 | #### path
38 | path:
39 | #pretrain_model_generator: ../experiments/diffusion_unet_128_imageset_22000.pt
40 | strict_load: true
41 | #resume_state: ../experiments/train_imgset_unet_diffusion/training_state/54000.state
42 |
43 | steps:
44 | generator:
45 | training: mel_gen
46 |
47 | optimizer: adamw
48 | optimizer_params:
49 | lr: !!float 1.2e-3
50 | weight_decay: !!float 1e-6
51 | beta1: 0.9
52 | beta2: 0.9999
53 | clip_grad_eps: 1.0
54 |
55 | injectors:
56 | mel:
57 | type: generator
58 | generator: mel_gen
59 | in: [padded_text, input_lengths, padded_mel, output_lengths]
60 | out: [mel_outputs, mel_outputs_postnet, gate_outputs, alignments]
61 | losses:
62 | tacotron_loss:
63 | type: nv_tacotron2_loss
64 | weight: 1
65 | mel_target_key: padded_mel
66 | mel_output_key: mel_outputs
67 | mel_output_postnet_key: mel_outputs_postnet
68 | gate_target_key: padded_gate
69 | gate_output_key: gate_outputs
70 |
71 | train:
72 | niter: 500000
73 | warmup_iter: -1
74 | mega_batch_factor: 3
75 | ema_rate: .999
76 | val_freq: 500
77 |
78 | default_lr_scheme: MultiStepLR
79 | gen_lr_steps: [ 50000, 100000, 150000 ]
80 | lr_gamma: 0.5
81 |
82 | eval:
83 | evaluators:
84 | val:
85 | type: mel
86 | for: mel_gen
87 | batch_size: 16
88 | dataset:
89 | mode: nv_tacotron
90 | path: E:\4k6k\datasets\audio\LJSpeech-1.1\ljs_audio_text_val_filelist.txt
91 |
92 |
93 | logger:
94 | print_freq: 30
95 | save_checkpoint_freq: 500
96 | visuals: [mel_outputs, padded_mel]
97 | is_mel_spectrogram: true
98 | visual_debug_rate: 100
--------------------------------------------------------------------------------
/recipes/vqvae2/README.md:
--------------------------------------------------------------------------------
1 | # VQVAE2 in Pytorch
2 |
3 | [VQVAE2](https://arxiv.org/pdf/1906.00446.pdf) is a generative autoencoder developed by Deepmind. It's unique innovation is
4 | discretizing the latent space into a fixed set of "codebook" vectors. This codebook
5 | can then be used in downstream tasks to rebuild images from the training set.
6 |
7 | This model is in DLAS thanks to work [@rosinality](https://github.com/rosinality) did
8 | [converting the Deepmind model](https://github.com/rosinality/vq-vae-2-pytorch) to Pytorch.
9 |
10 | # Training VQVAE2
11 |
12 | VQVAE2 is trained in two steps:
13 |
14 | ## Training the autoencoder
15 |
16 | This first step is to train the autoencoder itself. The config file `train_imgnet_vqvae_stage1.yml` provided shows how to do this
17 | for imagenet with the hyperparameters specified by deepmind. You'll need to bring your own imagenet folder for this.
18 |
19 | ## Training the PixelCNN encoder
20 |
21 | The second step is to train the PixelCNN model which will create "codebook" vectors given an
22 | input image.
--------------------------------------------------------------------------------
/resources/bitsandbytes_windows/cextension.py:
--------------------------------------------------------------------------------
1 | import ctypes as ct
2 | from pathlib import Path
3 | from warnings import warn
4 |
5 | from .cuda_setup.main import evaluate_cuda_setup
6 |
7 |
8 | class CUDALibrary_Singleton(object):
9 | _instance = None
10 |
11 | def __init__(self):
12 | raise RuntimeError("Call get_instance() instead")
13 |
14 | def initialize(self):
15 | binary_name = evaluate_cuda_setup()
16 | package_dir = Path(__file__).parent
17 | binary_path = package_dir / binary_name
18 |
19 | if not binary_path.exists():
20 | print(f"CUDA SETUP: TODO: compile library for specific version: {binary_name}")
21 | legacy_binary_name = "libbitsandbytes.so"
22 | print(f"CUDA SETUP: Defaulting to {legacy_binary_name}...")
23 | binary_path = package_dir / legacy_binary_name
24 | if not binary_path.exists():
25 | print('CUDA SETUP: CUDA detection failed. Either CUDA driver not installed, CUDA not installed, or you have multiple conflicting CUDA libraries!')
26 | print('CUDA SETUP: If you compiled from source, try again with `make CUDA_VERSION=DETECTED_CUDA_VERSION` for example, `make CUDA_VERSION=113`.')
27 | raise Exception('CUDA SETUP: Setup Failed!')
28 | # self.lib = ct.cdll.LoadLibrary(binary_path)
29 | self.lib = ct.cdll.LoadLibrary(str(binary_path)) # $$$
30 | else:
31 | print(f"CUDA SETUP: Loading binary {binary_path}...")
32 | # self.lib = ct.cdll.LoadLibrary(binary_path)
33 | self.lib = ct.cdll.LoadLibrary(str(binary_path)) # $$$
34 |
35 | @classmethod
36 | def get_instance(cls):
37 | if cls._instance is None:
38 | cls._instance = cls.__new__(cls)
39 | cls._instance.initialize()
40 | return cls._instance
41 |
42 |
43 | lib = CUDALibrary_Singleton.get_instance().lib
44 | try:
45 | lib.cadam32bit_g32
46 | lib.get_context.restype = ct.c_void_p
47 | lib.get_cusparse.restype = ct.c_void_p
48 | COMPILED_WITH_CUDA = True
49 | except AttributeError:
50 | warn(
51 | "The installed version of bitsandbytes was compiled without GPU support. "
52 | "8-bit optimizers and GPU quantization are unavailable."
53 | )
54 | COMPILED_WITH_CUDA = False
55 |
--------------------------------------------------------------------------------
/resources/bitsandbytes_windows/libbitsandbytes_cpu.dll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/resources/bitsandbytes_windows/libbitsandbytes_cpu.dll
--------------------------------------------------------------------------------
/resources/bitsandbytes_windows/libbitsandbytes_cuda116.dll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/resources/bitsandbytes_windows/libbitsandbytes_cuda116.dll
--------------------------------------------------------------------------------
/sandbox.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torchvision
3 | from PIL import Image
4 | from pytorch_wavelets import DWTForward, DWTInverse
5 | import torch.nn.functional as F
6 |
7 | def load_img(path):
8 | im = Image.open(path).convert(mode="RGB")
9 | return torchvision.transforms.ToTensor()(im)
10 |
11 | def save_img(t, path):
12 | torchvision.utils.save_image(t, path)
13 |
14 | img = load_img("pu.jpg")
15 | img = img.unsqueeze(0)
16 |
17 | # Reshape image to be multiple of 32
18 | w, h = img.shape[2:]
19 | w = (w // 32) * 32
20 | h = (h // 32) * 32
21 | img = F.interpolate(img, size=(w, h))
22 | print("Input shape:", img.shape)
23 |
24 | J_spec = 5
25 |
26 | Yl, Yh = DWTForward(J=J_spec, mode='periodization', wave='db3')(img)
27 | print(Yl.shape, [h.shape for h in Yh])
28 |
29 | imgLR = F.interpolate(img, scale_factor=.5)
30 | LQYl, LQYh = DWTForward(J=J_spec-1, mode='periodization', wave='db3')(imgLR)
31 | print(LQYl.shape, [h.shape for h in LQYh])
32 |
33 | for i in range(J_spec):
34 | smd = torch.sum(Yh[i], dim=2).cpu()
35 | save_img(smd, "high_%i.png" % (i,))
36 | save_img(Yl, "lo.png")
37 |
38 | '''
39 | Following code reconstructs the image with different high passes cancelled out.
40 | '''
41 | for i in range(J_spec):
42 | corrupted_im = [y for y in Yh]
43 | corrupted_im[i] = torch.zeros_like(corrupted_im[i])
44 | im = DWTInverse(mode='periodization', wave='db3')((Yl, corrupted_im))
45 | save_img(im, "corrupt_%i.png" % (i,))
46 | im = DWTInverse(mode='periodization', wave='db3')((torch.full_like(Yl, fill_value=torch.mean(Yl)), Yh))
47 | save_img(im, "corrupt_im.png")
48 |
49 |
50 | '''
51 | Following code reconstructs a hybrid image with the first high pass from the HR and the rest of the data from the LR.
52 | highpass = [Yh[0]] + LQYh
53 | im = DWTInverse(mode='periodization', wave='db3')((LQYl, highpass))
54 | save_img(im, "hybrid_lrhr.png")
55 | save_img(F.interpolate(imgLR, scale_factor=2), "upscaled.png")
56 | '''
--------------------------------------------------------------------------------
/static/drive_copied_file_tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/drive_copied_file_tree.png
--------------------------------------------------------------------------------
/static/export_to_gdrive.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/export_to_gdrive.png
--------------------------------------------------------------------------------
/static/file_directory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/file_directory.png
--------------------------------------------------------------------------------
/static/good_gpu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/good_gpu.png
--------------------------------------------------------------------------------
/static/hyperparam_dataset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/hyperparam_dataset.png
--------------------------------------------------------------------------------
/static/ljspeech.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/ljspeech.png
--------------------------------------------------------------------------------
/static/notebook_header.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/notebook_header.png
--------------------------------------------------------------------------------
/static/params.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/params.png
--------------------------------------------------------------------------------
/static/runtime_type.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/runtime_type.png
--------------------------------------------------------------------------------
/static/settings_options.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/settings_options.png
--------------------------------------------------------------------------------
/static/stop_training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/stop_training.png
--------------------------------------------------------------------------------
/static/training_button.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/training_button.png
--------------------------------------------------------------------------------
/static/very_long_process.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/very_long_process.png
--------------------------------------------------------------------------------
/static/very_recent_save.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/very_recent_save.png
--------------------------------------------------------------------------------
/static/warning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/warning.png
--------------------------------------------------------------------------------
/static/yml_file.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/yml_file.png
--------------------------------------------------------------------------------
/voice_samples/kk_500/kk_0_0.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/voice_samples/kk_500/kk_0_0.wav
--------------------------------------------------------------------------------
/voice_samples/kk_500/kk_0_1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/voice_samples/kk_500/kk_0_1.wav
--------------------------------------------------------------------------------
/voice_samples/kk_500/kk_0_2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/voice_samples/kk_500/kk_0_2.wav
--------------------------------------------------------------------------------
/voice_samples/kk_500_emma/emma_0_0.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/voice_samples/kk_500_emma/emma_0_0.wav
--------------------------------------------------------------------------------
/voice_samples/kk_500_emma/emma_0_1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/voice_samples/kk_500_emma/emma_0_1.wav
--------------------------------------------------------------------------------
/voice_samples/kk_500_emma/emma_0_2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/voice_samples/kk_500_emma/emma_0_2.wav
--------------------------------------------------------------------------------
/voice_samples/kk_orig/kk_0_0.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/voice_samples/kk_orig/kk_0_0.wav
--------------------------------------------------------------------------------
/voice_samples/kk_orig/kk_0_1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/voice_samples/kk_orig/kk_0_1.wav
--------------------------------------------------------------------------------
/voice_samples/kk_orig/kk_0_2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/voice_samples/kk_orig/kk_0_2.wav
--------------------------------------------------------------------------------