├── .flake8
├── .gitignore
├── .gitmodules
├── COLAB_USAGE.md
├── LICENSE
├── LICENSE.old
├── README.md
├── Setup DLAS.bat
├── Start DLAS.cmd
├── Start Training Monitor.cmd
├── codes
    ├── NOTES.md
    ├── configuration_gui.py
    ├── data
    │   ├── README.md
    │   ├── __init__.py
    │   ├── audio
    │   │   ├── audio_with_noise_dataset.py
    │   │   ├── fast_paired_dataset.py
    │   │   ├── fast_paired_dataset_with_phonemes.py
    │   │   ├── gpt_tts_dataset.py
    │   │   ├── gpt_tts_tokenizer.json
    │   │   ├── grand_conjoined_dataset.py
    │   │   ├── nv_tacotron_dataset.py
    │   │   ├── paired_voice_audio_dataset.py
    │   │   ├── preprocessed_mel_dataset.py
    │   │   ├── unsupervised_audio_dataset.py
    │   │   ├── voice_tokenizer.py
    │   │   └── wav_aug.py
    │   ├── combined_dataset.py
    │   ├── data_sampler.py
    │   ├── images
    │   │   ├── __init__.py
    │   │   ├── base_unsupervised_image_dataset.py
    │   │   ├── byol_attachment.py
    │   │   ├── chunk_with_reference.py
    │   │   ├── cifar.py
    │   │   ├── full_image_dataset.py
    │   │   ├── image_corruptor.py
    │   │   ├── image_folder_dataset.py
    │   │   ├── image_label_parser.py
    │   │   ├── image_pair_with_corresponding_points_dataset.py
    │   │   ├── multi_frame_dataset.py
    │   │   ├── multiscale_dataset.py
    │   │   ├── paired_frame_dataset.py
    │   │   ├── random_dataset.py
    │   │   ├── single_image_dataset.py
    │   │   ├── stylegan2_dataset.py
    │   │   └── zip_file_dataset.py
    │   ├── text
    │   │   └── hf_datasets_wrapper.py
    │   ├── torch_dataset.py
    │   ├── util.py
    │   └── zero_pad_dict_collate.py
    ├── maybe_bnb.py
    ├── models
    │   ├── __init__.py
    │   ├── arch_util.py
    │   ├── audio
    │   │   ├── __init__.py
    │   │   ├── asr
    │   │   │   ├── __init__.py
    │   │   │   └── w2v_wrapper.py
    │   │   ├── audio_resnet.py
    │   │   ├── mel2vec.py
    │   │   ├── music
    │   │   │   ├── __init__.py
    │   │   │   ├── cheater_gen_ar.py
    │   │   │   ├── diffwave.py
    │   │   │   ├── encoders.py
    │   │   │   ├── flat_diffusion.py
    │   │   │   ├── gpt_music.py
    │   │   │   ├── gpt_music2.py
    │   │   │   ├── instrument_quantizer.py
    │   │   │   ├── m2v_code_to_mel.py
    │   │   │   ├── mel2vec_codes_gpt.py
    │   │   │   ├── music_quantizer.py
    │   │   │   ├── music_quantizer2.py
    │   │   │   ├── tfdpc_v5.py
    │   │   │   ├── transformer_diffusion12.py
    │   │   │   ├── transformer_diffusion13.py
    │   │   │   ├── transformer_diffusion14.py
    │   │   │   ├── unet_diffusion_music_codes.py
    │   │   │   ├── unet_diffusion_waveform_gen.py
    │   │   │   ├── unet_diffusion_waveform_gen3.py
    │   │   │   └── unet_diffusion_waveform_gen_simple.py
    │   │   ├── tts
    │   │   │   ├── __init__.py
    │   │   │   ├── autoregressive_codegen.py
    │   │   │   ├── autoregressive_codegen2.py
    │   │   │   ├── ctc_code_generator.py
    │   │   │   ├── diffusion_encoder.py
    │   │   │   ├── lucidrains_dvae.py
    │   │   │   ├── mini_encoder.py
    │   │   │   ├── random_latent_converter.py
    │   │   │   ├── tacotron2
    │   │   │   │   ├── LICENSE
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── audio_processing.py
    │   │   │   │   ├── hparams.py
    │   │   │   │   ├── layers.py
    │   │   │   │   ├── loss.py
    │   │   │   │   ├── stft.py
    │   │   │   │   ├── taco_utils.py
    │   │   │   │   ├── tacotron2.py
    │   │   │   │   ├── text
    │   │   │   │   │   ├── LICENSE
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── cleaners.py
    │   │   │   │   │   ├── cmudict.py
    │   │   │   │   │   ├── numbers.py
    │   │   │   │   │   └── symbols.py
    │   │   │   │   └── wave_tacotron.py
    │   │   │   ├── transformer_builders.py
    │   │   │   ├── transformer_diffusion_tts.py
    │   │   │   ├── transformer_diffusion_tts2.py
    │   │   │   ├── unet_diffusion_tts7.py
    │   │   │   ├── unet_diffusion_tts9.py
    │   │   │   ├── unet_diffusion_tts_flat.py
    │   │   │   ├── unet_diffusion_vocoder.py
    │   │   │   ├── unet_diffusion_vocoder_with_ref.py
    │   │   │   ├── unified_voice2.py
    │   │   │   ├── unified_voice3.py
    │   │   │   ├── unified_voice4.py
    │   │   │   ├── voice_voice_clip.py
    │   │   │   └── w2v_matcher.py
    │   │   └── vocoders
    │   │   │   ├── __init__.py
    │   │   │   ├── univnet
    │   │   │       ├── __init__.py
    │   │   │       ├── generator.py
    │   │   │       └── lvcnet.py
    │   │   │   └── waveglow
    │   │   │       ├── __init__.py
    │   │   │       ├── denoiser.py
    │   │   │       └── waveglow.py
    │   ├── classifiers
    │   │   ├── __init__.py
    │   │   ├── cifar_resnet.py
    │   │   ├── resnet_with_checkpointing.py
    │   │   ├── torch_models.py
    │   │   ├── twin_cifar_resnet.py
    │   │   ├── weighted_conv_resnet.py
    │   │   └── wide_kernel_vgg.py
    │   ├── clip
    │   │   ├── __init__.py
    │   │   ├── clip.py
    │   │   ├── clvp.py
    │   │   ├── contrastive_audio.py
    │   │   ├── cvvp.py
    │   │   ├── mel_text_clip.py
    │   │   ├── text_cond_clip.py
    │   │   └── text_voice_clip.py
    │   ├── composable
    │   │   ├── README.md
    │   │   └── __init__.py
    │   ├── diffusion
    │   │   ├── __init__.py
    │   │   ├── fp16_util.py
    │   │   ├── gaussian_diffusion.py
    │   │   ├── losses.py
    │   │   ├── nn.py
    │   │   ├── resample.py
    │   │   ├── respace.py
    │   │   ├── rrdb_diffusion.py
    │   │   ├── unet_diffusion.py
    │   │   └── unet_latent_guide.py
    │   ├── image_generation
    │   │   ├── RRDBNet_arch.py
    │   │   ├── ResGen_arch.py
    │   │   ├── __init__.py
    │   │   ├── discriminator_vgg_arch.py
    │   │   ├── glean
    │   │   │   ├── __init__.py
    │   │   │   ├── glean.py
    │   │   │   └── stylegan2_latent_bank.py
    │   │   ├── srflow
    │   │   │   ├── FlowActNorms.py
    │   │   │   ├── FlowAffineCouplingsAblation.py
    │   │   │   ├── FlowStep.py
    │   │   │   ├── FlowUpsamplerNet.py
    │   │   │   ├── Permutations.py
    │   │   │   ├── RRDBNet_arch.py
    │   │   │   ├── SRFlowNet_arch.py
    │   │   │   ├── Split.py
    │   │   │   ├── __init__.py
    │   │   │   ├── flow.py
    │   │   │   ├── glow_arch.py
    │   │   │   ├── module_util.py
    │   │   │   └── thops.py
    │   │   └── stylegan
    │   │   │   ├── Discriminator_StyleGAN.py
    │   │   │   ├── __init__.py
    │   │   │   ├── stylegan2_lucidrains.py
    │   │   │   └── stylegan2_rosinality.py
    │   ├── image_latents
    │   │   ├── __init__.py
    │   │   ├── byol
    │   │   │   ├── __init__.py
    │   │   │   ├── byol_model_wrapper.py
    │   │   │   └── byol_structural.py
    │   │   ├── fixup_resnet
    │   │   │   ├── DiscriminatorResnet_arch.py
    │   │   │   └── __init__.py
    │   │   ├── spinenet_arch.py
    │   │   └── vit_latent.py
    │   ├── lucidrains
    │   │   ├── dalle
    │   │   │   ├── __init__.py
    │   │   │   ├── attention.py
    │   │   │   ├── reversible.py
    │   │   │   └── transformer.py
    │   │   ├── performer
    │   │   │   ├── __init__.py
    │   │   │   ├── autoregressive_wrapper.py
    │   │   │   ├── performer_enc_dec.py
    │   │   │   ├── performer_pytorch.py
    │   │   │   └── reversible.py
    │   │   ├── vq.py
    │   │   └── x_transformers.py
    │   ├── optical_flow
    │   │   └── PWCNet.py
    │   └── vqvae
    │   │   ├── __init__.py
    │   │   ├── dvae.py
    │   │   ├── gumbel_quantizer.py
    │   │   ├── scaled_weight_conv.py
    │   │   ├── vector_quantizer.py
    │   │   └── vqvae.py
    ├── multi_modal_train.py
    ├── process_video.py
    ├── requirements.laxed.txt
    ├── requirements.txt
    ├── requirements_frozen_only_use_if_something_broken.txt
    ├── scripts
    │   ├── __init__.py
    │   ├── audio
    │   │   ├── __init__.py
    │   │   ├── gen
    │   │   │   ├── __init__.py
    │   │   │   ├── ctc_codes.py
    │   │   │   ├── music_joiner.py
    │   │   │   ├── speech_synthesis_utils.py
    │   │   │   ├── use_diffuse_tts.py
    │   │   │   ├── use_diffuse_voice_translation.py
    │   │   │   ├── use_discrete_vocoder.py
    │   │   │   ├── use_discrete_vocoder_one_way.py
    │   │   │   ├── use_gpt_tts.py
    │   │   │   ├── use_mel2vec_codes.py
    │   │   │   └── w2v_patcher.py
    │   │   ├── gen_mel.py
    │   │   ├── mel_bin_norm_compute.py
    │   │   ├── play_with_spectral_representations.py
    │   │   ├── prep_music
    │   │   │   ├── demucs_notes.txt
    │   │   │   ├── generate_long_cheaters.py
    │   │   │   ├── generate_long_mels.py
    │   │   │   └── phase_1_split_files.py
    │   │   ├── preparation
    │   │   │   ├── __init__.py
    │   │   │   ├── combine_phonetic_and_text.py
    │   │   │   ├── filter_clips_with_no_hifreq_data.py
    │   │   │   ├── gen_dvae_codes.py
    │   │   │   ├── phase_1_split_files.py
    │   │   │   ├── phase_2_sample_and_filter.py
    │   │   │   ├── phase_3_generate_similarities.py
    │   │   │   ├── pipeline.py
    │   │   │   ├── process_spleeter_filter_outputs.py
    │   │   │   ├── save_mels_to_disk.py
    │   │   │   ├── spleeter_filter_noisy_clips.py
    │   │   │   ├── spleeter_utils
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── spleeter_dataset.py
    │   │   │   └── split_on_silence.py
    │   │   ├── random_mp3_splitter.py
    │   │   ├── spleeter_split_voice_and_background.py
    │   │   ├── test_audio_gen.py
    │   │   ├── test_audio_segmentor.py
    │   │   ├── test_audio_similarity.py
    │   │   ├── test_audio_speech_recognition.py
    │   │   ├── use_vocoder.py
    │   │   └── word_error_rate.py
    │   ├── byol
    │   │   ├── byol_extract_wrapped_model.py
    │   │   ├── byol_resnet_playground.py
    │   │   ├── byol_segformer_playground.py
    │   │   ├── byol_spinenet_playground.py
    │   │   └── tsne_torch.py
    │   ├── classify_into_folders.py
    │   ├── diffusion
    │   │   ├── diffusion_correction_surfer.py
    │   │   ├── diffusion_inference.py
    │   │   ├── diffusion_noise_surfer.py
    │   │   ├── diffusion_recursive_sampler.py
    │   │   └── diffusion_spacing_surfer.py
    │   ├── do_to_files.py
    │   ├── extract_square_images.py
    │   ├── extract_subimages.py
    │   ├── extract_subimages_with_ref.py
    │   ├── extract_temporal_squares.py
    │   ├── find_faulty_files.py
    │   ├── folderize_imagenet_val.py
    │   ├── gen_kmeans_clusters.py
    │   ├── hugging_face_hub_upload.py
    │   ├── post_install.py
    │   ├── srflow_latent_space_playground.py
    │   ├── start_tensorboard.py
    │   ├── stitch_images.py
    │   ├── stylegan2
    │   │   ├── convert_weights_rosinality.py
    │   │   └── dnnlib
    │   │   │   └── tflib
    │   │   │       └── network.py
    │   ├── ui
    │   │   ├── image_labeler
    │   │   │   ├── image_labeler_ui.py
    │   │   │   ├── label_editor.py
    │   │   │   ├── pretrained_image_patch_classifier.py
    │   │   │   └── test_image_patch_classifier.py
    │   │   └── image_pair_labeler
    │   │   │   └── image_pair_ui.py
    │   ├── use_generator_as_filter.py
    │   └── validate_data.py
    ├── sweep.py
    ├── test.py
    ├── train.py
    ├── trainer
    │   ├── ExtensibleTrainer.py
    │   ├── README.md
    │   ├── __init__.py
    │   ├── base_model.py
    │   ├── batch_size_optimizer.py
    │   ├── custom_training_components
    │   │   ├── __init__.py
    │   │   ├── progressive_zoom.py
    │   │   ├── stereoscopic.py
    │   │   └── tecogan_losses.py
    │   ├── eval
    │   │   ├── __init__.py
    │   │   ├── audio_diffusion_fid.py
    │   │   ├── eval_wer.py
    │   │   ├── evaluator.py
    │   │   ├── fid.py
    │   │   ├── flow_gaussian_nll.py
    │   │   ├── mel_evaluator.py
    │   │   ├── music_diffusion_fid.py
    │   │   ├── single_point_pair_contrastive_eval.py
    │   │   ├── sr_diffusion_fid.py
    │   │   ├── sr_fid.py
    │   │   └── sr_style.py
    │   ├── experiments
    │   │   ├── __init__.py
    │   │   └── experiments.py
    │   ├── feature_model.py
    │   ├── inject.py
    │   ├── injectors
    │   │   ├── __init__.py
    │   │   ├── audio_injectors.py
    │   │   ├── base_injectors.py
    │   │   ├── gaussian_diffusion_injector.py
    │   │   └── spec_augment.py
    │   ├── loss.py
    │   ├── losses.py
    │   ├── lr_scheduler.py
    │   ├── networks.py
    │   ├── optimizers
    │   │   ├── lamb.py
    │   │   ├── larc.py
    │   │   └── sgd.py
    │   └── steps.py
    ├── use_discriminator_as_filter.py
    └── utils
    │   ├── BASE_gpt.yaml
    │   ├── UI_icon.png
    │   ├── __init__.py
    │   ├── audio.py
    │   ├── audio_resampler.py
    │   ├── colors.py
    │   ├── convert_model.py
    │   ├── distributed_checkpont.py
    │   ├── gpu_mem_track.py
    │   ├── kmeans.py
    │   ├── loss_accumulator.py
    │   ├── music_utils.py
    │   ├── numeric_stability.py
    │   ├── options.py
    │   ├── util.py
    │   └── weight_scheduler.py
├── environment.yaml
├── experiments
    ├── EXAMPLE_diff.yml
    ├── EXAMPLE_gpt.yml
    ├── bpe_lowercase_asr_256.json
    ├── clips_mel_norms.pth
    ├── train_diffusion_vocoder_22k_level.yml
    └── train_gpt_tts_unified.yml
├── param_calc.py
├── recipes
    ├── byol
    │   ├── README.md
    │   └── train_div2k_byol.yml
    ├── diffusion
    │   ├── README.md
    │   ├── test_diffusion_unet.yml
    │   └── train_ddpm_unet.yml
    ├── esrgan
    │   ├── README.md
    │   ├── rrdb_process_video.yml
    │   ├── train_div2k_esrgan.yml
    │   └── train_div2k_esrgan_reference.yml
    ├── glean
    │   ├── README.md
    │   └── train_ffhq_glean.yml
    ├── segformer
    │   └── train_byol_segformer.yml
    ├── srflow
    │   ├── README.md
    │   ├── convert_official_weights.py
    │   ├── train_div2k_rrdb_psnr.yml
    │   └── train_div2k_srflow.yml
    ├── stylegan
    │   └── README.md
    ├── tacotron2
    │   ├── test_tacotron2_lj.yml
    │   └── train_tacotron2_lj.yml
    └── vqvae2
    │   ├── README.md
    │   └── train_imgnet_vqvae_stage1.yml
├── resources
    └── bitsandbytes_windows
    │   ├── cextension.py
    │   ├── libbitsandbytes_cpu.dll
    │   ├── libbitsandbytes_cuda116.dll
    │   └── main.py
├── sandbox.py
├── static
    ├── drive_copied_file_tree.png
    ├── export_to_gdrive.png
    ├── file_directory.png
    ├── good_gpu.png
    ├── hyperparam_dataset.png
    ├── ljspeech.png
    ├── notebook_header.png
    ├── params.png
    ├── runtime_type.png
    ├── settings_options.png
    ├── stop_training.png
    ├── training_button.png
    ├── very_long_process.png
    ├── very_recent_save.png
    ├── warning.png
    └── yml_file.png
└── voice_samples
    ├── kk_500
        ├── kk_0_0.wav
        ├── kk_0_1.wav
        └── kk_0_2.wav
    ├── kk_500_emma
        ├── emma_0_0.wav
        ├── emma_0_1.wav
        └── emma_0_2.wav
    └── kk_orig
        ├── kk_0_0.wav
        ├── kk_0_1.wav
        └── kk_0_2.wav


/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore =
3 |     # Too many leading '#' for block comment (E266)
4 |     E266
5 | 
6 | max-line-length=100


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "flownet2"]
2 | 	path = flownet2
3 | 	url = https://github.com/NVIDIA/flownet2-pytorch.git
4 | [submodule "codes/models/flownet2"]
5 | 	path = codes/models/flownet2
6 | 	url = https://github.com/neonbjb/flownet2-pytorch.git
7 | 


--------------------------------------------------------------------------------
/Setup DLAS.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | :: This file is part of sygil-webui (https://github.com/Sygil-Dev/sygil-webui/).
 3 | :: 
 4 | :: Copyright 2022 Sygil-Dev team.
 5 | :: This program is free software: you can redistribute it and/or modify
 6 | :: it under the terms of the GNU Affero General Public License as published by
 7 | :: the Free Software Foundation, either version 3 of the License, or
 8 | :: (at your option) any later version.
 9 | :: 
10 | :: This program is distributed in the hope that it will be useful,
11 | :: but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | :: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | :: GNU Affero General Public License for more details.
14 | :: 
15 | :: You should have received a copy of the GNU Affero General Public License
16 | :: along with this program.  If not, see <http://www.gnu.org/licenses/>. 
17 | :: Run all commands using this script's directory as the working directory
18 | cd %~dp0
19 | 
20 | :: copy over the first line from environment.yaml, e.g. name: ldm, and take the second word after splitting by ":" delimiter
21 | for /F "tokens=2 delims=: " %%i in (environment.yaml) DO (
22 |   set v_conda_env_name=%%i
23 |   goto EOL
24 | )
25 | :EOL
26 | 
27 | echo Environment name is set as %v_conda_env_name% as per environment.yaml
28 | 
29 | :: Put the path to conda directory in a file called "custom-conda-path.txt" if it's installed at non-standard path
30 | IF EXIST custom-conda-path.txt (
31 |   FOR /F %%i IN (custom-conda-path.txt) DO set v_custom_path=%%i
32 | )
33 | 
34 | set INSTALL_ENV_DIR=%cd%\installer_files\env
35 | set PATH=%INSTALL_ENV_DIR%;%INSTALL_ENV_DIR%\Library\bin;%INSTALL_ENV_DIR%\Scripts;%INSTALL_ENV_DIR%\Library\usr\bin;%PATH%
36 | 
37 | set v_paths=%INSTALL_ENV_DIR%
38 | set v_paths=%v_paths%;%ProgramData%\miniconda3
39 | set v_paths=%v_paths%;%USERPROFILE%\miniconda3
40 | set v_paths=%v_paths%;%ProgramData%\anaconda3
41 | set v_paths=%v_paths%;%USERPROFILE%\anaconda3
42 | 
43 | for %%a in (%v_paths%) do (
44 |   IF NOT "%v_custom_path%"=="" (
45 |     set v_paths=%v_custom_path%;%v_paths%
46 |   )
47 | )
48 | 
49 | for %%a in (%v_paths%) do (
50 |   if EXIST "%%a\Scripts\activate.bat" (
51 |     SET v_conda_path=%%a
52 |     echo anaconda3/miniconda3 detected in %%a
53 |     goto :CONDA_FOUND
54 |   )
55 | )
56 | 
57 | IF "%v_conda_path%"=="" (
58 |   echo anaconda3/miniconda3 not found. Install from here https://docs.conda.io/en/latest/miniconda.html
59 |   pause
60 |   exit /b 1
61 | )
62 | 
63 | :CONDA_FOUND
64 | echo Found Anaconda
65 | 
66 | :SKIP_RESTORE
67 | call "%v_conda_path%\Scripts\activate.bat"
68 | echo %v_conda_env_name%
69 | 
70 | call conda env create --name "%v_conda_env_name%" -f environment.yaml
71 | 
72 | 
73 | 
74 | call "%v_conda_path%\Scripts\activate.bat" "%v_conda_env_name%"
75 | 
76 | :PROMPT
77 | python codes/scripts/post_install.py
78 | pause


--------------------------------------------------------------------------------
/Start DLAS.cmd:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | :: This file is part of sygil-webui (https://github.com/Sygil-Dev/sygil-webui/).
 3 | :: 
 4 | :: Copyright 2022 Sygil-Dev team.
 5 | :: This program is free software: you can redistribute it and/or modify
 6 | :: it under the terms of the GNU Affero General Public License as published by
 7 | :: the Free Software Foundation, either version 3 of the License, or
 8 | :: (at your option) any later version.
 9 | :: 
10 | :: This program is distributed in the hope that it will be useful,
11 | :: but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | :: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | :: GNU Affero General Public License for more details.
14 | :: 
15 | :: You should have received a copy of the GNU Affero General Public License
16 | :: along with this program.  If not, see <http://www.gnu.org/licenses/>. 
17 | :: Run all commands using this script's directory as the working directory
18 | cd %~dp0
19 | 
20 | :: copy over the first line from environment.yaml, e.g. name: ldm, and take the second word after splitting by ":" delimiter
21 | set v_conda_env_name="DLAS"
22 | 
23 | 
24 | echo Environment name is set as %v_conda_env_name% as per environment.yaml
25 | 
26 | :: Put the path to conda directory in a file called "custom-conda-path.txt" if it's installed at non-standard path
27 | IF EXIST custom-conda-path.txt (
28 |   FOR /F %%i IN (custom-conda-path.txt) DO set v_custom_path=%%i
29 | )
30 | 
31 | set INSTALL_ENV_DIR=%cd%\installer_files\env
32 | set PATH=%INSTALL_ENV_DIR%;%INSTALL_ENV_DIR%\Library\bin;%INSTALL_ENV_DIR%\Scripts;%INSTALL_ENV_DIR%\Library\usr\bin;%PATH%
33 | 
34 | set v_paths=%INSTALL_ENV_DIR%
35 | set v_paths=%v_paths%;%ProgramData%\miniconda3
36 | set v_paths=%v_paths%;%USERPROFILE%\miniconda3
37 | set v_paths=%v_paths%;%ProgramData%\anaconda3
38 | set v_paths=%v_paths%;%USERPROFILE%\anaconda3
39 | 
40 | for %%a in (%v_paths%) do (
41 |   IF NOT "%v_custom_path%"=="" (
42 |     set v_paths=%v_custom_path%;%v_paths%
43 |   )
44 | )
45 | 
46 | for %%a in (%v_paths%) do (
47 |   if EXIST "%%a\Scripts\activate.bat" (
48 |     SET v_conda_path=%%a
49 |     echo anaconda3/miniconda3 detected in %%a
50 |     goto :CONDA_FOUND
51 |   )
52 | )
53 | 
54 | IF "%v_conda_path%"=="" (
55 |   echo anaconda3/miniconda3 not found. Install from here https://docs.conda.io/en/latest/miniconda.html
56 |   pause
57 |   exit /b 1
58 | )
59 | 
60 | :CONDA_FOUND
61 | echo Starting conda environment %v_conda_env_name% from %v_conda_path%
62 | 
63 | call "%v_conda_path%\Scripts\activate.bat" "%v_conda_env_name%"
64 | 
65 | :START_GUI
66 | python codes/configuration_gui.py
67 | 
68 | ::cmd /k
69 | 


--------------------------------------------------------------------------------
/Start Training Monitor.cmd:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | :: This file is part of sygil-webui (https://github.com/Sygil-Dev/sygil-webui/).
 3 | :: 
 4 | :: Copyright 2022 Sygil-Dev team.
 5 | :: This program is free software: you can redistribute it and/or modify
 6 | :: it under the terms of the GNU Affero General Public License as published by
 7 | :: the Free Software Foundation, either version 3 of the License, or
 8 | :: (at your option) any later version.
 9 | :: 
10 | :: This program is distributed in the hope that it will be useful,
11 | :: but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | :: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | :: GNU Affero General Public License for more details.
14 | :: 
15 | :: You should have received a copy of the GNU Affero General Public License
16 | :: along with this program.  If not, see <http://www.gnu.org/licenses/>. 
17 | :: Run all commands using this script's directory as the working directory
18 | cd %~dp0
19 | 
20 | :: copy over the first line from environment.yaml, e.g. name: ldm, and take the second word after splitting by ":" delimiter
21 | set v_conda_env_name="DLAS"
22 | 
23 | 
24 | echo Environment name is set as %v_conda_env_name% as per environment.yaml
25 | 
26 | :: Put the path to conda directory in a file called "custom-conda-path.txt" if it's installed at non-standard path
27 | IF EXIST custom-conda-path.txt (
28 |   FOR /F %%i IN (custom-conda-path.txt) DO set v_custom_path=%%i
29 | )
30 | 
31 | set INSTALL_ENV_DIR=%cd%\installer_files\env
32 | set PATH=%INSTALL_ENV_DIR%;%INSTALL_ENV_DIR%\Library\bin;%INSTALL_ENV_DIR%\Scripts;%INSTALL_ENV_DIR%\Library\usr\bin;%PATH%
33 | 
34 | set v_paths=%INSTALL_ENV_DIR%
35 | set v_paths=%v_paths%;%ProgramData%\miniconda3
36 | set v_paths=%v_paths%;%USERPROFILE%\miniconda3
37 | set v_paths=%v_paths%;%ProgramData%\anaconda3
38 | set v_paths=%v_paths%;%USERPROFILE%\anaconda3
39 | 
40 | for %%a in (%v_paths%) do (
41 |   IF NOT "%v_custom_path%"=="" (
42 |     set v_paths=%v_custom_path%;%v_paths%
43 |   )
44 | )
45 | 
46 | for %%a in (%v_paths%) do (
47 |   if EXIST "%%a\Scripts\activate.bat" (
48 |     SET v_conda_path=%%a
49 |     echo anaconda3/miniconda3 detected in %%a
50 |     goto :CONDA_FOUND
51 |   )
52 | )
53 | 
54 | IF "%v_conda_path%"=="" (
55 |   echo anaconda3/miniconda3 not found. Install from here https://docs.conda.io/en/latest/miniconda.html
56 |   pause
57 |   exit /b 1
58 | )
59 | 
60 | :CONDA_FOUND
61 | echo Starting conda environment %v_conda_env_name% from %v_conda_path%
62 | 
63 | call "%v_conda_path%\Scripts\activate.bat" "%v_conda_env_name%"
64 | 
65 | :START_GUI
66 | python codes/scripts/start_tensorboard.py
67 | 
68 | ::cmd /k
69 | 


--------------------------------------------------------------------------------
/codes/data/audio/preprocessed_mel_dataset.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pathlib import Path
 3 | 
 4 | import numpy as np
 5 | import torch
 6 | import torch.nn.functional as F
 7 | import torch.utils.data
 8 | import torchaudio
 9 | import torchvision
10 | from tqdm import tqdm
11 | 
12 | from utils.util import opt_get
13 | 
14 | 
15 | class PreprocessedMelDataset(torch.utils.data.Dataset):
16 | 
17 |     def __init__(self, opt):
18 |         path = opt['path']
19 |         cache_path = opt['cache_path']  # Will fail when multiple paths specified, must be specified in this case.
20 |         if os.path.exists(cache_path):
21 |             self.paths = torch.load(cache_path)
22 |         else:
23 |             print("Building cache..")
24 |             path = Path(path)
25 |             self.paths = [str(p) for p in path.rglob("*.npz")]
26 |             torch.save(self.paths, cache_path)
27 |         self.pad_to = opt_get(opt, ['pad_to_samples'], 10336)
28 |         self.squeeze = opt_get(opt, ['should_squeeze'], False)
29 | 
30 |     def __getitem__(self, index):
31 |         with np.load(self.paths[index]) as npz_file:
32 |             mel = torch.tensor(npz_file['arr_0'])
33 |         assert mel.shape[-1] <= self.pad_to
34 |         if self.squeeze:
35 |             mel = mel.squeeze()
36 |         padding_needed = self.pad_to - mel.shape[-1]
37 |         mask = torch.zeros_like(mel)
38 |         if padding_needed > 0:
39 |             mel = F.pad(mel, (0,padding_needed))
40 |             mask = F.pad(mask, (0,padding_needed), value=1)
41 | 
42 |         output = {
43 |             'mel': mel,
44 |             'mel_lengths': torch.tensor(mel.shape[-1]),
45 |             'mask': mask,
46 |             'mask_lengths': torch.tensor(mask.shape[-1]),
47 |             'path': self.paths[index],
48 |         }
49 |         return output
50 | 
51 |     def __len__(self):
52 |         return len(self.paths)
53 | 
54 | 
55 | if __name__ == '__main__':
56 |     params = {
57 |         'mode': 'preprocessed_mel',
58 |         'path': 'Y:\\separated\\large_mel_cheaters',
59 |         'cache_path': 'Y:\\separated\\large_mel_cheaters_win.pth',
60 |         'pad_to_samples': 646,
61 |         'phase': 'train',
62 |         'n_workers': 0,
63 |         'batch_size': 16,
64 |     }
65 |     from data import create_dataset, create_dataloader
66 | 
67 |     ds = create_dataset(params)
68 |     dl = create_dataloader(ds, params)
69 |     i = 0
70 |     for b in tqdm(dl):
71 |         #pass
72 |         torchvision.utils.save_image((b['mel'].unsqueeze(1)+1)/2, f'{i}.png')
73 |         i += 1
74 |         if i > 20:
75 |             break
76 | 


--------------------------------------------------------------------------------
/codes/data/audio/wav_aug.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | import torch
 4 | import torchaudio.sox_effects
 5 | 
 6 | from models.audio.tts.tacotron2.taco_utils import load_wav_to_torch
 7 | 
 8 | 
 9 | # Returns random double on [l,h] as a string
10 | def rdstr(l=0,h=1):
11 |     assert h > l
12 |     i=h-l
13 |     return str(random.random() * i + l)
14 | 
15 | 
16 | # Returns a randint on [s,e] as a string
17 | def rdi(e, s=0):
18 |     return str(random.randint(s,e))
19 | 
20 | 
21 | class WavAugmentor:
22 |     def __init__(self):
23 |         pass
24 | 
25 |     def augment(self, wav, sample_rate):
26 |         speed_effect = ['speed', rdstr(.8, 1)]
27 |         '''
28 |         Band effects are disabled until I can audit them better.
29 |         band_effects = [
30 |             ['reverb', '-w'],
31 |             ['reverb'],
32 |             ['band', rdi(8000, 3000), rdi(1000, 100)],
33 |             ['bandpass', rdi(8000, 3000), rdi(1000, 100)],
34 |             ['bass', rdi(20,-20)],
35 |             ['treble', rdi(20,-20)],
36 |             ['dither'],
37 |             ['equalizer', rdi(3000, 100), rdi(1000, 100), rdi(10, -10)],
38 |             ['hilbert'],
39 |             ['sinc', '3k'],
40 |             ['sinc', '-4k'],
41 |             ['sinc', '3k-4k']
42 |         ]
43 |         band_effect = random.choice(band_effects)
44 |         '''
45 |         volume_effects = [
46 |             ['loudness', rdi(10,-2)],
47 |             ['overdrive', rdi(20,0), rdi(20,0)],
48 |         ]
49 |         vol_effect = random.choice(volume_effects)
50 |         effects = [speed_effect, vol_effect]
51 |         out, sr = torchaudio.sox_effects.apply_effects_tensor(wav, sample_rate, effects)
52 |         # Add a variable amount of noise
53 |         out = out + torch.rand_like(out) * random.random() * .03
54 |         return out
55 | 
56 | 
57 | if __name__ == '__main__':
58 |     sample, _ = load_wav_to_torch('obama1.wav')
59 |     sample = sample / 32768.0
60 |     aug = WavAugmentor()
61 |     for j in range(10):
62 |         out = aug.augment(sample, 24000)
63 |         torchaudio.save(f'out{j}.wav', out, 24000)


--------------------------------------------------------------------------------
/codes/data/combined_dataset.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from data import create_dataset
 3 | 
 4 | 
 5 | # Simple composite dataset that combines multiple other datasets.
 6 | # Assumes that the datasets output dicts.
 7 | class CombinedDataset(torch.utils.data.Dataset):
 8 |     def __init__(self, opt):
 9 |         self.datasets = {}
10 |         for k, v in opt.items():
11 |             if not isinstance(v, dict):
12 |                 continue
13 |             # Scale&phase gets injected by options.py..
14 |             v['scale'] = opt['scale']
15 |             v['phase'] = opt['phase']
16 |             self.datasets[k] = create_dataset(v)
17 |         self.items_fetched = 0
18 | 
19 |     def __getitem__(self, i):
20 |         self.items_fetched += 1
21 |         output = {}
22 |         for name, dataset in self.datasets.items():
23 |             prefix = ""
24 |             # 'default' dataset gets no prefix, other ones get `key_`
25 |             if name != 'default':
26 |                 prefix = name + "_"
27 | 
28 |             data = dataset[i % len(dataset)]
29 |             for k, v in data.items():
30 |                 output[prefix + k] = v
31 |         return output
32 | 
33 |     def __len__(self):
34 |         return max(len(d) for d in self.datasets.values())


--------------------------------------------------------------------------------
/codes/data/data_sampler.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Modified from torch.utils.data.distributed.DistributedSampler
 3 | Support enlarging the dataset for *iteration-oriented* training, for saving time when restart the
 4 | dataloader after each epoch
 5 | """
 6 | import math
 7 | import torch
 8 | from torch.utils.data.sampler import Sampler
 9 | import torch.distributed as dist
10 | 
11 | 
12 | class DistIterSampler(Sampler):
13 |     """Sampler that restricts data loading to a subset of the dataset.
14 | 
15 |     It is especially useful in conjunction with
16 |     :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
17 |     process can pass a DistributedSampler instance as a DataLoader sampler,
18 |     and load a subset of the original dataset that is exclusive to it.
19 | 
20 |     .. note::
21 |         Dataset is assumed to be of constant size.
22 | 
23 |     Arguments:
24 |         dataset: Dataset used for sampling.
25 |         num_replicas (optional): Number of processes participating in
26 |             distributed training.
27 |         rank (optional): Rank of the current process within num_replicas.
28 |     """
29 | 
30 |     def __init__(self, dataset, num_replicas=None, rank=None, ratio=100):
31 |         if num_replicas is None:
32 |             if not dist.is_available():
33 |                 raise RuntimeError("Requires distributed package to be available")
34 |             num_replicas = dist.get_world_size()
35 |         if rank is None:
36 |             if not dist.is_available():
37 |                 raise RuntimeError("Requires distributed package to be available")
38 |             rank = dist.get_rank()
39 |         self.dataset = dataset
40 |         self.num_replicas = num_replicas
41 |         self.rank = rank
42 |         self.epoch = 0
43 |         self.num_samples = int(math.ceil(len(self.dataset) * ratio / self.num_replicas))
44 |         self.total_size = self.num_samples * self.num_replicas
45 | 
46 |     def __iter__(self):
47 |         # deterministically shuffle based on epoch
48 |         g = torch.Generator()
49 |         g.manual_seed(self.epoch)
50 |         indices = torch.randperm(self.total_size, generator=g).tolist()
51 | 
52 |         dsize = len(self.dataset)
53 |         indices = [v % dsize for v in indices]
54 | 
55 |         # subsample
56 |         indices = indices[self.rank:self.total_size:self.num_replicas]
57 |         assert len(indices) == self.num_samples
58 | 
59 |         return iter(indices)
60 | 
61 |     def __len__(self):
62 |         return self.num_samples
63 | 
64 |     def set_epoch(self, epoch):
65 |         self.epoch = epoch
66 | 


--------------------------------------------------------------------------------
/codes/data/images/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/data/images/__init__.py


--------------------------------------------------------------------------------
/codes/data/images/chunk_with_reference.py:
--------------------------------------------------------------------------------
 1 | import os.path as osp
 2 | from data import util
 3 | import torch
 4 | import numpy as np
 5 | 
 6 | # Iterable that reads all the images in a directory that contains a reference image, tile images and center coordinates.
 7 | from utils.util import opt_get
 8 | 
 9 | 
10 | class ChunkWithReference:
11 |     def __init__(self, opt, path):
12 |         self.path = path.path
13 |         self.tiles, _ = util.find_files_of_type('img', self.path)
14 |         self.need_metadata = opt_get(opt, ['strict'], False) or opt_get(opt, ['needs_metadata'], False)
15 |         self.need_ref = opt_get(opt, ['need_ref'], False)
16 |         if 'ignore_first' in opt.keys():
17 |             self.tiles = self.tiles[opt['ignore_first']:]
18 | 
19 |     # Odd failures occur at times. Rather than crashing, report the error and just return zeros.
20 |     def read_image_or_get_zero(self, img_path):
21 |         img = util.read_img(None, img_path, rgb=True)
22 |         if img is None:
23 |             return np.zeros(128, 128, 3)
24 |         return img
25 | 
26 |     def __getitem__(self, item):
27 |         tile = self.read_image_or_get_zero(self.tiles[item])
28 |         if self.need_ref and osp.exists(osp.join(self.path, "ref.jpg")):
29 |             tile_id = int(osp.splitext(osp.basename(self.tiles[item]))[0])
30 |             ref = self.read_image_or_get_zero(osp.join(self.path, "ref.jpg"))
31 |             if self.need_metadata:
32 |                 centers = torch.load(osp.join(self.path, "centers.pt"))
33 |                 if tile_id in centers.keys():
34 |                     center, tile_width = centers[tile_id]
35 |                 else:
36 |                     print("Could not find the given tile id in the accompanying centers.pt. This generally means that "
37 |                           "centers.pt was overwritten at some point e.g. by duplicate data. If you don't care about tile "
38 |                           "centers, consider passing strict=false to the dataset options. (Note: you must re-build your"
39 |                           "caches for this setting change to take effect.)")
40 |                     raise FileNotFoundError(tile_id, self.tiles[item])
41 |             else:
42 |                 center = torch.tensor([128, 128], dtype=torch.long)
43 |                 tile_width = 256
44 |             mask = np.full(tile.shape[:2] + (1,), fill_value=.1, dtype=tile.dtype)
45 |             mask[center[0] - tile_width // 2:center[0] + tile_width // 2, center[1] - tile_width // 2:center[1] + tile_width // 2] = 1
46 |         else:
47 |             ref = np.zeros_like(tile)
48 |             mask = np.zeros(tile.shape[:2] + (1,))
49 |             center = (0,0)
50 | 
51 |         return tile, ref, center, mask, self.tiles[item]
52 | 
53 |     def __len__(self):
54 |         return len(self.tiles)
55 | 


--------------------------------------------------------------------------------
/codes/data/images/random_dataset.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.utils.data import Dataset
 3 | 
 4 | 
 5 | # Dataset that feeds random data into the state. Can be useful for testing or demo purposes without actual data.
 6 | class RandomDataset(Dataset):
 7 |     def __init__(self, opt):
 8 |         self.hq_shape = tuple(opt['hq_shape'])
 9 |         self.lq_shape = tuple(opt['lq_shape'])
10 | 
11 |     def __getitem__(self, item):
12 |         return {'lq': torch.rand(self.lq_shape), 'hq': torch.rand(self.hq_shape),
13 |                 'LQ_path': '', 'GT_path': ''}
14 | 
15 |     def __len__(self):
16 |         # Arbitrary
17 |         return 1024 * 1024
18 | 


--------------------------------------------------------------------------------
/codes/data/images/zip_file_dataset.py:
--------------------------------------------------------------------------------
 1 | import PIL.Image
 2 | import zipfile
 3 | import torch
 4 | import torchvision
 5 | from torch.utils.data import DataLoader
 6 | from torchvision.transforms import Compose, ToTensor, Normalize, Resize
 7 | 
 8 | 
 9 | class ZipFileDataset(torch.utils.data.Dataset):
10 |     def __init__(self, opt):
11 |         self.path = opt['path']
12 |         zip = zipfile.ZipFile(self.path)
13 |         self.all_files = list(zip.namelist())
14 |         self.resolution = opt['resolution']
15 |         self.paired_mode = opt['paired_mode']
16 |         self.transforms = Compose([ToTensor(),
17 |                                  Resize(self.resolution),
18 |                                  Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
19 |                                   ])
20 |         self.zip = None
21 | 
22 |     def __len__(self):
23 |         return len(self.all_files)
24 | 
25 |     # Loaded on the fly because ZipFile does not tolerate pickling.
26 |     def get_zip(self):
27 |         if self.zip is None:
28 |             self.zip = zipfile.ZipFile(self.path)
29 |         return self.zip
30 | 
31 |     def load_image(self, path):
32 |         file = self.get_zip().open(path, 'r')
33 |         pilimg = PIL.Image.open(file)
34 |         tensor = self.transforms(pilimg)
35 |         return tensor
36 | 
37 |     def __getitem__(self, i):
38 |         try:
39 |             fname = self.all_files[i]
40 |             out = {
41 |                 'hq': self.load_image(fname),
42 |                 'HQ_path': fname,
43 |                 'has_alt': self.paired_mode
44 |             }
45 |             if self.paired_mode:
46 |                 if fname.endswith('0.jpg'):
47 |                     aname = fname.replace('0.jpg', '1.jpg')
48 |                 else:
49 |                     aname = fname.replace('1.jpg', '0.jpg')
50 |                 out['alt_hq'] = self.load_image(aname)
51 |         except:
52 |             print(f"Error loading {fname} from zipfile. Attempting to recover by loading next element.")
53 |             return self[i+1]
54 |         return out
55 | 
56 | if __name__ == '__main__':
57 |     opt = {
58 |         'path': 'E:\\4k6k\\datasets\\images\\youtube-imagenet-paired\\output.zip',
59 |         'resolution': 224,
60 |         'paired_mode': True
61 |     }
62 |     dataset = ZipFileDataset(opt)
63 |     print(len(dataset))
64 |     loader = DataLoader(dataset, shuffle=True)
65 |     for i, d in enumerate(loader):
66 |         torchvision.utils.save_image(d['hq'], f'{i}_hq.png')
67 |         torchvision.utils.save_image(d['alt_hq'], f'{i}_althq.png')
68 | 
69 | 


--------------------------------------------------------------------------------
/codes/data/text/hf_datasets_wrapper.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import Dataset
 2 | import datasets
 3 | 
 4 | 
 5 | class HfDataset(Dataset):
 6 |     """
 7 |     Simple wrapper for a HuggingFace dataset that can re-map keys if desired.
 8 |     """
 9 |     def __init__(self, corpi, cache_path=None, key_maps=None, dataset_spec_key='train'):
10 |         self.hfd = []
11 |         for corpus in corpi:
12 |             dataset_name, config = corpus
13 |             if config == '' or config == 'None':
14 |                 config = None
15 |             self.hfd.append(datasets.load_dataset(dataset_name, config, cache_dir=cache_path)[dataset_spec_key])
16 |         self.key_maps = key_maps
17 | 
18 |     def __getitem__(self, item):
19 |         for dataset in self.hfd:
20 |             if item < len(dataset):
21 |                 val = dataset[item]
22 |                 if self.key_maps is None:
23 |                     return val
24 |                 else:
25 |                     return {k: val[v] for k, v in self.key_maps.items()}
26 |             else:
27 |                 item -= len(dataset)
28 |         raise IndexError()
29 | 
30 |     def __len__(self):
31 |         return sum([len(h) for h in self.hfd])
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     d = HfDataset([['wikipedia', '20200501.en'], ['bookcorpus', '']], dataset_spec_key='train', cache_path='Z:\\huggingface_datasets\\cache')
36 |     print(d[5])
37 | 


--------------------------------------------------------------------------------
/codes/data/torch_dataset.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import Dataset
 2 | import torchvision.transforms as T
 3 | from torchvision import datasets
 4 | 
 5 | # Wrapper for basic pytorch datasets which re-wraps them into a format usable by ExtensibleTrainer.
 6 | from data.images.cifar import CIFAR100, CIFAR10
 7 | from utils.util import opt_get
 8 | 
 9 | 
10 | class TorchDataset(Dataset):
11 |     def __init__(self, opt):
12 |         DATASET_MAP = {
13 |             "mnist": datasets.MNIST,
14 |             "fmnist": datasets.FashionMNIST,
15 |             "cifar10": CIFAR10,
16 |             "cifar100": CIFAR100,
17 |             "imagenet": datasets.ImageNet,
18 |             "imagefolder": datasets.ImageFolder
19 |         }
20 |         normalize = T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
21 |         if opt_get(opt, ['random_crop'], False):
22 |             transforms = [
23 |                 T.RandomResizedCrop(opt['image_size']),
24 |                 T.RandomHorizontalFlip(),
25 |                 T.ToTensor(),
26 |                 normalize,
27 |             ]
28 |         else:
29 |             transforms = [
30 |                 T.Resize(opt['image_size']),
31 |                 T.CenterCrop(opt['image_size']),
32 |                 T.RandomHorizontalFlip(),
33 |                 T.ToTensor(),
34 |                 normalize,
35 |             ]
36 |         transforms = T.Compose(transforms)
37 |         self.dataset = DATASET_MAP[opt['dataset']](transform=transforms, **opt['kwargs'])
38 |         self.len = opt_get(opt, ['fixed_len'], len(self.dataset))
39 |         self.offset = opt_get(opt, ['offset'], 0)
40 | 
41 |     def __getitem__(self, item):
42 |         item = self.dataset[item+self.offset]
43 |         if len(item) == 2:
44 |             underlying_item, lbl = item
45 |             coarselbl = None
46 |         elif len(item) == 3:
47 |             underlying_item, lbl, coarselbl = item
48 |         else:
49 |             raise NotImplementedError
50 |         return {'lq': underlying_item, 'hq': underlying_item, 'labels': lbl, 'coarse_labels': coarselbl,
51 |                 'LQ_path': str(item), 'GT_path': str(item)}
52 | 
53 |     def __len__(self):
54 |         return self.len-self.offset
55 | 
56 | if __name__ == '__main__':
57 |     opt = {
58 |         'flip': True,
59 |         'crop_sz': None,
60 |         'dataset': 'cifar100',
61 |         'image_size': 32,
62 |         'normalize': True,
63 |         'kwargs': {
64 |             'root': 'E:\\4k6k\\datasets\\images\\cifar100',
65 |             'download': True
66 |         }
67 |     }
68 |     set = TorchDataset(opt)
69 |     j = set[0]
70 | 


--------------------------------------------------------------------------------
/codes/data/zero_pad_dict_collate.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | 
 4 | 
 5 | class ZeroPadDictCollate():
 6 |     """
 7 |     Given a list of dictionary outputs with torch.Tensors from a Dataset, iterates through each one, finds the longest
 8 |     tensor, and zero pads all the other tensors together.
 9 |     """
10 |     def collate_tensors(self, batch, key):
11 |         result = []
12 |         largest_dims = [0 for _ in range(len(batch[0][key].shape))]
13 |         for elem in batch:
14 |             result.append(elem[key])
15 |             largest_dims = [max(current_largest, new_consideration) for current_largest, new_consideration in zip(largest_dims, elem[key].shape)]
16 |         # Now pad each tensor by the largest dimension.
17 |         for i in range(len(result)):
18 |             padding_tuple = ()
19 |             for d in range(len(largest_dims)):
20 |                 padding_needed = largest_dims[d] - result[i].shape[d]
21 |                 assert padding_needed >= 0
22 |                 padding_tuple = (0, padding_needed) + padding_tuple
23 |             result[i] = F.pad(result[i], padding_tuple)
24 | 
25 |         return torch.stack(result, dim=0)
26 | 
27 | 
28 |     def collate_into_list(self, batch, key):
29 |         result = []
30 |         for elem in batch:
31 |             result.append(elem[key])
32 |         return result
33 | 
34 |     def __call__(self, batch):
35 |         first_dict = batch[0]
36 |         collated = {}
37 |         for key in first_dict.keys():
38 |             if isinstance(first_dict[key], torch.Tensor):
39 |                 if len(first_dict[key].shape) > 0:
40 |                     collated[key] = self.collate_tensors(batch, key)
41 |                 else:
42 |                     collated[key] = torch.stack([b[key] for b in batch])
43 |             else:
44 |                 collated[key] = self.collate_into_list(batch, key)
45 |         return collated


--------------------------------------------------------------------------------
/codes/maybe_bnb.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from typing import Optional,Literal
 3 | 
 4 | # EXPLICITLY leave these empty; ensure that an attributeerror is raised if these are not initialised properly.
 5 | class nn: pass
 6 | class optim: pass
 7 | 
 8 | def populate(adam=True, adamw=True, linear=False, embedding: Optional[Literal["STABLE", "NORMAL"]]="NORMAL"):
 9 |     nn.Linear = torch.nn.Linear
10 |     nn.Embedding = torch.nn.Embedding
11 |     optim.Adam = torch.optim.Adam # this does nothing tbh
12 |     optim.AdamW = torch.optim.AdamW
13 |     #
14 |     try:
15 |         import bitsandbytes as bnb
16 |     except ImportError:
17 |         return print("WARNING: bnb was missing, not using 8bit for anything!")
18 |     #
19 |     if adam: optim.Adam = bnb.optim.Adam8bit
20 |     if adamw: optim.AdamW = bnb.optim.AdamW8bit
21 |     if linear: nn.Linear = bnb.nn.Linear8bitLt
22 |     if embedding:
23 |         nn.Embedding = bnb.nn.StableEmbedding if embedding == 'STABLE' else bnb.nn.modules.Embedding
24 | 
25 | 


--------------------------------------------------------------------------------
/codes/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/__init__.py


--------------------------------------------------------------------------------
/codes/models/audio/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/audio/__init__.py


--------------------------------------------------------------------------------
/codes/models/audio/asr/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/audio/asr/__init__.py


--------------------------------------------------------------------------------
/codes/models/audio/music/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/audio/music/__init__.py


--------------------------------------------------------------------------------
/codes/models/audio/music/encoders.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from torch import nn
 4 | from transformers import GPT2Config, GPT2Model
 5 | 
 6 | from models.arch_util import AttentionBlock, ResBlock
 7 | from models.audio.tts.lucidrains_dvae import DiscreteVAE
 8 | from trainer.networks import register_model
 9 | from utils.util import opt_get, ceil_multiple, print_network
10 | 
11 | 
12 | class ResEncoder16x(nn.Module):
13 |     def __init__(self,
14 |                  spec_dim,
15 |                  hidden_dim,
16 |                  embedding_dim,
17 |                  checkpointing_enabled=True,
18 |                  ):
19 |         super().__init__()
20 |         attn = []
21 |         def edim(m):
22 |             dd = min(spec_dim + m * 128, hidden_dim)
23 |             return ceil_multiple(dd, 8)
24 |         self.downsampler = nn.Sequential(
25 |             ResBlock(spec_dim, out_channels=edim(2), use_conv=True, dims=1, down=True, checkpointing_enabled=checkpointing_enabled),
26 |             ResBlock(edim(2), out_channels=edim(3), use_conv=True, dims=1, down=True, checkpointing_enabled=checkpointing_enabled),
27 |             ResBlock(edim(3), out_channels=edim(3), use_conv=True, dims=1, checkpointing_enabled=checkpointing_enabled),            
28 |             ResBlock(edim(3), out_channels=edim(4), use_conv=True, dims=1, down=True, checkpointing_enabled=checkpointing_enabled),
29 |             ResBlock(edim(4), out_channels=edim(4), use_conv=True, dims=1, checkpointing_enabled=checkpointing_enabled),
30 |             ResBlock(edim(4), out_channels=hidden_dim, use_conv=True, dims=1, down=True, checkpointing_enabled=checkpointing_enabled))
31 |         self.encoder = nn.Sequential(
32 |             ResBlock(hidden_dim, out_channels=hidden_dim, use_conv=True, dims=1, checkpointing_enabled=checkpointing_enabled),
33 |             ResBlock(hidden_dim, out_channels=hidden_dim, use_conv=True, dims=1, checkpointing_enabled=checkpointing_enabled),
34 |             ResBlock(hidden_dim, out_channels=hidden_dim, use_conv=True, dims=1, checkpointing_enabled=checkpointing_enabled),
35 |             nn.GroupNorm(8, hidden_dim),
36 |             nn.SiLU(),
37 |             nn.Conv1d(hidden_dim, embedding_dim, 1),
38 |             nn.Tanh(),
39 |         )
40 | 
41 |     def forward(self, x):
42 |         h = self.downsampler(x)
43 |         h = self.encoder(h)
44 |         return h
45 | 


--------------------------------------------------------------------------------
/codes/models/audio/music/m2v_code_to_mel.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | from models.arch_util import ResBlock, AttentionBlock
 6 | from models.audio.music.flat_diffusion import MultiGroupEmbedding
 7 | from trainer.networks import register_model
 8 | from utils.util import checkpoint
 9 | 
10 | 
11 | class Code2Mel(nn.Module):
12 |     def __init__(self, out_dim=256, base_dim=1024, num_tokens=16, num_groups=4, dropout=.1):
13 |         super().__init__()
14 |         self.emb = MultiGroupEmbedding(num_tokens, num_groups, base_dim)
15 |         self.base_blocks = nn.Sequential(ResBlock(base_dim, dropout, dims=1),
16 |                                          AttentionBlock(base_dim, num_heads=base_dim//64),
17 |                                          ResBlock(base_dim, dropout, dims=1))
18 |         l2dim = base_dim-256
19 |         self.l2_up_block = nn.Conv1d(base_dim, l2dim, kernel_size=5, padding=2)
20 |         self.l2_blocks = nn.Sequential(ResBlock(l2dim, dropout, kernel_size=5, dims=1),
21 |                                          AttentionBlock(l2dim, num_heads=base_dim//64),
22 |                                          ResBlock(l2dim, dropout, kernel_size=5, dims=1),
23 |                                          AttentionBlock(l2dim, num_heads=base_dim//64),
24 |                                          ResBlock(l2dim, dropout, dims=1),
25 |                                          ResBlock(l2dim, dropout, dims=1))
26 |         l3dim = l2dim-256
27 |         self.l3_up_block = nn.Conv1d(l2dim, l3dim, kernel_size=5, padding=2)
28 |         self.l3_blocks = nn.Sequential(ResBlock(l3dim, dropout, kernel_size=5, dims=1),
29 |                                        AttentionBlock(l3dim, num_heads=base_dim//64),
30 |                                        ResBlock(l3dim, dropout, kernel_size=5, dims=1),
31 |                                        ResBlock(l3dim, dropout, dims=1))
32 |         self.final_block = nn.Conv1d(l3dim, out_dim, kernel_size=3, padding=1)
33 | 
34 |     def forward(self, codes, target):
35 |         with torch.autocast(codes.device.type):
36 |             h = self.emb(codes).permute(0,2,1)
37 |             h = checkpoint(self.base_blocks, h)
38 |             h = F.interpolate(h, scale_factor=2, mode='linear')
39 |             h = self.l2_up_block(h)
40 |             h = checkpoint(self.l2_blocks, h)
41 |             h = F.interpolate(h, size=target.shape[-1], mode='linear')
42 |             h = self.l3_up_block(h)
43 |         h = checkpoint(self.l3_blocks, h.float())
44 |         pred = self.final_block(h)
45 |         return F.mse_loss(pred, target), pred
46 | 
47 | 
48 | @register_model
49 | def register_code2mel(opt_net, opt):
50 |     return Code2Mel(**opt_net['kwargs'])
51 | 
52 | 
53 | if __name__ == '__main__':
54 |     model = Code2Mel()
55 |     codes = torch.randint(0,16, (2,200,4))
56 |     target = torch.randn(2,256,804)
57 |     model(codes, target)


--------------------------------------------------------------------------------
/codes/models/audio/music/mel2vec_codes_gpt.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | import torch.nn.functional as F
 4 | import maybe_bnb as mbnb
 5 | from transformers import GPT2Config, GPT2Model
 6 | 
 7 | from trainer.networks import register_model
 8 | from utils.util import opt_get
 9 | 
10 | 
11 | class Mel2VecCodesGpt(nn.Module):
12 |     def __init__(self, dim, layers, num_groups=8, num_vectors=8):
13 |         super().__init__()
14 | 
15 |         self.num_groups = num_groups
16 | 
17 |         self.config = GPT2Config(vocab_size=1, n_positions=8192, n_embd=dim, n_layer=layers, n_head=dim//64,
18 |                                  n_inner=dim*2)
19 |         self.gpt = GPT2Model(self.config)
20 |         del self.gpt.wte  # Unused, we'll do our own embeddings.
21 |         # nn.Embedding
22 |         self.embeddings = nn.ModuleList([mbnb.nn.Embedding(num_vectors, dim//num_groups) for _ in range(num_groups)])
23 |         self.heads = nn.ModuleList([mbnb.nn.Linear(dim, num_vectors) for _ in range(num_groups)])
24 | 
25 |     def forward(self, codes):
26 |         assert codes.shape[-1] == self.num_groups
27 | 
28 |         inputs = codes[:, :-1]
29 |         targets = codes[:, 1:]
30 | 
31 |         h = [embedding(inputs[:, :, i]) for i, embedding in enumerate(self.embeddings)]
32 |         h = torch.cat(h, dim=-1)
33 |         h = self.gpt(inputs_embeds=h, return_dict=True).last_hidden_state
34 | 
35 |         losses = 0
36 |         for i, head in enumerate(self.heads):
37 |             logits = head(h).permute(0,2,1)
38 |             loss = F.cross_entropy(logits, targets[:,:,i])
39 |             losses = losses + loss
40 | 
41 |         return losses / self.num_groups
42 | 
43 | 
44 | @register_model
45 | def register_music_gpt(opt_net, opt):
46 |     return Mel2VecCodesGpt(**opt_get(opt_net, ['kwargs'], {}))
47 | 
48 | 
49 | if __name__ == '__main__':
50 |     model = Mel2VecCodesGpt(512, 8)
51 |     codes = torch.randint(0,8, (2,300,8))
52 |     model(codes)


--------------------------------------------------------------------------------
/codes/models/audio/tts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/audio/tts/__init__.py


--------------------------------------------------------------------------------
/codes/models/audio/tts/random_latent_converter.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | import maybe_bnb as mbnb
 7 | 
 8 | from trainer.networks import register_model
 9 | from utils.util import opt_get
10 | 
11 | 
12 | def fused_leaky_relu(input, bias=None, negative_slope=0.2, scale=2 ** 0.5):
13 |     if bias is not None:
14 |         rest_dim = [1] * (input.ndim - bias.ndim - 1)
15 |         return (
16 |             F.leaky_relu(
17 |                 input + bias.view(1, bias.shape[0], *rest_dim), negative_slope=negative_slope
18 |             )
19 |             * scale
20 |         )
21 |     else:
22 |         return F.leaky_relu(input, negative_slope=0.2) * scale
23 | 
24 | 
25 | class EqualLinear(nn.Module):
26 |     def __init__(
27 |         self, in_dim, out_dim, bias=True, bias_init=0, lr_mul=1
28 |     ):
29 |         super().__init__()
30 |         self.weight = nn.Parameter(torch.randn(out_dim, in_dim).div_(lr_mul))
31 |         if bias:
32 |             self.bias = nn.Parameter(torch.zeros(out_dim).fill_(bias_init))
33 |         else:
34 |             self.bias = None
35 |         self.scale = (1 / math.sqrt(in_dim)) * lr_mul
36 |         self.lr_mul = lr_mul
37 | 
38 |     def forward(self, input):
39 |         out = F.linear(input, self.weight * self.scale)
40 |         out = fused_leaky_relu(out, self.bias * self.lr_mul)
41 |         return out
42 | 
43 | 
44 | class RandomLatentConverter(nn.Module):
45 |     def __init__(self, channels):
46 |         super().__init__()
47 |         self.layers = nn.Sequential(*[EqualLinear(channels, channels, lr_mul=.1) for _ in range(5)],
48 |                                     mbnb.nn.Linear(channels, channels))
49 |         self.channels = channels
50 | 
51 |     def forward(self, ref):
52 |         r = torch.randn(ref.shape[0], self.channels, device=ref.device)
53 |         y = self.layers(r)
54 |         return y
55 | 
56 | 
57 | @register_model
58 | def register_random_latent_converter(opt_net, opt):
59 |     return RandomLatentConverter(**opt_get(opt_net, ['kwargs'], {}))
60 | 
61 | 
62 | if __name__ == '__main__':
63 |     model = RandomLatentConverter(512)
64 |     model(torch.randn(5,512))


--------------------------------------------------------------------------------
/codes/models/audio/tts/tacotron2/LICENSE:
--------------------------------------------------------------------------------
 1 | This directory contains works with the below licenses, which should be considered in addition
 2 | to the base repository license.
 3 | 
 4 | BSD 3-Clause License
 5 | 
 6 | Copyright (c) 2018, NVIDIA Corporation
 7 | All rights reserved.
 8 | 
 9 | Redistribution and use in source and binary forms, with or without
10 | modification, are permitted provided that the following conditions are met:
11 | 
12 | * Redistributions of source code must retain the above copyright notice, this
13 |   list of conditions and the following disclaimer.
14 | 
15 | * Redistributions in binary form must reproduce the above copyright notice,
16 |   this list of conditions and the following disclaimer in the documentation
17 |   and/or other materials provided with the distribution.
18 | 
19 | * Neither the name of the copyright holder nor the names of its
20 |   contributors may be used to endorse or promote products derived from
21 |   this software without specific prior written permission.
22 | 
23 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
24 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
26 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
27 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
29 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
30 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/codes/models/audio/tts/tacotron2/__init__.py:
--------------------------------------------------------------------------------
1 | from models.audio.tts.tacotron2.taco_utils import *
2 | from models.audio.tts.tacotron2.text import *
3 | from models.audio.tts.tacotron2.tacotron2 import *
4 | from models.audio.tts.tacotron2.stft import *
5 | from models.audio.tts.tacotron2.layers import *
6 | from models.audio.tts.tacotron2.loss import *


--------------------------------------------------------------------------------
/codes/models/audio/tts/tacotron2/audio_processing.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | from scipy.signal import get_window
 4 | import librosa.util as librosa_util
 5 | 
 6 | 
 7 | def window_sumsquare(window, n_frames, hop_length=200, win_length=800,
 8 |                      n_fft=800, dtype=np.float32, norm=None):
 9 |     """
10 |     # from librosa 0.6
11 |     Compute the sum-square envelope of a window function at a given hop length.
12 | 
13 |     This is used to estimate modulation effects induced by windowing
14 |     observations in short-time fourier transforms.
15 | 
16 |     Parameters
17 |     ----------
18 |     window : string, tuple, number, callable, or list-like
19 |         Window specification, as in `get_window`
20 | 
21 |     n_frames : int > 0
22 |         The number of analysis frames
23 | 
24 |     hop_length : int > 0
25 |         The number of samples to advance between frames
26 | 
27 |     win_length : [optional]
28 |         The length of the window function.  By default, this matches `n_fft`.
29 | 
30 |     n_fft : int > 0
31 |         The length of each analysis frame.
32 | 
33 |     dtype : np.dtype
34 |         The data type of the output
35 | 
36 |     Returns
37 |     -------
38 |     wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
39 |         The sum-squared envelope of the window function
40 |     """
41 |     if win_length is None:
42 |         win_length = n_fft
43 | 
44 |     n = n_fft + hop_length * (n_frames - 1)
45 |     x = np.zeros(n, dtype=dtype)
46 | 
47 |     # Compute the squared window at the desired length
48 |     win_sq = get_window(window, win_length, fftbins=True)
49 |     win_sq = librosa_util.normalize(win_sq, norm=norm)**2
50 |     win_sq = librosa_util.pad_center(win_sq, n_fft)
51 | 
52 |     # Fill the envelope
53 |     for i in range(n_frames):
54 |         sample = i * hop_length
55 |         x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))]
56 |     return x
57 | 
58 | 
59 | def griffin_lim(magnitudes, stft_fn, n_iters=30):
60 |     """
61 |     PARAMS
62 |     ------
63 |     magnitudes: spectrogram magnitudes
64 |     stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
65 |     """
66 | 
67 |     angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
68 |     angles = angles.astype(np.float32)
69 |     angles = torch.autograd.Variable(torch.from_numpy(angles))
70 |     signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
71 | 
72 |     for i in range(n_iters):
73 |         _, angles = stft_fn.transform(signal)
74 |         signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
75 |     return signal
76 | 
77 | 
78 | def dynamic_range_compression(x, C=1, clip_val=1e-5):
79 |     """
80 |     PARAMS
81 |     ------
82 |     C: compression factor
83 |     """
84 |     return torch.log(torch.clamp(x, min=clip_val) * C)
85 | 
86 | 
87 | def dynamic_range_decompression(x, C=1):
88 |     """
89 |     PARAMS
90 |     ------
91 |     C: compression factor used to compress
92 |     """
93 |     return torch.exp(x) / C


--------------------------------------------------------------------------------
/codes/models/audio/tts/tacotron2/loss.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | 
 3 | from trainer.losses import ConfigurableLoss
 4 | 
 5 | 
 6 | class Tacotron2Loss(ConfigurableLoss):
 7 |     def __init__(self, opt_loss, env):
 8 |         super().__init__(opt_loss, env)
 9 |         self.mel_target_key = opt_loss['mel_target_key']
10 |         self.mel_output_key = opt_loss['mel_output_key']
11 |         self.mel_output_postnet_key = opt_loss['mel_output_postnet_key']
12 |         self.gate_target_key = opt_loss['gate_target_key']
13 |         self.gate_output_key = opt_loss['gate_output_key']
14 |         self.last_mel_loss = 0
15 |         self.last_gate_loss = 0
16 | 
17 |     def forward(self, _, state):
18 |         mel_target, gate_target = state[self.mel_target_key], state[self.gate_target_key]
19 |         mel_target.requires_grad = False
20 |         gate_target.requires_grad = False
21 |         gate_target = gate_target.view(-1, 1)
22 | 
23 |         mel_out, mel_out_postnet, gate_out = state[self.mel_output_key], state[self.mel_output_postnet_key], state[self.gate_output_key]
24 |         gate_out = gate_out.view(-1, 1)
25 |         mel_loss = nn.MSELoss()(mel_out, mel_target) + \
26 |             nn.MSELoss()(mel_out_postnet, mel_target)
27 |         gate_loss = nn.BCEWithLogitsLoss()(gate_out, gate_target)
28 |         self.last_mel_loss = mel_loss.detach().clone().mean().item()
29 |         self.last_gate_loss = gate_loss.detach().clone().mean().item()
30 |         return mel_loss + gate_loss
31 | 
32 |     def extra_metrics(self):
33 |         return {
34 |             'mel_loss': self.last_mel_loss,
35 |             'gate_loss':  self.last_gate_loss
36 |         }
37 | 
38 | 
39 | class Tacotron2LossRaw(nn.Module):
40 |     def __init__(self):
41 |         super().__init__()
42 |         self.last_mel_loss = 0
43 |         self.last_gate_loss = 0
44 | 
45 |     def forward(self, model_output, targets):
46 |         mel_target, gate_target = targets[0], targets[1]
47 |         mel_target.requires_grad = False
48 |         gate_target.requires_grad = False
49 |         gate_target = gate_target.view(-1, 1)
50 | 
51 |         mel_out, mel_out_postnet, gate_out, _ = model_output
52 |         gate_out = gate_out.view(-1, 1)
53 |         mel_loss = nn.MSELoss()(mel_out, mel_target) + \
54 |             nn.MSELoss()(mel_out_postnet, mel_target)
55 |         gate_loss = nn.BCEWithLogitsLoss()(gate_out, gate_target)
56 |         self.last_mel_loss = mel_loss.detach().clone().mean().item()
57 |         self.last_gate_loss = gate_loss.detach().clone().mean().item()
58 |         return mel_loss + gate_loss
59 | 
60 |     def extra_metrics(self):
61 |         return {
62 |             'mel_loss': self.last_mel_loss,
63 |             'gate_loss':  self.last_gate_loss
64 |         }


--------------------------------------------------------------------------------
/codes/models/audio/tts/tacotron2/taco_utils.py:
--------------------------------------------------------------------------------
 1 | import os.path
 2 | 
 3 | import numpy as np
 4 | import torch
 5 | from scipy.io.wavfile import read
 6 | 
 7 | 
 8 | def get_mask_from_lengths(lengths, max_len=None):
 9 |     if max_len is None:
10 |         max_len = torch.max(lengths).item()
11 |     ids = torch.arange(0, max_len, out=torch.LongTensor(max_len)).to(lengths.device)
12 |     mask = (ids < lengths.unsqueeze(1)).bool()
13 |     return mask
14 | 
15 | 
16 | def load_wav_to_torch(full_path):
17 |     sampling_rate, data = read(full_path)
18 |     if data.dtype == np.int32:
19 |         norm_fix = 2 ** 31
20 |     elif data.dtype == np.int16:
21 |         norm_fix = 2 ** 15
22 |     elif data.dtype == np.float16 or data.dtype == np.float32:
23 |         norm_fix = 1.
24 |     else:
25 |         raise NotImplemented(f"Provided data dtype not supported: {data.dtype}")
26 |     return (torch.FloatTensor(data.astype(np.float32)) / norm_fix, sampling_rate)
27 | 
28 | 
29 | def load_filepaths_and_text_type(filename, type, split="|"):
30 |     with open(filename, encoding='utf-8') as f:
31 |         filepaths_and_text = [list(line.strip().split(split)) + [type] for line in f]
32 |         base = os.path.dirname(filename)
33 |         for j in range(len(filepaths_and_text)):
34 |             filepaths_and_text[j][0] = os.path.join(base, filepaths_and_text[j][0])
35 |     return filepaths_and_text
36 | 
37 | def load_filepaths_and_text(filename, split="|"):
38 |     with open(filename, encoding='utf-8') as f:
39 |         filepaths_and_text = [line.strip().split(split) for line in f]
40 |         base = os.path.dirname(filename)
41 |         for j in range(len(filepaths_and_text)):
42 |             filepaths_and_text[j][0] = os.path.join(base, filepaths_and_text[j][0])
43 |     return filepaths_and_text
44 | 
45 | 
46 | def to_gpu(x):
47 |     x = x.contiguous()
48 | 
49 |     if torch.cuda.is_available():
50 |         x = x.cuda(non_blocking=True)
51 |     return torch.autograd.Variable(x)


--------------------------------------------------------------------------------
/codes/models/audio/tts/tacotron2/text/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017 Keith Ito
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/codes/models/audio/tts/tacotron2/text/__init__.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | import re
 3 | 
 4 | import torch
 5 | 
 6 | from models.audio.tts.tacotron2.text import cleaners
 7 | from models.audio.tts.tacotron2.text.symbols import symbols
 8 | 
 9 | 
10 | # Mappings from symbol to numeric ID and vice versa:
11 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
12 | _id_to_symbol = {i: s for i, s in enumerate(symbols)}
13 | 
14 | # Regular expression matching text enclosed in curly braces:
15 | _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
16 | 
17 | 
18 | def text_to_sequence(text, cleaner_names=['english_cleaners']):
19 |   '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
20 | 
21 |     The text can optionally have ARPAbet sequences enclosed in curly braces embedded
22 |     in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
23 | 
24 |     Args:
25 |       text: string to convert to a sequence
26 |       cleaner_names: names of the cleaner functions to run the text through
27 | 
28 |     Returns:
29 |       List of integers corresponding to the symbols in the text
30 |   '''
31 |   sequence = []
32 | 
33 |   # Check for curly braces and treat their contents as ARPAbet:
34 |   while len(text):
35 |     m = _curly_re.match(text)
36 |     if not m:
37 |       sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
38 |       break
39 |     sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
40 |     sequence += _arpabet_to_sequence(m.group(2))
41 |     text = m.group(3)
42 | 
43 |   return sequence
44 | 
45 | 
46 | def sequence_to_text(sequence):
47 |   '''Converts a sequence of IDs back to a string'''
48 |   result = ''
49 |   for symbol_id in sequence:
50 |     if isinstance(symbol_id, torch.Tensor):
51 |       symbol_id = symbol_id.item()
52 |     if symbol_id in _id_to_symbol:
53 |       s = _id_to_symbol[symbol_id]
54 |       # Enclose ARPAbet back in curly braces:
55 |       if len(s) > 1 and s[0] == '@':
56 |         s = '{%s}' % s[1:]
57 |       result += s
58 |   return result.replace('}{', ' ')
59 | 
60 | 
61 | def tacotron_symbols():
62 |   return list(_symbol_to_id.keys())
63 | 
64 | 
65 | def tacotron_symbol_mapping():
66 |   return _symbol_to_id.copy()
67 | 
68 | 
69 | def _clean_text(text, cleaner_names):
70 |   for name in cleaner_names:
71 |     cleaner = getattr(cleaners, name)
72 |     if not cleaner:
73 |       raise Exception('Unknown cleaner: %s' % name)
74 |     text = cleaner(text)
75 |   return text
76 | 
77 | 
78 | def _symbols_to_sequence(symbols):
79 |   return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
80 | 
81 | 
82 | def _arpabet_to_sequence(text):
83 |   return _symbols_to_sequence(['@' + s for s in text.split()])
84 | 
85 | 
86 | def _should_keep_symbol(s):
87 |   return s in _symbol_to_id and s != '_' and s != '~'
88 | 


--------------------------------------------------------------------------------
/codes/models/audio/tts/tacotron2/text/cleaners.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | 
 3 | '''
 4 | Cleaners are transformations that run over the input text at both training and eval time.
 5 | 
 6 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
 7 | hyperparameter. Some cleaners are English-specific. You'll typically want to use:
 8 |   1. "english_cleaners" for English text
 9 |   2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
10 |      the Unidecode library (https://pypi.python.org/pypi/Unidecode)
11 |   3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
12 |      the symbols in symbols.py to match your data).
13 | '''
14 | 
15 | import re
16 | from unidecode import unidecode
17 | from .numbers import normalize_numbers
18 | 
19 | 
20 | # Regular expression matching whitespace:
21 | _whitespace_re = re.compile(r'\s+')
22 | 
23 | # List of (regular expression, replacement) pairs for abbreviations:
24 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
25 |   ('mrs', 'misess'),
26 |   ('mr', 'mister'),
27 |   ('dr', 'doctor'),
28 |   ('st', 'saint'),
29 |   ('co', 'company'),
30 |   ('jr', 'junior'),
31 |   ('maj', 'major'),
32 |   ('gen', 'general'),
33 |   ('drs', 'doctors'),
34 |   ('rev', 'reverend'),
35 |   ('lt', 'lieutenant'),
36 |   ('hon', 'honorable'),
37 |   ('sgt', 'sergeant'),
38 |   ('capt', 'captain'),
39 |   ('esq', 'esquire'),
40 |   ('ltd', 'limited'),
41 |   ('col', 'colonel'),
42 |   ('ft', 'fort'),
43 | ]]
44 | 
45 | 
46 | def expand_abbreviations(text):
47 |   for regex, replacement in _abbreviations:
48 |     text = re.sub(regex, replacement, text)
49 |   return text
50 | 
51 | 
52 | def expand_numbers(text):
53 |   return normalize_numbers(text)
54 | 
55 | 
56 | def lowercase(text):
57 |   return text.lower()
58 | 
59 | 
60 | def collapse_whitespace(text):
61 |   return re.sub(_whitespace_re, ' ', text)
62 | 
63 | 
64 | def convert_to_ascii(text):
65 |   return unidecode(text)
66 | 
67 | 
68 | def basic_cleaners(text):
69 |   '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
70 |   text = lowercase(text)
71 |   text = collapse_whitespace(text)
72 |   return text
73 | 
74 | 
75 | def transliteration_cleaners(text):
76 |   '''Pipeline for non-English text that transliterates to ASCII.'''
77 |   text = convert_to_ascii(text)
78 |   text = lowercase(text)
79 |   text = collapse_whitespace(text)
80 |   return text
81 | 
82 | 
83 | def english_cleaners(text):
84 |   '''Pipeline for English text, including number and abbreviation expansion.'''
85 |   text = convert_to_ascii(text)
86 |   text = lowercase(text)
87 |   text = expand_numbers(text)
88 |   text = expand_abbreviations(text)
89 |   text = collapse_whitespace(text)
90 |   text = text.replace('"', '')
91 |   return text
92 | 


--------------------------------------------------------------------------------
/codes/models/audio/tts/tacotron2/text/cmudict.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | 
 3 | import re
 4 | 
 5 | 
 6 | valid_symbols = [
 7 |   'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
 8 |   'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
 9 |   'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
10 |   'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
11 |   'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
12 |   'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
13 |   'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
14 | ]
15 | 
16 | _valid_symbol_set = set(valid_symbols)
17 | 
18 | 
19 | class CMUDict:
20 |   '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''
21 |   def __init__(self, file_or_path, keep_ambiguous=True):
22 |     if isinstance(file_or_path, str):
23 |       with open(file_or_path, encoding='latin-1') as f:
24 |         entries = _parse_cmudict(f)
25 |     else:
26 |       entries = _parse_cmudict(file_or_path)
27 |     if not keep_ambiguous:
28 |       entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
29 |     self._entries = entries
30 | 
31 | 
32 |   def __len__(self):
33 |     return len(self._entries)
34 | 
35 | 
36 |   def lookup(self, word):
37 |     '''Returns list of ARPAbet pronunciations of the given word.'''
38 |     return self._entries.get(word.upper())
39 | 
40 | 
41 | 
42 | _alt_re = re.compile(r'\([0-9]+\)')
43 | 
44 | 
45 | def _parse_cmudict(file):
46 |   cmudict = {}
47 |   for line in file:
48 |     if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
49 |       parts = line.split('  ')
50 |       word = re.sub(_alt_re, '', parts[0])
51 |       pronunciation = _get_pronunciation(parts[1])
52 |       if pronunciation:
53 |         if word in cmudict:
54 |           cmudict[word].append(pronunciation)
55 |         else:
56 |           cmudict[word] = [pronunciation]
57 |   return cmudict
58 | 
59 | 
60 | def _get_pronunciation(s):
61 |   parts = s.strip().split(' ')
62 |   for part in parts:
63 |     if part not in _valid_symbol_set:
64 |       return None
65 |   return ' '.join(parts)
66 | 


--------------------------------------------------------------------------------
/codes/models/audio/tts/tacotron2/text/numbers.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | 
 3 | import inflect
 4 | import re
 5 | 
 6 | 
 7 | _inflect = inflect.engine()
 8 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
 9 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
10 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
11 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
12 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
13 | _number_re = re.compile(r'[0-9]+')
14 | 
15 | 
16 | def _remove_commas(m):
17 |   return m.group(1).replace(',', '')
18 | 
19 | 
20 | def _expand_decimal_point(m):
21 |   return m.group(1).replace('.', ' point ')
22 | 
23 | 
24 | def _expand_dollars(m):
25 |   match = m.group(1)
26 |   parts = match.split('.')
27 |   if len(parts) > 2:
28 |     return match + ' dollars'  # Unexpected format
29 |   dollars = int(parts[0]) if parts[0] else 0
30 |   cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
31 |   if dollars and cents:
32 |     dollar_unit = 'dollar' if dollars == 1 else 'dollars'
33 |     cent_unit = 'cent' if cents == 1 else 'cents'
34 |     return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
35 |   elif dollars:
36 |     dollar_unit = 'dollar' if dollars == 1 else 'dollars'
37 |     return '%s %s' % (dollars, dollar_unit)
38 |   elif cents:
39 |     cent_unit = 'cent' if cents == 1 else 'cents'
40 |     return '%s %s' % (cents, cent_unit)
41 |   else:
42 |     return 'zero dollars'
43 | 
44 | 
45 | def _expand_ordinal(m):
46 |   return _inflect.number_to_words(m.group(0))
47 | 
48 | 
49 | def _expand_number(m):
50 |   num = int(m.group(0))
51 |   if num > 1000 and num < 3000:
52 |     if num == 2000:
53 |       return 'two thousand'
54 |     elif num > 2000 and num < 2010:
55 |       return 'two thousand ' + _inflect.number_to_words(num % 100)
56 |     elif num % 100 == 0:
57 |       return _inflect.number_to_words(num // 100) + ' hundred'
58 |     else:
59 |       return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
60 |   else:
61 |     return _inflect.number_to_words(num, andword='')
62 | 
63 | 
64 | def normalize_numbers(text):
65 |   text = re.sub(_comma_number_re, _remove_commas, text)
66 |   text = re.sub(_pounds_re, r'\1 pounds', text)
67 |   text = re.sub(_dollars_re, _expand_dollars, text)
68 |   text = re.sub(_decimal_number_re, _expand_decimal_point, text)
69 |   text = re.sub(_ordinal_re, _expand_ordinal, text)
70 |   text = re.sub(_number_re, _expand_number, text)
71 |   return text
72 | 


--------------------------------------------------------------------------------
/codes/models/audio/tts/tacotron2/text/symbols.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | 
 3 | '''
 4 | Defines the set of symbols used in text input to the model.
 5 | 
 6 | The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. '''
 7 | from models.audio.tts.tacotron2.text import cmudict
 8 | 
 9 | _pad        = '_'
10 | _punctuation = '!\'(),.:;? '
11 | _special = '-'
12 | _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
13 | 
14 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
15 | _arpabet = ['@' + s for s in cmudict.valid_symbols]
16 | 
17 | # Export all symbols:
18 | symbols = [_pad] + list(_special) + list(_punctuation) + list(_letters) + _arpabet
19 | 


--------------------------------------------------------------------------------
/codes/models/audio/vocoders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/audio/vocoders/__init__.py


--------------------------------------------------------------------------------
/codes/models/audio/vocoders/univnet/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/audio/vocoders/univnet/__init__.py


--------------------------------------------------------------------------------
/codes/models/audio/vocoders/waveglow/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/audio/vocoders/waveglow/__init__.py


--------------------------------------------------------------------------------
/codes/models/audio/vocoders/waveglow/denoiser.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | from models.audio.tts.tacotron2.stft import STFT
 4 | 
 5 | sys.path.append('tacotron2')
 6 | import torch
 7 | 
 8 | 
 9 | class Denoiser(torch.nn.Module):
10 |     """ Removes model bias from audio produced with waveglow """
11 | 
12 |     def __init__(self, waveglow, filter_length=1024, n_overlap=4,
13 |                  win_length=1024, mode='zeros'):
14 |         super(Denoiser, self).__init__()
15 |         self.stft = STFT(filter_length=filter_length,
16 |                          hop_length=int(filter_length/n_overlap),
17 |                          win_length=win_length).cuda()
18 |         if mode == 'zeros':
19 |             mel_input = torch.zeros(
20 |                 (1, 80, 88),
21 |                 dtype=waveglow.upsample.weight.dtype,
22 |                 device=waveglow.upsample.weight.device)
23 |         elif mode == 'normal':
24 |             mel_input = torch.randn(
25 |                 (1, 80, 88),
26 |                 dtype=waveglow.upsample.weight.dtype,
27 |                 device=waveglow.upsample.weight.device)
28 |         else:
29 |             raise Exception("Mode {} if not supported".format(mode))
30 | 
31 |         with torch.no_grad():
32 |             bias_audio = waveglow.infer(mel_input, sigma=0.0).float()
33 |             bias_spec, _ = self.stft.transform(bias_audio)
34 | 
35 |         self.register_buffer('bias_spec', bias_spec[:, :, 0][:, :, None])
36 | 
37 |     def forward(self, audio, strength=0.1):
38 |         audio_spec, audio_angles = self.stft.transform(audio.cuda().float())
39 |         audio_spec_denoised = audio_spec - self.bias_spec * strength
40 |         audio_spec_denoised = torch.clamp(audio_spec_denoised, 0.0)
41 |         audio_denoised = self.stft.inverse(audio_spec_denoised, audio_angles)
42 |         return audio_denoised
43 | 


--------------------------------------------------------------------------------
/codes/models/classifiers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/classifiers/__init__.py


--------------------------------------------------------------------------------
/codes/models/classifiers/torch_models.py:
--------------------------------------------------------------------------------
 1 | from torchvision.models import vgg16
 2 | 
 3 | from trainer.networks import register_model
 4 | from utils.util import opt_get
 5 | 
 6 | 
 7 | @register_model
 8 | def register_torch_vgg16(opt_net, opt):
 9 |     """ return a ResNet 18 object
10 |     """
11 |     return vgg16(**opt_get(opt_net, ['kwargs'], {}))
12 | 


--------------------------------------------------------------------------------
/codes/models/clip/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/clip/__init__.py


--------------------------------------------------------------------------------
/codes/models/clip/clip.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from trainer.networks import register_model
 4 | from utils.util import opt_get
 5 | 
 6 | 
 7 | def encoder_for_type(type, master_dim, enc_kwargs):
 8 |     from x_clip.x_clip import VisionTransformer, TextTransformer
 9 |     if type == 'image':
10 |         # xclip_kwargs: image_size, patch_size, channels, depth, heads
11 |         return VisionTransformer(dim=master_dim, **enc_kwargs)
12 |     elif type == 'tokens':
13 |         # xclip_kwargs: num_tokens, max_seq_len, depth, heads
14 |         return TextTransformer(dim=master_dim, **enc_kwargs)
15 |     raise NotImplementedError()
16 | 
17 | 
18 | class XClipWrapper(nn.Module):
19 |     def __init__(self,
20 |                  master_dim=512,
21 |                  enc1_type='vision',
22 |                  enc1_kwargs={},
23 |                  enc2_type='text',
24 |                  enc2_kwargs={},
25 |                  mask_seq1_percentage=0,
26 |                  mask_seq2_percentage=0,
27 |                  **xclip_kwargs):
28 |         super().__init__()
29 |         self.mask_seq1_percentage = mask_seq1_percentage
30 |         self.mask_seq2_percentage = mask_seq2_percentage
31 |         enc1 = encoder_for_type(enc1_type, master_dim, enc1_kwargs)
32 |         enc2 = encoder_for_type(enc2_type, master_dim, enc2_kwargs)
33 |         xclip_kwargs['dim_text'] = master_dim
34 |         xclip_kwargs['dim_image'] = master_dim
35 |         xclip_kwargs['dim_latent'] = master_dim
36 |         xclip_kwargs['text_encoder'] = enc1  # The first argument of forward
37 |         xclip_kwargs['image_encoder'] = enc2
38 |         # xclip_kwargs:
39 |         #  use_all_token_embeds
40 |         #  downsample_image_embeds
41 |         #  decoupled_contrastive_learning
42 |         #  extra_latent_projection
43 |         #  use_mlm
44 |         from x_clip import CLIP
45 |         self.clip = CLIP(**xclip_kwargs)
46 | 
47 |     def forward(self, seq1, seq2, return_loss=False):
48 |         seq1_mask = torch.rand_like(seq1.float()) > self.mask_seq1_percentage
49 |         # TODO: add support for seq2 mask..
50 |         #seq2_mask = torch.rand_like(seq2.float()) > self.mask_seq2_percentage
51 |         return self.clip(seq1, seq2, seq1_mask, return_loss=return_loss)
52 | 
53 | 
54 | @register_model
55 | def register_clip(opt_net, opt):
56 |     return XClipWrapper(**opt_get(opt_net, ['kwargs'], {}))
57 | 
58 | if __name__ == '__main__':
59 |     model = XClipWrapper(enc1_type='tokens', enc2_type='tokens',
60 |                          enc1_kwargs={'num_tokens': 256, 'max_seq_len': 200, 'depth': 8, 'heads': 8},
61 |                          enc2_kwargs={'num_tokens': 8192, 'max_seq_len': 250, 'depth': 8, 'heads': 8})
62 |     loss = model(torch.randint(0,256, (2,200)),  torch.randint(0,8192, (2,250)), True)
63 |     print(loss)


--------------------------------------------------------------------------------
/codes/models/composable/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/composable/__init__.py


--------------------------------------------------------------------------------
/codes/models/diffusion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/diffusion/__init__.py


--------------------------------------------------------------------------------
/codes/models/diffusion/losses.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Helpers for various likelihood-based losses. These are ported from the original
 3 | Ho et al. diffusion models codebase:
 4 | https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/utils.py
 5 | """
 6 | 
 7 | import numpy as np
 8 | 
 9 | import torch as th
10 | 
11 | 
12 | def normal_kl(mean1, logvar1, mean2, logvar2):
13 |     """
14 |     Compute the KL divergence between two gaussians.
15 | 
16 |     Shapes are automatically broadcasted, so batches can be compared to
17 |     scalars, among other use cases.
18 |     """
19 |     tensor = None
20 |     for obj in (mean1, logvar1, mean2, logvar2):
21 |         if isinstance(obj, th.Tensor):
22 |             tensor = obj
23 |             break
24 |     assert tensor is not None, "at least one argument must be a Tensor"
25 | 
26 |     # Force variances to be Tensors. Broadcasting helps convert scalars to
27 |     # Tensors, but it does not work for th.exp().
28 |     logvar1, logvar2 = [
29 |         x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor)
30 |         for x in (logvar1, logvar2)
31 |     ]
32 | 
33 |     return 0.5 * (
34 |         -1.0
35 |         + logvar2
36 |         - logvar1
37 |         + th.exp(logvar1 - logvar2)
38 |         + ((mean1 - mean2) ** 2) * th.exp(-logvar2)
39 |     )
40 | 
41 | 
42 | def approx_standard_normal_cdf(x):
43 |     """
44 |     A fast approximation of the cumulative distribution function of the
45 |     standard normal.
46 |     """
47 |     return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3))))
48 | 
49 | 
50 | def discretized_gaussian_log_likelihood(x, *, means, log_scales):
51 |     """
52 |     Compute the log-likelihood of a Gaussian distribution discretizing to a
53 |     given image.
54 | 
55 |     :param x: the target images. It is assumed that this was uint8 values,
56 |               rescaled to the range [-1, 1].
57 |     :param means: the Gaussian mean Tensor.
58 |     :param log_scales: the Gaussian log stddev Tensor.
59 |     :return: a tensor like x of log probabilities (in nats).
60 |     """
61 |     assert x.shape == means.shape == log_scales.shape
62 |     centered_x = x - means
63 |     inv_stdv = th.exp(-log_scales)
64 |     plus_in = inv_stdv * (centered_x + 1.0 / 255.0)
65 |     cdf_plus = approx_standard_normal_cdf(plus_in)
66 |     min_in = inv_stdv * (centered_x - 1.0 / 255.0)
67 |     cdf_min = approx_standard_normal_cdf(min_in)
68 |     log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12))
69 |     log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12))
70 |     cdf_delta = cdf_plus - cdf_min
71 |     log_probs = th.where(
72 |         x < -0.999,
73 |         log_cdf_plus,
74 |         th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))),
75 |     )
76 |     assert log_probs.shape == x.shape
77 |     return log_probs
78 | 


--------------------------------------------------------------------------------
/codes/models/image_generation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/image_generation/__init__.py


--------------------------------------------------------------------------------
/codes/models/image_generation/glean/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/image_generation/glean/__init__.py


--------------------------------------------------------------------------------
/codes/models/image_generation/srflow/Permutations.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | from torch import nn as nn
 4 | from torch.nn import functional as F
 5 | 
 6 | from models.image_generation.srflow import thops
 7 | 
 8 | 
 9 | class InvertibleConv1x1(nn.Module):
10 |     def __init__(self, num_channels, LU_decomposed=False):
11 |         super().__init__()
12 |         w_shape = [num_channels, num_channels]
13 |         w_init = np.linalg.qr(np.random.randn(*w_shape))[0].astype(np.float32)
14 |         self.register_parameter("weight", nn.Parameter(torch.Tensor(w_init)))
15 |         self.w_shape = w_shape
16 |         self.LU = LU_decomposed
17 | 
18 |     def get_weight(self, input, reverse):
19 |         w_shape = self.w_shape
20 |         pixels = thops.pixels(input)
21 |         dlogdet = torch.slogdet(self.weight)[1] * pixels
22 |         if not reverse:
23 |             weight = self.weight.view(w_shape[0], w_shape[1], 1, 1)
24 |         else:
25 |             weight = torch.inverse(self.weight.double()).float() \
26 |                 .view(w_shape[0], w_shape[1], 1, 1)
27 |         return weight, dlogdet
28 |     def forward(self, input, logdet=None, reverse=False):
29 |         """
30 |         log-det = log|abs(|W|)| * pixels
31 |         """
32 |         weight, dlogdet = self.get_weight(input, reverse)
33 |         if not reverse:
34 |             z = F.conv2d(input, weight)
35 |             if logdet is not None:
36 |                 logdet = logdet + dlogdet
37 |             return z, logdet
38 |         else:
39 |             z = F.conv2d(input, weight)
40 |             if logdet is not None:
41 |                 logdet = logdet - dlogdet
42 |             return z, logdet
43 | 


--------------------------------------------------------------------------------
/codes/models/image_generation/srflow/Split.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn as nn
 3 | 
 4 | from models.image_generation.srflow import thops
 5 | from models.image_generation.srflow.flow import Conv2dZeros, GaussianDiag
 6 | from utils.util import opt_get
 7 | 
 8 | 
 9 | class Split2d(nn.Module):
10 |     def __init__(self, num_channels, logs_eps=0, cond_channels=0, position=None, consume_ratio=0.5, opt=None):
11 |         super().__init__()
12 | 
13 |         self.num_channels_consume = int(round(num_channels * consume_ratio))
14 |         self.num_channels_pass = num_channels - self.num_channels_consume
15 | 
16 |         self.conv = Conv2dZeros(in_channels=self.num_channels_pass + cond_channels,
17 |                                 out_channels=self.num_channels_consume * 2)
18 |         self.logs_eps = logs_eps
19 |         self.position = position
20 |         self.gaussian_nll_weight = opt_get(opt, ['networks', 'generator', 'flow', 'gaussian_loss_weight'], 1)
21 | 
22 |     def split2d_prior(self, z, ft):
23 |         if ft is not None:
24 |             z = torch.cat([z, ft], dim=1)
25 |         h = self.conv(z)
26 |         return thops.split_feature(h, "cross")
27 | 
28 |     def exp_eps(self, logs):
29 |         return torch.exp(logs) + self.logs_eps
30 | 
31 |     def forward(self, input, logdet=0., reverse=False, eps_std=None, eps=None, ft=None, y_onehot=None):
32 |         if not reverse:
33 |             # self.input = input
34 |             z1, z2 = self.split_ratio(input)
35 |             mean, logs = self.split2d_prior(z1, ft)
36 |             
37 |             eps = (z2 - mean) / self.exp_eps(logs)
38 | 
39 |             logdet = logdet + self.get_logdet(logs, mean, z2)
40 | 
41 |             # print(logs.shape, mean.shape, z2.shape)
42 |             # self.eps = eps
43 |             # print('split, enc eps:', eps)
44 |             return z1, logdet, eps
45 |         else:
46 |             z1 = input
47 |             mean, logs = self.split2d_prior(z1, ft)
48 | 
49 |             if eps is None:
50 |                 #print("WARNING: eps is None, generating eps untested functionality!")
51 |                 eps = GaussianDiag.sample(mean, logs, eps_std)
52 |                 #eps = GaussianDiag.sample_eps(mean.shape, eps_std)
53 | 
54 |             eps = eps.to(mean.device)
55 |             z2 = mean + self.exp_eps(logs) * eps
56 |             z = thops.cat_feature(z1, z2)
57 | 
58 |             logdet = logdet - self.get_logdet(logs, mean, z2)
59 | 
60 |             return z, logdet
61 |             # return z, logdet, eps
62 | 
63 |     def get_logdet(self, logs, mean, z2):
64 |         logdet_diff = GaussianDiag.logp(mean, logs, z2)
65 |         return logdet_diff * self.gaussian_nll_weight
66 | 
67 |     def split_ratio(self, input):
68 |         z1, z2 = input[:, :self.num_channels_pass, ...], input[:, self.num_channels_pass:, ...]
69 |         return z1, z2


--------------------------------------------------------------------------------
/codes/models/image_generation/srflow/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/image_generation/srflow/__init__.py


--------------------------------------------------------------------------------
/codes/models/image_generation/srflow/glow_arch.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | 
 3 | 
 4 | def f_conv2d_bias(in_channels, out_channels):
 5 |     def padding_same(kernel, stride):
 6 |         return [((k - 1) * s + 1) // 2 for k, s in zip(kernel, stride)]
 7 | 
 8 |     padding = padding_same([3, 3], [1, 1])
 9 |     assert padding == [1, 1], padding
10 |     return nn.Sequential(
11 |         nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=[3, 3], stride=1, padding=1,
12 |                   bias=True))
13 | 


--------------------------------------------------------------------------------
/codes/models/image_generation/srflow/module_util.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.init as init
 4 | import torch.nn.functional as F
 5 | import maybe_bnb as mbnb
 6 | 
 7 | 
 8 | def initialize_weights(net_l, scale=1):
 9 |     if not isinstance(net_l, list):
10 |         net_l = [net_l]
11 |     for net in net_l:
12 |         for m in net.modules():
13 |             if isinstance(m, nn.Conv2d):
14 |                 init.kaiming_normal_(m.weight, a=0, mode='fan_in')
15 |                 m.weight.data *= scale  # for residual block
16 |                 if m.bias is not None:
17 |                     m.bias.data.zero_()
18 |             elif isinstance(m, mbnb.nn.Linear):
19 |                 init.kaiming_normal_(m.weight, a=0, mode='fan_in')
20 |                 m.weight.data *= scale
21 |                 if m.bias is not None:
22 |                     m.bias.data.zero_()
23 |             elif isinstance(m, nn.BatchNorm2d):
24 |                 init.constant_(m.weight, 1)
25 |                 init.constant_(m.bias.data, 0.0)
26 | 
27 | 
28 | def make_layer(block, n_layers):
29 |     layers = []
30 |     for _ in range(n_layers):
31 |         layers.append(block())
32 |     return nn.Sequential(*layers)
33 | 
34 | 
35 | class ResidualBlock_noBN(nn.Module):
36 |     '''Residual block w/o BN
37 |     ---Conv-ReLU-Conv-+-
38 |      |________________|
39 |     '''
40 | 
41 |     def __init__(self, nf=64):
42 |         super(ResidualBlock_noBN, self).__init__()
43 |         self.conv1 = nn.Conv2d(nf, nf, 3, 1, 1, bias=True)
44 |         self.conv2 = nn.Conv2d(nf, nf, 3, 1, 1, bias=True)
45 | 
46 |         # initialization
47 |         initialize_weights([self.conv1, self.conv2], 0.1)
48 | 
49 |     def forward(self, x):
50 |         identity = x
51 |         out = F.relu(self.conv1(x), inplace=True)
52 |         out = self.conv2(out)
53 |         return identity + out
54 | 
55 | 
56 | def flow_warp(x, flow, interp_mode='bilinear', padding_mode='zeros'):
57 |     """Warp an image or feature map with optical flow
58 |     Args:
59 |         x (Tensor): size (N, C, H, W)
60 |         flow (Tensor): size (N, H, W, 2), normal value
61 |         interp_mode (str): 'nearest' or 'bilinear'
62 |         padding_mode (str): 'zeros' or 'border' or 'reflection'
63 | 
64 |     Returns:
65 |         Tensor: warped image or feature map
66 |     """
67 |     assert x.size()[-2:] == flow.size()[1:3]
68 |     B, C, H, W = x.size()
69 |     # mesh grid
70 |     grid_y, grid_x = torch.meshgrid(torch.arange(0, H), torch.arange(0, W))
71 |     grid = torch.stack((grid_x, grid_y), 2).float()  # W(x), H(y), 2
72 |     grid.requires_grad = False
73 |     grid = grid.type_as(x)
74 |     vgrid = grid + flow
75 |     # scale grid to [-1,1]
76 |     vgrid_x = 2.0 * vgrid[:, :, :, 0] / max(W - 1, 1) - 1.0
77 |     vgrid_y = 2.0 * vgrid[:, :, :, 1] / max(H - 1, 1) - 1.0
78 |     vgrid_scaled = torch.stack((vgrid_x, vgrid_y), dim=3)
79 |     output = F.grid_sample(x, vgrid_scaled, mode=interp_mode, padding_mode=padding_mode)
80 |     return output
81 | 


--------------------------------------------------------------------------------
/codes/models/image_generation/srflow/thops.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def sum(tensor, dim=None, keepdim=False):
 5 |     if dim is None:
 6 |         # sum up all dim
 7 |         return torch.sum(tensor)
 8 |     else:
 9 |         if isinstance(dim, int):
10 |             dim = [dim]
11 |         dim = sorted(dim)
12 |         for d in dim:
13 |             tensor = tensor.sum(dim=d, keepdim=True)
14 |         if not keepdim:
15 |             for i, d in enumerate(dim):
16 |                 tensor.squeeze_(d-i)
17 |         return tensor
18 | 
19 | 
20 | def mean(tensor, dim=None, keepdim=False):
21 |     if dim is None:
22 |         # mean all dim
23 |         return torch.mean(tensor)
24 |     else:
25 |         if isinstance(dim, int):
26 |             dim = [dim]
27 |         dim = sorted(dim)
28 |         for d in dim:
29 |             tensor = tensor.mean(dim=d, keepdim=True)
30 |         if not keepdim:
31 |             for i, d in enumerate(dim):
32 |                 tensor.squeeze_(d-i)
33 |         return tensor
34 | 
35 | 
36 | def split_feature(tensor, type="split"):
37 |     """
38 |     type = ["split", "cross"]
39 |     """
40 |     C = tensor.size(1)
41 |     if type == "split":
42 |         return tensor[:, :C // 2, ...], tensor[:, C // 2:, ...]
43 |     elif type == "cross":
44 |         return tensor[:, 0::2, ...], tensor[:, 1::2, ...]
45 | 
46 | 
47 | def cat_feature(tensor_a, tensor_b):
48 |     return torch.cat((tensor_a, tensor_b), dim=1)
49 | 
50 | 
51 | def pixels(tensor):
52 |     return int(tensor.size(2) * tensor.size(3))


--------------------------------------------------------------------------------
/codes/models/image_generation/stylegan/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def create_stylegan2_loss(opt_loss, env):
 3 |     type = opt_loss['type']
 4 |     if type == 'stylegan2_divergence':
 5 |         import models.image_generation.stylegan.stylegan2_lucidrains as stylegan2
 6 |         return stylegan2.StyleGan2DivergenceLoss(opt_loss, env)
 7 |     elif type == 'stylegan2_pathlen':
 8 |         import models.image_generation.stylegan.stylegan2_lucidrains as stylegan2
 9 |         return stylegan2.StyleGan2PathLengthLoss(opt_loss, env)
10 |     else:
11 |         raise NotImplementedError


--------------------------------------------------------------------------------
/codes/models/image_latents/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/image_latents/__init__.py


--------------------------------------------------------------------------------
/codes/models/image_latents/byol/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/image_latents/byol/__init__.py


--------------------------------------------------------------------------------
/codes/models/image_latents/fixup_resnet/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/image_latents/fixup_resnet/__init__.py


--------------------------------------------------------------------------------
/codes/models/lucidrains/dalle/__init__.py:
--------------------------------------------------------------------------------
1 | # This directory contains some useful code from https://github.com/lucidrains/DALLE-pytorch/tree/main/dalle_pytorch


--------------------------------------------------------------------------------
/codes/models/lucidrains/performer/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/codes/models/vqvae/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/models/vqvae/__init__.py


--------------------------------------------------------------------------------
/codes/models/vqvae/gumbel_quantizer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from torch import einsum
 5 | 
 6 | from utils.weight_scheduler import LinearDecayWeightScheduler
 7 | import maybe_bnb as mbnb
 8 | 
 9 | 
10 | class GumbelQuantizer(nn.Module):
11 |     def __init__(self, inp_dim, codebook_dim, num_tokens, straight_through=False):
12 |         super().__init__()
13 |         self.to_logits = nn.Conv1d(inp_dim, num_tokens, 1)
14 |         # nn.Embedding
15 |         self.codebook = mbnb.nn.Embedding(num_tokens, codebook_dim)
16 |         self.straight_through = straight_through
17 |         self.temperature_scheduler = LinearDecayWeightScheduler(10, 5000, .9, 2000)
18 |         self.step = 0
19 |         self.norm = SwitchNorm(num_tokens)
20 | 
21 |     def get_temperature(self, step):
22 |         self.step = step  # VERY POOR DESIGN. WHEN WILL HE EVER LEARN???
23 |         return self.temperature_scheduler.get_weight_for_step(step)
24 | 
25 |     def embed_code(self, codes):
26 |         return self.codebook(codes)
27 | 
28 |     def gumbel_softmax(self, logits, tau, dim, hard):
29 |         gumbels = torch.rand_like(logits)
30 |         gumbels = -torch.log(-torch.log(gumbels + 1e-8) + 1e-8)
31 |         logits = (logits + gumbels) / tau  # ~Gumbel(logits,tau)
32 |         y_soft = F.softmax(logits, dim=dim)
33 | 
34 |         if hard:
35 |             index = y_soft.max(dim, keepdim=True)[1]
36 |             y_hard = torch.zeros_like(logits, memory_format=torch.legacy_contiguous_format).scatter_(dim, index, 1.0)
37 |             ret = y_hard - y_soft.detach() + y_soft
38 |         else:
39 |             ret = y_soft
40 |         return ret
41 | 
42 |     def forward(self, h):
43 |         h = h.permute(0,2,1)
44 |         logits = self.to_logits(h)
45 |         logits = self.gumbel_softmax(logits, tau=self.temperature_scheduler.get_weight_for_step(self.step), dim=1, hard=self.straight_through)
46 |         logits = self.norm(logits)
47 |         codes = logits.argmax(dim=1).flatten(1)
48 |         sampled = einsum('b n l, n d -> b d l', logits, self.codebook.weight)
49 |         return sampled.permute(0,2,1), 0, codes
50 | 
51 | if __name__ == '__main__':
52 |     j =  torch.randn(8,40,1024)
53 |     m = GumbelQuantizer(1024, 1024, 4096)
54 |     m2 = DiscreteDecoder(1024, (512, 256), 2)
55 |     l=m2(m(j)[0].permute(0,2,1))
56 |     mean = 0
57 |     for ls in l:
58 |         mean = mean + ls.mean()
59 |     mean.backward()


--------------------------------------------------------------------------------
/codes/requirements.laxed.txt:
--------------------------------------------------------------------------------
 1 | # Fundamentals
 2 | numpy
 3 | pyyaml
 4 | tb-nightly
 5 | future
 6 | scp
 7 | tqdm
 8 | matplotlib
 9 | scipy
10 | munch
11 | tqdm
12 | scp
13 | tensorboard
14 | orjson
15 | einops
16 | lambda-networks
17 | mup
18 | 
19 | #UI
20 | customtkinter
21 | ruamel.yaml
22 | # For image generation stuff
23 | opencv-python
24 | kornia
25 | pytorch_ssim
26 | gsa-pytorch
27 | pytorch_fid
28 | 
29 | # For audio generation stuff
30 | inflect
31 | librosa
32 | Unidecode
33 | tgt
34 | pyworld
35 | audio2numpy
36 | SoundFile
37 | 
38 | # For text stuff
39 | transformers
40 | tokenizers
41 | jiwer  # calculating WER
42 | omegaconf
43 | 
44 | # lucidrains stuff
45 | vector_quantize_pytorch
46 | linear_attention_transformer
47 | rotary-embedding-torch
48 | axial_positional_embedding
49 | g-mlp-pytorch
50 | x-clip
51 | x_transformers==1.0.4
52 | 
53 | # bitsandbytes
54 | bitsandbytes
55 | lion-pytorch==0.0.7
56 | # triton==2.0.0a2
57 | 


--------------------------------------------------------------------------------
/codes/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Fundamentals
 2 | numpy
 3 | pyyaml
 4 | tb-nightly
 5 | future
 6 | scp
 7 | tqdm
 8 | matplotlib
 9 | scipy
10 | munch
11 | tqdm
12 | scp
13 | tensorboard
14 | orjson
15 | einops
16 | lambda-networks
17 | mup
18 | 
19 | # For image generation stuff
20 | opencv-python
21 | kornia
22 | pytorch_ssim
23 | gsa-pytorch
24 | pytorch_fid==0.1.1
25 | 
26 | # For audio generation stuff
27 | inflect==0.2.5
28 | librosa==0.6.0
29 | Unidecode==1.0.22
30 | tgt == 1.4.4
31 | pyworld == 0.2.10
32 | audio2numpy
33 | SoundFile
34 | 
35 | # For text stuff
36 | transformers
37 | tokenizers
38 | jiwer  # calculating WER
39 | omegaconf
40 | 
41 | # lucidrains stuff
42 | vector_quantize_pytorch
43 | linear_attention_transformer
44 | rotary-embedding-torch
45 | axial_positional_embedding
46 | g-mlp-pytorch
47 | x-clip
48 | x_transformers
49 | 
50 | bitsandbytes
51 | lion-pytorch==0.0.7
52 | # triton==2.0.0a2
53 | 


--------------------------------------------------------------------------------
/codes/scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/scripts/__init__.py


--------------------------------------------------------------------------------
/codes/scripts/audio/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/scripts/audio/__init__.py


--------------------------------------------------------------------------------
/codes/scripts/audio/gen/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/scripts/audio/gen/__init__.py


--------------------------------------------------------------------------------
/codes/scripts/audio/gen/use_discrete_vocoder_one_way.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import torch
 4 | import torchaudio
 5 | 
 6 | from data.audio.unsupervised_audio_dataset import load_audio
 7 | from scripts.audio.gen.speech_synthesis_utils import do_spectrogram_diffusion, \
 8 |     load_discrete_vocoder_diffuser, wav_to_mel, convert_mel_to_codes
 9 | from utils.audio import plot_spectrogram
10 | from utils.util import load_model_from_config
11 | 
12 | 
13 | def roundtrip_vocoding(dvae, vocoder, diffuser, clip, cond=None, plot_spec=False):
14 |     clip = clip.unsqueeze(0)
15 |     if cond is None:
16 |         cond = clip
17 |     else:
18 |         cond = cond.unsqueeze(0)
19 |     mel = wav_to_mel(clip)
20 |     if plot_spec:
21 |         plot_spectrogram(mel[0].cpu())
22 |     codes = convert_mel_to_codes(dvae, mel)
23 |     return
24 | 
25 | 
26 | if __name__ == '__main__':
27 |     parser = argparse.ArgumentParser()
28 |     parser.add_argument('-opt', type=str, help='Path to options YAML file used to train the diffusion model', default='X:\\dlas\\experiments\\train_diffusion_vocoder_with_cond_new_dvae.yml')
29 |     parser.add_argument('-diffusion_model_name', type=str, help='Name of the diffusion model in opt.', default='generator')
30 |     parser.add_argument('-diffusion_model_path', type=str, help='Name of the diffusion model in opt.', default='X:\\dlas\\experiments\\train_diffusion_vocoder_with_cond_new_dvae_full\\models\\6100_generator_ema.pth')
31 |     parser.add_argument('-dvae_model_name', type=str, help='Name of the DVAE model in opt.', default='dvae')
32 |     parser.add_argument('-input_file', type=str, help='Path to the input torch save file.', default='speech_forward_mels.pth')
33 |     parser.add_argument('-cond', type=str, help='Path to the conditioning input audio file.', default='Z:\\clips\\books1\\3042_18_Holden__000000000\\00037.wav')
34 |     args = parser.parse_args()
35 | 
36 |     print("Loading DVAE..")
37 |     dvae = load_model_from_config(args.opt, args.dvae_model_name)
38 |     print("Loading Diffusion Model..")
39 |     diffusion = load_model_from_config(args.opt, args.diffusion_model_name, also_load_savepoint=False, load_path=args.diffusion_model_path)
40 | 
41 |     print("Loading data..")
42 |     cond = load_audio(args.cond, 22050)
43 |     if cond.shape[-1] > 44100+10000:
44 |         cond = cond[:,10000:54100]
45 |     cond = cond.unsqueeze(0).cuda()
46 | 
47 |     diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=20)
48 |     inp = torch.load(args.input_file)
49 |     codes = inp
50 | 
51 |     print("Performing inference..")
52 |     for i, cb in enumerate(codes):
53 |         roundtripped = do_spectrogram_diffusion(diffusion, dvae, diffuser, cb.unsqueeze(0).cuda(), cond, spectrogram_compression_factor=128, plt_spec=False)
54 |         torchaudio.save(f'vocoded_output_sp_{i}.wav', roundtripped.squeeze(0).cpu(), 11025)


--------------------------------------------------------------------------------
/codes/scripts/audio/gen/use_mel2vec_codes.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torchvision
 3 | 
 4 | from models.audio.mel2vec import ContrastiveTrainingWrapper
 5 | from trainer.injectors.audio_injectors import TorchMelSpectrogramInjector, normalize_mel
 6 | from utils.util import load_audio
 7 | 
 8 | def collapse_codegroups(codes):
 9 |     codes = codes.clone()
10 |     groups = codes.shape[-1]
11 |     for k in range(groups):
12 |         codes[:,:,k] = codes[:,:,k] * groups ** k
13 |     codes = codes.sum(-1)
14 |     return codes
15 | 
16 | 
17 | def recover_codegroups(codes, groups):
18 |     codes = codes.clone()
19 |     output = torch.LongTensor(codes.shape[0], codes.shape[1], groups, device=codes.device)
20 |     for k in range(groups):
21 |         output[:,:,k] = codes % groups
22 |         codes = codes // groups
23 |     return output
24 | 
25 | 
26 | if __name__ == '__main__':
27 |     model = ContrastiveTrainingWrapper(mel_input_channels=256, inner_dim=1024, layers=24, dropout=0, mask_time_prob=0,
28 |                                        mask_time_length=6, num_negatives=100, codebook_size=16, codebook_groups=4,
29 |                                        disable_custom_linear_init=True, feature_producer_type='standard',
30 |                                        freq_mask_percent=0, do_reconstruction_loss=True)
31 |     model.load_state_dict(torch.load("../experiments/m2v_music2.pth"))
32 |     model.eval()
33 | 
34 |     wav = load_audio("Y:/separated/bt-music-1/100 Hits - Running Songs 2014 CD 2/100 Hits - Running Songs 2014 Cd2 - 02 - 7Th Heaven - Ain't Nothin' Goin' On But The Rent/00001/no_vocals.wav", 22050)
35 |     mel = TorchMelSpectrogramInjector({'n_mel_channels': 256, 'mel_fmax': 11000, 'filter_length': 16000,
36 |                                        'normalize': True, 'in': 'in', 'out': 'out'}, {})({'in': wav.unsqueeze(0)})['out']
37 |     codes = model.get_codes(mel)
38 |     reconstruction = model.reconstruct(mel)
39 | 
40 |     torchvision.utils.save_image((normalize_mel(mel).unsqueeze(1)+1)/2, 'mel.png')
41 |     torchvision.utils.save_image((normalize_mel(reconstruction).unsqueeze(1)+1)/2, 'reconstructed.png')
42 | 
43 |     collapsed = collapse_codegroups(codes)
44 |     recovered = recover_codegroups(collapsed, 4)
45 | 
46 |     print(codes)


--------------------------------------------------------------------------------
/codes/scripts/audio/gen/w2v_patcher.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from utils.util import load_model_from_config
 4 | 
 5 | if __name__ == '__main__':
 6 |     config = "D:\\dlas\\options\\train_wav2vec_matcher.yml"
 7 |     model_name = "generator"
 8 |     model_path = "D:\dlas\experiments\train_wav2vec_matcher\models"
 9 |     wav_dump_path = "FIXME"
10 | 
11 |     model = load_model_from_config(config, model_name, also_load_savepoint=False, load_path=model_path, device='cuda').eval()
12 |     w2v_logits, audio_samples = torch.load(wav_dump_path)
13 | 
14 |     w2v_logits_chunked = torch.chunk(w2v_logits, 32)
15 |     for chunk in w2v_logits_chunked:
16 | 


--------------------------------------------------------------------------------
/codes/scripts/audio/gen_mel.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import torch
 4 | 
 5 | from data.util import find_files_of_type, is_audio_file
 6 | from trainer.injectors.audio_injectors import MelSpectrogramInjector
 7 | from utils.util import load_audio
 8 | 
 9 | if __name__ == '__main__':
10 |     path = 'C:\\Users\\jbetk\\Documents\\tmp\\some_audio'
11 | 
12 |     inj = MelSpectrogramInjector({'in': 'wav', 'out': 'mel',
13 |                                   'mel_fmax': 12000, 'sampling_rate': 22050, 'n_mel_channels': 100
14 |                                   },{})
15 |     audio = find_files_of_type('img', path, qualifier=is_audio_file)[0]
16 |     for clip in audio:
17 |         if not clip.endswith('.wav'):
18 |             continue
19 |         wav = load_audio(clip, 24000)
20 |         mel = inj({'wav': wav.unsqueeze(0)})['mel']
21 |         torch.save(mel, clip.replace('.wav', '.mel'))


--------------------------------------------------------------------------------
/codes/scripts/audio/mel_bin_norm_compute.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import torch
 4 | import yaml
 5 | from tqdm import tqdm
 6 | 
 7 | from data import create_dataset, create_dataloader
 8 | from scripts.audio.gen.speech_synthesis_utils import wav_to_univnet_mel
 9 | from utils.options import Loader
10 | 
11 | if __name__ == '__main__':
12 |     parser = argparse.ArgumentParser()
13 |     parser.add_argument('-opt', type=str, help='Path to options YAML file used to train the diffusion model', default='D:\\dlas\\options\\train_diffusion_tts9.yml')
14 |     parser.add_argument('-key', type=str, help='Key where audio data is stored', default='wav')
15 |     parser.add_argument('-num_batches', type=int, help='Number of batches to collect to compute the norm', default=50000)
16 |     args = parser.parse_args()
17 | 
18 |     with open(args.opt, mode='r') as f:
19 |         opt = yaml.load(f, Loader=Loader)
20 |     dopt = opt['datasets']['train']
21 |     dopt['phase'] = 'train'
22 |     dataset, collate = create_dataset(dopt, return_collate=True)
23 |     dataloader = create_dataloader(dataset, dopt, collate_fn=collate, shuffle=True)
24 | 
25 |     mel_means = []
26 |     mel_max = -999999999
27 |     mel_min = 999999999
28 |     mel_stds = []
29 |     mel_vars = []
30 |     for batch in tqdm(dataloader):
31 |         if len(mel_means) > args.num_batches:
32 |             break
33 |         clip = batch[args.key].cuda()
34 |         for b in range(clip.shape[0]):
35 |             wav = clip[b].unsqueeze(0)
36 |             wav = wav[:, :, :batch[f'{args.key}_lengths'][b]]
37 |             mel = wav_to_univnet_mel(clip)  # Caution: make sure this isn't already normed.
38 |             mel_means.append(mel.mean((0,2)).cpu())
39 |             mel_max = max(mel.max().item(), mel_max)
40 |             mel_min = min(mel.min().item(), mel_min)
41 |             mel_stds.append(mel.std((0,2)).cpu())
42 |             mel_vars.append(mel.var((0,2)).cpu())
43 |     mel_means = torch.stack(mel_means).mean(0)
44 |     mel_stds = torch.stack(mel_stds).mean(0)
45 |     mel_vars = torch.stack(mel_vars).mean(0)
46 |     torch.save((mel_means,mel_max,mel_min,mel_stds,mel_vars), 'univnet_mel_norms.pth')


--------------------------------------------------------------------------------
/codes/scripts/audio/play_with_spectral_representations.py:
--------------------------------------------------------------------------------
 1 | import torchvision.utils
 2 | 
 3 | from utils.music_utils import music2mel, music2cqt
 4 | from utils.util import load_audio
 5 | 
 6 | if __name__ == '__main__':
 7 |     clip = load_audio('Y:\\split\\yt-music-eval\\00001.wav', 22050)
 8 |     mel = music2mel(clip)
 9 |     cqt = music2cqt(clip)
10 |     torchvision.utils.save_image((mel.unsqueeze(1) + 1) / 2, 'mel.png')
11 |     torchvision.utils.save_image((cqt.unsqueeze(1) + 1) / 2, 'cqt.png')
12 | 


--------------------------------------------------------------------------------
/codes/scripts/audio/prep_music/demucs_notes.txt:
--------------------------------------------------------------------------------
 1 | My custom demucs library is used for batch source separation:
 2 | https://github.com/neonbjb/demucs
 3 | 
 4 | ```
 5 | conda activate demucs
 6 | python setup.py install
 7 | CUDA_VISIBLE_DEVICES=0 python -m demucs /y/split/bt-music-5 --out=/y/separated/bt-music-5 --num_workers=2 --device cuda --two-stems=vocals
 8 | ```
 9 | 
10 | Example usage of generate_long_cheaters and generate_long_mels, post demucs:
11 | 
12 | ```
13 | CUDA_VISIBLE_DEVICES=0 python generate_long_mels.py --path=/y/separated/mpm/1 --progress_file=/y/separated/large_mels/mpm/already_processed.txt \
14 | --output_path=/y/separated/large_mels/mpm/1 --num_threads=2
15 | 
16 | CUDA_VISIBLE_DEVICES=2 python generate_long_cheaters.py --path=/y/separated/large_mels/mpm/3 --progress_file=/y/separated/large_mel_cheaters/mpm/already_processed.txt \
17 | --output_path=/y/separated/large_mel_cheaters/mpm/3 --num_threads=1
18 | ```


--------------------------------------------------------------------------------
/codes/scripts/audio/preparation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/scripts/audio/preparation/__init__.py


--------------------------------------------------------------------------------
/codes/scripts/audio/preparation/combine_phonetic_and_text.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | if __name__ == '__main__':
 4 |     basepath = 'Y:\\bigasr_dataset\\hifi_tts'
 5 | 
 6 |     english_file = os.path.join(basepath, 'transcribed-oco-realtext.tsv')
 7 |     if not os.path.exists(english_file):
 8 |         english_file = os.path.join(basepath, 'transcribed-oco.tsv')
 9 |     phoneme_file = os.path.join(basepath, 'transcribed-phoneme-oco.tsv')
10 | 
11 |     texts = {}
12 |     with open(english_file, 'r', encoding='utf-8') as f:
13 |         for line in f.readlines():
14 |             spl = line.split('\t')
15 |             if len(spl) == 3:
16 |                 text, p, _ = spl
17 |                 texts[p] = text
18 |             else:
19 |                 print(f'Error processing line {line}')
20 | 
21 |     with open(phoneme_file, 'r', encoding='utf-8') as f:
22 |         wf = open(os.path.join(basepath, 'transcribed-phoneme-english-oco.tsv'), 'w', encoding='utf-8')
23 |         for line in f.readlines():
24 |             spl = line.split('\t')
25 |             if len(spl) == 3:
26 |                 _, p, codes = spl
27 |                 codes = codes.strip()
28 |                 if p not in texts:
29 |                     print(f'Could not find the text for {p}')
30 |                     continue
31 |                 wf.write(f'{texts[p]}\t{p}\t{codes}\n')
32 |         wf.close()
33 | 


--------------------------------------------------------------------------------
/codes/scripts/audio/preparation/filter_clips_with_no_hifreq_data.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torchaudio
 3 | 
 4 | from data.audio.unsupervised_audio_dataset import load_audio
 5 | from scripts.do_to_files import do_to_files
 6 | 
 7 | 
 8 | def get_spec_mags(clip):
 9 |     stft = torch.stft(clip, n_fft=22000, hop_length=1024, return_complex=True)
10 |     stft = stft[0, -2000:, :]
11 |     return (stft.real ** 2 + stft.imag ** 2).sqrt()
12 | 
13 | 
14 | def filter_no_hifreq_data(path, output_path):
15 |     clip = load_audio(path, 22050)
16 |     if clip.shape[-1] < 22050:
17 |         return
18 |     stft = get_spec_mags(clip)
19 |     if stft.mean() < .08:
20 |         with open(output_path, 'a') as o:
21 |             o.write(f'{path}\n')
22 | 
23 | if __name__ == '__main__':
24 |     do_to_files(filter_no_hifreq_data)


--------------------------------------------------------------------------------
/codes/scripts/audio/preparation/gen_dvae_codes.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import torch
 4 | from tqdm import tqdm
 5 | 
 6 | from scripts.audio.gen.speech_synthesis_utils import load_speech_dvae, wav_to_mel
 7 | 
 8 | if __name__ == '__main__':
 9 |     input_folder = 'C:\\Users\\James\\Downloads\\lex2\\lexfridman_training_mp3'
10 |     output_folder = 'C:\\Users\\James\\Downloads\\lex2\\quantized'
11 | 
12 |     params = {
13 |         'mode': 'unsupervised_audio',
14 |         'path': [input_folder],
15 |         'cache_path': f'{input_folder}/cache.pth',
16 |         'sampling_rate': 22050,
17 |         'pad_to_samples': 441000,
18 |         'resample_clip': False,
19 |         'extra_samples': 0,
20 |         'phase': 'train',
21 |         'n_workers': 2,
22 |         'batch_size': 64,
23 |     }
24 |     from data import create_dataset, create_dataloader
25 |     os.makedirs(output_folder, exist_ok=True)
26 | 
27 |     ds = create_dataset(params)
28 |     dl = create_dataloader(ds, params)
29 | 
30 |     dvae = load_speech_dvae().cuda()
31 |     with torch.no_grad():
32 |         for batch in tqdm(dl):
33 |             audio = batch['clip'].cuda()
34 |             mel = wav_to_mel(audio)
35 |             codes = dvae.get_codebook_indices(mel)
36 |             for i in range(audio.shape[0]):
37 |                 c = codes[i, :batch['clip_lengths'][i]//1024+4]  # +4 seems empirically to be a good clipping point - it seems to preserve the termination codes.
38 |                 fn = batch['path'][i]
39 |                 outp = os.path.join(output_folder, os.path.relpath(fn, input_folder) + ".pth")
40 |                 os.makedirs(os.path.dirname(outp), exist_ok=True)
41 |                 torch.save(c.tolist(), outp)
42 | 


--------------------------------------------------------------------------------
/codes/scripts/audio/preparation/pipeline.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import shutil
 4 | from subprocess import Popen
 5 | 
 6 | if __name__ == '__main__':
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument('--path', type=str, help='Path to search for files')
 9 |     parser.add_argument('--output_path', type=str, help='Path for output files')
10 |     args = parser.parse_args()
11 | 
12 |     cmds = [
13 |         f"scripts/audio/preparation/phase_1_split_files.py --path={args.path} --progress_file={args.output_path}_t1/progress.txt --num_threads=6 --output_path={args.output_path}_t1",
14 |         f"scripts/audio/preparation/phase_2_sample_and_filter.py --path={args.output_path}_t1 --progress_file={args.output_path}/progress.txt --num_threads=6 --output_path={args.output_path}",
15 |         f"scripts/audio/preparation/phase_3_generate_similarities.py --path={args.output_path} --num_workers=4",
16 |     ]
17 |     os.makedirs(args.output_path, exist_ok=True)
18 |     os.makedirs(args.output_path + "_t1", exist_ok=True)
19 | 
20 |     for cmd in cmds:
21 |         p = Popen("python " + cmd, shell=True)
22 |         p.wait()
23 | 
24 |     shutil.rmtree(args.output_path + "_t1")
25 | 


--------------------------------------------------------------------------------
/codes/scripts/audio/preparation/process_spleeter_filter_outputs.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import argparse
 4 | from tqdm import tqdm
 5 | 
 6 | if __name__ == '__main__':
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument('input', metavar='in', type=str)
 9 |     parser.add_argument('basis', metavar='basis', type=str)
10 |     parser.add_argument('garbage', metavar='garbage', type=str)
11 |     args = parser.parse_args()
12 |     print(f"Moving files from {args.input} to {args.garbage}")
13 |     os.makedirs(args.garbage, exist_ok=True)
14 | 
15 |     with open(args.input) as f:
16 |         lines = f.readlines()
17 |         for line in tqdm(lines):
18 |             line = line.strip()
19 |             assert args.basis in line
20 |             movefile = os.path.join(args.garbage, line.replace(args.basis, '')[1:])
21 |             print(f'{line} -> {movefile}')
22 |             os.makedirs(os.path.dirname(movefile), exist_ok=True)
23 |             shutil.move(line, movefile)
24 | 
25 |     
26 |     
27 |     
28 | 


--------------------------------------------------------------------------------
/codes/scripts/audio/preparation/save_mels_to_disk.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | import numpy
 5 | import torch
 6 | from spleeter.audio.adapter import AudioAdapter
 7 | from tqdm import tqdm
 8 | 
 9 | from data.util import find_audio_files
10 | # Uses pydub to process a directory of audio files, splitting them into clips at points where it detects a small amount
11 | # of silence.
12 | from trainer.injectors.base_injectors import MelSpectrogramInjector
13 | 
14 | 
15 | def main():
16 |     parser = argparse.ArgumentParser()
17 |     parser.add_argument('--path')
18 |     args = parser.parse_args()
19 |     files = find_audio_files(args.path, include_nonwav=True)
20 |     mel_inj = MelSpectrogramInjector({'in':'in', 'out':'out'}, {})
21 |     audio_loader = AudioAdapter.default()
22 |     for e, wav_file in enumerate(tqdm(files)):
23 |         if e < 0:
24 |             continue
25 |         print(f"Processing {wav_file}..")
26 |         outfile = f'{wav_file}.npz'
27 |         if os.path.exists(outfile):
28 |             continue
29 | 
30 |         try:
31 |             wave, sample_rate = audio_loader.load(wav_file, sample_rate=22050)
32 |             wave = torch.tensor(wave)[:,0].unsqueeze(0)
33 |             wave = wave / wave.abs().max()
34 |         except:
35 |             print(f"Error with {wav_file}")
36 |             continue
37 | 
38 |         inj = mel_inj({'in': wave})
39 |         numpy.savez_compressed(outfile, inj['out'].numpy())
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     main()
44 | 


--------------------------------------------------------------------------------
/codes/scripts/audio/preparation/spleeter_filter_noisy_clips.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import numpy as np
 4 | from spleeter.separator import Separator
 5 | from torch.utils.data import DataLoader
 6 | from tqdm import tqdm
 7 | 
 8 | from scripts.audio.preparation.spleeter_utils.spleeter_dataset import SpleeterDataset
 9 | 
10 | 
11 | def main():
12 |     parser = argparse.ArgumentParser()
13 |     parser.add_argument('--path')
14 |     parser.add_argument('--out')
15 |     parser.add_argument('--resume', default=None)
16 |     parser.add_argument('--partition_size', default=None)
17 |     parser.add_argument('--partition', default=None)
18 |     args = parser.parse_args()
19 | 
20 |     src_dir = args.path
21 |     out_file = args.out
22 |     output_sample_rate=22050
23 |     resume_file = args.resume
24 | 
25 |     loader = DataLoader(SpleeterDataset(src_dir, batch_sz=16, sample_rate=output_sample_rate,
26 |                                         max_duration=10, partition=args.partition, partition_size=args.partition_size,
27 |                                         resume=resume_file), batch_size=1, num_workers=1)
28 | 
29 |     separator = Separator('spleeter:2stems')
30 |     unacceptable_files = open(out_file, 'a')
31 |     for batch in tqdm(loader):
32 |         audio, files, ends = batch['audio'], batch['files'], batch['ends']
33 |         sep = separator.separate(audio.squeeze(0).numpy())
34 |         vocals = sep['vocals']
35 |         bg = sep['accompaniment']
36 |         start = 0
37 |         for path, end in zip(files, ends):
38 |             vmax = np.abs(vocals[start:end]).mean()
39 |             bmax = np.abs(bg[start:end]).mean()
40 |             start = end
41 | 
42 |             # Only output to the "good" sample dir if the ratio of background noise to vocal noise is high enough.
43 |             ratio = vmax / (bmax+.0000001)
44 |             if ratio < 18:  # These values were derived empirically
45 |                 unacceptable_files.write(f'{path[0]}\n')
46 |         unacceptable_files.flush()
47 | 
48 |     unacceptable_files.close()
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     main()
53 | 


--------------------------------------------------------------------------------
/codes/scripts/audio/preparation/spleeter_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/scripts/audio/preparation/spleeter_utils/__init__.py


--------------------------------------------------------------------------------
/codes/scripts/audio/preparation/spleeter_utils/spleeter_dataset.py:
--------------------------------------------------------------------------------
 1 | from math import ceil
 2 | 
 3 | import numpy as np
 4 | 
 5 | from spleeter.audio.adapter import AudioAdapter
 6 | from torch.utils.data import Dataset
 7 | 
 8 | from data.util import find_audio_files
 9 | 
10 | 
11 | class SpleeterDataset(Dataset):
12 |     def __init__(self, src_dir, batch_sz, max_duration, sample_rate=22050, partition=None, partition_size=None, resume=None):
13 |         self.batch_sz = batch_sz
14 |         self.max_duration = max_duration
15 |         self.files = find_audio_files(src_dir, include_nonwav=True)
16 |         self.sample_rate = sample_rate
17 | 
18 |         # Partition files if needed.
19 |         if partition_size is not None:
20 |             psz = int(partition_size)
21 |             prt = int(partition)
22 |             self.files = self.files[prt * psz:(prt + 1) * psz]
23 | 
24 |         # Find the resume point and carry on from there.
25 |         if resume is not None:
26 |             for i, f in enumerate(self.files):
27 |                 if resume in f:
28 |                     break
29 |             assert i < len(self.files)
30 |             self.files = self.files[i:]
31 |         self.loader = AudioAdapter.default()
32 | 
33 |     def __len__(self):
34 |         return ceil(len(self.files) / self.batch_sz)
35 | 
36 |     def __getitem__(self, item):
37 |         item = item * self.batch_sz
38 |         wavs = None
39 |         files = []
40 |         ends = []
41 |         for k in range(self.batch_sz):
42 |             ind = k+item
43 |             if ind >= len(self.files):
44 |                 break
45 | 
46 |             #try:
47 |             wav, sr = self.loader.load(self.files[ind], sample_rate=self.sample_rate)
48 |             assert sr == 22050
49 |             # Get rid of all channels except one.
50 |             if wav.shape[1] > 1:
51 |                 wav = wav[:, 0]
52 | 
53 |             if wavs is None:
54 |                 wavs = wav
55 |             else:
56 |                 wavs = np.concatenate([wavs, wav])
57 |             ends.append(wavs.shape[0])
58 |             files.append(self.files[ind])
59 |             #except:
60 |             #    print(f'Error loading {self.files[ind]}')
61 |         return {
62 |             'audio': wavs,
63 |             'files': files,
64 |             'ends': ends
65 |         }


--------------------------------------------------------------------------------
/codes/scripts/audio/preparation/split_on_silence.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import os
 4 | from pydub import AudioSegment
 5 | from pydub.exceptions import CouldntDecodeError
 6 | from pydub.silence import split_on_silence
 7 | from data.util import find_audio_files
 8 | from tqdm import tqdm
 9 | 
10 | 
11 | # Uses pydub to process a directory of audio files, splitting them into clips at points where it detects a small amount
12 | # of silence.
13 | def main():
14 |     parser = argparse.ArgumentParser()
15 |     parser.add_argument('--path')
16 |     parser.add_argument('--out')
17 |     args = parser.parse_args()
18 |     minimum_duration = 2
19 |     maximum_duration = 20
20 |     files = find_audio_files(args.path, include_nonwav=True)
21 |     for e, wav_file in enumerate(tqdm(files)):
22 |         print(f"Processing {wav_file}..")
23 |         outdir = os.path.join(args.out, f'{e}_{os.path.basename(wav_file[:-4])}').replace('.', '').strip()
24 |         os.makedirs(outdir, exist_ok=True)
25 | 
26 |         try:
27 |             speech = AudioSegment.from_file(wav_file)
28 |         except CouldntDecodeError as e:
29 |             print(e)
30 |             continue
31 |         chunks = split_on_silence(speech, min_silence_len=400, silence_thresh=-40,
32 |                                   seek_step=100, keep_silence=50)
33 | 
34 |         for i in range(0, len(chunks)):
35 |             if chunks[i].duration_seconds < minimum_duration or chunks[i].duration_seconds > maximum_duration:
36 |                 continue
37 |             chunks[i].export(f"{outdir}/{i:05d}.mp3", format='mp3', parameters=["-ac", "1"])
38 | 
39 | if __name__ == '__main__':
40 |     main()
41 | 


--------------------------------------------------------------------------------
/codes/scripts/audio/random_mp3_splitter.py:
--------------------------------------------------------------------------------
 1 | from scipy.io import wavfile
 2 | from spleeter.separator import Separator
 3 | from tqdm import tqdm
 4 | 
 5 | from data.util import find_audio_files
 6 | import os.path as osp
 7 | from spleeter.audio.adapter import AudioAdapter
 8 | import numpy as np
 9 | 
10 | 
11 | if __name__ == '__main__':
12 |     src_dir = 'P:\\Audiobooks-Podcasts'
13 |     #src_dir = 'E:\\audio\\books'
14 |     output_dir = 'D:\\data\\audio\\misc-split'
15 |     output_dir_lq = 'D:\\data\\audio\\misc-split-with-bg'
16 |     output_dir_garbage = 'D:\\data\\audio\\misc-split-garbage'
17 |     #output_dir = 'E:\\audio\\books-clips'
18 |     clip_length = 5  # In seconds
19 |     sparsity = .1  # Only this proportion of the total clips are extracted as wavs.
20 |     output_sample_rate=22050
21 | 
22 |     audio_loader = AudioAdapter.default()
23 |     separator = Separator('spleeter:2stems')
24 |     files = find_audio_files(src_dir, include_nonwav=True)
25 |     for e, file in enumerate(tqdm(files)):
26 |         if e < 1092:
27 |             continue
28 |         file_basis = osp.relpath(file, src_dir)\
29 |             .replace('/', '_')\
30 |             .replace('\\', '_')\
31 |             .replace('.', '_')\
32 |             .replace(' ', '_')\
33 |             .replace('!', '_')\
34 |             .replace(',', '_')
35 |         if len(file_basis) > 100:
36 |             file_basis = file_basis[:100]
37 |         try:
38 |             wave, sample_rate = audio_loader.load(file, sample_rate=output_sample_rate)
39 |         except:
40 |             print(f"Error with {file}")
41 |             continue
42 | 
43 |         #if len(wave.shape) < 2:
44 |         #    continue
45 | 
46 |         # Calculate how much data we need to extract for each clip.
47 |         clip_sz = sample_rate * clip_length
48 |         interval = int(sample_rate * (clip_length / sparsity))
49 |         i = 0
50 |         while (i+clip_sz) < wave.shape[0]:
51 |             clip = wave[i:i+clip_sz]
52 |             sep = separator.separate(clip)
53 |             vocals = sep['vocals']
54 |             bg = sep['accompaniment']
55 |             vmax = np.abs(vocals).mean()
56 |             bmax = np.abs(bg).mean()
57 | 
58 |             # Only output to the "good" sample dir if the ratio of background noise to vocal noise is high enough.
59 |             ratio = vmax / (bmax+.0000001)
60 |             if ratio >= 25:  # These values were derived empirically
61 |                 od = output_dir
62 |                 os = clip
63 |             elif ratio >= 1:
64 |                 od = output_dir_lq
65 |                 os = vocals
66 |             else:
67 |                 od = output_dir_garbage
68 |                 os = vocals
69 | 
70 |             # Strip out channels.
71 |             if len(os.shape) > 1:
72 |                 os = os[:, 0]  # Just use the first channel.
73 | 
74 |             wavfile.write(osp.join(od, f'{e}_{file_basis}_{i}.wav'), output_sample_rate, os)
75 |             i = i + interval
76 | 


--------------------------------------------------------------------------------
/codes/scripts/audio/spleeter_split_voice_and_background.py:
--------------------------------------------------------------------------------
 1 | from scipy.io import wavfile
 2 | from spleeter.separator import Separator
 3 | from tqdm import tqdm
 4 | '''
 5 | Uses a model configuration to load a classifier and iterate through a dataset, binning each class into it's own
 6 | folder.
 7 | '''
 8 | 
 9 | from data.util import find_audio_files
10 | import os
11 | import os.path as osp
12 | from spleeter.audio.adapter import AudioAdapter
13 | import numpy as np
14 | 
15 | 
16 | # Uses spleeter_utils to divide audio clips into one of two bins:
17 | # 1. Audio has little to no background noise, saved to "output_dir"
18 | # 2. Audio has a lot of background noise, bg noise split off and saved to "output_dir_bg"
19 | if __name__ == '__main__':
20 |     src_dir = 'F:\\split\\joe_rogan'
21 |     output_dir = 'F:\\split\\cleaned\\joe_rogan'
22 |     output_dir_bg = 'F:\\split\\background-noise\\joe_rogan'
23 |     output_sample_rate=22050
24 | 
25 |     os.makedirs(output_dir_bg, exist_ok=True)
26 |     os.makedirs(output_dir, exist_ok=True)
27 | 
28 |     audio_loader = AudioAdapter.default()
29 |     separator = Separator('spleeter:2stems')
30 |     files = find_audio_files(src_dir, include_nonwav=True)
31 |     for e, file in enumerate(tqdm(files)):
32 |         #if e < 406500:
33 |         #    continue
34 |         file_basis = osp.relpath(file, src_dir)\
35 |             .replace('/', '_')\
36 |             .replace('\\', '_')\
37 |             .replace('.', '_')\
38 |             .replace(' ', '_')\
39 |             .replace('!', '_')\
40 |             .replace(',', '_')
41 |         if len(file_basis) > 100:
42 |             file_basis = file_basis[:100]
43 |         try:
44 |             wave, sample_rate = audio_loader.load(file, sample_rate=output_sample_rate)
45 |         except:
46 |             print(f"Error with {file}")
47 |             continue
48 | 
49 |         sep = separator.separate(wave)
50 |         vocals = sep['vocals']
51 |         bg = sep['accompaniment']
52 |         vmax = np.abs(vocals).mean()
53 |         bmax = np.abs(bg).mean()
54 | 
55 |         # Only output to the "good" sample dir if the ratio of background noise to vocal noise is high enough.
56 |         ratio = vmax / (bmax+.0000001)
57 |         if ratio >= 25:  # These values were derived empirically
58 |             od = output_dir
59 |             os = wave
60 |         elif ratio <= 1:
61 |             od = output_dir_bg
62 |             os = bg
63 |         else:
64 |             continue
65 | 
66 |         # Strip out channels.
67 |         if len(os.shape) > 1:
68 |             os = os[:, 0]  # Just use the first channel.
69 | 
70 |         wavfile.write(osp.join(od, file_basis, f'{e}.wav'), output_sample_rate, os)
71 | 


--------------------------------------------------------------------------------
/codes/scripts/audio/test_audio_similarity.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import torch
 4 | import torch.nn.functional as F
 5 | 
 6 | from data.util import is_wav_file, find_files_of_type
 7 | from models.audio.audio_resnet import resnet50
 8 | from models.audio.tts.tacotron2.taco_utils import load_wav_to_torch
 9 | from scripts.byol.byol_extract_wrapped_model import extract_byol_model_from_state_dict
10 | 
11 | if __name__ == '__main__':
12 |     window = 48000
13 |     root_path = 'D:\\tmp\\clips'
14 |     paths = find_files_of_type('img', root_path, qualifier=is_wav_file)[0]
15 |     clips = []
16 |     for path in paths:
17 |         clip, sr = load_wav_to_torch(os.path.join(root_path, path))
18 |         if len(clip.shape) > 1:
19 |             clip = clip[:,0]
20 |         clip = clip[:window].unsqueeze(0)
21 |         clip = clip / 32768.0  # Normalize
22 |         #clip = clip + torch.rand_like(clip) * .03  # Noise (this is how the model was trained)
23 |         assert sr == 24000
24 |         clips.append(clip)
25 |     clips = torch.stack(clips, dim=0)
26 | 
27 |     resnet = resnet50()
28 |     sd = torch.load('../experiments/train_byol_audio_clips/models/8000_generator.pth')
29 |     sd = extract_byol_model_from_state_dict(sd)
30 |     resnet.load_state_dict(sd)
31 |     embedding = resnet(clips, return_pool=True)
32 | 
33 |     for i, path in enumerate(paths):
34 |         print(f'Using a baseline of {path}..')
35 |         for j, cpath in enumerate(paths):
36 |             if i == j:
37 |                 continue
38 |             l2 = F.mse_loss(embedding[j], embedding[i])
39 |             print(f'Compared to {cpath}: {l2}')
40 | 
41 | 


--------------------------------------------------------------------------------
/codes/scripts/audio/test_audio_speech_recognition.py:
--------------------------------------------------------------------------------
 1 | import os.path as osp
 2 | import logging
 3 | import random
 4 | import argparse
 5 | 
 6 | import utils
 7 | import utils.options as option
 8 | import utils.util as util
 9 | from models.audio.tts.tacotron2 import sequence_to_text
10 | from trainer.ExtensibleTrainer import ExtensibleTrainer
11 | from data import create_dataset, create_dataloader
12 | from tqdm import tqdm
13 | import torch
14 | import numpy as np
15 | from scipy.io import wavfile
16 | 
17 | 
18 | def forward_pass(model, data, output_dir, opt, b):
19 |     with torch.no_grad():
20 |         model.feed_data(data, 0)
21 |         model.test()
22 | 
23 |     if 'real_text' in opt['eval'].keys():
24 |         real = data[opt['eval']['real_text']][0]
25 |         print(f'{b} Real text: "{real}"')
26 | 
27 |     pred_seq = model.eval_state[opt['eval']['gen_text']][0]
28 |     pred_text = [sequence_to_text(ts) for ts in pred_seq]
29 |     audio = model.eval_state[opt['eval']['audio']][0].cpu().numpy()
30 |     wavfile.write(osp.join(output_dir, f'{b}_clip.wav'), 22050, audio)
31 |     for i, text in enumerate(pred_text):
32 |         print(f'{b} Predicted text {i}: "{text}"')
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     # Set seeds
37 |     torch.manual_seed(5555)
38 |     random.seed(5555)
39 |     np.random.seed(5555)
40 | 
41 |     #### options
42 |     torch.backends.cudnn.benchmark = True
43 |     want_metrics = False
44 |     parser = argparse.ArgumentParser()
45 |     parser.add_argument('-opt', type=str, help='Path to options YAML file.', default='../options/test_gpt_asr_mass.yml')
46 |     opt = option.parse(parser.parse_args().opt, is_train=False)
47 |     opt = option.dict_to_nonedict(opt)
48 |     utils.util.loaded_options = opt
49 | 
50 |     util.mkdirs(
51 |         (path for key, path in opt['path'].items()
52 |          if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key))
53 |     util.setup_logger('base', opt['path']['log'], 'test_' + opt['name'], level=logging.INFO,
54 |                       screen=True, tofile=True)
55 |     logger = logging.getLogger('base')
56 |     logger.info(option.dict2str(opt))
57 | 
58 |     test_loaders = []
59 |     for phase, dataset_opt in sorted(opt['datasets'].items()):
60 |         test_set, collate_fn = create_dataset(dataset_opt, return_collate=True)
61 |         test_loader = create_dataloader(test_set, dataset_opt, collate_fn=collate_fn)
62 |         logger.info('Number of test texts in [{:s}]: {:d}'.format(dataset_opt['name'], len(test_set)))
63 |         test_loaders.append(test_loader)
64 | 
65 |     model = ExtensibleTrainer(opt)
66 | 
67 |     batch = 0
68 |     for test_loader in test_loaders:
69 |         dataset_dir = opt['path']['results_root']
70 |         util.mkdir(dataset_dir)
71 | 
72 |         tq = tqdm(test_loader)
73 |         for data in tq:
74 |             forward_pass(model, data, dataset_dir, opt, batch)
75 |             batch += 1
76 | 
77 | 


--------------------------------------------------------------------------------
/codes/scripts/audio/use_vocoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from scipy.io import wavfile
 3 | 
 4 | from models.audio.vocoders.waveglow.waveglow import WaveGlow
 5 | 
 6 | 
 7 | class Vocoder:
 8 |     def __init__(self):
 9 |         self.model = WaveGlow(n_mel_channels=80, n_flows=12, n_group=8, n_early_size=2, n_early_every=4, WN_config={'n_layers': 8, 'n_channels': 256, 'kernel_size': 3})
10 |         sd = torch.load('../experiments/waveglow_256channels_universal_v5.pth')
11 |         self.model.load_state_dict(sd)
12 |         self.model = self.model.cpu()
13 |         self.model.eval()
14 | 
15 |     def transform_mel_to_audio(self, mel):
16 |         if len(mel.shape) == 2:  # Assume it's missing the batch dimension and fix that.
17 |             mel = mel.unsqueeze(0)
18 |         with torch.no_grad():
19 |             return self.model.infer(mel)
20 | 
21 | 
22 | if __name__ == '__main__':
23 |     vocoder = Vocoder()
24 |     m = torch.load('C:\\Users\\jbetk\\Documents\\tmp\\some_audio\\00008.mel').cpu()
25 |     wav = vocoder.transform_mel_to_audio(m)
26 |     wavfile.write(f'0.wav', 22050, wav[0].cpu().numpy())


--------------------------------------------------------------------------------
/codes/scripts/audio/word_error_rate.py:
--------------------------------------------------------------------------------
 1 | import Levenshtein
 2 | from jiwer import wer, compute_measures
 3 | import torch
 4 | from tqdm import tqdm
 5 | 
 6 | from data.audio.voice_tokenizer import VoiceBpeTokenizer
 7 | 
 8 | 
 9 | def load_truths(file):
10 |     niltok = VoiceBpeTokenizer(None)
11 |     out = {}
12 |     with open(file, 'r', encoding='utf-8') as f:
13 |         for line in f.readlines():
14 |             spl = line.split('|')
15 |             if len(spl) != 2:
16 |                 print(spl)
17 |                 continue
18 |             path, truth = spl
19 |             #path = path.replace('wav/', '')
20 |             # This preprocesses the truth data in the same way that training data is processed: removing punctuation, all lowercase, removing unnecessary
21 |             # whitespace, and applying "english cleaners", which convert words like "mrs" to "missus" and such.
22 |             truth = niltok.preprocess_text(truth)
23 |             out[path] = truth
24 |     return out
25 | 
26 | 
27 | if __name__ == '__main__':
28 |     inference_tsv = 'results.tsv'
29 |     libri_base = 'y:\\bigasr_dataset/librispeech/test_clean/test_clean.txt'
30 | 
31 |     # Pre-process truth values
32 |     truths = load_truths(libri_base)
33 | 
34 |     niltok = VoiceBpeTokenizer(None)
35 |     ground_truths = []
36 |     hypotheses = []
37 |     with open(inference_tsv, 'r') as tsv_file:
38 |         tsv = tsv_file.read().splitlines()
39 |         for line in tqdm(tsv):
40 |             sentence_pred, wav = line.split('\t')
41 |             hypotheses.append(niltok.preprocess_text(sentence_pred))
42 |             ground_truths.append(truths[wav])
43 |     wer = wer(ground_truths, hypotheses)*100
44 |     print(f"WER: {wer}")
45 | 


--------------------------------------------------------------------------------
/codes/scripts/byol/byol_extract_wrapped_model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def extract_byol_model_from_state_dict(sd):
 5 |     wrap_key = 'online_encoder.net.'
 6 |     sdo = {}
 7 |     for k,v in sd.items():
 8 |         if wrap_key in k:
 9 |             sdo[k.replace(wrap_key, '')] = v
10 |     return sdo
11 | 
12 | if __name__ == '__main__':
13 |     pretrained_path = '../../../experiments/uresnet_pixpro4_imgset.pth'
14 |     output_path = '../../../experiments/uresnet_pixpro4_imgset.pth'
15 | 
16 |     sd = torch.load(pretrained_path)
17 |     sd = extract_byol_model_from_state_dict(sd)
18 | 
19 |     #model = SpineNet('49', in_channels=3, use_input_norm=True).to('cuda')
20 |     #model.load_state_dict(sdo, strict=True)
21 | 
22 |     print("Validation succeeded, dumping state dict to output path.")
23 |     torch.save(sdo, output_path)


--------------------------------------------------------------------------------
/codes/scripts/do_to_files.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import functools
 3 | import os
 4 | import pathlib
 5 | from multiprocessing.pool import ThreadPool
 6 | 
 7 | from tqdm import tqdm
 8 | 
 9 | 
10 | ''' 
11 | Helper function for scripts that iterate over large sets of files. Defines command-line arguments
12 | for operating over a large set of files, then handles setting up a worker queue system to operate
13 | on those files. You need to provide your own process_file_fn.
14 | 
15 | process_file_fn expected signature:
16 |  (path, output_path)
17 | '''
18 | def do_to_files(process_file_fn):
19 |     parser = argparse.ArgumentParser()
20 |     parser.add_argument('--path')
21 |     parser.add_argument('--glob')
22 |     parser.add_argument('--out')
23 |     parser.add_argument('--resume')
24 |     parser.add_argument('--num_workers')
25 | 
26 |     args = parser.parse_args()
27 |     src = args.path
28 |     glob = args.glob
29 |     out = args.out
30 |     resume = args.resume
31 |     num_workers = int(args.num_workers)
32 | 
33 |     path = pathlib.Path(src)
34 |     files = path.rglob(glob)
35 |     files = [str(f) for f in files]
36 |     files = files[resume:]
37 |     pfn = functools.partial(process_file_fn, output_path=out)
38 |     if num_workers > 0:
39 |         with ThreadPool(num_workers) as pool:
40 |             list(tqdm(pool.imap(pfn, files), total=len(files)))
41 |     else:
42 |         for f in tqdm(files):
43 |             pfn(f)
44 | 


--------------------------------------------------------------------------------
/codes/scripts/folderize_imagenet_val.py:
--------------------------------------------------------------------------------
 1 | from glob import glob
 2 | 
 3 | import torch
 4 | import os
 5 | import shutil
 6 | 
 7 | if __name__ == '__main__':
 8 |     index_map_file = 'F:\\4k6k\\datasets\\images\\imagenet_2017\\imagenet_index_to_train_folder_name_map.pth'
 9 |     ground_truth = 'F:\\4k6k\\datasets\\images\\imagenet_2017\\validation_ground_truth.txt'
10 |     val_path = 'F:\\4k6k\\datasets\\images\\imagenet_2017\\val'
11 | 
12 |     index_map = torch.load(index_map_file)
13 | 
14 |     for folder in index_map.values():
15 |         os.makedirs(os.path.join(val_path, folder), exist_ok=True)
16 | 
17 |     gtfile = open(ground_truth, 'r')
18 |     gtids = []
19 |     for line in gtfile:
20 |         gtids.append(int(line.strip()))
21 |     gtfile.close()
22 | 
23 |     for i, img_file in enumerate(glob(os.path.join(val_path, "*.JPEG"))):
24 |         shutil.move(img_file, os.path.join(val_path, index_map[gtids[i]],
25 |                                            os.path.basename(img_file)))
26 |     print("Done!")
27 | 


--------------------------------------------------------------------------------
/codes/scripts/hugging_face_hub_upload.py:
--------------------------------------------------------------------------------
 1 | if __name__ == '__main__':
 2 |     """
 3 |     Utility script for uploading model weights to the HF hub
 4 |     """
 5 | 
 6 |     '''
 7 |     model = Wav2VecWrapper(vocab_size=148, basis_model='facebook/wav2vec2-large-robust-ft-libri-960h', freeze_transformer=True, checkpointing_enabled=False)
 8 |     weights = torch.load('D:\\dlas\\experiments\\train_wav2vec_mass_large2\\models\\22500_wav2vec.pth')
 9 |     model.load_state_dict(weights)
10 |     model.w2v.save_pretrained("jbetker/wav2vec2-large-robust-ft-libritts-voxpopuli", push_to_hub=True)
11 |     '''
12 | 
13 |     # Build tokenizer vocab
14 |     #mapping = tacotron_symbol_mapping()
15 |     #print(json.dumps(mapping))


--------------------------------------------------------------------------------
/codes/scripts/start_tensorboard.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | #script to find the latest directory in a directory and start tensorboard from there
 4 | 
 5 | 
 6 | def get_latest_dir(path):
 7 |     dirs = os.listdir(path)
 8 |     dirs = [os.path.join(path, d) for d in dirs]
 9 |     dirs = [d for d in dirs if os.path.isdir(d)]
10 |     return max(dirs, key=os.path.getmtime)
11 | 
12 | def start_tensorboard(path):
13 |     latest_dir = get_latest_dir(path)
14 |     os.path.join(latest_dir, 'tb_logger')
15 |     os.system('tensorboard --logdir ' + latest_dir)
16 | 
17 | if __name__ == '__main__':
18 |     #process experiments folder
19 |     print('Starting tensorboard from latest experiment folder:' + get_latest_dir('experiments') + '...')
20 |     start_tensorboard('experiments')


--------------------------------------------------------------------------------
/codes/scripts/stitch_images.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | 
 3 | import torch
 4 | import torchvision
 5 | from PIL import Image
 6 | from torchvision.transforms import ToTensor
 7 | 
 8 | if __name__ == '__main__':
 9 |     imfolder = 'F:\\dlas\\results\\test_diffusion_unet\\imgset5'
10 |     cols, rows = 10, 5
11 |     images = glob.glob(f'{imfolder}/*.png')
12 |     output = None
13 |     for r in range(rows):
14 |         for c in range(cols):
15 |             im = ToTensor()(Image.open(next(images)))
16 |             if output is None:
17 |                 c, h, w = im.shape
18 |                 output = torch.zeros(c, h * rows, w * cols)
19 |             output[:,r*h:(r+1)*h,c*w:(c+1)*w] = im
20 |     torchvision.utils.save_image(output, "out.png")


--------------------------------------------------------------------------------
/codes/scripts/stylegan2/dnnlib/tflib/network.py:
--------------------------------------------------------------------------------
 1 | # Pretends to be the stylegan2 Network class for intercepting pickle load requests.
 2 | # Horrible hack. Please don't judge me.
 3 | 
 4 | # Globals for storing these networks because I have no idea how pickle is doing this internally.
 5 | generator, discriminator, gen_ema = {}, {}, {}
 6 | 
 7 | class Network:
 8 |     def __setstate__(self, state: dict) -> None:
 9 |         global generator, discriminator, gen_ema
10 |         name = state['name']
11 |         if name in ['G_synthesis', 'G_mapping', 'G', 'G_main']:
12 |             if name != 'G' and name not in generator.keys():
13 |                 generator[name] = state
14 |             else:
15 |                 gen_ema[name] = state
16 |         elif name in ['D']:
17 |             discriminator[name] = state
18 | 


--------------------------------------------------------------------------------
/codes/scripts/ui/image_labeler/label_editor.py:
--------------------------------------------------------------------------------
 1 | import orjson
 2 | 
 3 | from data.images.image_label_parser import VsNetImageLabeler
 4 | 
 5 | 
 6 | # Translates from the label JSON output of the VS.NET UI to something more compact and usable.
 7 | def convert_from_vsnet_labels():
 8 |     labeler = VsNetImageLabeler(['F:\\4k6k\datasets\\ns_images\\512_unsupervised\\categories.json',
 9 |                                  'F:\\4k6k\datasets\\ns_images\\512_unsupervised\\categories_new.json',
10 |                                  'F:\\4k6k\datasets\\ns_images\\512_unsupervised\\categories_new_new.json'])
11 |     # Proposed format:
12 |     # 'config': { 'dim' }
13 |     # 'labels': [{ 'label', 'key'}] <- ordered by label index.
14 |     # 'images': {'file': [{ 'lid', 'top', 'left' }}
15 |     # 'labelMap' {<mapping of string labels to ids>}
16 |     out_dict = {
17 |         'config': {
18 |             'dim': next(iter(labeler.labeled_images.values()))[0]['patch_width']
19 |         },
20 |         'labels': [{'label': cat['label'], 'key': cat['keyBinding']} for cat in labeler.categories.values()],
21 |     }
22 |     out_dict['labelMap'] = {}
23 |     for i, lbl in enumerate(out_dict['labels']):
24 |         out_dict['labelMap'][lbl['label']] = i
25 |     out_dict['images'] = {}
26 |     for fname, ilbls in labeler.labeled_images.items():
27 |         out_dict['images'][fname] = [{'lid': out_dict['labelMap'][il['label']], 'top': il['patch_top'], 'left': il['patch_left']} for il in ilbls]
28 |     with open("label_editor.json", 'wb') as fout:
29 |         fout.write(orjson.dumps(out_dict))
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     convert_from_vsnet_labels()


--------------------------------------------------------------------------------
/codes/scripts/ui/image_labeler/pretrained_image_patch_classifier.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os.path as osp
 3 | 
 4 | import utils
 5 | import utils.options as option
 6 | import utils.util as util
 7 | from data import create_dataset, create_dataloader
 8 | from trainer.ExtensibleTrainer import ExtensibleTrainer
 9 | 
10 | 
11 | class PretrainedImagePatchClassifier:
12 |     def __init__(self, cfg):
13 |         self.cfg = cfg
14 | 
15 |         opt = option.parse(cfg, is_train=False)
16 |         opt = option.dict_to_nonedict(opt)
17 |         utils.util.loaded_options = opt
18 | 
19 |         util.mkdirs(
20 |             (path for key, path in opt['path'].items()
21 |              if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key))
22 |         util.setup_logger('base', opt['path']['log'], 'test_' + opt['name'], level=logging.INFO,
23 |                           screen=True, tofile=True)
24 |         logger = logging.getLogger('base')
25 |         logger.info(option.dict2str(opt))
26 | 
27 |         #### Create test dataset and dataloader
28 |         dataset_opt = list(opt['datasets'].values())[0]
29 |         # Remove labeling features from the dataset config and wrappers.
30 |         if 'dataset' in dataset_opt.keys():
31 |             if 'labeler' in dataset_opt['dataset'].keys():
32 |                 dataset_opt['dataset']['includes_labels'] = False
33 |                 del dataset_opt['dataset']['labeler']
34 |             test_set = create_dataset(dataset_opt)
35 |             if hasattr(test_set, 'wrapped_dataset'):
36 |                 test_set = test_set.wrapped_dataset
37 |         else:
38 |             test_set = create_dataset(dataset_opt)
39 |         logger.info('Number of test images: {:d}'.format(len(test_set)))
40 |         self.test_loader = create_dataloader(test_set, dataset_opt, opt)
41 |         self.model = ExtensibleTrainer(opt)
42 |         self.gen = self.model.netsG['generator']
43 |         self.dataset_dir = osp.join(opt['path']['results_root'], opt['name'])
44 |         util.mkdir(self.dataset_dir)
45 | 
46 |     def get_next_sample(self):
47 | 
48 |         for data in self.test_loader:
49 |             hq = data['hq'].to('cuda')
50 |             res = self.gen(hq)
51 |             yield hq, res, data
52 | 
53 | 


--------------------------------------------------------------------------------
/codes/scripts/ui/image_labeler/test_image_patch_classifier.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | import torch
 5 | import torchvision
 6 | 
 7 | import utils.options as option
 8 | from scripts.ui.image_labeler.pretrained_image_patch_classifier import PretrainedImagePatchClassifier
 9 | 
10 | if __name__ == "__main__":
11 |     #### options
12 |     torch.backends.cudnn.benchmark = True
13 |     want_metrics = False
14 |     parser = argparse.ArgumentParser()
15 |     parser.add_argument('-opt', type=str, help='Path to options YAML file.', default='../options/train_imgset_structural_classifier.yml')
16 | 
17 |     classifier = PretrainedImagePatchClassifier(parser.parse_args().opt)
18 |     label_to_search_for = 4
19 |     step = 1
20 |     for hq, res in classifier.get_next_sample():
21 |         res = torch.nn.functional.interpolate(res, size=hq.shape[2:], mode="nearest")
22 |         res_lbl = res[:, label_to_search_for, :, :].unsqueeze(1)
23 |         res_lbl_mask = (1.0 * (res_lbl > .5))*.5 + .5
24 |         hq = hq * res_lbl_mask
25 |         torchvision.utils.save_image(hq, os.path.join(classifier.dataset_dir, "%i.png" % (step,)))
26 |         step += 1
27 | 


--------------------------------------------------------------------------------
/codes/scripts/use_generator_as_filter.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | 
 4 | from torch.utils.data import DataLoader
 5 | 
 6 | from data.images.single_image_dataset import SingleImageDataset
 7 | from tqdm import tqdm
 8 | import torch
 9 | 
10 | from models.vqvae.vqvae_no_conv_transpose import VQVAE
11 | 
12 | if __name__ == "__main__":
13 |     bin_path = "f:\\binned"
14 |     good_path = "f:\\good"
15 |     os.makedirs(bin_path, exist_ok=True)
16 |     os.makedirs(good_path, exist_ok=True)
17 | 
18 |     torch.backends.cudnn.benchmark = True
19 | 
20 |     model = VQVAE().cuda()
21 |     model.load_state_dict(torch.load('../experiments/nvqvae_imgset.pth'))
22 |     ds = SingleImageDataset({
23 |         'name': 'amalgam',
24 |         'paths': ['F:\\4k6k\\datasets\\ns_images\\imagesets\\256_with_ref_v5'],
25 |         'weights': [1],
26 |         'target_size': 128,
27 |         'force_multiple': 32,
28 |         'scale': 1,
29 |         'eval': False
30 |     })
31 |     dl = DataLoader(ds, batch_size=256, num_workers=1)
32 | 
33 |     means = []
34 |     model.eval()
35 |     with torch.no_grad():
36 |         for i, data in enumerate(tqdm(dl)):
37 |             hq = data['hq'].cuda()
38 |             gen = model(hq)[0]
39 |             l2 = torch.mean(torch.square(hq - gen), dim=[1,2,3])
40 |             for b in range(len(l2)):
41 |                 if l2[b] > .0004:
42 |                     shutil.copy(data['GT_path'][b], good_path)
43 |                 #else:
44 |                 #    shutil.copy(data['GT_path'][b], bin_path)
45 | 
46 | 
47 |             #means.append(l2.cpu())
48 |             #if i % 10 == 0:
49 |             #    print(torch.stack(means, dim=0).mean())
50 | 


--------------------------------------------------------------------------------
/codes/scripts/validate_data.py:
--------------------------------------------------------------------------------
 1 | # This script iterates through all the data with no worker threads and performs whatever transformations are prescribed.
 2 | # The idea is to find bad/corrupt images.
 3 | 
 4 | import math
 5 | import argparse
 6 | import random
 7 | import torch
 8 | from utils import util, options as option
 9 | from data import create_dataloader, create_dataset
10 | from tqdm import tqdm
11 | from skimage import io
12 | 
13 | def main():
14 |     #### options
15 |     parser = argparse.ArgumentParser()
16 |     parser.add_argument('-opt', type=str, help='Path to option YAML file.', default='../../options/train_prog_mi1_rrdb_6bypass.yml')
17 |     parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none',
18 |                         help='job launcher')
19 |     parser.add_argument('--local_rank', type=int, default=0)
20 |     args = parser.parse_args()
21 |     opt = option.parse(args.opt, is_train=True)
22 | 
23 |     #### distributed training settings
24 |     opt['dist'] = False
25 |     rank = -1
26 | 
27 |     # convert to NoneDict, which returns None for missing keys
28 |     opt = option.dict_to_nonedict(opt)
29 | 
30 |     #### random seed
31 |     seed = opt['train']['manual_seed']
32 |     if seed is None:
33 |         seed = random.randint(1, 10000)
34 |     util.set_random_seed(seed)
35 | 
36 |     torch.backends.cudnn.benchmark = True
37 |     # torch.backends.cudnn.deterministic = True
38 | 
39 |     #### create train and val dataloader
40 |     for phase, dataset_opt in opt['datasets'].items():
41 |         if phase == 'train':
42 |             train_set = create_dataset(dataset_opt)
43 |             train_size = int(math.ceil(len(train_set) / dataset_opt['batch_size']))
44 |             total_iters = int(opt['train']['niter'])
45 |             total_epochs = int(math.ceil(total_iters / train_size))
46 |             dataset_opt['n_workers'] = 0  # Force num_workers=0 to make dataloader work in process.
47 |             train_loader = create_dataloader(train_set, dataset_opt, opt, None)
48 |             if rank <= 0:
49 |                 print('Number of training data elements: {:,d}, iters: {:,d}'.format(
50 |                     len(train_set), train_size))
51 |     assert train_loader is not None
52 | 
53 |     '''
54 |     tq_ldr = tqdm(train_set.get_paths())
55 |     for path in tq_ldr:
56 |         try:
57 |             _ = io.imread(path)
58 |             # Do stuff with img
59 |         except Exception as e:
60 |             print("Error with %s" % (path,))
61 |             print(e)
62 |     '''
63 |     tq_ldr = tqdm(train_set)
64 |     for ds in tq_ldr:
65 |         pass
66 | 
67 | 
68 | if __name__ == '__main__':
69 |     main()
70 | 


--------------------------------------------------------------------------------
/codes/sweep.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import functools
 3 | import os
 4 | from multiprocessing.pool import ThreadPool
 5 | 
 6 | import torch
 7 | 
 8 | from train import Trainer
 9 | from utils import options as option
10 | import collections.abc
11 | 
12 | 
13 | def deep_update(d, u):
14 |     for k, v in u.items():
15 |         if isinstance(v, collections.abc.Mapping):
16 |             d[k] = deep_update(d.get(k, {}), v)
17 |         else:
18 |             d[k] = v
19 |     return d
20 | 
21 | 
22 | def launch_trainer(opt, opt_path, rank):
23 |     os.environ['CUDA_VISIBLE_DEVICES'] = str(rank)
24 |     print('export CUDA_VISIBLE_DEVICES=' + str(rank))
25 |     trainer = Trainer()
26 |     opt['dist'] = False
27 |     trainer.rank = -1
28 |     trainer.init(opt_path, opt, 'none')
29 |     trainer.do_training()
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     """
34 |     Ad-hoc script (hard coded; no command-line parameters) that spawns multiple separate trainers from a single options
35 |     file, with a hard-coded set of modifications.
36 |     """
37 |     base_opt = '../experiments/sweep_music_mel2vec.yml'
38 |     modifications = {
39 |         'baseline': {},
40 |         'lr1e3': {'steps': {'generator': {'optimizer_params': {'lr': {.001}}}}},
41 |         'lr1e5': {'steps': {'generator': {'optimizer_params': {'lr': {.00001}}}}},
42 |         'no_warmup': {'train': {'warmup_steps': 0}},
43 |     }
44 |     base_rank = 4
45 |     opt = option.parse(base_opt, is_train=True)
46 |     all_opts = []
47 |     for i, (mod, mod_dict) in enumerate(modifications.items()):
48 |         nd = copy.deepcopy(opt)
49 |         deep_update(nd, mod_dict)
50 |         nd['name'] = f'{nd["name"]}_{mod}'
51 |         nd['wandb_run_name'] = mod
52 |         base_path = nd['path']['log']
53 |         for k, p in nd['path'].items():
54 |             if isinstance(p, str) and base_path in p:
55 |                 nd['path'][k] = p.replace(base_path, f'{base_path}/{mod}')
56 |         all_opts.append(nd)
57 | 
58 |     for i in range(1,len(modifications)):
59 |         pid = os.fork()
60 |         if pid == 0:
61 |             rank = i
62 |             break
63 |         else:
64 |             rank = 0
65 |     launch_trainer(all_opts[rank], base_opt, rank+base_rank)
66 | 


--------------------------------------------------------------------------------
/codes/trainer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/trainer/__init__.py


--------------------------------------------------------------------------------
/codes/trainer/custom_training_components/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/trainer/custom_training_components/__init__.py


--------------------------------------------------------------------------------
/codes/trainer/custom_training_components/stereoscopic.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.cuda.amp import autocast
 3 | from models.flownet2.networks import Resample2d
 4 | from models.flownet2 import flow2img
 5 | from trainer.inject import Injector
 6 | 
 7 | 
 8 | def create_stereoscopic_injector(opt, env):
 9 |     type = opt['type']
10 |     if type == 'stereoscopic_resample':
11 |         return ResampleInjector(opt, env)
12 |     elif type == 'stereoscopic_flow2image':
13 |         return Flow2Image(opt, env)
14 |     return None
15 | 
16 | 
17 | class ResampleInjector(Injector):
18 |     def __init__(self, opt, env):
19 |         super(ResampleInjector, self).__init__(opt, env)
20 |         self.resample = Resample2d()
21 |         self.flow = opt['flowfield']
22 | 
23 |     def forward(self, state):
24 |         with autocast(enabled=False):
25 |             return {self.output: self.resample(state[self.input], state[self.flow])}
26 | 
27 | 
28 | # Converts a flowfield to an image representation for viewing purposes.
29 | # Uses flownet's implementation to do so. Which really sucks. TODO: just do my own implementation in the future.
30 | # Note: this is not differentiable and is only usable for debugging purposes.
31 | class Flow2Image(Injector):
32 |     def __init__(self, opt, env):
33 |         super(Flow2Image, self).__init__(opt, env)
34 | 
35 |     def forward(self, state):
36 |         with torch.no_grad():
37 |             flo = state[self.input].cpu()
38 |             bs, c, h, w = flo.shape
39 |             flo = flo.permute(0, 2, 3, 1)  # flow2img works in numpy space for some reason..
40 |             imgs = torch.empty_like(flo)
41 |             flo = flo.numpy()
42 |             for b in range(bs):
43 |                 img = flow2img(flo[b])  # Note that this returns the image in an integer format.
44 |                 img = torch.tensor(img, dtype=torch.float) / 255
45 |                 imgs[b] = img
46 |             imgs = imgs.permute(0, 3, 1, 2)
47 |             return {self.output: imgs}
48 | 


--------------------------------------------------------------------------------
/codes/trainer/eval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/trainer/eval/__init__.py


--------------------------------------------------------------------------------
/codes/trainer/eval/evaluator.py:
--------------------------------------------------------------------------------
 1 | # Base class for an evaluator, which is responsible for feeding test data through a model and evaluating the response.
 2 | import importlib
 3 | import inspect
 4 | import pkgutil
 5 | import re
 6 | import sys
 7 | 
 8 | 
 9 | class Evaluator:
10 |     def __init__(self, model, opt_eval, env, uses_all_ddp=True):
11 |         self.model = model.module if hasattr(model, 'module') else model
12 |         self.opt = opt_eval
13 |         self.env = env
14 |         self.uses_all_ddp = uses_all_ddp
15 | 
16 |     def perform_eval(self):
17 |         return {}
18 | 
19 | 
20 | def format_evaluator_name(name):
21 |     # Formats by converting from CamelCase to snake_case and removing trailing "_evaluator"
22 |     name = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
23 |     name = re.sub('([a-z0-9])([A-Z])', r'\1_\2', name).lower()
24 |     return name.replace("_evaluator", "")
25 | 
26 | 
27 | # Works by loading all python modules in the eval/ directory and sniffing out subclasses of Evaluator.
28 | def find_registered_evaluators(base_path="trainer/eval"):
29 |     module_iter = pkgutil.walk_packages([base_path])
30 |     results = {}
31 |     for mod in module_iter:
32 |         if mod.ispkg:
33 |             EXCLUSION_LIST = []
34 |             if mod.name not in EXCLUSION_LIST:
35 |                 results.update(find_registered_evaluators(f'{base_path}/{mod.name}'))
36 |         else:
37 |             mod_name = f'{base_path}/{mod.name}'.replace('/', '.')
38 |             if 'eval_wer' in mod.name: continue # TODO: this causes an import error for PyCtcDecode. get rid of this if there's a need to use that evaluator.
39 |             importlib.import_module(mod_name)
40 |             classes = inspect.getmembers(sys.modules[mod_name], inspect.isclass)
41 |             for name, obj in classes:
42 |                 if 'Evaluator' in [mro.__name__ for mro in inspect.getmro(obj)]:
43 |                     results[format_evaluator_name(name)] = obj
44 |     return results
45 | 
46 | 
47 | class CreateEvaluatorError(Exception):
48 |     def __init__(self, name, available):
49 |         super().__init__(f'Could not find the specified evaluator name: {name}.  Available evaluators:'
50 |                          f'{available}')
51 | 
52 | 
53 | def create_evaluator(model, opt_eval, env):
54 |     evaluators = find_registered_evaluators()
55 |     type = opt_eval['type']
56 |     if type not in evaluators.keys():
57 |         raise CreateEvaluatorError(type, list(evaluators.keys()))
58 |     return evaluators[opt_eval['type']](model, opt_eval, env)
59 | 


--------------------------------------------------------------------------------
/codes/trainer/eval/fid.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import torch
 4 | import os.path as osp
 5 | import torchvision
 6 | import trainer.eval.evaluator as evaluator
 7 | from pytorch_fid import fid_score
 8 | from utils.util import opt_get
 9 | 
10 | # Evaluator that generates uniform noise to feed into a generator, then calculates a FID score on the results.
11 | class StyleTransferEvaluator(evaluator.Evaluator):
12 |     def __init__(self, model, opt_eval, env):
13 |         super().__init__(model, opt_eval, env, uses_all_ddp=False)
14 |         self.batches_per_eval = opt_eval['batches_per_eval']
15 |         self.batch_sz = opt_eval['batch_size']
16 |         self.im_sz = opt_eval['image_size']
17 |         self.fid_real_samples = opt_eval['real_fid_path']
18 |         self.gen_output_index = opt_eval['gen_index'] if 'gen_index' in opt_eval.keys() else 0
19 |         self.noise_type = opt_get(opt_eval, ['noise_type'], 'imgnoise')
20 |         self.latent_dim = opt_get(opt_eval, ['latent_dim'], 512)  # Not needed if using 'imgnoise' input.
21 |         self.image_norm_range = tuple(opt_get(env['opt'], ['image_normalization_range'], [0,1]))
22 | 
23 |     def perform_eval(self):
24 |         fid_fake_path = osp.join(self.env['base_path'], "../", "fid", str(self.env["step"]))
25 |         os.makedirs(fid_fake_path, exist_ok=True)
26 |         counter = 0
27 |         self.model.eval()
28 |         for i in range(self.batches_per_eval):
29 |             if self.noise_type == 'imgnoise':
30 |                 batch = torch.FloatTensor(self.batch_sz, 3, self.im_sz, self.im_sz).uniform_(0., 1.).to(self.env['device'])
31 |             elif self.noise_type == 'stylenoise':
32 |                 batch = torch.randn(self.batch_sz, self.latent_dim).to(self.env['device'])
33 |             gen = self.model(batch)
34 |             if not isinstance(gen, list) and not isinstance(gen, tuple):
35 |                 gen = [gen]
36 |             gen = gen[self.gen_output_index]
37 |             gen = (gen - self.image_norm_range[0]) / (self.image_norm_range[1]-self.image_norm_range[0])
38 |             for b in range(self.batch_sz):
39 |                 torchvision.utils.save_image(gen[b], osp.join(fid_fake_path, "%i_.png" % (counter)))
40 |                 counter += 1
41 |         self.model.train()
42 | 
43 |         print("Got all images, computing fid")
44 |         return {"fid": fid_score.calculate_fid_given_paths([self.fid_real_samples, fid_fake_path], self.batch_sz, True,
45 |                                                            2048)}
46 | 


--------------------------------------------------------------------------------
/codes/trainer/eval/flow_gaussian_nll.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.utils.data import DataLoader
 3 | from tqdm import tqdm
 4 | 
 5 | import trainer.eval.evaluator as evaluator
 6 | 
 7 | # Evaluate how close to true Gaussian a flow network predicts in a "normal" pass given a LQ/HQ image pair.
 8 | from data.images.image_folder_dataset import ImageFolderDataset
 9 | from models.image_generation.srflow.flow import GaussianDiag
10 | 
11 | 
12 | class FlowGaussianNll(evaluator.Evaluator):
13 |     def __init__(self, model, opt_eval, env):
14 |         super().__init__(model, opt_eval, env, uses_all_ddp=False)
15 |         self.batch_sz = opt_eval['batch_size']
16 |         self.dataset = ImageFolderDataset(opt_eval['dataset'])
17 |         self.dataloader = DataLoader(self.dataset, self.batch_sz)
18 | 
19 |     def perform_eval(self):
20 |         total_zs = 0
21 |         z_loss = 0
22 |         self.model.eval()
23 |         with torch.no_grad():
24 |             print("Evaluating FlowGaussianNll..")
25 |             for batch in tqdm(self.dataloader):
26 |                 dev = self.env['device']
27 |                 z, _, _ = self.model(gt=batch['hq'].to(dev),
28 |                                      lr=batch['lq'].to(dev),
29 |                                      epses=[],
30 |                                      reverse=False,
31 |                                      add_gt_noise=False)
32 |                 for z_ in z:
33 |                     z_loss += GaussianDiag.logp(None, None, z_).mean()
34 |                     total_zs += 1
35 |         self.model.train()
36 |         return {"gaussian_diff": z_loss / total_zs}
37 | 


--------------------------------------------------------------------------------
/codes/trainer/eval/mel_evaluator.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | import trainer.eval.evaluator as evaluator
 4 | 
 5 | from data import create_dataset
 6 | from data.audio.nv_tacotron_dataset import TextMelCollate
 7 | from models.audio.tts.tacotron2 import Tacotron2LossRaw
 8 | from torch.utils.data import DataLoader
 9 | from tqdm import tqdm
10 | 
11 | 
12 | # Evaluates the performance of a MEL spectrogram predictor.
13 | class MelEvaluator(evaluator.Evaluator):
14 |     def __init__(self, model, opt_eval, env):
15 |         super().__init__(model, opt_eval, env, uses_all_ddp=True)
16 |         self.batch_sz = opt_eval['batch_size']
17 |         self.dataset = create_dataset(opt_eval['dataset'])
18 |         assert self.batch_sz is not None
19 |         self.dataloader = DataLoader(self.dataset, self.batch_sz, shuffle=False, num_workers=1, collate_fn=TextMelCollate(n_frames_per_step=1))
20 |         self.criterion = Tacotron2LossRaw()
21 | 
22 |     def perform_eval(self):
23 |         counter = 0
24 |         total_error = 0
25 |         self.model.eval()
26 |         for batch in tqdm(self.dataloader):
27 |             model_params = {
28 |                 'text_inputs': 'padded_text',
29 |                 'text_lengths': 'input_lengths',
30 |                 'mels': 'padded_mel',
31 |                 'output_lengths': 'output_lengths',
32 |             }
33 |             params = {k: batch[v].to(self.env['device']) for k, v in model_params.items()}
34 |             with torch.no_grad():
35 |                 pred = self.model(**params)
36 | 
37 |             targets = ['padded_mel', 'padded_gate']
38 |             targets = [batch[t].to(self.env['device']) for t in targets]
39 |             total_error += self.criterion(pred, targets).item()
40 |             counter += 1
41 |         self.model.train()
42 | 
43 |         return {"validation-score": total_error / counter}
44 | 
45 | 


--------------------------------------------------------------------------------
/codes/trainer/eval/sr_diffusion_fid.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | import os.path as osp
 4 | import torchvision
 5 | from torch.nn.functional import interpolate
 6 | from tqdm import tqdm
 7 | 
 8 | import trainer.eval.evaluator as evaluator
 9 | 
10 | from pytorch_fid import fid_score
11 | from data import create_dataset
12 | from torch.utils.data import DataLoader, DistributedSampler, SequentialSampler
13 | 
14 | from trainer.injectors.gaussian_diffusion_injector import GaussianDiffusionInferenceInjector
15 | from utils.util import opt_get
16 | 
17 | 
18 | # Performs a FID evaluation on a diffusion network
19 | class SrDiffusionFidEvaluator(evaluator.Evaluator):
20 |     def __init__(self, model, opt_eval, env):
21 |         super().__init__(model, opt_eval, env)
22 |         self.batch_sz = opt_eval['batch_size']
23 |         self.fid_batch_size = opt_get(opt_eval, ['fid_batch_size'], 64)
24 |         assert self.batch_sz is not None
25 |         self.dataset = create_dataset(opt_eval['dataset'])
26 |         if torch.distributed.is_available() and torch.distributed.is_initialized():
27 |             self.sampler = DistributedSampler(self.dataset, shuffle=False, drop_last=True)
28 |         else:
29 |             self.sampler = SequentialSampler(self.dataset)
30 |         self.fid_real_samples = opt_eval['dataset']['paths']  # This is assumed to exist for the given dataset.
31 |         assert isinstance(self.fid_real_samples, str)
32 |         self.gd = GaussianDiffusionInferenceInjector(opt_eval['diffusion_params'], env)
33 |         self.out_key = opt_eval['diffusion_params']['out']
34 | 
35 |     def perform_eval(self):
36 |         # Attempt to make the dataset deterministic.
37 |         self.dataset.reset_random()
38 |         dataloader = DataLoader(self.dataset, self.batch_sz, sampler=self.sampler, num_workers=0)
39 | 
40 |         fid_fake_path = osp.join(self.env['base_path'], "..", "fid", str(self.env["step"]))
41 |         os.makedirs(fid_fake_path, exist_ok=True)
42 |         counter = 0
43 |         for batch in tqdm(dataloader):
44 |             batch = {k: v.to(self.env['device']) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
45 |             gen = self.gd(batch)[self.out_key]
46 | 
47 |             # All gather if we're in distributed mode.
48 |             if torch.distributed.is_available() and torch.distributed.is_initialized():
49 |                 gather_list = [torch.zeros_like(gen) for _ in range(torch.distributed.get_world_size())]
50 |                 torch.distributed.all_gather(gather_list, gen)
51 |                 gen = torch.cat(gather_list, dim=0)
52 | 
53 |             if self.env['rank'] <= 0:
54 |                 for g in gen:
55 |                     torchvision.utils.save_image(g, osp.join(fid_fake_path, f"{counter}.png"))
56 |                     counter += 1
57 | 
58 |         if self.env['rank'] <= 0:
59 |             return {"fid": fid_score.calculate_fid_given_paths([self.fid_real_samples, fid_fake_path], self.fid_batch_size,
60 |                                                                True, 2048)}
61 |         else:
62 |             return {}
63 | 


--------------------------------------------------------------------------------
/codes/trainer/experiments/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/trainer/experiments/__init__.py


--------------------------------------------------------------------------------
/codes/trainer/inject.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | import inspect
 3 | import pkgutil
 4 | import re
 5 | import sys
 6 | 
 7 | import torch.nn
 8 | 
 9 | 
10 | # Base class for all other injectors.
11 | class Injector(torch.nn.Module):
12 |     def __init__(self, opt, env):
13 |         super(Injector, self).__init__()
14 |         self.opt = opt
15 |         self.env = env
16 |         if 'in' in opt.keys():
17 |             self.input = opt['in']
18 |         if 'out' in opt.keys():
19 |             self.output = opt['out']
20 | 
21 |     # This should return a dict of new state variables.
22 |     def forward(self, state):
23 |         raise NotImplementedError
24 | 
25 | 
26 | def format_injector_name(name):
27 |     # Formats by converting from CamelCase to snake_case and removing trailing "_injector"
28 |     name = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
29 |     name = re.sub('([a-z0-9])([A-Z])', r'\1_\2', name).lower()
30 |     return name.replace("_injector", "")
31 | 
32 | 
33 | # Works by loading all python modules in the injectors/ directory and sniffing out subclasses of Injector.
34 | # field will be properly populated.
35 | def find_registered_injectors(base_path="trainer/injectors"):
36 |     module_iter = pkgutil.walk_packages([base_path])
37 |     results = {}
38 |     for mod in module_iter:
39 |         if mod.ispkg:
40 |             EXCLUSION_LIST = []
41 |             if mod.name not in EXCLUSION_LIST:
42 |                 results.update(find_registered_injectors(f'{base_path}/{mod.name}'))
43 |         else:
44 |             mod_name = f'{base_path}/{mod.name}'.replace('/', '.')
45 |             importlib.import_module(mod_name)
46 |             classes = inspect.getmembers(sys.modules[mod_name], inspect.isclass)
47 |             for name, obj in classes:
48 |                 if 'Injector' in [mro.__name__ for mro in inspect.getmro(obj)]:
49 |                     results[format_injector_name(name)] = obj
50 |     return results
51 | 
52 | 
53 | class CreateInjectorError(Exception):
54 |     def __init__(self, name, available):
55 |         super().__init__(f'Could not find the specified injector name: {name}.  Available injectors:'
56 |                          f'{available}')
57 | 
58 | 
59 | # Injectors are a way to synthesize data within a step that can then be used (and reused) by loss functions.
60 | def create_injector(opt_inject, env):
61 |     injectors = find_registered_injectors()
62 |     type = opt_inject['type']
63 |     if type not in injectors.keys():
64 |         raise CreateInjectorError(type, list(injectors.keys()))
65 |     return injectors[opt_inject['type']](opt_inject, env)
66 | 


--------------------------------------------------------------------------------
/codes/trainer/injectors/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/trainer/injectors/__init__.py


--------------------------------------------------------------------------------
/codes/trainer/loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import numpy as np
 4 | from utils.colors import rgb2ycbcr
 5 | 
 6 | 
 7 | class CharbonnierLoss(nn.Module):
 8 |     """Charbonnier Loss (L1)"""
 9 | 
10 |     def __init__(self, eps=1e-6):
11 |         super(CharbonnierLoss, self).__init__()
12 |         self.eps = eps
13 | 
14 |     def forward(self, x, y):
15 |         diff = x - y
16 |         loss = torch.sum(torch.sqrt(diff * diff + self.eps))
17 |         return loss
18 | 
19 | 
20 | class ZeroSpreadLoss(nn.Module):
21 |     def __init__(self):
22 |         super(ZeroSpreadLoss, self).__init__()
23 | 
24 |     def forward(self, x, _):
25 |         return 2 * torch.nn.functional.sigmoid(1 / torch.abs(torch.mean(x))) - 1
26 | 
27 | 
28 | # Define GAN loss: [vanilla | lsgan]
29 | class GANLoss(nn.Module):
30 |     def __init__(self, gan_type, real_label_val=1.0, fake_label_val=0.0):
31 |         super(GANLoss, self).__init__()
32 |         self.gan_type = gan_type.lower()
33 |         self.real_label_val = real_label_val
34 |         self.fake_label_val = fake_label_val
35 | 
36 |         if self.gan_type in ['gan', 'ragan', 'pixgan', 'pixgan_fea', 'crossgan', 'crossgan_lrref']:
37 |             self.loss = nn.BCEWithLogitsLoss()
38 |         elif self.gan_type == 'lsgan':
39 |             self.loss = nn.MSELoss()
40 |         elif self.gan_type == 'max_spread':
41 |             self.loss = ZeroSpreadLoss()
42 |         else:
43 |             raise NotImplementedError('GAN type [{:s}] is not found'.format(self.gan_type))
44 | 
45 |     def get_target_label(self, input, target_is_real):
46 |         if target_is_real:
47 |             return torch.empty_like(input).fill_(self.real_label_val)
48 |         else:
49 |             return torch.empty_like(input).fill_(self.fake_label_val)
50 | 
51 |     def forward(self, input, target_is_real):
52 |         if self.gan_type in ['pixgan', 'pixgan_fea', 'crossgan', 'crossgan_lrref'] and not isinstance(target_is_real, bool):
53 |             target_label = target_is_real
54 |         else:
55 |             target_label = self.get_target_label(input, target_is_real)
56 |         loss = self.loss(input.float(), target_label.float())
57 |         return loss
58 | 


--------------------------------------------------------------------------------
/codes/trainer/optimizers/sgd.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.optim import Optimizer
 3 | 
 4 | 
 5 | class SGDNoBiasMomentum(Optimizer):
 6 |     r"""
 7 |     Copy of pytorch implementation of SGD with a modification which turns off momentum for params marked
 8 |     with `is_norm` or `is_bias`.
 9 |     """
10 | 
11 |     def __init__(self, params, lr, momentum=0, dampening=0,
12 |                  weight_decay=0, nesterov=False):
13 |         if lr < 0.0:
14 |             raise ValueError("Invalid learning rate: {}".format(lr))
15 |         if momentum < 0.0:
16 |             raise ValueError("Invalid momentum value: {}".format(momentum))
17 |         if weight_decay < 0.0:
18 |             raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
19 | 
20 |         defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
21 |                         weight_decay=weight_decay, nesterov=nesterov)
22 |         if nesterov and (momentum <= 0 or dampening != 0):
23 |             raise ValueError("Nesterov momentum requires a momentum and zero dampening")
24 |         super().__init__(params, defaults)
25 | 
26 |     def __setstate__(self, state):
27 |         super().__setstate__(state)
28 |         for group in self.param_groups:
29 |             group.setdefault('nesterov', False)
30 | 
31 |     @torch.no_grad()
32 |     def step(self, closure=None):
33 |         """Performs a single optimization step.
34 | 
35 |         Arguments:
36 |             closure (callable, optional): A closure that reevaluates the model
37 |                 and returns the loss.
38 |         """
39 |         loss = None
40 |         if closure is not None:
41 |             with torch.enable_grad():
42 |                 loss = closure()
43 | 
44 |         for group in self.param_groups:
45 |             weight_decay = group['weight_decay']
46 |             momentum = group['momentum']
47 |             dampening = group['dampening']
48 |             nesterov = group['nesterov']
49 | 
50 |             for p in group['params']:
51 |                 if p.grad is None:
52 |                     continue
53 |                 d_p = p.grad
54 |                 if weight_decay != 0:
55 |                     d_p = d_p.add(p, alpha=weight_decay)
56 |                 # **this is the only modification over standard torch.optim.SGD:
57 |                 is_bn_or_bias = (hasattr(p, 'is_norm') and p.is_norm) or (hasattr(p, 'is_bias') and p.is_bias)
58 |                 if not is_bn_or_bias and momentum != 0:
59 |                     param_state = self.state[p]
60 |                     if 'momentum_buffer' not in param_state:
61 |                         buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()
62 |                     else:
63 |                         buf = param_state['momentum_buffer']
64 |                         buf.mul_(momentum).add_(d_p, alpha=1 - dampening)
65 |                     if nesterov:
66 |                         d_p = d_p.add(buf, alpha=momentum)
67 |                     else:
68 |                         d_p = buf
69 | 
70 |                 p.add_(d_p, alpha=-group['lr'])
71 | 
72 |         return loss
73 | 


--------------------------------------------------------------------------------
/codes/use_discriminator_as_filter.py:
--------------------------------------------------------------------------------
 1 | import os.path as osp
 2 | import logging
 3 | import time
 4 | import argparse
 5 | 
 6 | import os
 7 | 
 8 | from torchvision.transforms import CenterCrop
 9 | 
10 | from trainer.ExtensibleTrainer import ExtensibleTrainer
11 | from utils import options as option
12 | import utils.util as util
13 | from data import create_dataset, create_dataloader
14 | from tqdm import tqdm
15 | import torch
16 | import torchvision
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     bin_path = "f:\\tmp\\binned"
21 |     good_path = "f:\\tmp\\good"
22 |     os.makedirs(bin_path, exist_ok=True)
23 |     os.makedirs(good_path, exist_ok=True)
24 | 
25 | 
26 |     torch.backends.cudnn.benchmark = True
27 |     parser = argparse.ArgumentParser()
28 |     parser.add_argument('-opt', type=str, help='Path to options YAML file.', default='../options/train_quality_detectors/train_resnet_jpeg.yml')
29 |     opt = option.parse(parser.parse_args().opt, is_train=False)
30 |     opt = option.dict_to_nonedict(opt)
31 |     opt['dist'] = False
32 | 
33 |     util.mkdirs(
34 |         (path for key, path in opt['path'].items()
35 |          if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key))
36 |     util.setup_logger('base', opt['path']['log'], 'test_' + opt['name'], level=logging.INFO,
37 |                       screen=True, tofile=True)
38 |     logger = logging.getLogger('base')
39 |     logger.info(option.dict2str(opt))
40 | 
41 |     #### Create test dataset and dataloader
42 |     test_loaders = []
43 |     for phase, dataset_opt in sorted(opt['datasets'].items()):
44 |         test_set = create_dataset(dataset_opt)
45 |         test_loader = create_dataloader(test_set, dataset_opt, opt=opt)
46 |         logger.info('Number of test images in [{:s}]: {:d}'.format(dataset_opt['name'], len(test_set)))
47 |         test_loaders.append(test_loader)
48 | 
49 |     model = ExtensibleTrainer(opt)
50 |     fea_loss = 0
51 |     for test_loader in test_loaders:
52 |         test_set_name = test_loader.dataset.opt['name']
53 |         logger.info('\nTesting [{:s}]...'.format(test_set_name))
54 |         test_start_time = time.time()
55 |         dataset_dir = osp.join(opt['path']['results_root'], test_set_name)
56 |         util.mkdir(dataset_dir)
57 | 
58 |         tq = tqdm(test_loader)
59 |         removed = 0
60 |         means = []
61 |         for k, data in enumerate(tq):
62 |             model.feed_data(data, k)
63 |             model.test()
64 |             results = torch.argmax(torch.nn.functional.softmax(model.eval_state['logits'][0], dim=-1), dim=1)
65 |             for i in range(results.shape[0]):
66 |                 if results[i] == 0:
67 |                     imname = osp.basename(data['HQ_path'][i])
68 |                     # For VERIFICATION:
69 |                     #torchvision.utils.save_image(data['hq'][i], osp.join(bin_path, imname))
70 |                     # 4 REALZ:
71 |                     os.remove(data['HQ_path'][i])
72 |                     removed += 1
73 | 
74 |         print("Removed %i/%i images" % (removed, len(test_set)))


--------------------------------------------------------------------------------
/codes/utils/UI_icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/utils/UI_icon.png


--------------------------------------------------------------------------------
/codes/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/codes/utils/__init__.py


--------------------------------------------------------------------------------
/codes/utils/audio.py:
--------------------------------------------------------------------------------
 1 | import librosa
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | 
 5 | def plot_spectrogram(spec, title=None, ylabel="freq_bin", aspect="auto", xmax=None):
 6 |     fig, axs = plt.subplots(1, 1)
 7 |     axs.set_title(title or "Spectrogram (db)")
 8 |     axs.set_ylabel(ylabel)
 9 |     axs.set_xlabel("frame")
10 |     im = axs.imshow(librosa.power_to_db(spec), origin="lower", aspect=aspect)
11 |     if xmax:
12 |         axs.set_xlim((0, xmax))
13 |     fig.colorbar(im, ax=axs)
14 |     plt.show(block=False)


--------------------------------------------------------------------------------
/codes/utils/convert_model.py:
--------------------------------------------------------------------------------
 1 | # Tool that can be used to add a new layer into an existing model save file. Primarily useful for "progressive"
 2 | # models which can be trained piecemeal.
 3 | 
 4 | from utils import options as option
 5 | from models import create_model
 6 | import torch
 7 | import os
 8 | 
 9 | def get_model_for_opt_file(filename):
10 |     opt = option.parse(filename, is_train=True)
11 |     opt = option.dict_to_nonedict(opt)
12 |     model = create_model(opt)
13 |     return model, opt
14 | 
15 | def copy_state_dict_list(l_from, l_to):
16 |     for i, v in enumerate(l_from):
17 |         if isinstance(v, list):
18 |             copy_state_dict_list(v, l_to[i])
19 |         elif isinstance(v, dict):
20 |             copy_state_dict(v, l_to[i])
21 |         else:
22 |             l_to[i] = v
23 | 
24 | def copy_state_dict(dict_from, dict_to):
25 |     for k in dict_from.keys():
26 |         if k == 'optimizers':
27 |             for j in range(len(dict_from[k][0]['param_groups'])):
28 |                 for p in dict_to[k][0]['param_groups'][j]['params']:
29 |                     del dict_to[k][0]['state']
30 |                 dict_to[k][0]['param_groups'][j] = dict_from[k][0]['param_groups'][j]
31 |             dict_to[k][0]['state'].update(dict_from[k][0]['state'])
32 |             print(len(dict_from[k][0].keys()), dict_from[k][0].keys())
33 |             print(len(dict_to[k][0].keys()), dict_to[k][0].keys())
34 |         assert k in dict_to.keys()
35 |         if isinstance(dict_from[k], dict):
36 |             copy_state_dict(dict_from[k], dict_to[k])
37 |         elif isinstance(dict_from[k], list):
38 |             copy_state_dict_list(dict_from[k], dict_to[k])
39 |         else:
40 |             dict_to[k] = dict_from[k]
41 |     return dict_to
42 | 
43 | if __name__ == "__main__":
44 |     os.chdir("..")
45 |     model_from, opt_from = get_model_for_opt_file("../options/train_imgset_pixgan_progressive_srg2.yml")
46 |     model_to, _ = get_model_for_opt_file("../options/train_imgset_pixgan_progressive_srg2_.yml")
47 | 
48 |     '''
49 |     model_to.netG.module.update_for_step(1000000000000)
50 |     l = torch.nn.MSELoss()
51 |     o, _ = model_to.netG(torch.randn(1, 3, 64, 64))
52 |     l(o, torch.randn_like(o)).backward()
53 |     model_to.optimizer_G.step()
54 |     o = model_to.netD(torch.randn(1, 3, 128, 128))
55 |     l(o, torch.randn_like(o)).backward()
56 |     model_to.optimizer_D.step()
57 |     '''
58 | 
59 |     torch.save(copy_state_dict(model_from.netG.state_dict(), model_to.netG.state_dict()), "converted_g.pth")
60 |     torch.save(copy_state_dict(model_from.netD.state_dict(), model_to.netD.state_dict()), "converted_d.pth")
61 | 
62 |     # Also convert the state.
63 |     resume_state_from = torch.load(opt_from['path']['resume_state'])
64 |     resume_state_to = model_to.save_training_state({}, return_state=True)
65 |     resume_state_from['optimizers'][0]['param_groups'].append(resume_state_to['optimizers'][0]['param_groups'][-1])
66 |     torch.save(resume_state_from, "converted_state.pth")
67 | 
68 | 
69 | 
70 | 
71 | 
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/codes/utils/distributed_checkpont.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import warnings
 3 | 
 4 | 
 5 | def detach_variable(inputs):
 6 |     if isinstance(inputs, tuple):
 7 |         out = []
 8 |         for inp in inputs:
 9 |             x = inp.detach()
10 |             x.requires_grad = inp.requires_grad
11 |             out.append(x)
12 |         return tuple(out)
13 |     else:
14 |         raise RuntimeError(
15 |             "Only tuple of tensors is supported. Got Unsupported input type: ", type(inputs).__name__)
16 | 
17 | 
18 | def check_backward_validity(inputs):
19 |     if not any(inp.requires_grad for inp in inputs):
20 |         warnings.warn("None of the inputs have requires_grad=True. Gradients will be None")
21 | 
22 | 
23 | class CheckpointFunction(torch.autograd.Function):
24 |     @staticmethod
25 |     def forward(ctx, run_function, length, *args):
26 |         ctx.run_function = run_function
27 |         ctx.input_tensors = list(args[:length])
28 |         ctx.input_params = list(args[length:])
29 |         with torch.no_grad():
30 |             output_tensors = ctx.run_function(*ctx.input_tensors)
31 |         return output_tensors
32 | 
33 |     @staticmethod
34 |     def backward(ctx, *output_grads):
35 |         for i in range(len(ctx.input_tensors)):
36 |             temp = ctx.input_tensors[i]
37 |             ctx.input_tensors[i] = temp.detach()
38 |             ctx.input_tensors[i].requires_grad = temp.requires_grad
39 |         with torch.enable_grad():
40 |             output_tensors = ctx.run_function(*ctx.input_tensors)
41 |         input_grads = torch.autograd.grad(output_tensors, ctx.input_tensors + ctx.input_params, output_grads, allow_unused=True)
42 |         return (None, None) + input_grads
43 | 
44 | 
45 | def checkpoint(module, *params):
46 |     differentiable_params = tuple(filter(lambda p: p.requires_grad, module.parameters()))
47 |     if len(differentiable_params) > 0:
48 |         args = params + differentiable_params
49 |         return CheckpointFunction.apply(module, len(params), *args)
50 |     else:
51 |         return module(*params)


--------------------------------------------------------------------------------
/codes/utils/weight_scheduler.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | from matplotlib import pyplot as plt
 3 | 
 4 | # Base class for weight schedulers. Holds weight at a fixed initial value.
 5 | class WeightScheduler:
 6 |     def __init__(self, initial_weight):
 7 |         self.initial_weight = initial_weight
 8 | 
 9 |     def get_weight_for_step(self, step):
10 |         return self.initial_weight
11 | 
12 | 
13 | class LinearDecayWeightScheduler(WeightScheduler):
14 |     def __init__(self, initial_weight, steps_to_decay, lower_bound, initial_step=0):
15 |         super(LinearDecayWeightScheduler, self).__init__(initial_weight)
16 |         self.steps_to_decay = steps_to_decay
17 |         self.lower_bound = lower_bound
18 |         self.initial_step = initial_step
19 |         self.decrease_per_step = (initial_weight - lower_bound) / self.steps_to_decay
20 | 
21 |     def get_weight_for_step(self, step):
22 |         step = step - self.initial_step
23 |         if step < 0:
24 |             return self.initial_weight
25 |         return max(self.lower_bound, self.initial_weight - step * self.decrease_per_step)
26 | 
27 | 
28 | class SinusoidalWeightScheduler(WeightScheduler):
29 |     def __init__(self, upper_weight, lower_weight, period_steps, initial_step=0):
30 |         super(SinusoidalWeightScheduler, self).__init__(upper_weight)
31 |         self.center = (upper_weight + lower_weight) / 2
32 |         self.amplitude = (upper_weight - lower_weight) / 2
33 |         self.period = period_steps
34 |         self.initial_step = initial_step
35 | 
36 |     def get_weight_for_step(self, step):
37 |         step = step - self.initial_step
38 |         if step < 0:
39 |             return self.initial_weight
40 |         # Use cosine because it starts at y=1 for x=0.
41 |         return math.cos(step * math.pi * 2 / self.period) * self.amplitude + self.center
42 | 
43 | 
44 | def get_scheduler_for_opt(opt):
45 |     if opt['type'] == 'fixed':
46 |         return WeightScheduler(opt['weight'])
47 |     elif opt['type'] == 'linear_decay':
48 |         return LinearDecayWeightScheduler(opt['initial_weight'], opt['steps'], opt['lower_bound'], opt['start_step'])
49 |     elif opt['type'] == 'sinusoidal':
50 |         return SinusoidalWeightScheduler(opt['upper_weight'], opt['lower_weight'], opt['period'], opt['start_step'])
51 |     else:
52 |         raise NotImplementedError
53 | 
54 | 
55 | # Do some testing.
56 | if __name__ == "__main__":
57 |     #sched = SinusoidalWeightScheduler(1, .1, 50, 10)
58 |     sched = LinearDecayWeightScheduler(10, 5000, .9, 2000)
59 | 
60 |     x = []
61 |     y = []
62 |     for s in range(8000):
63 |         x.append(s)
64 |         y.append(sched.get_weight_for_step(s))
65 |     plt.plot(x, y)
66 |     plt.show()


--------------------------------------------------------------------------------
/environment.yaml:
--------------------------------------------------------------------------------
 1 | name: DLAS
 2 | channels:
 3 |   - conda-forge
 4 |   - nvidia
 5 |   - pytorch
 6 | dependencies:
 7 |   - nvidia::cudatoolkit
 8 |   - git
 9 |   - numpy
10 |   - pip
11 |   - python=3.10.0
12 |   - torchvision
13 |   - torchaudio
14 |   - pytorch::pytorch
15 |   - pip:
16 |     - -r codes/requirements.laxed.txt


--------------------------------------------------------------------------------
/experiments/clips_mel_norms.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/experiments/clips_mel_norms.pth


--------------------------------------------------------------------------------
/experiments/train_diffusion_vocoder_22k_level.yml:
--------------------------------------------------------------------------------
 1 | path:
 2 |   pretrain_model_dvae: '../experiments/dvae.pth'
 3 |   strict_load: true
 4 |   #resume_state: ../experiments/train_imgnet_vqvae_stage1/training_state/0.state   # <-- Set this to resume from a previous training state.
 5 | networks:
 6 |   dvae:
 7 |     type: generator
 8 |     which_model_G: lucidrains_dvae
 9 |     kwargs:
10 |       channels: 80
11 |       codebook_dim: 512
12 |       hidden_dim: 512
13 |       kernel_size: 3
14 |       num_layers: 2
15 |       num_resnet_blocks: 3
16 |       num_tokens: 8192
17 |       positional_dims: 1
18 |       use_transposed_convs: false
19 | 


--------------------------------------------------------------------------------
/experiments/train_gpt_tts_unified.yml:
--------------------------------------------------------------------------------
 1 | path:
 2 |  #pretrain_model_dvae: '../experiments/dvae.pth'
 3 |   strict_load: true
 4 |   #resume_state: ../experiments/train_imgnet_vqvae_stage1/training_state/0.state   # <-- Set this to resume from a previous training state.
 5 | networks:
 6 |   gpt:
 7 |     type: generator
 8 |     which_model_G: unified_voice2
 9 |     kwargs:
10 |       layers: 30 # WAS 8
11 |       model_dim: 1024 # WAS 512
12 |       heads: 16 # WAS 8
13 |       max_text_tokens: 402 # WAS 120
14 |       max_mel_tokens: 604 # WAS 250
15 |       max_conditioning_inputs: 2 # WAS 1
16 |       mel_length_compression: 1024
17 |       number_text_tokens: 256 # supposed to be 255 for newer unified_voice files 
18 |       number_mel_codes: 8194
19 |       start_mel_token: 8192
20 |       stop_mel_token: 8193
21 |       start_text_token: 255
22 |       train_solo_embeddings: False # missing in uv3/4
23 |       use_mel_codes_as_input: True # ditto
24 |       checkpointing: True
25 |       freeze_everything_but_position_embeddings: True
26 |       tortoise_compat: True
27 | 


--------------------------------------------------------------------------------
/recipes/byol/README.md:
--------------------------------------------------------------------------------
 1 | # Working with BYOL in DLAS
 2 | 
 3 | [BYOL](https://arxiv.org/abs/2006.07733) is a technique for pretraining an arbitrary image processing
 4 | neural network. It is built upon previous self-supervised architectures like SimCLR.
 5 | 
 6 | BYOL in DLAS is adapted from an implementation written by [lucidrains](https://github.com/lucidrains/byol-pytorch).
 7 | It is implemented via two wrappers: 
 8 | 
 9 | 1. A Dataset wrapper that augments the LQ and HQ inputs from a typical DLAS dataset. Since differentiable
10 |    augmentations don't actually matter for BYOL, it makes more sense (to me) to do this on the CPU at the
11 |    dataset layer, so your GPU can focus on processing gradients.
12 | 1. A model wrapper that attaches a small MLP to the end of your input network to produce a fixed
13 |    size latent. This latent is used to produce the BYOL loss which trains the master weights from
14 |    your network.
15 |    
16 | Thanks to the excellent implementation from lucidrains, this wrapping process makes training your
17 | network on unsupervised datasets extremely easy.
18 | 
19 | The DLAS version improves on lucidrains implementation adding some important training details, such as
20 | a custom LARS optimizer implementation that aligns with the recommendations from the paper. By moving augmentation
21 | to the dataset level, additional augmentation options are unlocked - like being able to take two similar video frames
22 | as the image pair.
23 | 
24 | # Training BYOL
25 | 
26 | In this directory, you will find a sample training config for training BYOL on DIV2K. You will
27 | likely want to insert your own model architecture first.
28 | 
29 | Run the trainer by:
30 | 
31 | `python train.py -opt train_div2k_byol.yml`
32 | 
33 | BYOL is data hungry, as most unsupervised training methods are. If you're providing your own dataset, make sure it is
34 | the hundreds of K-images or more!
35 | 
36 | ## Using your own model
37 | 
38 | Training your own model on this BYOL implementation is trivial:
39 | 1. Add your nn.Module model implementation to the models/ directory.
40 | 2. Register your model with `trainer/networks.py` as a generator. This file tells DLAS how to build your model from
41 |    a set of configuration options.
42 | 3. Copy the sample training config. Change the `subnet` and `hidden_layer` params.
43 | 4. Run your config with `python train.py -opt <your_config>`.
44 | 
45 | *hint: Your network architecture (including layer names) is printed out when running train.py
46 | against your network.*


--------------------------------------------------------------------------------
/recipes/diffusion/README.md:
--------------------------------------------------------------------------------
 1 | # Working with Gaussian Diffusion models in DLAS
 2 | 
 3 | Diffusion Models are a method of generating structural data using a gradual de-noising process. This process allows a
 4 | simple network training regime.
 5 | 
 6 | This implementation of Gaussian Diffusion is largely based on the work done by OpenAI in their paper ["Diffusion Models
 7 | Beat GANs on Image Synthesis"](https://arxiv.org/pdf/2105.05233.pdf) and ["Improved Denoising Diffusion Probabilistic
 8 | Models"](https://arxiv.org/pdf/2102.09672).
 9 | 
10 | OpenAI opened sourced their reference implementations [here](https://github.com/openai/guided-diffusion). The diffusion
11 | model that DLAS trains uses the [gaussian_diffusion.py](https://github.com/openai/guided-diffusion/blob/main/guided_diffusion/gaussian_diffusion.py)
12 | script from that repo for training and inference with these models. We also include the UNet from that repo as a model
13 | that can be used to train a diffusion network.
14 | 
15 | Diffusion networks can be re-purposed to pretty much any image generation task, including super-resolution. Even though
16 | they are trained with MSE losses, they produce incredibly crisp images with FID scores competitive with the best GANs.
17 | More importantly, it is easy to track training progress since diffusion networks use a "normal" loss.
18 | 
19 | Diffusion networks are unique in that during inference, they perform multiple forward passes to generate a single image.
20 | During training, these networks are trained to denoise images over 4000 steps. In inference, this sample rate can be
21 | adjusted. For the purposes of super-resolution, I have found that images sampled in 50 steps to be of very good quality.
22 | This still means that a diffusion generator is 50x slower than generators trained in different ways.
23 | 
24 | What's more is that I have found that diffusion networks can be trained in the tiled methodology used by ESRGAN: instead
25 | of training on whole images, you can train on tiles of larger images. At inference time, the network can be applied to
26 | larger images than the network was initially trained on. I have found this works well on inference images within ~3x
27 | the training size. I have not tried larger, because the size of the UNet model means that inference at ultra-high 
28 | resolutions is impossible (I run out of GPU memory).
29 | 
30 | I have provided a reference configuration for training a diffusion model in this manner. The config performs a 2x
31 | upsampling to 256px, de-blurs it and removes JPEG artifacts. The deblurring and image repairs are done on a configurable
32 | scale. The scale is [0,1] passed to the model as `corruption_entropy`. `1` represents a maximum correction factor.
33 | You can try reducing this to 128px for faster training. It should work fine.
34 | 
35 | Diffusion models also have a fairly arcane inference method. To help you along, I've provided an inference configuration
36 | that can be used with models trained in DLAS.


--------------------------------------------------------------------------------
/recipes/diffusion/test_diffusion_unet.yml:
--------------------------------------------------------------------------------
 1 | #### general settings
 2 | name: test_diffusion_unet
 3 | use_tb_logger: true
 4 | model: extensibletrainer
 5 | scale: 1
 6 | gpu_ids: [0]
 7 | start_step: -1
 8 | checkpointing_enabled: true
 9 | fp16: false
10 | wandb: false
11 | 
12 | datasets:
13 |   train:
14 |     name: my_inference_images
15 |     n_workers: 0
16 |     batch_size: 1
17 |     mode: imagefolder
18 |     rgb_n1_to_1: true
19 |     disable_flip: true
20 |     force_square: false
21 |     paths: <low resolution images you want to upsample>
22 |     scale: 1
23 |     skip_lq: true
24 |     fixed_parameters:
25 |       # Specify correction factors here. For networks trained with the paired training configuration, the first number
26 |       # is a JPEG correction factor, and the second number is a deblurring factor. Testing shows that if you attempt to
27 |       # deblur too far, you get extremely distorted images. It's actually pretty cool - the network clearly knows how
28 |       # much deblurring is appropriate.
29 |       corruption_entropy: [.2, .5]
30 | 
31 | networks:
32 |   generator:
33 |     type: generator
34 |     which_model_G: unet_diffusion
35 |     args:
36 |       image_size: 256
37 |       in_channels: 3
38 |       num_corruptions: 2
39 |       model_channels: 192
40 |       out_channels: 6
41 |       num_res_blocks: 2
42 |       attention_resolutions: [8,16]
43 |       dropout: 0
44 |       channel_mult: [1,1,2,2,4,4]
45 |       num_heads: 4
46 |       num_heads_upsample: -1
47 |       use_scale_shift_norm: true
48 | 
49 | #### path
50 | path:
51 |   pretrain_model_generator: <Your model (or EMA) path>
52 |   strict_load: true
53 | 
54 | steps:        
55 |   generator:
56 |     training: generator
57 |     injectors:
58 |       visual_debug:
59 |         type: gaussian_diffusion_inference
60 |         generator: generator
61 |         output_batch_size: 1
62 |         output_scale_factor: 2
63 |         respaced_timestep_spacing: 50  # This can be tweaked to perform inference faster or slower. 50-200 seems to be the sweet spot. At 4000 steps, the quality is actually worse often.
64 |         undo_n1_to_1: true
65 |         beta_schedule:
66 |           schedule_name: linear
67 |           num_diffusion_timesteps: 4000
68 |         diffusion_args:
69 |           model_mean_type: epsilon
70 |           model_var_type: learned_range
71 |           loss_type: mse
72 |         model_input_keys:
73 |           low_res: hq
74 |           corruption_factor: corruption_entropy
75 |         out: sample
76 | 
77 | eval:
78 |     output_state: sample


--------------------------------------------------------------------------------
/recipes/esrgan/rrdb_process_video.yml:
--------------------------------------------------------------------------------
 1 | name: video_process
 2 | suffix: ~  # add suffix to saved images
 3 | model: extensibletrainer
 4 | scale: 4
 5 | gpu_ids: [0]
 6 | fp16: true
 7 | minivid_crf: 12  # Defines the 'crf' output video quality parameter fed to FFMPEG
 8 | frames_per_mini_vid: 360  # How many frames to process before generating a small video segment. Used to reduce number of images you must store to convert an entire video.
 9 | minivid_start_no: 360
10 | recurrent_mode: false
11 | 
12 | dataset:
13 |   n_workers: 1
14 |   name: myvideo
15 |   video_file: <your path> # <-- Path to your video file here. any format supported by ffmpeg works.
16 |   frame_rate: 30  # Set to the frame rate of your video.
17 |   start_at_seconds: 0  # Set this if you want to start somewhere other than the beginning of the video.
18 |   end_at_seconds: 5000  # Set to the time you want to stop at.
19 |   batch_size: 1  # Set to the number of frames to convert at once. Larger batches provide a modest performance increase.
20 |   vertical_splits: 1 # Used for 3d binocular videos. Leave at 1.
21 |   force_multiple: 1
22 | 
23 | #### network structures
24 | networks:
25 |   generator:
26 |     type: generator
27 |     which_model_G: RRDBNet
28 |     in_nc: 3
29 |     out_nc: 3
30 |     initial_stride: 1
31 |     nf: 64
32 |     nb: 23
33 |     scale: 4
34 |     blocks_per_checkpoint: 3
35 | 
36 | #### path
37 | path:
38 |   pretrain_model_generator: <your path> # <-- Set your generator path here.
39 | 
40 | steps:
41 |   generator:
42 |     training: generator
43 |     generator: generator
44 | 
45 |     # Optimizer params. Not used, but currently required to initialize ExtensibleTrainer, even in eval mode.
46 |     lr: !!float 5e-6
47 |     weight_decay: 0
48 |     beta1: 0.9
49 |     beta2: 0.99
50 | 
51 |     injectors:
52 |       gen_inj:
53 |         type: generator
54 |         generator: generator
55 |         in: lq
56 |         out: gen
57 | 
58 | # Train section is required, even though we are just evaluating.
59 | train:
60 |   niter: 500000
61 |   warmup_iter: -1
62 |   mega_batch_factor: 1
63 |   val_freq: 500
64 |   default_lr_scheme: MultiStepLR
65 |   gen_lr_steps: [20000, 40000, 80000, 100000, 140000, 180000]
66 |   lr_gamma: 0.5
67 | 
68 | eval:
69 |   output_state: gen


--------------------------------------------------------------------------------
/recipes/glean/README.md:
--------------------------------------------------------------------------------
 1 | # GLEAN
 2 | 
 3 | DLAS contains an attempt at implementing [GLEAN](https://ckkelvinchan.github.io/papers/glean.pdf), which performs image
 4 | super-resolution guided by pretrained StyleGAN networks. Since this paper is currently closed-source, it was 
 5 | implemented entirely on what information I could glean from the paper.
 6 | 
 7 | ## Training
 8 | 
 9 | GLEAN requires a pre-trained StyleGAN network to operate. DLAS currently only has support for StyleGAN2 models, so
10 | you will need to use one of those. The pre-eminent StyleGAN 2 model is the one trained on FFHQ faces, so I will use
11 | that in this training example.
12 | 
13 | 1. Download the ffhq model from [nVidias Drive](https://drive.google.com/drive/folders/1yanUI9m4b4PWzR0eurKNq6JR1Bbfbh6L).
14 |    This repo currently only supports the "-f.pkl" files without further modifications, so choose one of those.
15 | 1. Download and extract the [FFHQ dataset](https://github.com/NVlabs/ffhq-dataset).
16 | 1. Convert the TF model to a Pytorch one supported by DLAS:
17 | 
18 |    `python scripts/stylegan2/convert_weights_rosinality.py stylegan2-ffhq-config-f.pkl`
19 |    
20 | 1. The above conversion script outputs a *.pth file as well as JPG preview of model outputs. Check the JPG to ensure
21 |    the StyleGAN is performing as expected. If so, copy the *.pth file to your experiments/ directory within DLAS.
22 | 1. Edit the provided trainer configuration. Find comments starting with '<--' and make changes as indicated.
23 | 1. Train the model:
24 | 
25 |    `python train.py -opt train_ffhq_glean.yml`


--------------------------------------------------------------------------------
/recipes/segformer/train_byol_segformer.yml:
--------------------------------------------------------------------------------
  1 | #### general settings
  2 | name: train_byol_segformer
  3 | use_tb_logger: true
  4 | model: extensibletrainer
  5 | distortion: sr
  6 | scale: 1
  7 | gpu_ids: [0]
  8 | fp16: false
  9 | start_step: -1
 10 | checkpointing_enabled: false
 11 | wandb: false
 12 | 
 13 | datasets:
 14 |   train:
 15 |     n_workers: 1
 16 |     batch_size: 96
 17 |     mode: byol_dataset
 18 |     crop_size: 224
 19 |     key1: hq
 20 |     key2: hq
 21 |     dataset:
 22 |       mode: imagefolder
 23 |       paths: <>
 24 |       target_size: 224
 25 |       scale: 1
 26 |       fetch_alt_image: false
 27 |       skip_lq: true
 28 |       normalize: imagenet
 29 | 
 30 | networks:
 31 |   generator:
 32 |     type: generator
 33 |     which_model_G: pixel_local_byol
 34 |     image_size: 224
 35 |     hidden_layer: tail
 36 |     subnet:
 37 |       which_model_G: segformer
 38 | 
 39 | #### path
 40 | path:
 41 |   strict_load: true
 42 |   #resume_state: <>
 43 | 
 44 | steps:
 45 |   generator:
 46 |     training: generator
 47 |     optimizer: lars
 48 |     optimizer_params:
 49 |       # All parameters from appendix J of BYOL.
 50 |       lr: .08   # From BYOL: LR=.2*<batch_size>/256
 51 |       weight_decay: !!float 1.5e-6
 52 |       lars_coefficient: .001
 53 |       momentum: .9
 54 | 
 55 |     injectors:
 56 |       gen_inj:
 57 |         type: generator
 58 |         generator: generator
 59 |         in: aug1
 60 |         out: loss
 61 | 
 62 |     losses:
 63 |       byol_loss:
 64 |         type: direct
 65 |         key: loss
 66 |         weight: 1
 67 | 
 68 | train:
 69 |   warmup_iter: -1
 70 |   mega_batch_factor: 2
 71 |   val_freq: 1000
 72 |   niter: 300000
 73 | 
 74 |   # Default LR scheduler options
 75 |   default_lr_scheme: CosineAnnealingLR_Restart
 76 |   T_period: [120000, 120000, 120000]
 77 |   warmup: 10000
 78 |   eta_min: .01  # Unspecified by the paper..
 79 |   restarts: [140000, 280000]  # Paper says no re-starts, but this scheduler will add them automatically if we don't set them.
 80 |                               # likely I won't train this far.
 81 |   restart_weights: [.5, .25]
 82 | 
 83 | 
 84 | eval:
 85 |   output_state: loss
 86 |   evaluators:
 87 |     single_point_pair_contrastive_eval:
 88 |       for: generator
 89 |       type: single_point_pair_contrastive_eval
 90 |       batch_size: 16
 91 |       quantity: 96
 92 |       similar_set_args:
 93 |         path: <>
 94 |         size: 256
 95 |       dissimilar_set_args:
 96 |         path: <>
 97 |         size: 256
 98 | 
 99 | logger:
100 |   print_freq: 30
101 |   save_checkpoint_freq: 1000
102 |   visuals: [hq, aug1]
103 |   visual_debug_rate: 100


--------------------------------------------------------------------------------
/recipes/srflow/convert_official_weights.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | # Quick script that can be used to convert from pretrained SRFlow weights to the variants used in this repo. The only
 4 | # differences between the two is the variable naming conventions used by the RRDBNet. (FWIW this repo is using the
 5 | # more up-to-date names that conform to Python standards).
 6 | 
 7 | official_weight_file = 'SRFlow_CelebA_8X.pth'
 8 | output = 'CelebA_converted.pth'
 9 | 
10 | sd = torch.load(official_weight_file)
11 | sdp = {}
12 | for k,v in sd.items():
13 |     k = k.replace('RRDB.RRDB_trunk', 'RRDB.body')
14 |     k = k.replace('.RDB', '.rdb')
15 |     k = k.replace('trunk_conv.', 'conv_body.')
16 |     k = k.replace('.upconv', '.conv_up')
17 |     k = k.replace('.HRconv', '.conv_hr')
18 | sdp[k] = v
19 | torch.save(sdp, output)
20 | 


--------------------------------------------------------------------------------
/recipes/srflow/train_div2k_rrdb_psnr.yml:
--------------------------------------------------------------------------------
 1 | #### general settings
 2 | name: train_div2k_rrdb_psnr
 3 | use_tb_logger: true
 4 | model: extensibletrainer
 5 | distortion: sr
 6 | scale: 2
 7 | gpu_ids: [0]
 8 | fp16: false
 9 | start_step: 0
10 | checkpointing_enabled: true  # <-- Highly recommended for single-GPU training. Will not work with DDP.
11 | wandb: false
12 | 
13 | datasets:
14 |   train:
15 |     n_workers: 4
16 |     batch_size: 32
17 |     name: div2k
18 |     mode: single_image_extensible
19 |     paths: /content/div2k   # <-- Put your path here.
20 |     target_size: 128
21 |     force_multiple: 1
22 |     scale: 4
23 |     eval: False
24 |     num_corrupts_per_image: 0
25 |     strict: false
26 |   val:
27 |     name: val
28 |     mode: fullimage
29 |     dataroot_GT: /content/set14
30 |     scale: 4
31 |     force_multiple: 16
32 | 
33 | networks:
34 |   generator:
35 |     type: generator
36 |     which_model_G: RRDBNet
37 |     in_nc: 3
38 |     out_nc: 3
39 |     nf: 64
40 |     nb: 23
41 |     scale: 4
42 |     blocks_per_checkpoint: 3
43 | 
44 | #### path
45 | path:
46 |   #pretrain_model_generator: <insert pretrained model path if desired>
47 |   strict_load: true
48 |   #resume_state: ../experiments/train_div2k_rrdb_psnr/training_state/0.state   # <-- Set this to resume from a previous training state.
49 | 
50 | steps:
51 |   generator:
52 |     training: generator
53 | 
54 |     optimizer_params:
55 |       # Optimizer params
56 |       lr: !!float 2e-4
57 |       weight_decay: 0
58 |       beta1: 0.9
59 |       beta2: 0.99
60 | 
61 |     injectors:
62 |       gen_inj:
63 |         type: generator
64 |         generator: generator
65 |         in: lq
66 |         out: gen
67 |         
68 |     losses:
69 |       pix:
70 |         type: pix
71 |         weight: 1
72 |         criterion: l1
73 |         real: hq
74 |         fake: gen
75 | 
76 | train:
77 |   niter: 500000
78 |   warmup_iter: -1
79 |   mega_batch_factor: 1    # <-- Gradient accumulation factor. If you are running OOM, increase this to [2,4,8].
80 |   val_freq: 2000
81 | 
82 |   # Default LR scheduler options
83 |   default_lr_scheme: MultiStepLR
84 |   gen_lr_steps: [50000, 100000, 150000, 200000]
85 |   lr_gamma: 0.5
86 | 
87 | eval:
88 |   output_state: gen
89 | 
90 | logger:
91 |   print_freq: 30
92 |   save_checkpoint_freq: 1000
93 |   visuals: [gen, hq, lq]
94 |   visual_debug_rate: 100


--------------------------------------------------------------------------------
/recipes/stylegan/README.md:
--------------------------------------------------------------------------------
 1 | # StyleGAN Implementations
 2 | DLAS supports two different StyleGAN2 implementations:
 3 | 
 4 | - [@rosinality implementation](https://github.com/rosinality/stylegan2-pytorch/commits/master)
 5 |   Designed to reach parity with the nVidia reference implementation in TF1.5
 6 | - [@lucidrains implementation](https://github.com/lucidrains/stylegan2-pytorch)
 7 |   Designed with simplicity and readability in mind.
 8 |   
 9 | I prefer the readability of @lucidrains implementation, but you cannot (yet) use pretrained weights
10 | with it. I'm working on that.
11 | 


--------------------------------------------------------------------------------
/recipes/tacotron2/test_tacotron2_lj.yml:
--------------------------------------------------------------------------------
 1 | #### general settings
 2 | name: test_tacotron2_lj
 3 | use_tb_logger: true
 4 | gpu_ids: [0]
 5 | start_step: -1
 6 | fp16: false
 7 | checkpointing_enabled: true
 8 | wandb: false
 9 | 
10 | datasets:
11 |   train:
12 |     name: lj
13 |     n_workers: 0
14 |     batch_size: 1
15 |     mode: nv_tacotron
16 |     path: E:\4k6k\datasets\audio\LJSpeech-1.1\ljs_audio_text_train_filelist.txt
17 | 
18 | networks:
19 |   mel_gen:
20 |     type: generator
21 |     which_model_G: nv_tacotron2
22 |     args:
23 |       encoder_kernel_size: 5
24 |       encoder_n_convolutions: 3
25 |       encoder_embedding_dim: 512
26 |       decoder_rnn_dim: 1024
27 |       prenet_dim: 256
28 |       max_decoder_steps: 1000
29 |       attention_rnn_dim: 1024
30 |       attention_dim: 128
31 |       attention_location_n_filters: 32
32 |       attention_location_kernel_size: 31
33 |       postnet_embedding_dim: 512
34 |       postnet_kernel_size: 5
35 |       postnet_n_convolutions: 5
36 |   waveglow:
37 |     type: generator
38 |     which_model_G: nv_waveglow
39 |     args:
40 |       n_mel_channels: 80
41 |       n_flows: 12
42 |       n_group: 8
43 |       n_early_every: 4
44 |       n_early_size: 2
45 |       WN_config:
46 |         n_layers: 8
47 |         n_channels: 256
48 |         kernel_size: 3
49 | 
50 | #### path
51 | path:
52 |   pretrain_model_mel_gen: ../experiments/train_tacotron2_lj/models/22000_mel_gen_ema.pth
53 |   pretrain_model_waveglow: ../experiments/waveglow_256channels_universal_v5.pth
54 |   strict_load: true
55 |   #resume_state: ../experiments/train_imgset_unet_diffusion/training_state/54000.state
56 | 
57 | steps:        
58 |   generator:
59 |     training: mel_gen
60 |     injectors:
61 |       mel:
62 |         type: generator
63 |         generator: mel_gen
64 |         in: [padded_text, input_lengths, padded_mel, output_lengths]
65 |         out: [mel_outputs, mel_outputs_postnet, gate_outputs, alignments]
66 |       wave:
67 |         type: generator
68 |         generator: waveglow
69 |         method: infer
70 |         in: mel_outputs
71 |         out: waveform
72 | 
73 | eval:
74 |   output_state: waveform


--------------------------------------------------------------------------------
/recipes/tacotron2/train_tacotron2_lj.yml:
--------------------------------------------------------------------------------
 1 | #### general settings
 2 | name: train_tacotron2_lj
 3 | use_tb_logger: true
 4 | gpu_ids: [0]
 5 | start_step: -1
 6 | fp16: false
 7 | checkpointing_enabled: true
 8 | wandb: false
 9 | 
10 | datasets:
11 |   train:
12 |     name: lj
13 |     n_workers: 1
14 |     batch_size: 72
15 |     mode: nv_tacotron
16 |     path: E:\4k6k\datasets\audio\LJSpeech-1.1\ljs_audio_text_train_filelist.txt
17 | 
18 | networks:
19 |   mel_gen:
20 |     type: generator
21 |     which_model_G: nv_tacotron2
22 |     args:
23 |       encoder_kernel_size: 5
24 |       encoder_n_convolutions: 3
25 |       encoder_embedding_dim: 512
26 |       decoder_rnn_dim: 1024
27 |       prenet_dim: 256
28 |       max_decoder_steps: 1000
29 |       attention_rnn_dim: 1024
30 |       attention_dim: 128
31 |       attention_location_n_filters: 32
32 |       attention_location_kernel_size: 31
33 |       postnet_embedding_dim: 512
34 |       postnet_kernel_size: 5
35 |       postnet_n_convolutions: 5
36 | 
37 | #### path
38 | path:
39 |   #pretrain_model_generator: ../experiments/diffusion_unet_128_imageset_22000.pt
40 |   strict_load: true
41 |   #resume_state: ../experiments/train_imgset_unet_diffusion/training_state/54000.state
42 | 
43 | steps:        
44 |   generator:
45 |     training: mel_gen
46 | 
47 |     optimizer: adamw
48 |     optimizer_params:
49 |       lr: !!float 1.2e-3
50 |       weight_decay: !!float 1e-6
51 |       beta1: 0.9
52 |       beta2: 0.9999
53 |     clip_grad_eps: 1.0
54 | 
55 |     injectors:
56 |       mel:
57 |         type: generator
58 |         generator: mel_gen
59 |         in: [padded_text, input_lengths, padded_mel, output_lengths]
60 |         out: [mel_outputs, mel_outputs_postnet, gate_outputs, alignments]
61 |     losses:
62 |       tacotron_loss:
63 |         type: nv_tacotron2_loss
64 |         weight: 1
65 |         mel_target_key: padded_mel
66 |         mel_output_key: mel_outputs
67 |         mel_output_postnet_key: mel_outputs_postnet
68 |         gate_target_key: padded_gate
69 |         gate_output_key: gate_outputs
70 | 
71 | train:
72 |   niter: 500000
73 |   warmup_iter: -1
74 |   mega_batch_factor: 3
75 |   ema_rate: .999
76 |   val_freq: 500
77 | 
78 |   default_lr_scheme: MultiStepLR
79 |   gen_lr_steps: [ 50000, 100000, 150000 ]
80 |   lr_gamma: 0.5
81 | 
82 | eval:
83 |   evaluators:
84 |     val:
85 |       type: mel
86 |       for: mel_gen
87 |       batch_size: 16
88 |       dataset:
89 |         mode: nv_tacotron
90 |         path: E:\4k6k\datasets\audio\LJSpeech-1.1\ljs_audio_text_val_filelist.txt
91 | 
92 | 
93 | logger:
94 |   print_freq: 30
95 |   save_checkpoint_freq: 500
96 |   visuals: [mel_outputs, padded_mel]
97 |   is_mel_spectrogram: true
98 |   visual_debug_rate: 100


--------------------------------------------------------------------------------
/recipes/vqvae2/README.md:
--------------------------------------------------------------------------------
 1 | # VQVAE2 in Pytorch
 2 | 
 3 | [VQVAE2](https://arxiv.org/pdf/1906.00446.pdf) is a generative autoencoder developed by Deepmind. It's unique innovation is
 4 | discretizing the latent space into a fixed set of "codebook" vectors.  This codebook
 5 | can then be used in downstream tasks to rebuild images from the training set.
 6 | 
 7 | This model is in DLAS thanks to work [@rosinality](https://github.com/rosinality) did 
 8 | [converting the Deepmind model](https://github.com/rosinality/vq-vae-2-pytorch) to Pytorch.
 9 | 
10 | # Training VQVAE2
11 | 
12 | VQVAE2 is trained in two steps:
13 | 
14 | ## Training the autoencoder
15 | 
16 | This first step is to train the autoencoder itself. The config file `train_imgnet_vqvae_stage1.yml` provided shows how to do this
17 | for imagenet with the hyperparameters specified by deepmind. You'll need to bring your own imagenet folder for this.
18 | 
19 | ## Training the PixelCNN encoder
20 | 
21 | The second step is to train the PixelCNN model which will create "codebook" vectors given an
22 | input image.


--------------------------------------------------------------------------------
/resources/bitsandbytes_windows/cextension.py:
--------------------------------------------------------------------------------
 1 | import ctypes as ct
 2 | from pathlib import Path
 3 | from warnings import warn
 4 | 
 5 | from .cuda_setup.main import evaluate_cuda_setup
 6 | 
 7 | 
 8 | class CUDALibrary_Singleton(object):
 9 |     _instance = None
10 | 
11 |     def __init__(self):
12 |         raise RuntimeError("Call get_instance() instead")
13 | 
14 |     def initialize(self):
15 |         binary_name = evaluate_cuda_setup()
16 |         package_dir = Path(__file__).parent
17 |         binary_path = package_dir / binary_name
18 | 
19 |         if not binary_path.exists():
20 |             print(f"CUDA SETUP: TODO: compile library for specific version: {binary_name}")
21 |             legacy_binary_name = "libbitsandbytes.so"
22 |             print(f"CUDA SETUP: Defaulting to {legacy_binary_name}...")
23 |             binary_path = package_dir / legacy_binary_name
24 |             if not binary_path.exists():
25 |                 print('CUDA SETUP: CUDA detection failed. Either CUDA driver not installed, CUDA not installed, or you have multiple conflicting CUDA libraries!')
26 |                 print('CUDA SETUP: If you compiled from source, try again with `make CUDA_VERSION=DETECTED_CUDA_VERSION` for example, `make CUDA_VERSION=113`.')
27 |                 raise Exception('CUDA SETUP: Setup Failed!')
28 |             # self.lib = ct.cdll.LoadLibrary(binary_path)
29 |             self.lib = ct.cdll.LoadLibrary(str(binary_path))            # $$$
30 |         else:
31 |             print(f"CUDA SETUP: Loading binary {binary_path}...")
32 |             # self.lib = ct.cdll.LoadLibrary(binary_path)
33 |             self.lib = ct.cdll.LoadLibrary(str(binary_path))            # $$$
34 | 
35 |     @classmethod
36 |     def get_instance(cls):
37 |         if cls._instance is None:
38 |             cls._instance = cls.__new__(cls)
39 |             cls._instance.initialize()
40 |         return cls._instance
41 | 
42 | 
43 | lib = CUDALibrary_Singleton.get_instance().lib
44 | try:
45 |     lib.cadam32bit_g32
46 |     lib.get_context.restype = ct.c_void_p
47 |     lib.get_cusparse.restype = ct.c_void_p
48 |     COMPILED_WITH_CUDA = True
49 | except AttributeError:
50 |     warn(
51 |         "The installed version of bitsandbytes was compiled without GPU support. "
52 |         "8-bit optimizers and GPU quantization are unavailable."
53 |     )
54 |     COMPILED_WITH_CUDA = False
55 | 


--------------------------------------------------------------------------------
/resources/bitsandbytes_windows/libbitsandbytes_cpu.dll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/resources/bitsandbytes_windows/libbitsandbytes_cpu.dll


--------------------------------------------------------------------------------
/resources/bitsandbytes_windows/libbitsandbytes_cuda116.dll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/resources/bitsandbytes_windows/libbitsandbytes_cuda116.dll


--------------------------------------------------------------------------------
/sandbox.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torchvision
 3 | from PIL import Image
 4 | from pytorch_wavelets import DWTForward, DWTInverse
 5 | import torch.nn.functional as F
 6 | 
 7 | def load_img(path):
 8 |     im = Image.open(path).convert(mode="RGB")
 9 |     return torchvision.transforms.ToTensor()(im)
10 | 
11 | def save_img(t, path):
12 |     torchvision.utils.save_image(t, path)
13 | 
14 | img = load_img("pu.jpg")
15 | img = img.unsqueeze(0)
16 | 
17 | # Reshape image to be multiple of 32
18 | w, h = img.shape[2:]
19 | w = (w // 32) * 32
20 | h = (h // 32) * 32
21 | img = F.interpolate(img, size=(w, h))
22 | print("Input shape:", img.shape)
23 | 
24 | J_spec = 5
25 | 
26 | Yl, Yh = DWTForward(J=J_spec, mode='periodization', wave='db3')(img)
27 | print(Yl.shape, [h.shape for h in Yh])
28 | 
29 | imgLR = F.interpolate(img, scale_factor=.5)
30 | LQYl, LQYh = DWTForward(J=J_spec-1, mode='periodization', wave='db3')(imgLR)
31 | print(LQYl.shape, [h.shape for h in LQYh])
32 | 
33 | for i in range(J_spec):
34 |     smd = torch.sum(Yh[i], dim=2).cpu()
35 |     save_img(smd, "high_%i.png" % (i,))
36 | save_img(Yl, "lo.png")
37 | 
38 | '''
39 | Following code reconstructs the image with different high passes cancelled out.
40 | '''
41 | for i in range(J_spec):
42 |     corrupted_im = [y for y in Yh]
43 |     corrupted_im[i] = torch.zeros_like(corrupted_im[i])
44 |     im = DWTInverse(mode='periodization', wave='db3')((Yl, corrupted_im))
45 |     save_img(im, "corrupt_%i.png" % (i,))
46 | im = DWTInverse(mode='periodization', wave='db3')((torch.full_like(Yl, fill_value=torch.mean(Yl)), Yh))
47 | save_img(im, "corrupt_im.png")
48 | 
49 | 
50 | '''
51 | Following code reconstructs a hybrid image with the first high pass from the HR and the rest of the data from the LR.
52 | highpass = [Yh[0]] + LQYh
53 | im = DWTInverse(mode='periodization', wave='db3')((LQYl, highpass))
54 | save_img(im, "hybrid_lrhr.png")
55 | save_img(F.interpolate(imgLR, scale_factor=2), "upscaled.png")
56 | '''


--------------------------------------------------------------------------------
/static/drive_copied_file_tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/drive_copied_file_tree.png


--------------------------------------------------------------------------------
/static/export_to_gdrive.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/export_to_gdrive.png


--------------------------------------------------------------------------------
/static/file_directory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/file_directory.png


--------------------------------------------------------------------------------
/static/good_gpu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/good_gpu.png


--------------------------------------------------------------------------------
/static/hyperparam_dataset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/hyperparam_dataset.png


--------------------------------------------------------------------------------
/static/ljspeech.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/ljspeech.png


--------------------------------------------------------------------------------
/static/notebook_header.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/notebook_header.png


--------------------------------------------------------------------------------
/static/params.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/params.png


--------------------------------------------------------------------------------
/static/runtime_type.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/runtime_type.png


--------------------------------------------------------------------------------
/static/settings_options.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/settings_options.png


--------------------------------------------------------------------------------
/static/stop_training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/stop_training.png


--------------------------------------------------------------------------------
/static/training_button.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/training_button.png


--------------------------------------------------------------------------------
/static/very_long_process.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/very_long_process.png


--------------------------------------------------------------------------------
/static/very_recent_save.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/very_recent_save.png


--------------------------------------------------------------------------------
/static/warning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/warning.png


--------------------------------------------------------------------------------
/static/yml_file.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/static/yml_file.png


--------------------------------------------------------------------------------
/voice_samples/kk_500/kk_0_0.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/voice_samples/kk_500/kk_0_0.wav


--------------------------------------------------------------------------------
/voice_samples/kk_500/kk_0_1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/voice_samples/kk_500/kk_0_1.wav


--------------------------------------------------------------------------------
/voice_samples/kk_500/kk_0_2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/voice_samples/kk_500/kk_0_2.wav


--------------------------------------------------------------------------------
/voice_samples/kk_500_emma/emma_0_0.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/voice_samples/kk_500_emma/emma_0_0.wav


--------------------------------------------------------------------------------
/voice_samples/kk_500_emma/emma_0_1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/voice_samples/kk_500_emma/emma_0_1.wav


--------------------------------------------------------------------------------
/voice_samples/kk_500_emma/emma_0_2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/voice_samples/kk_500_emma/emma_0_2.wav


--------------------------------------------------------------------------------
/voice_samples/kk_orig/kk_0_0.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/voice_samples/kk_orig/kk_0_0.wav


--------------------------------------------------------------------------------
/voice_samples/kk_orig/kk_0_1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/voice_samples/kk_orig/kk_0_1.wav


--------------------------------------------------------------------------------
/voice_samples/kk_orig/kk_0_2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/152334H/DL-Art-School/5ab4d9ed415e0a5c0b7ce9aae02aa2e76fe0eccd/voice_samples/kk_orig/kk_0_2.wav


--------------------------------------------------------------------------------